Add voice control, working but need more work

2026-01-31 11:41:50 -03:00
parent 380c466170
commit b9798a2f46
21 changed files with 3101 additions and 0 deletions
--- a/config.toml
+++ b/config.toml
@@ -0,0 +1,22 @@
+[roi]
+roi_top_left = [500, 230]
+roi_bottom_right = [775, 510]
+
+[bucket_positions]
+blue_bucket_pos = [400, 90]
+red_bucket_pos = [550, 90]
+yellow_bucket_pos = [550, 630]
+white_bucket_pos = [400, 630]
+
+[dominant_colors]
+blue = [255, 0, 0]
+red = [0, 0, 255]
+yellow = [0, 255, 255]
+white = [255, 255, 255]
+
+[object_parameters]
+size_threshold = 4200
+big_height = 125.9
+small_height = 106.0
+bottom_height = 68.0
+normal_height = 220.0
--- a/dataflow_voice_control_ulite6_zed.yml
+++ b/dataflow_voice_control_ulite6_zed.yml
@@ -0,0 +1,133 @@
+nodes:
+  - id: zed_camera_cpp
+    build: bash -lc "cmake -S dora_zed_cpp -B dora_zed_cpp/build && cmake --build dora_zed_cpp/build"
+    path: dora_zed_cpp/build/dora_zed_cpp
+    env:
+      ZED_RESOLUTION: "720"
+      ZED_FPS: "15"
+      ZED_DEPTH_MODE: "NEURAL"
+      ZED_DEPTH_MIN_MM: "10"
+      ZED_DEPTH_MAX_MM: "600"
+      ZED_DEPTH_FILL: "false"
+      ZED_FLIP: "ON"
+      ZED_WARMUP_FRAMES: "30"
+    inputs:
+      tick: dora/timer/millis/100
+    outputs:
+      - image_bgr
+      - camera_info
+      - point_cloud
+  - id: ulite6
+    build: uv pip install -e dora_ulite6
+    path: dora_ulite6/dora_ulite6/main.py
+    inputs:
+      tick: dora/timer/millis/10
+      command: voice/robot_cmd
+    outputs:
+      - tcp_pose
+      - status
+    env:
+      ROBOT_IP: "192.168.1.192"
+      DEFAULT_SPEED: "30"
+      DEFAULT_UNITS: "mm"
+      API_HOST: "0.0.0.0"
+      API_PORT: "9000"
+      VACUUM_ENABLED: "true"
+      # Initial position on startup: "home", "pose", or "none"
+      # Set to "none" - voice control handles initial positioning
+      INIT_MODE: "none"
+  - id: iobridge
+    build: |
+      uv venv -p 3.12 --seed --allow-existing
+      uv pip install -e dora_iobridge
+    path: dora_iobridge/dora_iobridge/main.py
+    env:
+      VIRTUAL_ENV: ./.venv
+      VOICE_HOST: "0.0.0.0"
+      VOICE_PORT: "8765"
+      VOICE_IN_OUTPUT: "voice_in"
+      VOICE_OUT_INPUT: "voice_out"
+      SCENE_INPUT: "scene_update"
+    inputs:
+      voice_out: voice/voice_out
+      scene_update: voice/scene_update
+      tick: dora/timer/millis/100
+    outputs:
+      - voice_in
+  - id: detector
+    build: |
+      uv venv -p 3.12 --seed --allow-existing
+      uv pip install -e dora_detector
+    path: dora_detector/dora_detector/main.py
+    env:
+      VIRTUAL_ENV: ./.venv
+      IMAGE_INPUT: "image_bgr"
+      POINT_CLOUD_INPUT: "point_cloud"
+      POSE_INPUT: "tcp_pose"
+      OBJECTS_OUTPUT: "objects"
+      IMAGE_OUTPUT: "image_annotated"
+      CALIBRATION_FILE: "calibration_ulite6_zed.npz"
+      DETECTOR_WEIGHTS: "trained_models/yolo8n.pt"
+      CONFIG_FILE: "config.toml"
+      ROI_TOP_LEFT: "500,230"
+      ROI_BOTTOM_RIGHT: "775,510"
+      SIZE_THRESHOLD: "4200"
+      DETECT_EVERY_N: "3"
+      MIN_DEPTH_MM: "10"
+      MAX_DEPTH_MM: "600"
+    inputs:
+      image_bgr: zed_camera_cpp/image_bgr
+      point_cloud: zed_camera_cpp/point_cloud
+      tcp_pose: ulite6/tcp_pose
+      tick: dora/timer/millis/100
+    outputs:
+      - objects
+      - image_annotated
+  - id: voice
+    build: |
+      uv venv -p 3.12 --seed --allow-existing
+      uv pip install -e dora_voice_control
+    path: dora_voice_control/dora_voice_control/main.py
+    env:
+      VIRTUAL_ENV: ./.venv
+      OBJECTS_INPUT: "objects"
+      POSE_INPUT: "tcp_pose"
+      STATUS_INPUT: "status"
+      COMMAND_OUTPUT: "robot_cmd"
+      CONFIG_FILE: "config.toml"
+      # Map Spanish command names to detector class names
+      CLASS_MAP: '{"cilindro": "cylinder", "cubo": "cube", "estrella": "star", "caja": "box", "amarillo": "yellow", "rojo": "red", "azul": "blue", "blanco": "white", "grande": "big", "pequeno": "small"}'
+      VOICE_IN_INPUT: "voice_in"
+      VOICE_OUT_OUTPUT: "voice_out"
+      SCENE_OUTPUT: "scene_update"
+      TCP_OFFSET_MM: "63.0"
+      APPROACH_OFFSET_MM: "50.0"
+      STEP_MM: "20.0"
+      DEFAULT_ROLL: "180.0"
+      DEFAULT_PITCH: "0.0"
+      DEFAULT_YAW: "0.0"
+      DRY_RUN: "false"
+      # Initial position (used on startup and reset command)
+      INIT_ON_START: "true"
+      INIT_X: "300.0"
+      INIT_Y: "0.0"
+      INIT_Z: "350.0"
+      INIT_ROLL: "180.0"
+      INIT_PITCH: "0.0"
+      INIT_YAW: "0.0"
+      IMAGE_INPUT: "image_annotated"
+      IMAGE_WIDTH: "1280"
+      IMAGE_HEIGHT: "720"
+      API_ENABLED: "true"
+      API_PORT: "8080"
+    inputs:
+      objects: detector/objects
+      tcp_pose: ulite6/tcp_pose
+      status: ulite6/status
+      voice_in: iobridge/voice_in
+      image_annotated: detector/image_annotated
+      tick: dora/timer/millis/100
+    outputs:
+      - robot_cmd
+      - voice_out
+      - scene_update
--- a/dora_detector/dora_detector/init.py
+++ b/dora_detector/dora_detector/init.py
@@ -0,0 +1 @@
+"""Dora detector node package."""
--- a/dora_detector/dora_detector/main.py
+++ b/dora_detector/dora_detector/main.py
@@ -0,0 +1,513 @@
+"""Dora node for YOLO detection and base-frame object localization."""
+
+from __future__ import annotations
+
+import json
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+import pyarrow as pa
+from dora import Node
+from ultralytics import YOLO
+
+try:
+    import tomllib
+except ModuleNotFoundError:  # pragma: no cover
+    import tomli as tomllib
+
+
+DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt")
+
+
+@dataclass
+class DetectionConfig:
+    imgsz: int
+    conf: float
+    iou: float
+    size_threshold: int
+    roi_top_left: Tuple[int, int]
+    roi_bottom_right: Tuple[int, int]
+    use_roi: bool
+    detect_every_n: int
+    min_depth_mm: float
+    max_depth_mm: float
+    color_blue: Tuple[int, int, int]
+    color_red: Tuple[int, int, int]
+    color_yellow: Tuple[int, int, int]
+    color_white: Tuple[int, int, int]
+
+
+def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]:
+    try:
+        parts = [p.strip() for p in raw.split(",")]
+        if len(parts) >= 2:
+            return int(parts[0]), int(parts[1])
+    except Exception:
+        pass
+    return default
+
+
+def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, float]:
+    try:
+        parts = [p.strip() for p in raw.split(",")]
+        if len(parts) >= 2:
+            return float(parts[0]), float(parts[1])
+    except Exception:
+        pass
+    return default
+
+
+def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int]:
+    try:
+        parts = [p.strip() for p in raw.split(",")]
+        if len(parts) >= 3:
+            return int(parts[0]), int(parts[1]), int(parts[2])
+    except Exception:
+        pass
+    return default
+
+
+def _rotation_matrix_xyz(
+    roll_deg: float, pitch_deg: float, yaw_deg: float
+) -> np.ndarray:
+    roll = np.deg2rad(roll_deg)
+    pitch = np.deg2rad(pitch_deg)
+    yaw = np.deg2rad(yaw_deg)
+    cx, sx = np.cos(roll), np.sin(roll)
+    cy, sy = np.cos(pitch), np.sin(pitch)
+    cz, sz = np.cos(yaw), np.sin(yaw)
+    rot_x = np.array([[1.0, 0.0, 0.0], [0.0, cx, -sx], [0.0, sx, cx]])
+    rot_y = np.array([[cy, 0.0, sy], [0.0, 1.0, 0.0], [-sy, 0.0, cy]])
+    rot_z = np.array([[cz, -sz, 0.0], [sz, cz, 0.0], [0.0, 0.0, 1.0]])
+    return rot_z @ rot_y @ rot_x
+
+
+def _pose_to_matrix(tcp_pose_mm_deg: List[float]) -> np.ndarray:
+    x, y, z, roll, pitch, yaw = tcp_pose_mm_deg
+    rot = _rotation_matrix_xyz(roll, pitch, yaw)
+    mat = np.eye(4)
+    mat[:3, :3] = rot
+    mat[:3, 3] = np.array([x, y, z], dtype=np.float64) / 1000.0
+    return mat
+
+
+def _decode_image(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
+    encoding = str(metadata.get("encoding", "bgr8")).lower()
+    width = metadata.get("width")
+    height = metadata.get("height")
+    if (width is None or height is None) and "shape" in metadata:
+        shape = metadata.get("shape")
+        if isinstance(shape, (list, tuple)) and len(shape) >= 2:
+            height = height if height is not None else int(shape[0])
+            width = width if width is not None else int(shape[1])
+    if width is None or height is None:
+        raise KeyError("width/height (or shape) missing from metadata")
+
+    if encoding == "bgr8":
+        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
+        return frame.copy()
+    if encoding == "rgb8":
+        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
+        return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+    if encoding in {"jpeg", "jpg", "jpe", "bmp", "webp", "png"}:
+        frame = storage.to_numpy().astype(np.uint8)
+        return cv2.imdecode(frame, cv2.IMREAD_COLOR)
+    if encoding == "yuv420":
+        yuv = storage.to_numpy().astype(np.uint8)
+        yuv = yuv[: width * height * 3 // 2].reshape((height * 3 // 2, width))
+        return cv2.cvtColor(yuv, cv2.COLOR_YUV420p2BGR)
+
+    raise RuntimeError(f"Unsupported image encoding: {encoding}")
+
+
+def _decode_point_cloud(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
+    dtype_str = str(metadata.get("dtype", "float32"))
+    if dtype_str != "float32":
+        raise RuntimeError(f"Unsupported point cloud dtype: {dtype_str}")
+    shape = metadata.get("shape")
+    if not isinstance(shape, (list, tuple)) or len(shape) < 3:
+        raise KeyError("point cloud shape missing from metadata")
+    height, width, channels = [int(v) for v in shape[:3]]
+    if channels < 3:
+        raise ValueError("point cloud requires at least 3 channels")
+    raw = storage.to_numpy().astype(np.uint8).tobytes()
+    values = np.frombuffer(raw, dtype=np.float32)
+    return values.reshape((height, width, channels))
+
+
+def _valid_point(point_xyz: np.ndarray, cfg: DetectionConfig) -> bool:
+    if not np.all(np.isfinite(point_xyz)):
+        return False
+    z = float(point_xyz[2])
+    if z <= 0:
+        return False
+    if z < cfg.min_depth_mm or z > cfg.max_depth_mm:
+        return False
+    return True
+
+
+def _sample_point(
+    point_cloud: np.ndarray, x: int, y: int, cfg: DetectionConfig
+) -> Optional[np.ndarray]:
+    h, w, _ = point_cloud.shape
+    if x < 0 or y < 0 or x >= w or y >= h:
+        return None
+    point_xyz = point_cloud[y, x, :3].astype(np.float64)
+    if _valid_point(point_xyz, cfg):
+        return point_xyz
+    radius = 3
+    samples = []
+    for dy in range(-radius, radius + 1):
+        for dx in range(-radius, radius + 1):
+            xx = x + dx
+            yy = y + dy
+            if xx < 0 or yy < 0 or xx >= w or yy >= h:
+                continue
+            p = point_cloud[yy, xx, :3].astype(np.float64)
+            if _valid_point(p, cfg):
+                samples.append(p)
+    if not samples:
+        return None
+    return np.median(np.stack(samples, axis=0), axis=0)
+
+
+def _dominant_color(image: np.ndarray, bbox: List[int]) -> Tuple[int, int, int]:
+    x1, y1, x2, y2 = bbox
+    x1 = max(0, x1)
+    y1 = max(0, y1)
+    x2 = min(image.shape[1], x2)
+    y2 = min(image.shape[0], y2)
+    roi = image[y1:y2, x1:x2]
+    if roi.size == 0:
+        return (0, 0, 0)
+    color = np.median(roi, axis=(0, 1)).astype(int)
+    return int(color[0]), int(color[1]), int(color[2])
+
+
+def _closest_color(color: Tuple[int, int, int], cfg: DetectionConfig) -> str:
+    colors = {
+        "blue": np.array(cfg.color_blue, dtype=np.float64),
+        "red": np.array(cfg.color_red, dtype=np.float64),
+        "yellow": np.array(cfg.color_yellow, dtype=np.float64),
+        "white": np.array(cfg.color_white, dtype=np.float64),
+    }
+    color_vec = np.array(color, dtype=np.float64)
+    best_name = "unknown"
+    best_dist = float("inf")
+    for name, value in colors.items():
+        dist = np.linalg.norm(color_vec - value)
+        if dist < best_dist:
+            best_name = name
+            best_dist = dist
+    return best_name
+
+
+def _load_calibration(calibration_file: str) -> np.ndarray:
+    calib = np.load(calibration_file, allow_pickle=True)
+    t_cam2gripper = calib["T_cam2gripper"]
+    return t_cam2gripper
+
+
+def _load_config_file(path: str) -> Dict[str, Any]:
+    if not path or not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "rb") as handle:
+            return tomllib.load(handle)
+    except Exception:
+        return {}
+
+
+def _build_config(config_path: str) -> DetectionConfig:
+    cfg_data = _load_config_file(config_path)
+    roi_cfg = cfg_data.get("roi", {})
+    colors_cfg = cfg_data.get("dominant_colors", {})
+    obj_cfg = cfg_data.get("object_parameters", {})
+
+    imgsz = int(os.getenv("YOLO_IMGSZ", "640"))
+    conf = float(os.getenv("YOLO_CONF", "0.25"))
+    iou = float(os.getenv("YOLO_IOU", "0.45"))
+    size_threshold = int(
+        os.getenv("SIZE_THRESHOLD", str(obj_cfg.get("size_threshold", 4200)))
+    )
+    roi_top_left = _parse_int_pair(
+        os.getenv(
+            "ROI_TOP_LEFT",
+            ",".join([str(v) for v in roi_cfg.get("roi_top_left", [500, 230])]),
+        ),
+        (500, 230),
+    )
+    roi_bottom_right = _parse_int_pair(
+        os.getenv(
+            "ROI_BOTTOM_RIGHT",
+            ",".join([str(v) for v in roi_cfg.get("roi_bottom_right", [775, 510])]),
+        ),
+        (775, 510),
+    )
+    use_roi = os.getenv("USE_ROI", "true").lower() in ("true", "1", "yes")
+    detect_every_n = int(os.getenv("DETECT_EVERY_N", "3"))
+    min_depth_mm = float(os.getenv("MIN_DEPTH_MM", "10"))
+    max_depth_mm = float(os.getenv("MAX_DEPTH_MM", "600"))
+    color_blue = _parse_color(
+        os.getenv(
+            "COLOR_BLUE",
+            ",".join([str(v) for v in colors_cfg.get("blue", [255, 0, 0])]),
+        ),
+        (255, 0, 0),
+    )
+    color_red = _parse_color(
+        os.getenv(
+            "COLOR_RED",
+            ",".join([str(v) for v in colors_cfg.get("red", [0, 0, 255])]),
+        ),
+        (0, 0, 255),
+    )
+    color_yellow = _parse_color(
+        os.getenv(
+            "COLOR_YELLOW",
+            ",".join([str(v) for v in colors_cfg.get("yellow", [0, 255, 255])]),
+        ),
+        (0, 255, 255),
+    )
+    color_white = _parse_color(
+        os.getenv(
+            "COLOR_WHITE",
+            ",".join([str(v) for v in colors_cfg.get("white", [255, 255, 255])]),
+        ),
+        (255, 255, 255),
+    )
+    return DetectionConfig(
+        imgsz=imgsz,
+        conf=conf,
+        iou=iou,
+        size_threshold=size_threshold,
+        roi_top_left=roi_top_left,
+        roi_bottom_right=roi_bottom_right,
+        use_roi=use_roi,
+        detect_every_n=detect_every_n,
+        min_depth_mm=min_depth_mm,
+        max_depth_mm=max_depth_mm,
+        color_blue=color_blue,
+        color_red=color_red,
+        color_yellow=color_yellow,
+        color_white=color_white,
+    )
+
+
+def _within_roi(bbox: List[int], cfg: DetectionConfig) -> bool:
+    if not cfg.use_roi:
+        return True
+    x1, y1, x2, y2 = bbox
+    rx1, ry1 = cfg.roi_top_left
+    rx2, ry2 = cfg.roi_bottom_right
+    return x1 >= rx1 and y1 >= ry1 and x2 <= rx2 and y2 <= ry2
+
+
+def _draw_detections(
+    frame: np.ndarray, objects: List[Dict[str, Any]], cfg: DetectionConfig
+) -> np.ndarray:
+    """Draw bounding boxes and labels on frame."""
+    annotated = frame.copy()
+
+    # Draw ROI rectangle (always visible)
+    cv2.rectangle(
+        annotated,
+        cfg.roi_top_left,
+        cfg.roi_bottom_right,
+        (0, 255, 0) if cfg.use_roi else (128, 128, 128),
+        2,
+    )
+    # Label the ROI
+    cv2.putText(
+        annotated,
+        "ROI",
+        (cfg.roi_top_left[0] + 5, cfg.roi_top_left[1] + 20),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.6,
+        (0, 255, 0) if cfg.use_roi else (128, 128, 128),
+        2,
+    )
+
+    # Color mapping for visualization
+    color_map = {
+        "blue": (255, 100, 0),
+        "red": (0, 0, 255),
+        "yellow": (0, 255, 255),
+        "white": (200, 200, 200),
+        "unknown": (128, 128, 128),
+    }
+
+    for obj in objects:
+        bbox = obj.get("bbox", [0, 0, 0, 0])
+        color_name = obj.get("color", "unknown")
+        obj_type = obj.get("object_type", "?")
+        size = obj.get("size", "?")
+        pos = obj.get("position_mm", [0, 0, 0])
+
+        color = color_map.get(color_name, (128, 128, 128))
+
+        # Draw bounding box
+        cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
+
+        # Draw label background
+        label = f"{obj_type} {color_name} {size}"
+        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+        cv2.rectangle(
+            annotated,
+            (bbox[0], bbox[1] - th - 8),
+            (bbox[0] + tw + 4, bbox[1]),
+            color,
+            -1,
+        )
+        cv2.putText(
+            annotated,
+            label,
+            (bbox[0] + 2, bbox[1] - 4),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 0, 0),
+            1,
+        )
+
+        # Draw position
+        pos_label = f"[{pos[0]:.0f}, {pos[1]:.0f}, {pos[2]:.0f}]"
+        cv2.putText(
+            annotated,
+            pos_label,
+            (bbox[0], bbox[3] + 15),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.4,
+            color,
+            1,
+        )
+
+    return annotated
+
+
+def main() -> None:
+    image_input = os.getenv("IMAGE_INPUT", "image_bgr")
+    point_cloud_input = os.getenv("POINT_CLOUD_INPUT", "point_cloud")
+    pose_input = os.getenv("POSE_INPUT", "tcp_pose")
+    objects_output = os.getenv("OBJECTS_OUTPUT", "objects")
+    image_output = os.getenv("IMAGE_OUTPUT", "image_annotated")
+    calibration_file = os.getenv("CALIBRATION_FILE", "calibration.npz")
+    weights = os.getenv("DETECTOR_WEIGHTS", DEFAULT_WEIGHTS)
+    config_file = os.getenv("CONFIG_FILE", "config.toml")
+
+    cfg = _build_config(config_file)
+    model = YOLO(weights)
+
+    if calibration_file and not os.path.isabs(calibration_file):
+        config_path = os.path.join("config", calibration_file)
+        calibration_file = config_path if os.path.exists(config_path) else calibration_file
+    t_cam2gripper = _load_calibration(calibration_file)
+
+    node = Node()
+    latest_pose: Optional[List[float]] = None
+    latest_pose_at: Optional[float] = None
+    latest_point_cloud: Optional[np.ndarray] = None
+    latest_pc_at: Optional[float] = None
+    frame_count = 0
+
+    for event in node:
+        if event["type"] != "INPUT":
+            continue
+
+        now = time.monotonic()
+        if event["id"] == pose_input:
+            tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
+            if tcp_pose.size >= 6:
+                latest_pose = tcp_pose[:6].tolist()
+                latest_pose_at = now
+            continue
+
+        if event["id"] == point_cloud_input:
+            latest_point_cloud = _decode_point_cloud(event["value"], event.get("metadata", {}))
+            latest_pc_at = now
+            continue
+
+        if event["id"] != image_input:
+            continue
+
+        frame_count += 1
+        if frame_count % max(1, cfg.detect_every_n) != 0:
+            continue
+
+        if latest_pose is None or latest_point_cloud is None:
+            continue
+
+        frame = _decode_image(event["value"], event.get("metadata", {}))
+
+        results = model.predict(
+            frame, imgsz=cfg.imgsz, conf=cfg.conf, iou=cfg.iou, verbose=False
+        )[0]
+
+        base_T_flange = _pose_to_matrix(latest_pose)
+        objects: List[Dict[str, Any]] = []
+        for r in results.boxes:
+            bbox = [int(x) for x in r.xyxy[0]]
+            if not _within_roi(bbox, cfg):
+                continue
+            cx = int((bbox[0] + bbox[2]) / 2)
+            cy = int((bbox[1] + bbox[3]) / 2)
+            point_cam_mm = _sample_point(latest_point_cloud, cx, cy, cfg)
+            if point_cam_mm is None:
+                continue
+            point_cam_m = np.array(
+                [point_cam_mm[0], point_cam_mm[1], point_cam_mm[2], 1.0],
+                dtype=np.float64,
+            )
+            point_cam_m[:3] /= 1000.0
+            point_base = base_T_flange @ t_cam2gripper @ point_cam_m
+            point_base_mm = point_base[:3] * 1000.0
+
+            dominant = _dominant_color(frame, bbox)
+            color_name = _closest_color(dominant, cfg)
+            area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
+            size_label = "big" if area >= cfg.size_threshold else "small"
+
+            objects.append(
+                {
+                    "object_type": results.names[int(r.cls.item())],
+                    "confidence": float(r.conf.item()),
+                    "color": color_name,
+                    "size": size_label,
+                    "bbox": bbox,
+                    "center_px": [cx, cy],
+                    "position_mm": [
+                        float(point_base_mm[0]),
+                        float(point_base_mm[1]),
+                        float(point_base_mm[2]),
+                    ],
+                    "timestamp_ns": time.time_ns(),
+                }
+            )
+
+        payload = json.dumps({"objects": objects, "timestamp_ns": time.time_ns()})
+        node.send_output(
+            objects_output,
+            pa.array([payload]),
+            metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+        )
+
+        # Send annotated image
+        annotated = _draw_detections(frame, objects, cfg)
+        h, w = annotated.shape[:2]
+        node.send_output(
+            image_output,
+            pa.array(annotated.ravel().tolist()),
+            metadata={
+                "encoding": "bgr8",
+                "width": w,
+                "height": h,
+                "timestamp_ns": time.time_ns(),
+            },
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/dora_detector/pyproject.toml
+++ b/dora_detector/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "dora-detector"
+version = "0.1.0"
+license = { file = "MIT" }
+authors = [{ name = "Dora" }]
+description = "Dora node for YOLO-based object detection with ZED point cloud"
+
+requires-python = ">=3.8"
+
+dependencies = [
+  "dora-rs >= 0.3.9",
+  "numpy < 2.0.0",
+  "opencv-python >= 4.1.1",
+  "pyarrow >= 12.0.0",
+  "ultralytics >= 8.0.0",
+]
+
+[project.scripts]
+dora-detector = "dora_detector.main:main"
--- a/dora_iobridge/README.md
+++ b/dora_iobridge/README.md
@@ -0,0 +1,178 @@
+# Dora IOBridge Node
+
+A WebSocket server that bridges web clients with the Dora dataflow for real-time voice commands and scene updates.
+
+## Inputs/Outputs
+
+| Input          | Type   | Description                           |
+|----------------|--------|---------------------------------------|
+| `voice_out`    | JSON   | Response from voice control node      |
+| `scene_update` | JSON   | Scene objects from voice control      |
+
+| Output         | Type   | Description                           |
+|----------------|--------|---------------------------------------|
+| `voice_in`     | string | Voice commands forwarded to Dora      |
+
+## Environment Variables
+
+```bash
+VOICE_HOST=0.0.0.0    # Bind address
+VOICE_PORT=8765       # Listen port
+```
+
+## Installation
+
+```bash
+cd dora_iobridge
+pip install -e .
+```
+
+## Testing
+
+### Test with WebSocket (wscat)
+
+```bash
+# Install wscat
+npm install -g wscat
+
+# Connect to the server
+wscat -c ws://localhost:8765
+```
+
+### Test with curl (websocat)
+
+```bash
+# Install websocat
+# Ubuntu: sudo apt install websocat
+# macOS: brew install websocat
+
+# Send a ping
+echo '{"type": "ping"}' | websocat ws://localhost:8765
+# Response: {"type": "pong"}
+
+# Send a voice command
+echo '{"type": "command", "text": "sube"}' | websocat ws://localhost:8765
+
+# Request scene refresh
+echo '{"type": "scene_refresh"}' | websocat ws://localhost:8765
+```
+
+### Test with Python
+
+```python
+import asyncio
+import websockets
+import json
+
+async def test_iobridge():
+    uri = "ws://localhost:8765"
+    async with websockets.connect(uri) as ws:
+        # Test ping
+        await ws.send(json.dumps({"type": "ping"}))
+        response = await ws.recv()
+        print(f"Ping response: {response}")
+
+        # Send command
+        await ws.send(json.dumps({
+            "type": "command",
+            "text": "agarra el cubo rojo"
+        }))
+
+        # Listen for responses
+        async for message in ws:
+            data = json.loads(message)
+            print(f"Received: {data}")
+
+asyncio.run(test_iobridge())
+```
+
+### Test with curl (HTTP upgrade not supported directly)
+
+Since WebSocket requires an upgrade handshake, use this shell script:
+
+```bash
+#!/bin/bash
+# test_iobridge.sh
+
+# Using websocat for interactive testing
+websocat ws://localhost:8765 <<EOF
+{"type": "ping"}
+{"type": "command", "text": "sube"}
+{"type": "scene_refresh"}
+EOF
+```
+
+## WebSocket Message Types
+
+### Client -> Server
+
+**Command (voice input)**
+```json
+{"type": "command", "text": "agarra el cubo rojo"}
+```
+
+**Ping (health check)**
+```json
+{"type": "ping"}
+```
+Response: `{"type": "pong"}`
+
+**Scene Refresh**
+```json
+{"type": "scene_refresh"}
+```
+
+### Server -> Client (Broadcasts)
+
+**Command Response**
+```json
+{
+  "type": "response",
+  "text": "Ok, voy a tomar",
+  "status": "ok"
+}
+```
+
+**Scene Update**
+```json
+{
+  "type": "scene_updated",
+  "objects": [
+    {
+      "object_type": "cubo",
+      "color": "rojo",
+      "size": "big",
+      "position_mm": [150.0, 200.0, 280.0],
+      "source": "detection"
+    }
+  ]
+}
+```
+
+## Dora Dataflow Configuration
+
+```yaml
+nodes:
+  - id: iobridge
+    build: pip install -e ./dora_iobridge
+    path: dora_iobridge
+    inputs:
+      voice_out: voice_control/voice_out
+      scene_update: voice_control/scene_update
+    outputs:
+      - voice_in
+    env:
+      VOICE_HOST: "0.0.0.0"
+      VOICE_PORT: "8765"
+```
+
+```bash
+dora up
+dora start dataflow.yml
+```
+
+## Dependencies
+
+- dora-rs >= 0.3.9
+- pyarrow >= 12.0.0
+- websockets >= 12.0
--- a/dora_iobridge/dora_iobridge/init.py
+++ b/dora_iobridge/dora_iobridge/init.py
@@ -0,0 +1 @@
+"""Dora IO bridge node package."""
--- a/dora_iobridge/dora_iobridge/main.py
+++ b/dora_iobridge/dora_iobridge/main.py
@@ -0,0 +1,145 @@
+"""Dora node bridging WebSocket IO to Dora topics."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import threading
+import time
+from typing import Any, Dict, Optional, Set
+
+import pyarrow as pa
+from dora import Node
+from websockets.server import serve, WebSocketServerProtocol
+
+
+class IoBridgeServer:
+    def __init__(self, host: str, port: int):
+        self.host = host
+        self.port = port
+        self.clients: Set[WebSocketServerProtocol] = set()
+        self.command_handler = None
+        self.scene_refresh_handler = None
+
+    async def handler(self, websocket: WebSocketServerProtocol):
+        self.clients.add(websocket)
+        try:
+            async for message in websocket:
+                try:
+                    data = json.loads(message)
+                except json.JSONDecodeError:
+                    await websocket.send(
+                        json.dumps({"type": "error", "text": "Invalid JSON message"})
+                    )
+                    continue
+                response = await self._route_message(data, websocket)
+                if response:
+                    await websocket.send(json.dumps(response))
+        finally:
+            self.clients.discard(websocket)
+
+    async def _route_message(
+        self, data: Dict[str, Any], websocket: WebSocketServerProtocol
+    ) -> Optional[Dict[str, Any]]:
+        msg_type = data.get("type")
+        if msg_type == "command":
+            text = data.get("text", "")
+            if self.command_handler:
+                await self.command_handler(text)
+                return None
+            return {"type": "error", "text": "No command handler registered"}
+        if msg_type == "ping":
+            return {"type": "pong"}
+        if msg_type == "scene_refresh":
+            if self.scene_refresh_handler:
+                objects = await self.scene_refresh_handler()
+                return {"type": "scene_updated", "objects": objects}
+            return {"type": "error", "text": "No scene handler registered"}
+        return {"type": "error", "text": f"Unknown message type: {msg_type}"}
+
+    async def broadcast(self, message: Dict[str, Any]):
+        if not self.clients:
+            return
+        payload = json.dumps(message)
+        await asyncio.gather(
+            *[client.send(payload) for client in self.clients], return_exceptions=True
+        )
+
+    async def send(self, message: Dict[str, Any], websocket: WebSocketServerProtocol):
+        await websocket.send(json.dumps(message))
+
+    async def start(self):
+        async with serve(self.handler, self.host, self.port):
+            await asyncio.Future()
+
+
+def main() -> None:
+    host = os.getenv("VOICE_HOST", "0.0.0.0")
+    port = int(os.getenv("VOICE_PORT", "8765"))
+    input_topic = os.getenv("VOICE_IN_OUTPUT", "voice_in")
+    response_input = os.getenv("VOICE_OUT_INPUT", "voice_out")
+    scene_input = os.getenv("SCENE_INPUT", "scene_update")
+
+    node = Node()
+    server = IoBridgeServer(host, port)
+    loop = asyncio.new_event_loop()
+
+    def push_command(text: str) -> None:
+        node.send_output(
+            input_topic,
+            pa.array([text]),
+            metadata={"encoding": "utf-8", "timestamp_ns": time.time_ns()},
+        )
+
+    async def handle_scene_refresh():
+        return []
+
+    def command_handler(text: str):
+        push_command(text)
+        return None
+
+    server.command_handler = command_handler
+    server.scene_refresh_handler = handle_scene_refresh
+
+    def run_server():
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(server.start())
+
+    threading.Thread(target=run_server, daemon=True).start()
+
+    for event in node:
+        if event["type"] != "INPUT":
+            continue
+
+        if event["id"] == response_input:
+            raw = event["value"][0].as_py() if len(event["value"]) else ""
+            if not raw:
+                continue
+            try:
+                payload = json.loads(raw)
+                message = {
+                    "type": "response",
+                    "text": payload.get("text", ""),
+                    "status": payload.get("status", "ok"),
+                }
+            except Exception:
+                message = {"type": "response", "text": raw, "status": "ok"}
+            asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
+            continue
+
+        if event["id"] == scene_input:
+            raw = event["value"][0].as_py() if len(event["value"]) else ""
+            if not raw:
+                continue
+            try:
+                payload = json.loads(raw)
+                objects = payload.get("objects", [])
+                message = {"type": "scene_updated", "objects": objects}
+            except Exception:
+                message = {"type": "scene_updated", "objects": []}
+            asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
+
+
+if __name__ == "__main__":
+    main()
--- a/dora_iobridge/pyproject.toml
+++ b/dora_iobridge/pyproject.toml
@@ -0,0 +1,17 @@
+[project]
+name = "dora-iobridge"
+version = "0.1.0"
+license = { file = "MIT" }
+authors = [{ name = "Dora" }]
+description = "Dora node bridging WebSocket IO to Dora topics"
+
+requires-python = ">=3.8"
+
+dependencies = [
+  "dora-rs >= 0.3.9",
+  "pyarrow >= 12.0.0",
+  "websockets >= 12.0",
+]
+
+[project.scripts]
+dora-iobridge = "dora_iobridge.main:main"
--- a/dora_ulite6/dora_ulite6/main.py
+++ b/dora_ulite6/dora_ulite6/main.py
@@ -781,6 +781,12 @@ def _status_snapshot(helper: ULite6Helper) -> Dict[str, Any]:
    }


+def _log(msg: str) -> None:
+    """Print a timestamped log message."""
+    timestamp = time.strftime("%H:%M:%S")
+    print(f"[ulite6 {timestamp}] {msg}", flush=True)
+
+
 def main() -> None:
    node = Node()

@@ -791,7 +797,42 @@ def main() -> None:
    api_port = int(os.getenv("API_PORT", "8080"))
    vacuum_enabled = os.getenv("VACUUM_ENABLED", "false").lower() in ("true", "1", "yes")

+    # Initial position settings
+    init_mode = os.getenv("INIT_MODE", "none").lower()  # "home", "pose", or "none"
+    init_x = float(os.getenv("INIT_X", "300.0"))
+    init_y = float(os.getenv("INIT_Y", "0.0"))
+    init_z = float(os.getenv("INIT_Z", "250.0"))
+    init_roll = float(os.getenv("INIT_ROLL", "180.0"))
+    init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
+    init_yaw = float(os.getenv("INIT_YAW", "0.0"))
+    init_speed = float(os.getenv("INIT_SPEED", "50.0"))
+
+    _log(f"Connecting to robot at {robot_ip}...")
    helper = ULite6Helper(robot_ip)
+    _log("Robot connected")
+
+    # Move to initial position on startup
+    if init_mode == "home":
+        _log("Moving to home position...")
+        code = helper.go_home()
+        if code == 0:
+            _log("Home position reached")
+        else:
+            _log(f"Home failed with code {code}")
+    elif init_mode == "pose":
+        _log(f"Moving to initial pose: [{init_x}, {init_y}, {init_z}] roll={init_roll} pitch={init_pitch} yaw={init_yaw}")
+        code = helper.move_to_pose(
+            init_x, init_y, init_z,
+            init_roll, init_pitch, init_yaw,
+            speed=init_speed,
+            units="mm",
+        )
+        if code == 0:
+            _log("Initial pose reached")
+        else:
+            _log(f"Move to initial pose failed with code {code}")
+    else:
+        _log("Skipping initial position (INIT_MODE=none)")

    # Create and start FastAPI server in background thread
    app = create_api(helper, default_speed, default_units, vacuum_enabled)
@@ -895,6 +936,28 @@ def main() -> None:
                        code=code,
                        status=_status_snapshot(helper),
                    )
+                elif action in ("vacuum_on", "vacuum_off"):
+                    if not vacuum_enabled:
+                        _send_command_status(
+                            node,
+                            command_id=command_id,
+                            action=action,
+                            ok=False,
+                            message="Vacuum gripper not enabled",
+                            status=_status_snapshot(helper),
+                        )
+                        continue
+                    vacuum_on = action == "vacuum_on"
+                    code = helper.set_vacuum_gripper(vacuum_on)
+                    _send_command_status(
+                        node,
+                        command_id=command_id,
+                        action=action,
+                        ok=code == 0,
+                        message="Vacuum command executed",
+                        code=code,
+                        status=_status_snapshot(helper),
+                    )
                else:
                    _send_command_status(
                        node,
--- a/dora_voice_control/README.md
+++ b/dora_voice_control/README.md
@@ -0,0 +1,211 @@
+# Dora Voice Control Node
+
+A Dora node that processes Spanish voice commands from children and translates them into robot actions (movement, grasping, releasing objects). Includes a web debug interface.
+
+## Features
+
+- Spanish voice command parsing (rule-based or Gemini LLM)
+- Real-time web debug interface
+- Command queue management
+- Workspace bounds validation
+- Object detection integration
+
+## File Structure
+
+```
+dora_voice_control/
+├── __init__.py
+├── main.py        # Main Dora node entry point
+├── api.py         # FastAPI web server
+├── config.py      # Configuration management
+├── models.py      # Pydantic request/response models
+├── parser.py      # Voice command parsing logic
+├── state.py       # Shared state management
+└── templates.py   # HTML template for web interface
+```
+
+## Web Debug Interface
+
+Access the debug interface at `http://localhost:8080` (default).
+
+Features:
+- Real-time status monitoring (pose, objects, queue)
+- Send manual voice commands
+- Quick command buttons
+- View parse results
+- Command history
+- Clear queue
+
+## Inputs/Outputs
+
+| Input         | Type   | Description                              |
+|---------------|--------|------------------------------------------|
+| `voice_in`    | string | Text transcription of voice command      |
+| `tcp_pose`    | array  | Current robot pose [x, y, z, roll, pitch, yaw] |
+| `objects`     | JSON   | Detected objects from vision system      |
+| `status`      | JSON   | Command execution status from robot      |
+
+| Output        | Type   | Description                              |
+|---------------|--------|------------------------------------------|
+| `robot_cmd`   | JSON   | Robot command with action and payload    |
+| `voice_out`   | JSON   | Response confirmation to user            |
+| `scene_update`| JSON   | Updated scene with all visible objects   |
+
+## Supported Commands (Spanish)
+
+| Command       | Action         | Example                        |
+|---------------|----------------|--------------------------------|
+| `subir`       | Move up        | "sube"                         |
+| `bajar`       | Move down      | "baja"                         |
+| `tomar`       | Grab object    | "agarra el cubo rojo"          |
+| `soltar`      | Release object | "suelta en la caja azul"       |
+| `ir`          | Go to object   | "ve al cilindro"               |
+| `reiniciar`   | Reset          | "reinicia"                     |
+
+## Environment Variables
+
+```bash
+# Web API Server
+API_ENABLED=true        # Enable/disable web interface
+API_HOST=0.0.0.0        # Bind address
+API_PORT=8080           # Listen port
+
+# TCP Parameters
+TCP_OFFSET_MM=63.0          # Z-offset to object surface
+APPROACH_OFFSET_MM=50.0     # Safe approach distance above object
+STEP_MM=20.0                # Distance for up/down increments
+
+# LLM Configuration (optional)
+LLM_PROVIDER=rules          # "rules" or "gemini"
+GOOGLE_API_KEY=your_key     # Required if using gemini
+GEMINI_MODEL=gemini-2.0-flash
+
+# Workspace Safety (optional)
+WORKSPACE_MIN_X=-300
+WORKSPACE_MAX_X=300
+WORKSPACE_MIN_Y=-300
+WORKSPACE_MAX_Y=300
+WORKSPACE_MIN_Z=0
+WORKSPACE_MAX_Z=500
+
+# Misc
+DRY_RUN=false               # Skip sending robot commands
+```
+
+## Installation
+
+```bash
+cd dora_voice_control
+pip install -e .
+
+# With LLM support
+pip install -e ".[llm]"
+```
+
+## Testing
+
+### Web Interface
+
+```bash
+# Start the node (standalone for testing)
+python -m dora_voice_control.main
+
+# Open in browser
+open http://localhost:8080
+```
+
+### API Endpoints
+
+```bash
+# Get status
+curl http://localhost:8080/api/status
+
+# Get objects
+curl http://localhost:8080/api/objects
+
+# Get queue
+curl http://localhost:8080/api/queue
+
+# Send command
+curl -X POST http://localhost:8080/api/command \
+  -H "Content-Type: application/json" \
+  -d '{"text": "sube"}'
+
+# Clear queue
+curl -X POST http://localhost:8080/api/queue/clear
+```
+
+### Python Test
+
+```python
+from dora_voice_control.parser import rule_parse, normalize
+
+# Test command parsing
+text = "agarra el cubo rojo grande"
+result = rule_parse(normalize(text))
+print(result)
+# {'resultado': 'ok', 'accion': 'tomar', 'objeto': 'cubo', 'color': 'rojo', 'tamano': 'grande'}
+```
+
+## Dora Dataflow Configuration
+
+```yaml
+nodes:
+  - id: voice_control
+    build: pip install -e ./dora_voice_control
+    path: dora_voice_control
+    inputs:
+      voice_in: iobridge/voice_in
+      tcp_pose: robot/tcp_pose
+      objects: detector/objects
+      status: robot/status
+    outputs:
+      - robot_cmd
+      - voice_out
+      - scene_update
+    env:
+      API_ENABLED: "true"
+      API_PORT: "8080"
+      DRY_RUN: "false"
+```
+
+## Message Examples
+
+### Input: voice_in
+```
+"sube"
+"agarra el cubo rojo"
+"suelta en la caja azul"
+```
+
+### Output: robot_cmd
+```json
+{
+  "id": "550e8400-e29b-41d4-a716-446655440000",
+  "action": "move_to_pose",
+  "payload": {
+    "x": 150.0,
+    "y": 200.0,
+    "z": 280.0,
+    "roll": 180.0,
+    "pitch": 0.0,
+    "yaw": 0.0
+  }
+}
+```
+
+### Output: voice_out
+```json
+{"text": "Ok, voy a subir", "status": "ok"}
+{"text": "No entendi el comando", "status": "error"}
+```
+
+## Dependencies
+
+- dora-rs >= 0.3.9
+- numpy < 2.0.0
+- pyarrow >= 12.0.0
+- fastapi >= 0.109.0
+- uvicorn >= 0.27.0
+- pydantic >= 2.0.0
+- google-genai (optional, for Gemini mode)
--- a/dora_voice_control/dora_voice_control/init.py
+++ b/dora_voice_control/dora_voice_control/init.py
@@ -0,0 +1 @@
+"""Dora voice control node package."""
--- a/dora_voice_control/dora_voice_control/api.py
+++ b/dora_voice_control/dora_voice_control/api.py
@@ -0,0 +1,162 @@
+"""FastAPI application for the voice control web interface."""
+
+from __future__ import annotations
+
+import os
+import sys
+import threading
+from typing import Any
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse, Response
+
+# Handle both package and direct script execution
+# __package__ is None when run as script, '' when imported from a script
+if not __package__:
+    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
+    if _pkg_dir not in sys.path:
+        sys.path.insert(0, _pkg_dir)
+    from models import CommandRequest, CommandResponse
+    from state import SharedState
+    from templates import HTML_TEMPLATE
+else:
+    from .models import CommandRequest, CommandResponse
+    from .state import SharedState
+    from .templates import HTML_TEMPLATE
+
+
+def create_api(state: SharedState) -> FastAPI:
+    """Create FastAPI application with voice control endpoints."""
+    app = FastAPI(
+        title="Voice Control Debug API",
+        description="Debug interface for the voice control node",
+        version="0.1.0",
+    )
+
+    @app.get("/", response_class=HTMLResponse)
+    def index() -> str:
+        """Serve the web interface."""
+        return HTML_TEMPLATE
+
+    @app.get("/api/status")
+    def get_status() -> dict:
+        """Get current status."""
+        try:
+            return state.get_status()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/objects")
+    def get_objects() -> dict:
+        """Get detected and static objects."""
+        try:
+            return state.get_objects()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/queue")
+    def get_queue() -> list:
+        """Get the command queue."""
+        try:
+            return state.get_queue()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.post("/api/queue/clear")
+    def clear_queue() -> dict:
+        """Clear the command queue."""
+        try:
+            with state._lock:
+                state.voice_state.queue.clear()
+            return {"ok": True}
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/history")
+    def get_history() -> list:
+        """Get command history."""
+        try:
+            return state.get_history()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/errors")
+    def get_errors() -> list:
+        """Get error log."""
+        try:
+            return state.get_errors()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.post("/api/command", response_model=CommandResponse)
+    def send_command(request: CommandRequest) -> CommandResponse:
+        """Send a voice command."""
+        try:
+            callback = state.get_command_callback()
+            if callback is None:
+                return CommandResponse(ok=False, text="No command handler available", status="error")
+
+            result = callback(request.text)
+            return CommandResponse(
+                ok=result.get("status") == "ok",
+                text=result.get("text", ""),
+                status=result.get("status", "error"),
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/image")
+    def get_image() -> Response:
+        """Get the latest camera image as JPEG."""
+        try:
+            image_data = state.get_image()
+            if image_data is None:
+                # Return a 1x1 transparent pixel if no image
+                return Response(
+                    content=b"",
+                    media_type="image/jpeg",
+                    status_code=204,
+                )
+            return Response(
+                content=image_data,
+                media_type="image/jpeg",
+                headers={"Cache-Control": "no-cache, no-store, must-revalidate"},
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/image/info")
+    def get_image_info() -> dict:
+        """Get image metadata."""
+        try:
+            return {
+                "has_image": state.get_image() is not None,
+                "age_ms": state.get_image_age_ms(),
+            }
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    return app
+
+
+def run_uvicorn(app: FastAPI, host: str, port: int) -> None:
+    """Run uvicorn server (for use in background thread)."""
+    config = uvicorn.Config(app, host=host, port=port, log_level="warning")
+    server = uvicorn.Server(config)
+    server.run()
+
+
+def start_api_server(state: SharedState, config: Any) -> threading.Thread:
+    """Start the API server in a background thread."""
+    import time as _time
+    app = create_api(state)
+    api_thread = threading.Thread(
+        target=run_uvicorn,
+        args=(app, config.host, config.port),
+        daemon=True,
+    )
+    api_thread.start()
+    timestamp = _time.strftime("%H:%M:%S")
+    print(f"[voice_control {timestamp}] Web interface at http://{config.host}:{config.port}", flush=True)
+    return api_thread
--- a/dora_voice_control/dora_voice_control/config.py
+++ b/dora_voice_control/dora_voice_control/config.py
@@ -0,0 +1,95 @@
+"""Configuration for the voice control node."""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+
+@dataclass
+class VoiceConfig:
+    """Configuration for voice control."""
+
+    host: str
+    port: int
+    tcp_offset_mm: float
+    approach_offset_mm: float
+    step_mm: float
+    default_roll: float
+    default_pitch: float
+    default_yaw: float
+    dry_run: bool
+    workspace_min: Tuple[Optional[float], Optional[float], Optional[float]]
+    workspace_max: Tuple[Optional[float], Optional[float], Optional[float]]
+    class_map: Dict[str, str]
+
+
+@dataclass
+class ApiConfig:
+    """Configuration for the web API server."""
+
+    host: str
+    port: int
+    enabled: bool
+
+
+def _parse_float_env(name: str) -> Optional[float]:
+    """Parse an optional float from environment variable."""
+    raw = os.getenv(name)
+    if raw is None or raw == "":
+        return None
+    try:
+        return float(raw)
+    except ValueError:
+        return None
+
+
+def _parse_class_map(raw: str) -> Dict[str, str]:
+    """Parse JSON class mapping from string."""
+    import json
+
+    if not raw:
+        return {}
+    try:
+        data = json.loads(raw)
+        if isinstance(data, dict):
+            return {str(k): str(v) for k, v in data.items()}
+    except Exception:
+        pass
+    return {}
+
+
+def load_voice_config() -> VoiceConfig:
+    """Load voice configuration from environment variables."""
+    return VoiceConfig(
+        host="",
+        port=0,
+        tcp_offset_mm=float(os.getenv("TCP_OFFSET_MM", "63.0")),
+        approach_offset_mm=float(os.getenv("APPROACH_OFFSET_MM", "50.0")),
+        step_mm=float(os.getenv("STEP_MM", "20.0")),
+        default_roll=float(os.getenv("DEFAULT_ROLL", "180.0")),
+        default_pitch=float(os.getenv("DEFAULT_PITCH", "0.0")),
+        default_yaw=float(os.getenv("DEFAULT_YAW", "0.0")),
+        dry_run=os.getenv("DRY_RUN", "false").lower() in ("true", "1", "yes"),
+        workspace_min=(
+            _parse_float_env("WORKSPACE_MIN_X"),
+            _parse_float_env("WORKSPACE_MIN_Y"),
+            _parse_float_env("WORKSPACE_MIN_Z"),
+        ),
+        workspace_max=(
+            _parse_float_env("WORKSPACE_MAX_X"),
+            _parse_float_env("WORKSPACE_MAX_Y"),
+            _parse_float_env("WORKSPACE_MAX_Z"),
+        ),
+        class_map=_parse_class_map(os.getenv("CLASS_MAP", "")),
+    )
+
+
+def load_api_config() -> ApiConfig:
+    """Load API server configuration from environment variables."""
+    return ApiConfig(
+        host=os.getenv("API_HOST", "0.0.0.0"),
+        port=int(os.getenv("API_PORT", "8080")),
+        enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"),
+    )
--- a/dora_voice_control/dora_voice_control/main.py
+++ b/dora_voice_control/dora_voice_control/main.py
@@ -0,0 +1,501 @@
+"""Dora node for voice control with safe robot commands."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import time
+import uuid
+from collections import deque
+from typing import Any, Deque, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib
+
+# Handle both package and direct script execution
+# __package__ is None when run as script, '' when imported from a script
+_RUNNING_AS_SCRIPT = not __package__
+
+if _RUNNING_AS_SCRIPT:
+    # Running as script - use absolute imports
+    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
+    if _pkg_dir not in sys.path:
+        sys.path.insert(0, _pkg_dir)
+    from config import VoiceConfig, load_api_config, load_voice_config
+    from parser import normalize, parse_command
+    from state import RobotStep, SharedState
+    from api import start_api_server
+else:
+    # Running as package - use relative imports
+    from .config import VoiceConfig, load_api_config, load_voice_config
+    from .parser import normalize, parse_command
+    from .state import RobotStep, SharedState
+    from .api import start_api_server
+
+
+def _within_bounds(
+    point_mm: np.ndarray,
+    min_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
+    max_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
+) -> bool:
+    """Check if point is within workspace bounds."""
+    x, y, z = point_mm.tolist()
+    min_x, min_y, min_z = min_xyz
+    max_x, max_y, max_z = max_xyz
+    if min_x is not None and x < min_x:
+        return False
+    if max_x is not None and x > max_x:
+        return False
+    if min_y is not None and y < min_y:
+        return False
+    if max_y is not None and y > max_y:
+        return False
+    if min_z is not None and z < min_z:
+        return False
+    if max_z is not None and z > max_z:
+        return False
+    return True
+
+
+def _translate_target(token: str, mapping: Dict[str, str]) -> str:
+    """Translate object name using class map."""
+    if token in mapping:
+        return mapping[token]
+    return token
+
+
+def _load_config_file(path: str) -> Dict[str, Any]:
+    """Load TOML configuration file."""
+    if not path or not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "rb") as handle:
+            return tomllib.load(handle)
+    except Exception:
+        return {}
+
+
+def _load_bucket_objects(config_path: str) -> List[Dict[str, Any]]:
+    """Load bucket positions from config file."""
+    cfg = _load_config_file(config_path)
+    buckets = cfg.get("bucket_positions", {})
+    obj_cfg = cfg.get("object_parameters", {})
+    base_z = float(obj_cfg.get("normal_height", 220.0))
+    out = []
+    for key, color in [
+        ("blue_bucket_pos", "blue"),
+        ("red_bucket_pos", "red"),
+        ("yellow_bucket_pos", "yellow"),
+        ("white_bucket_pos", "white"),
+    ]:
+        pos = buckets.get(key)
+        if not isinstance(pos, list) or len(pos) < 2:
+            continue
+        out.append(
+            {
+                "object_type": "box",
+                "color": color,
+                "size": "big",
+                "position_mm": [float(pos[0]), float(pos[1]), base_z],
+                "source": "config",
+            }
+        )
+    return out
+
+
+def _send_dora_command(
+    node: Node, output_name: str, action: str, payload: Dict[str, Any]
+) -> str:
+    """Send a robot command via Dora."""
+    command_id = str(uuid.uuid4())
+    message = {"id": command_id, "action": action, "payload": payload}
+    node.send_output(
+        output_name,
+        pa.array([json.dumps(message)]),
+        metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+    )
+    return command_id
+
+
+def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]:
+    """Parse status payload from robot."""
+    if len(value) == 0:
+        return None
+    raw = value[0].as_py()
+    if not raw:
+        return None
+    try:
+        return json.loads(raw)
+    except Exception:
+        return None
+
+
+def _log(msg: str) -> None:
+    """Print a timestamped log message."""
+    timestamp = time.strftime("%H:%M:%S")
+    print(f"[voice_control {timestamp}] {msg}", flush=True)
+
+
+def main() -> None:
+    """Main entry point for the voice control node."""
+    _log("Starting voice control node...")
+
+    # Load configuration
+    cfg = load_voice_config()
+    api_cfg = load_api_config()
+
+    # Environment variables for I/O topics
+    objects_input = os.getenv("OBJECTS_INPUT", "objects")
+    voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in")
+    voice_out_output = os.getenv("VOICE_OUT_OUTPUT", "voice_out")
+    scene_output = os.getenv("SCENE_OUTPUT", "scene_update")
+    pose_input = os.getenv("POSE_INPUT", "tcp_pose")
+    status_input = os.getenv("STATUS_INPUT", "status")
+    command_output = os.getenv("COMMAND_OUTPUT", "robot_cmd")
+    image_input = os.getenv("IMAGE_INPUT", "image_annotated")
+    llm_provider = os.getenv("LLM_PROVIDER", "rules").lower()
+    config_file = os.getenv("CONFIG_FILE", "config.toml")
+
+    # Image dimensions (will be detected from first frame)
+    image_width = int(os.getenv("IMAGE_WIDTH", "1280"))
+    image_height = int(os.getenv("IMAGE_HEIGHT", "720"))
+
+    # Initial/home position for reset command
+    init_x = float(os.getenv("INIT_X", "300.0"))
+    init_y = float(os.getenv("INIT_Y", "0.0"))
+    init_z = float(os.getenv("INIT_Z", "250.0"))
+    init_roll = float(os.getenv("INIT_ROLL", "180.0"))
+    init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
+    init_yaw = float(os.getenv("INIT_YAW", "0.0"))
+
+    _log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm")
+    _log(f"Initial position: [{init_x}, {init_y}, {init_z}]")
+    _log(f"LLM provider: {llm_provider}")
+    _log(f"Dry run: {cfg.dry_run}")
+
+    # Initialize shared state
+    shared_state = SharedState()
+    state = shared_state.voice_state
+    state.static_objects = _load_bucket_objects(config_file)
+    pending_intents: Deque[Dict[str, Any]] = deque()
+
+    _log(f"Loaded {len(state.static_objects)} static objects from config")
+
+    # Queue initial position movement on startup (same as reiniciar)
+    init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes")
+    send_init_scene_reset = init_on_start  # Flag to send scene reset after node starts
+    if init_on_start:
+        _log(f"Startup: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
+        # Clear detected objects
+        state.latest_objects = []
+        state.latest_objects_at = None
+        # Queue vacuum off and move to home
+        state.queue.append(RobotStep(action="vacuum_off", payload={}))
+        state.queue.append(
+            RobotStep(
+                action="move_to_pose",
+                payload={
+                    "x": init_x,
+                    "y": init_y,
+                    "z": init_z,
+                    "roll": init_roll,
+                    "pitch": init_pitch,
+                    "yaw": init_yaw,
+                },
+            )
+        )
+
+    def command_handler(transcript: str) -> Dict[str, str]:
+        """Handle voice command and return response."""
+        _log(f"Voice input received: \"{transcript}\"")
+        llm_result = parse_command(transcript, llm_provider)
+        _log(f"Parse result: {llm_result}")
+
+        # Update debug state
+        shared_state.update_voice_input(transcript, llm_result, time.monotonic())
+
+        if llm_result.get("resultado") != "ok":
+            _log("Command not understood")
+            return {"text": "No entendi el comando", "status": "error"}
+
+        action = llm_result.get("accion", "error")
+        obj = llm_result.get("objeto", "no especificado")
+        color = llm_result.get("color", "no especificado")
+        size = llm_result.get("tamano", "no especificado")
+
+        _log(f"Intent: action={action}, object={obj}, color={color}, size={size}")
+
+        pending_intents.append(
+            {"action": action, "obj": obj, "color": color, "size": size}
+        )
+
+        # Add to history
+        shared_state.add_to_history({
+            "timestamp": time.time(),
+            "input": transcript,
+            "action": action,
+            "object": obj,
+            "color": color,
+            "size": size,
+        })
+
+        return {"text": f"Ok, voy a {action}", "status": "ok"}
+
+    # Set command callback for web interface
+    shared_state.set_command_callback(command_handler)
+
+    # Start web API server if enabled
+    if api_cfg.enabled:
+        start_api_server(shared_state, api_cfg)
+
+    # Create Dora node
+    node = Node()
+    _log("Dora node created, waiting for events...")
+
+    first_event = True
+    for event in node:
+        # Send scene reset on first event (startup)
+        if first_event and send_init_scene_reset:
+            first_event = False
+            scene_payload = json.dumps(
+                {"objects": list(state.static_objects), "reset": True}
+            )
+            node.send_output(
+                scene_output,
+                pa.array([scene_payload]),
+                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+            )
+            _log("Sent initial scene reset notification")
+        if event["type"] != "INPUT":
+            continue
+
+        # Handle voice input
+        if event["id"] == voice_in_input:
+            raw = event["value"][0].as_py() if len(event["value"]) else ""
+            if not raw:
+                continue
+            response = command_handler(raw)
+            node.send_output(
+                voice_out_output,
+                pa.array([json.dumps(response)]),
+                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+            )
+            continue
+
+        # Handle pose updates
+        if event["id"] == pose_input:
+            tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
+            if tcp_pose.size >= 6:
+                state.latest_pose = tcp_pose[:6].tolist()
+                state.latest_pose_at = time.monotonic()
+            continue
+
+        # Handle object detection updates
+        if event["id"] == objects_input:
+            raw = event["value"][0].as_py() if len(event["value"]) else ""
+            if raw:
+                try:
+                    payload = json.loads(raw)
+                    objects = payload.get("objects", [])
+                except Exception:
+                    objects = []
+                state.latest_objects = objects
+                state.latest_objects_at = time.monotonic()
+            continue
+
+        # Handle camera image
+        if event["id"] == image_input:
+            try:
+                # Get raw image data
+                img_data = event["value"].to_numpy()
+                # Reshape to image (assuming BGR format)
+                img = img_data.reshape((image_height, image_width, 3)).astype(np.uint8)
+                # Encode to JPEG
+                _, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 80])
+                shared_state.update_image(jpeg_data.tobytes(), time.monotonic())
+            except Exception as e:
+                # Log error but don't crash
+                pass
+            continue
+
+        # Handle robot status updates
+        if event["id"] == status_input:
+            payload = _parse_status_payload(event["value"])
+            if payload and state.pending_command:
+                if payload.get("command_id") == state.pending_command.get("id"):
+                    _log(f"Command completed: {state.pending_command.get('action')} (status={payload.get('status', 'ok')})")
+                    state.pending_command = None
+            continue
+
+        # Process pending intents
+        if pending_intents:
+            intent = pending_intents.popleft()
+            action = intent["action"]
+            obj = intent["obj"]
+            color = intent["color"]
+            size = intent["size"]
+
+            _log(f"Processing intent: {action} {obj} {color} {size}")
+
+            latest_pose = state.latest_pose
+            objects = list(state.latest_objects) + list(state.static_objects)
+            _log(f"Available objects: {len(state.latest_objects)} detected + {len(state.static_objects)} static")
+
+            if action in ("subir", "bajar") and latest_pose:
+                delta = cfg.step_mm if action == "subir" else -cfg.step_mm
+                target = np.array(latest_pose[:3], dtype=np.float64)
+                target[2] += delta
+                if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
+                    step = RobotStep(
+                        action="move_to_pose",
+                        payload={
+                            "x": float(target[0]),
+                            "y": float(target[1]),
+                            "z": float(target[2]),
+                            "roll": cfg.default_roll,
+                            "pitch": cfg.default_pitch,
+                            "yaw": cfg.default_yaw,
+                        },
+                    )
+                    state.queue.append(step)
+                    _log(f"Queued: move Z to {target[2]:.1f}mm (delta={delta:+.1f})")
+                else:
+                    _log(f"Target {target.tolist()} out of bounds, skipping")
+
+            elif action in ("ir", "tomar", "soltar"):
+                target_obj = None
+                if obj != "no especificado":
+                    target_name = _translate_target(obj, cfg.class_map)
+                    target_color = _translate_target(color, cfg.class_map)
+                    _log(f"Looking for: type={target_name}, color={target_color}")
+                    # Log available objects for debugging
+                    for o in objects:
+                        _log(f"  -> Available: {o.get('object_type')} {o.get('color')} {o.get('size')} at {o.get('position_mm')}")
+                    for o in objects:
+                        if o.get("object_type") == target_name:
+                            if color == "no especificado" or o.get("color") == target_color:
+                                if size == "no especificado" or o.get("size") == _translate_target(size, cfg.class_map):
+                                    target_obj = o
+                                    break
+                if target_obj:
+                    _log(f"Found target: {target_obj.get('object_type')} {target_obj.get('color')} at {target_obj.get('position_mm')}")
+                    pos = np.array(target_obj["position_mm"], dtype=np.float64)
+                    approach = pos.copy()
+                    approach[2] += cfg.tcp_offset_mm + cfg.approach_offset_mm
+                    target = pos.copy()
+                    target[2] += cfg.tcp_offset_mm
+                    if _within_bounds(approach, cfg.workspace_min, cfg.workspace_max):
+                        state.queue.append(
+                            RobotStep(
+                                action="move_to_pose",
+                                payload={
+                                    "x": float(approach[0]),
+                                    "y": float(approach[1]),
+                                    "z": float(approach[2]),
+                                    "roll": cfg.default_roll,
+                                    "pitch": cfg.default_pitch,
+                                    "yaw": cfg.default_yaw,
+                                },
+                            )
+                        )
+                        _log(f"Queued: approach pose at Z={approach[2]:.1f}mm")
+                    if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
+                        state.queue.append(
+                            RobotStep(
+                                action="move_to_pose",
+                                payload={
+                                    "x": float(target[0]),
+                                    "y": float(target[1]),
+                                    "z": float(target[2]),
+                                    "roll": cfg.default_roll,
+                                    "pitch": cfg.default_pitch,
+                                    "yaw": cfg.default_yaw,
+                                },
+                            )
+                        )
+                        _log(f"Queued: target pose at Z={target[2]:.1f}mm")
+                    if action == "tomar":
+                        state.queue.append(RobotStep(action="vacuum_on", payload={}))
+                        _log("Queued: vacuum_on")
+                    elif action == "soltar":
+                        state.queue.append(RobotStep(action="vacuum_off", payload={}))
+                        _log("Queued: vacuum_off")
+                else:
+                    _log(f"Target object not found: {obj} {color}")
+                    continue
+
+            elif action == "reiniciar":
+                _log(f"Reiniciar: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
+                # Turn off vacuum first
+                state.queue.append(RobotStep(action="vacuum_off", payload={}))
+                # Clear current detected objects (will be refreshed by detector)
+                state.latest_objects = []
+                state.latest_objects_at = None
+                _log("Cleared detected objects - waiting for fresh detection")
+                # Move to initial position
+                state.queue.append(
+                    RobotStep(
+                        action="move_to_pose",
+                        payload={
+                            "x": init_x,
+                            "y": init_y,
+                            "z": init_z,
+                            "roll": init_roll,
+                            "pitch": init_pitch,
+                            "yaw": init_yaw,
+                        },
+                    )
+                )
+                _log(f"Queued: vacuum_off + move to home")
+                # Send scene update to notify clients that scene was reset
+                scene_payload = json.dumps(
+                    {"objects": list(state.static_objects), "reset": True}
+                )
+                node.send_output(
+                    scene_output,
+                    pa.array([scene_payload]),
+                    metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+                )
+                _log("Sent scene reset notification")
+
+            _log(f"Queue size: {len(state.queue)}")
+
+        # Emit scene updates when objects change
+        if event["id"] == objects_input:
+            scene_payload = json.dumps(
+                {"objects": list(state.latest_objects) + list(state.static_objects)}
+            )
+            node.send_output(
+                scene_output,
+                pa.array([scene_payload]),
+                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+            )
+
+        # Send queued robot steps one at a time
+        if state.pending_command is None and state.queue:
+            step = state.queue.popleft()
+            if cfg.dry_run:
+                _log(f"[DRY RUN] Would send: {step.action} {step.payload}")
+                state.pending_command = None
+                continue
+            cmd_id = _send_dora_command(node, command_output, step.action, step.payload)
+            state.pending_command = {"id": cmd_id, "action": step.action}
+            _log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={len(state.queue)}")
+
+            # Update debug state
+            shared_state.update_robot_command(
+                {"id": cmd_id, "action": step.action, "payload": step.payload},
+                time.monotonic(),
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/dora_voice_control/dora_voice_control/models.py
+++ b/dora_voice_control/dora_voice_control/models.py
@@ -0,0 +1,38 @@
+"""Pydantic models for the voice control API."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class CommandRequest(BaseModel):
+    """Request to send a voice command."""
+
+    text: str
+
+
+class CommandResponse(BaseModel):
+    """Response from a voice command."""
+
+    ok: bool
+    text: str
+    status: str
+
+
+class MoveRequest(BaseModel):
+    """Request to move to a position."""
+
+    x: float
+    y: float
+    z: float
+    roll: Optional[float] = 180.0
+    pitch: Optional[float] = 0.0
+    yaw: Optional[float] = 0.0
+
+
+class VacuumRequest(BaseModel):
+    """Request to control the vacuum."""
+
+    on: bool
--- a/dora_voice_control/dora_voice_control/parser.py
+++ b/dora_voice_control/dora_voice_control/parser.py
@@ -0,0 +1,118 @@
+"""Voice command parsing logic."""
+
+from __future__ import annotations
+
+import json
+import os
+import unicodedata
+from typing import Dict
+
+
+def normalize(text: str) -> str:
+    """Normalize text: lowercase, remove accents."""
+    text = text.lower().strip()
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join([c for c in text if not unicodedata.combining(c)])
+    return text
+
+
+def rule_parse(transcript: str) -> Dict[str, str]:
+    """Parse voice command using rule-based approach."""
+    text = normalize(transcript)
+
+    action = "error"
+    if any(w in text for w in ["reiniciar", "reinicia", "reset"]):
+        action = "reiniciar"
+    elif any(w in text for w in ["sube", "subir", "arriba"]):
+        action = "subir"
+    elif any(w in text for w in ["baja", "bajar", "abajo"]):
+        action = "bajar"
+    elif any(w in text for w in ["soltar", "deja", "dejar"]):
+        action = "soltar"
+    elif any(w in text for w in ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"]):
+        action = "tomar"
+    elif any(w in text for w in ["ir", "ve", "mover", "muevete", "acercar"]):
+        action = "ir"
+
+    color = "no especificado"
+    if "rojo" in text:
+        color = "rojo"
+    elif "azul" in text:
+        color = "azul"
+    elif "amarillo" in text:
+        color = "amarillo"
+    elif "blanco" in text:
+        color = "blanco"
+
+    obj = "no especificado"
+    if "estrella" in text:
+        obj = "estrella"
+    elif "cilindro" in text:
+        obj = "cilindro"
+    elif "cubo" in text:
+        obj = "cubo"
+    elif "caja" in text:
+        obj = "caja"
+
+    size = "no especificado"
+    if "grande" in text:
+        size = "grande"
+    elif "pequeno" in text or "pequeño" in text or "chico" in text:
+        size = "pequeno"
+
+    if action == "error":
+        return {"resultado": "error"}
+    return {
+        "resultado": "ok",
+        "accion": action,
+        "objeto": obj,
+        "color": color,
+        "tamano": size,
+    }
+
+
+def build_gemini_prompt(transcript: str) -> str:
+    """Build prompt for Gemini LLM parsing."""
+    return f"""Interpreta el siguiente comando de voz de un niño, convertido a texto, para controlar
+    un robot (manito). Asegúrate de responder con 'accion', 'objeto', 'color' y 'tamano'. Si el color
+    o el tamaño no están especificados, responde con 'no especificado'. Si no entiendes la frase,
+    responde con 'resultado: error'. En caso contrario, responde con 'resultado: ok'. Las acciones
+    posibles son 'bajar', 'subir', 'soltar', 'tomar', 'ir', 'reiniciar'. Los colores posibles son 'rojo',
+    'blanco','azul' y 'amarillo'. Los tamaños posibles son 'grande', 'pequeno'. Los posible objetos son estrella,
+    cilindro, cubo y caja; cualquier otro objeto es error.
+    Comando: "{transcript}"
+    Nota: Los comandos pueden incluir variaciones en la expresión y errores comunes en el lenguaje de
+    los niños. Normaliza la respuesta a las categorías establecidas. La salida es un json con los campos
+    'resultado', 'accion', 'objeto', 'color' y 'tamano'. Adicionalmente los ninos pueden decir tomar,chupar, succionar o similar para tomar un objeto.
+    """
+
+
+def parse_command(transcript: str, llm_provider: str = "rules") -> Dict[str, str]:
+    """Parse voice command using specified provider."""
+    if llm_provider == "gemini":
+        try:
+            from google import genai
+            from google.genai import types
+        except Exception:
+            return rule_parse(transcript)
+
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            return rule_parse(transcript)
+
+        try:
+            client = genai.Client(api_key=api_key)
+            prompt = build_gemini_prompt(transcript)
+            reply = client.models.generate_content(
+                model=os.getenv("GEMINI_MODEL", "gemini-2.0-flash"),
+                contents=prompt,
+                config=types.GenerateContentConfig(temperature=0.5),
+            )
+            raw = str(reply.text).replace("```json", "").replace("```", "")
+            return json.loads(raw)
+        except json.JSONDecodeError:
+            return {"resultado": "error"}
+        except Exception:
+            return rule_parse(transcript)
+    else:
+        return rule_parse(transcript)
--- a/dora_voice_control/dora_voice_control/state.py
+++ b/dora_voice_control/dora_voice_control/state.py
@@ -0,0 +1,158 @@
+"""Shared state management for voice control node."""
+
+from __future__ import annotations
+
+import threading
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Any, Deque, Dict, List, Optional
+
+
+@dataclass
+class RobotStep:
+    """A single step in the robot command queue."""
+
+    action: str
+    payload: Dict[str, Any]
+
+
+@dataclass
+class VoiceState:
+    """Runtime state for voice control."""
+
+    latest_pose: Optional[List[float]] = None
+    latest_pose_at: Optional[float] = None
+    latest_objects: List[Dict[str, Any]] = field(default_factory=list)
+    latest_objects_at: Optional[float] = None
+    static_objects: List[Dict[str, Any]] = field(default_factory=list)
+    pending_command: Optional[Dict[str, Any]] = None
+    queue: Deque[RobotStep] = field(default_factory=deque)
+
+
+@dataclass
+class DebugState:
+    """Debug information for the web interface."""
+
+    last_voice_input: Optional[str] = None
+    last_voice_input_at: Optional[float] = None
+    last_parse_result: Optional[Dict[str, Any]] = None
+    last_robot_command: Optional[Dict[str, Any]] = None
+    last_robot_command_at: Optional[float] = None
+    command_history: List[Dict[str, Any]] = field(default_factory=list)
+    error_log: List[Dict[str, Any]] = field(default_factory=list)
+    latest_image: Optional[bytes] = None
+    latest_image_at: Optional[float] = None
+
+
+class SharedState:
+    """Thread-safe shared state container."""
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self.voice_state = VoiceState()
+        self.debug_state = DebugState()
+        self._command_callback: Optional[Any] = None
+
+    def set_command_callback(self, callback: Any) -> None:
+        """Set callback for sending commands from web interface."""
+        with self._lock:
+            self._command_callback = callback
+
+    def get_command_callback(self) -> Optional[Any]:
+        """Get the command callback."""
+        with self._lock:
+            return self._command_callback
+
+    def get_status(self) -> Dict[str, Any]:
+        """Get current status for web interface."""
+        with self._lock:
+            vs = self.voice_state
+            ds = self.debug_state
+            return {
+                "has_pose": vs.latest_pose is not None,
+                "pose": vs.latest_pose,
+                "pose_age_ms": _age_ms(vs.latest_pose_at),
+                "object_count": len(vs.latest_objects),
+                "static_object_count": len(vs.static_objects),
+                "queue_size": len(vs.queue),
+                "has_pending_command": vs.pending_command is not None,
+                "pending_command": vs.pending_command,
+                "last_voice_input": ds.last_voice_input,
+                "last_voice_input_age_ms": _age_ms(ds.last_voice_input_at),
+                "last_parse_result": ds.last_parse_result,
+            }
+
+    def get_objects(self) -> Dict[str, Any]:
+        """Get detected and static objects."""
+        with self._lock:
+            return {
+                "detected": list(self.voice_state.latest_objects),
+                "static": list(self.voice_state.static_objects),
+            }
+
+    def get_queue(self) -> List[Dict[str, Any]]:
+        """Get the command queue."""
+        with self._lock:
+            return [{"action": s.action, "payload": s.payload} for s in self.voice_state.queue]
+
+    def get_history(self) -> List[Dict[str, Any]]:
+        """Get command history."""
+        with self._lock:
+            return list(self.debug_state.command_history[-50:])
+
+    def get_errors(self) -> List[Dict[str, Any]]:
+        """Get error log."""
+        with self._lock:
+            return list(self.debug_state.error_log[-50:])
+
+    def add_to_history(self, entry: Dict[str, Any]) -> None:
+        """Add entry to command history."""
+        with self._lock:
+            self.debug_state.command_history.append(entry)
+            if len(self.debug_state.command_history) > 100:
+                self.debug_state.command_history = self.debug_state.command_history[-100:]
+
+    def add_error(self, error: Dict[str, Any]) -> None:
+        """Add entry to error log."""
+        with self._lock:
+            self.debug_state.error_log.append(error)
+            if len(self.debug_state.error_log) > 100:
+                self.debug_state.error_log = self.debug_state.error_log[-100:]
+
+    def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None:
+        """Update last voice input info."""
+        with self._lock:
+            self.debug_state.last_voice_input = text
+            self.debug_state.last_voice_input_at = timestamp
+            self.debug_state.last_parse_result = parse_result
+
+    def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None:
+        """Update last robot command info."""
+        with self._lock:
+            self.debug_state.last_robot_command = command
+            self.debug_state.last_robot_command_at = timestamp
+
+    def update_image(self, image_bytes: bytes, timestamp: float) -> None:
+        """Update latest camera image."""
+        with self._lock:
+            self.debug_state.latest_image = image_bytes
+            self.debug_state.latest_image_at = timestamp
+
+    def get_image(self) -> Optional[bytes]:
+        """Get latest camera image."""
+        with self._lock:
+            return self.debug_state.latest_image
+
+    def get_image_age_ms(self) -> Optional[int]:
+        """Get age of latest image in milliseconds."""
+        with self._lock:
+            return _age_ms(self.debug_state.latest_image_at)
+
+
+def _age_ms(timestamp: Optional[float]) -> Optional[int]:
+    """Calculate age in milliseconds from monotonic timestamp."""
+    import time
+
+    if timestamp is None:
+        return None
+    return int((time.monotonic() - timestamp) * 1000)
--- a/dora_voice_control/dora_voice_control/templates.py
+++ b/dora_voice_control/dora_voice_control/templates.py
@@ -0,0 +1,700 @@
+"""HTML templates for the voice control web interface."""
+
+HTML_TEMPLATE = """<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Voice Control Debug</title>
+    <style>
+        * { box-sizing: border-box; margin: 0; padding: 0; }
+        body {
+            font-family: 'Segoe UI', system-ui, sans-serif;
+            background: #1a1a2e;
+            color: #eee;
+            min-height: 100vh;
+            padding: 20px;
+        }
+        .header {
+            text-align: center;
+            margin-bottom: 20px;
+            padding-bottom: 15px;
+            border-bottom: 1px solid #333;
+        }
+        .header h1 { color: #00d4ff; font-size: 1.5em; }
+        .header .status {
+            margin-top: 8px;
+            font-size: 0.9em;
+        }
+        .status-dot {
+            display: inline-block;
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            margin-right: 6px;
+        }
+        .status-dot.ok { background: #00ff88; }
+        .status-dot.warn { background: #ffaa00; }
+        .status-dot.error { background: #ff4444; }
+
+        .grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
+            gap: 15px;
+            max-width: 1400px;
+            margin: 0 auto;
+        }
+        .card {
+            background: #16213e;
+            border-radius: 8px;
+            padding: 15px;
+            border: 1px solid #0f3460;
+        }
+        .card h2 {
+            color: #00d4ff;
+            font-size: 1em;
+            margin-bottom: 12px;
+            padding-bottom: 8px;
+            border-bottom: 1px solid #0f3460;
+        }
+
+        /* Command Input */
+        .command-form {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 15px;
+        }
+        .command-form input {
+            flex: 1;
+            padding: 10px 12px;
+            border: 1px solid #0f3460;
+            border-radius: 6px;
+            background: #1a1a2e;
+            color: #fff;
+            font-size: 14px;
+        }
+        .command-form input:focus {
+            outline: none;
+            border-color: #00d4ff;
+        }
+        .btn {
+            padding: 10px 20px;
+            border: none;
+            border-radius: 6px;
+            cursor: pointer;
+            font-weight: 500;
+            transition: all 0.2s;
+        }
+        .btn-primary {
+            background: #00d4ff;
+            color: #000;
+        }
+        .btn-primary:hover { background: #00b8e0; }
+        .btn-primary:disabled { background: #555; color: #888; cursor: not-allowed; }
+        .btn-secondary {
+            background: #333;
+            color: #fff;
+        }
+        .btn-secondary:hover { background: #444; }
+        .btn-danger {
+            background: #ff4444;
+            color: #fff;
+        }
+        .btn-danger:hover { background: #cc3333; }
+        .btn-success {
+            background: #00ff88;
+            color: #000;
+        }
+        .btn-success:hover { background: #00cc6a; }
+
+        /* Quick Commands */
+        .quick-commands {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+        }
+        .quick-btn {
+            padding: 8px 14px;
+            background: #0f3460;
+            border: 1px solid #1a4a7a;
+            border-radius: 20px;
+            color: #00d4ff;
+            cursor: pointer;
+            font-size: 13px;
+            transition: all 0.2s;
+        }
+        .quick-btn:hover {
+            background: #1a4a7a;
+            border-color: #00d4ff;
+        }
+
+        /* Status Grid */
+        .status-grid {
+            display: grid;
+            grid-template-columns: 1fr 1fr;
+            gap: 10px;
+        }
+        .status-item {
+            background: #1a1a2e;
+            padding: 10px;
+            border-radius: 6px;
+        }
+        .status-item .label {
+            font-size: 11px;
+            color: #888;
+            text-transform: uppercase;
+            margin-bottom: 4px;
+        }
+        .status-item .value {
+            font-size: 14px;
+            font-weight: 500;
+        }
+        .status-item .value.ok { color: #00ff88; }
+        .status-item .value.warn { color: #ffaa00; }
+        .status-item .value.error { color: #ff4444; }
+
+        /* Pose Display */
+        .pose-grid {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 8px;
+        }
+        .pose-item {
+            background: #1a1a2e;
+            padding: 10px;
+            border-radius: 6px;
+            text-align: center;
+        }
+        .pose-item .label {
+            font-size: 11px;
+            color: #888;
+            margin-bottom: 4px;
+        }
+        .pose-item .value {
+            font-size: 16px;
+            font-weight: 600;
+            color: #00d4ff;
+            font-family: monospace;
+        }
+
+        /* Objects List */
+        .objects-list {
+            max-height: 300px;
+            overflow-y: auto;
+        }
+        .object-item {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            padding: 8px 10px;
+            background: #1a1a2e;
+            border-radius: 6px;
+            margin-bottom: 6px;
+            font-size: 13px;
+        }
+        .object-item .type { color: #00d4ff; font-weight: 500; }
+        .object-item .color-badge {
+            padding: 2px 8px;
+            border-radius: 10px;
+            font-size: 11px;
+        }
+        .color-badge.red { background: #ff4444; color: #fff; }
+        .color-badge.blue { background: #4488ff; color: #fff; }
+        .color-badge.yellow { background: #ffcc00; color: #000; }
+        .color-badge.white { background: #fff; color: #000; }
+        .object-item .pos {
+            font-family: monospace;
+            font-size: 11px;
+            color: #888;
+        }
+
+        /* Queue Display */
+        .queue-list {
+            max-height: 150px;
+            overflow-y: auto;
+        }
+        .queue-item {
+            display: flex;
+            align-items: center;
+            gap: 10px;
+            padding: 8px 10px;
+            background: #1a1a2e;
+            border-radius: 6px;
+            margin-bottom: 6px;
+            font-size: 13px;
+        }
+        .queue-item .index {
+            background: #0f3460;
+            color: #00d4ff;
+            padding: 2px 8px;
+            border-radius: 4px;
+            font-size: 11px;
+        }
+        .queue-item .action { color: #00ff88; font-weight: 500; }
+        .queue-item.pending { border-left: 3px solid #ffaa00; }
+
+        /* Log Display */
+        .log {
+            max-height: 250px;
+            overflow-y: auto;
+            font-family: monospace;
+            font-size: 12px;
+        }
+        .log-entry {
+            padding: 6px 10px;
+            border-bottom: 1px solid #0f3460;
+        }
+        .log-entry:last-child { border-bottom: none; }
+        .log-entry .time {
+            color: #666;
+            margin-right: 10px;
+        }
+        .log-entry.error { color: #ff4444; }
+        .log-entry.success { color: #00ff88; }
+        .log-entry.info { color: #00d4ff; }
+
+        /* Parse Result */
+        .parse-result {
+            background: #1a1a2e;
+            padding: 12px;
+            border-radius: 6px;
+            font-family: monospace;
+            font-size: 13px;
+        }
+        .parse-result .field {
+            display: flex;
+            justify-content: space-between;
+            padding: 4px 0;
+            border-bottom: 1px solid #0f3460;
+        }
+        .parse-result .field:last-child { border-bottom: none; }
+        .parse-result .key { color: #888; }
+        .parse-result .val { color: #00d4ff; }
+
+        /* Empty State */
+        .empty {
+            text-align: center;
+            color: #666;
+            padding: 20px;
+            font-style: italic;
+        }
+
+        /* Camera View */
+        .camera-container {
+            position: relative;
+            background: #000;
+            border-radius: 6px;
+            overflow: hidden;
+            min-height: 240px;
+        }
+        .camera-container img {
+            width: 100%;
+            height: auto;
+            display: block;
+        }
+        .camera-overlay {
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background: rgba(0,0,0,0.6);
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 11px;
+        }
+        .camera-overlay.ok { color: #00ff88; }
+        .camera-overlay.stale { color: #ffaa00; }
+        .camera-overlay.error { color: #ff4444; }
+        .no-image {
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            height: 240px;
+            color: #666;
+            font-style: italic;
+        }
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>Voice Control Debug Interface</h1>
+        <div class="status">
+            <span class="status-dot" id="status-dot"></span>
+            <span id="status-text">Connecting...</span>
+        </div>
+    </div>
+
+    <div class="grid">
+        <!-- Camera View -->
+        <div class="card" style="grid-column: span 2;">
+            <h2>Camera View <span id="camera-status" style="font-weight:normal;font-size:11px;color:#888;"></span></h2>
+            <div class="camera-container" id="camera-container">
+                <div class="no-image" id="no-image">No camera image available</div>
+                <img id="camera-img" style="display:none;" alt="Camera feed">
+                <div class="camera-overlay" id="camera-overlay"></div>
+            </div>
+        </div>
+
+        <!-- Command Input -->
+        <div class="card">
+            <h2>Send Command</h2>
+            <form class="command-form" id="command-form">
+                <input type="text" id="command-input" placeholder="Enter voice command (e.g., 'sube', 'agarra el cubo rojo')" autocomplete="off">
+                <button type="submit" class="btn btn-primary" id="btn-send">Send</button>
+            </form>
+            <div class="quick-commands">
+                <button class="quick-btn" onclick="sendQuick('sube')">Sube</button>
+                <button class="quick-btn" onclick="sendQuick('baja')">Baja</button>
+                <button class="quick-btn" onclick="sendQuick('agarra el cubo rojo')">Cubo Rojo</button>
+                <button class="quick-btn" onclick="sendQuick('agarra el cubo azul')">Cubo Azul</button>
+                <button class="quick-btn" onclick="sendQuick('suelta')">Soltar</button>
+                <button class="quick-btn" onclick="sendQuick('reinicia')">Reiniciar</button>
+            </div>
+        </div>
+
+        <!-- Status -->
+        <div class="card">
+            <h2>System Status</h2>
+            <div class="status-grid">
+                <div class="status-item">
+                    <div class="label">Pose Available</div>
+                    <div class="value" id="st-pose">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Pose Age</div>
+                    <div class="value" id="st-pose-age">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Objects Detected</div>
+                    <div class="value" id="st-objects">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Static Objects</div>
+                    <div class="value" id="st-static">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Queue Size</div>
+                    <div class="value" id="st-queue">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Pending Command</div>
+                    <div class="value" id="st-pending">--</div>
+                </div>
+            </div>
+        </div>
+
+        <!-- TCP Pose -->
+        <div class="card">
+            <h2>TCP Pose</h2>
+            <div class="pose-grid">
+                <div class="pose-item">
+                    <div class="label">X (mm)</div>
+                    <div class="value" id="pose-x">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Y (mm)</div>
+                    <div class="value" id="pose-y">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Z (mm)</div>
+                    <div class="value" id="pose-z">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Roll</div>
+                    <div class="value" id="pose-roll">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Pitch</div>
+                    <div class="value" id="pose-pitch">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Yaw</div>
+                    <div class="value" id="pose-yaw">--</div>
+                </div>
+            </div>
+        </div>
+
+        <!-- Last Parse Result -->
+        <div class="card">
+            <h2>Last Parse Result</h2>
+            <div class="parse-result" id="parse-result">
+                <div class="empty">No command parsed yet</div>
+            </div>
+            <div style="margin-top: 10px; font-size: 12px; color: #888;">
+                <span>Last input: </span><span id="last-input">--</span>
+            </div>
+        </div>
+
+        <!-- Detected Objects -->
+        <div class="card">
+            <h2>Detected Objects</h2>
+            <div class="objects-list" id="objects-list">
+                <div class="empty">No objects detected</div>
+            </div>
+        </div>
+
+        <!-- Command Queue -->
+        <div class="card">
+            <h2>Command Queue</h2>
+            <div class="queue-list" id="queue-list">
+                <div class="empty">Queue is empty</div>
+            </div>
+            <div style="margin-top: 10px;">
+                <button class="btn btn-danger btn-sm" onclick="clearQueue()">Clear Queue</button>
+            </div>
+        </div>
+
+        <!-- Log -->
+        <div class="card" style="grid-column: span 2;">
+            <h2>Activity Log</h2>
+            <div class="log" id="log"></div>
+        </div>
+    </div>
+
+    <script>
+        const $ = id => document.getElementById(id);
+
+        async function fetchJson(url, opts = {}) {
+            try {
+                const res = await fetch(url, opts);
+                return await res.json();
+            } catch (e) {
+                return { error: e.message };
+            }
+        }
+
+        function log(msg, type = 'info') {
+            const logEl = $('log');
+            const time = new Date().toLocaleTimeString();
+            const entry = document.createElement('div');
+            entry.className = 'log-entry ' + type;
+            entry.innerHTML = '<span class="time">' + time + '</span>' + msg;
+            logEl.insertBefore(entry, logEl.firstChild);
+            if (logEl.children.length > 100) logEl.removeChild(logEl.lastChild);
+        }
+
+        async function updateStatus() {
+            const data = await fetchJson('/api/status');
+            if (data.error) {
+                $('status-dot').className = 'status-dot error';
+                $('status-text').textContent = 'Error: ' + data.error;
+                return;
+            }
+
+            $('status-dot').className = 'status-dot ok';
+            $('status-text').textContent = 'Connected';
+
+            $('st-pose').textContent = data.has_pose ? 'Yes' : 'No';
+            $('st-pose').className = 'value ' + (data.has_pose ? 'ok' : 'warn');
+
+            $('st-pose-age').textContent = data.pose_age_ms !== null ? data.pose_age_ms + 'ms' : '--';
+            $('st-pose-age').className = 'value ' + (data.pose_age_ms < 1000 ? 'ok' : 'warn');
+
+            $('st-objects').textContent = data.object_count;
+            $('st-static').textContent = data.static_object_count;
+            $('st-queue').textContent = data.queue_size;
+            $('st-queue').className = 'value ' + (data.queue_size > 0 ? 'warn' : 'ok');
+
+            $('st-pending').textContent = data.has_pending_command ? 'Yes' : 'No';
+            $('st-pending').className = 'value ' + (data.has_pending_command ? 'warn' : 'ok');
+
+            // Update pose
+            if (data.pose) {
+                $('pose-x').textContent = data.pose[0].toFixed(1);
+                $('pose-y').textContent = data.pose[1].toFixed(1);
+                $('pose-z').textContent = data.pose[2].toFixed(1);
+                $('pose-roll').textContent = data.pose[3].toFixed(1);
+                $('pose-pitch').textContent = data.pose[4].toFixed(1);
+                $('pose-yaw').textContent = data.pose[5].toFixed(1);
+            }
+
+            // Update last input
+            $('last-input').textContent = data.last_voice_input || '--';
+
+            // Update parse result
+            if (data.last_parse_result) {
+                let html = '';
+                for (const [k, v] of Object.entries(data.last_parse_result)) {
+                    html += '<div class="field"><span class="key">' + k + '</span><span class="val">' + v + '</span></div>';
+                }
+                $('parse-result').innerHTML = html;
+            }
+        }
+
+        async function updateObjects() {
+            const data = await fetchJson('/api/objects');
+            if (data.error) return;
+
+            const list = $('objects-list');
+            const detected = data.detected || [];
+            const staticObjs = data.static || [];
+
+            if (detected.length === 0 && staticObjs.length === 0) {
+                list.innerHTML = '<div class="empty">No objects detected</div>';
+                return;
+            }
+
+            let html = '';
+
+            // Detected objects
+            if (detected.length > 0) {
+                html += '<div style="font-size:11px;color:#00d4ff;margin-bottom:6px;">Detected (' + detected.length + ')</div>';
+                html += detected.map(obj => {
+                    const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
+                    const colorClass = obj.color || 'white';
+                    const conf = obj.confidence ? (obj.confidence * 100).toFixed(0) + '%' : '';
+                    const size = obj.size || '';
+                    return '<div class="object-item">' +
+                        '<span class="type">' + (obj.object_type || '?') + '</span>' +
+                        '<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
+                        '<span style="color:#888;font-size:10px;">' + size + '</span>' +
+                        '<span style="color:#00ff88;font-size:10px;">' + conf + '</span>' +
+                        '<span class="pos">[' + pos + ']</span>' +
+                        '</div>';
+                }).join('');
+            }
+
+            // Static objects
+            if (staticObjs.length > 0) {
+                html += '<div style="font-size:11px;color:#888;margin:8px 0 6px 0;">Static (' + staticObjs.length + ')</div>';
+                html += staticObjs.map(obj => {
+                    const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
+                    const colorClass = obj.color || 'white';
+                    return '<div class="object-item" style="opacity:0.7;">' +
+                        '<span class="type">' + (obj.object_type || '?') + '</span>' +
+                        '<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
+                        '<span class="pos">[' + pos + ']</span>' +
+                        '</div>';
+                }).join('');
+            }
+
+            list.innerHTML = html;
+        }
+
+        async function updateQueue() {
+            const data = await fetchJson('/api/queue');
+            if (data.error) return;
+
+            const list = $('queue-list');
+            if (!data.length) {
+                list.innerHTML = '<div class="empty">Queue is empty</div>';
+                return;
+            }
+
+            list.innerHTML = data.map((item, i) => {
+                const payload = JSON.stringify(item.payload || {});
+                return '<div class="queue-item">' +
+                    '<span class="index">' + (i + 1) + '</span>' +
+                    '<span class="action">' + item.action + '</span>' +
+                    '<span style="color:#888;font-size:11px">' + payload + '</span>' +
+                    '</div>';
+            }).join('');
+        }
+
+        async function sendCommand(text) {
+            if (!text.trim()) return;
+
+            $('btn-send').disabled = true;
+            log('Sending: "' + text + '"', 'info');
+
+            const res = await fetchJson('/api/command', {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ text: text })
+            });
+
+            $('btn-send').disabled = false;
+
+            if (res.ok) {
+                log('Response: ' + res.text, 'success');
+            } else {
+                log('Error: ' + (res.text || res.detail || 'Unknown error'), 'error');
+            }
+
+            $('command-input').value = '';
+            updateStatus();
+            updateQueue();
+        }
+
+        function sendQuick(text) {
+            $('command-input').value = text;
+            sendCommand(text);
+        }
+
+        async function clearQueue() {
+            log('Clearing queue...', 'info');
+            const res = await fetchJson('/api/queue/clear', { method: 'POST' });
+            if (res.ok) {
+                log('Queue cleared', 'success');
+            } else {
+                log('Failed to clear queue', 'error');
+            }
+            updateQueue();
+        }
+
+        // Form submit
+        $('command-form').addEventListener('submit', e => {
+            e.preventDefault();
+            sendCommand($('command-input').value);
+        });
+
+        // Camera update
+        let cameraErrorCount = 0;
+        async function updateCamera() {
+            const info = await fetchJson('/api/image/info');
+            const overlay = $('camera-overlay');
+            const img = $('camera-img');
+            const noImage = $('no-image');
+            const status = $('camera-status');
+
+            if (info.error || !info.has_image) {
+                cameraErrorCount++;
+                if (cameraErrorCount > 3) {
+                    img.style.display = 'none';
+                    noImage.style.display = 'flex';
+                    overlay.textContent = '';
+                    status.textContent = '(no feed)';
+                }
+                return;
+            }
+
+            cameraErrorCount = 0;
+            noImage.style.display = 'none';
+            img.style.display = 'block';
+
+            // Update image with cache-busting
+            const newSrc = '/api/image?t=' + Date.now();
+            if (img.src !== newSrc) {
+                img.src = newSrc;
+            }
+
+            // Update overlay
+            const ageMs = info.age_ms || 0;
+            if (ageMs < 500) {
+                overlay.textContent = 'LIVE';
+                overlay.className = 'camera-overlay ok';
+            } else if (ageMs < 2000) {
+                overlay.textContent = ageMs + 'ms';
+                overlay.className = 'camera-overlay stale';
+            } else {
+                overlay.textContent = 'STALE ' + (ageMs/1000).toFixed(1) + 's';
+                overlay.className = 'camera-overlay error';
+            }
+            status.textContent = '';
+        }
+
+        // Auto-refresh
+        setInterval(updateStatus, 500);
+        setInterval(updateObjects, 1000);
+        setInterval(updateQueue, 500);
+        setInterval(updateCamera, 100);
+
+        // Initial load
+        updateStatus();
+        updateObjects();
+        updateQueue();
+        updateCamera();
+        log('Interface loaded', 'info');
+    </script>
+</body>
+</html>
+"""
--- a/dora_voice_control/pyproject.toml
+++ b/dora_voice_control/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "dora-voice-control"
+version = "0.1.0"
+license = { file = "MIT" }
+authors = [{ name = "Dora" }]
+description = "Dora node for voice command control via WebSocket"
+
+requires-python = ">=3.8"
+
+dependencies = [
+  "dora-rs >= 0.3.9",
+  "numpy < 2.0.0",
+  "pyarrow >= 12.0.0",
+  "websockets >= 12.0",
+  "fastapi >= 0.109.0",
+  "uvicorn >= 0.27.0",
+  "pydantic >= 2.0.0",
+  "opencv-python >= 4.8.0",
+]
+
+[project.optional-dependencies]
+llm = ["google-genai"]
+
+[project.scripts]
+dora-voice-control = "dora_voice_control.main:main"
--- a/trained_models/yolo8n.pt
+++ b/trained_models/yolo8n.pt