Add voice control, working but need more work

2026-01-31 11:41:50 -03:00
parent 380c466170
commit b9798a2f46
21 changed files with 3101 additions and 0 deletions
--- a/config.toml
+++ b/config.toml
@@ -0,0 +1,22 @@
 [roi]
 roi_top_left = [500, 230]
 roi_bottom_right = [775, 510]
 [bucket_positions]
 blue_bucket_pos = [400, 90]
 red_bucket_pos = [550, 90]
 yellow_bucket_pos = [550, 630]
 white_bucket_pos = [400, 630]
 [dominant_colors]
 blue = [255, 0, 0]
 red = [0, 0, 255]
 yellow = [0, 255, 255]
 white = [255, 255, 255]
 [object_parameters]
 size_threshold = 4200
 big_height = 125.9
 small_height = 106.0
 bottom_height = 68.0
 normal_height = 220.0
--- a/dataflow_voice_control_ulite6_zed.yml
+++ b/dataflow_voice_control_ulite6_zed.yml
@@ -0,0 +1,133 @@
 nodes:
  - id: zed_camera_cpp
    build: bash -lc "cmake -S dora_zed_cpp -B dora_zed_cpp/build && cmake --build dora_zed_cpp/build"
    path: dora_zed_cpp/build/dora_zed_cpp
    env:
      ZED_RESOLUTION: "720"
      ZED_FPS: "15"
      ZED_DEPTH_MODE: "NEURAL"
      ZED_DEPTH_MIN_MM: "10"
      ZED_DEPTH_MAX_MM: "600"
      ZED_DEPTH_FILL: "false"
      ZED_FLIP: "ON"
      ZED_WARMUP_FRAMES: "30"
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image_bgr
      - camera_info
      - point_cloud
  - id: ulite6
    build: uv pip install -e dora_ulite6
    path: dora_ulite6/dora_ulite6/main.py
    inputs:
      tick: dora/timer/millis/10
      command: voice/robot_cmd
    outputs:
      - tcp_pose
      - status
    env:
      ROBOT_IP: "192.168.1.192"
      DEFAULT_SPEED: "30"
      DEFAULT_UNITS: "mm"
      API_HOST: "0.0.0.0"
      API_PORT: "9000"
      VACUUM_ENABLED: "true"
      # Initial position on startup: "home", "pose", or "none"
      # Set to "none" - voice control handles initial positioning
      INIT_MODE: "none"
  - id: iobridge
    build: |
      uv venv -p 3.12 --seed --allow-existing
      uv pip install -e dora_iobridge
    path: dora_iobridge/dora_iobridge/main.py
    env:
      VIRTUAL_ENV: ./.venv
      VOICE_HOST: "0.0.0.0"
      VOICE_PORT: "8765"
      VOICE_IN_OUTPUT: "voice_in"
      VOICE_OUT_INPUT: "voice_out"
      SCENE_INPUT: "scene_update"
    inputs:
      voice_out: voice/voice_out
      scene_update: voice/scene_update
      tick: dora/timer/millis/100
    outputs:
      - voice_in
  - id: detector
    build: |
      uv venv -p 3.12 --seed --allow-existing
      uv pip install -e dora_detector
    path: dora_detector/dora_detector/main.py
    env:
      VIRTUAL_ENV: ./.venv
      IMAGE_INPUT: "image_bgr"
      POINT_CLOUD_INPUT: "point_cloud"
      POSE_INPUT: "tcp_pose"
      OBJECTS_OUTPUT: "objects"
      IMAGE_OUTPUT: "image_annotated"
      CALIBRATION_FILE: "calibration_ulite6_zed.npz"
      DETECTOR_WEIGHTS: "trained_models/yolo8n.pt"
      CONFIG_FILE: "config.toml"
      ROI_TOP_LEFT: "500,230"
      ROI_BOTTOM_RIGHT: "775,510"
      SIZE_THRESHOLD: "4200"
      DETECT_EVERY_N: "3"
      MIN_DEPTH_MM: "10"
      MAX_DEPTH_MM: "600"
    inputs:
      image_bgr: zed_camera_cpp/image_bgr
      point_cloud: zed_camera_cpp/point_cloud
      tcp_pose: ulite6/tcp_pose
      tick: dora/timer/millis/100
    outputs:
      - objects
      - image_annotated
  - id: voice
    build: |
      uv venv -p 3.12 --seed --allow-existing
      uv pip install -e dora_voice_control
    path: dora_voice_control/dora_voice_control/main.py
    env:
      VIRTUAL_ENV: ./.venv
      OBJECTS_INPUT: "objects"
      POSE_INPUT: "tcp_pose"
      STATUS_INPUT: "status"
      COMMAND_OUTPUT: "robot_cmd"
      CONFIG_FILE: "config.toml"
      # Map Spanish command names to detector class names
      CLASS_MAP: '{"cilindro": "cylinder", "cubo": "cube", "estrella": "star", "caja": "box", "amarillo": "yellow", "rojo": "red", "azul": "blue", "blanco": "white", "grande": "big", "pequeno": "small"}'
      VOICE_IN_INPUT: "voice_in"
      VOICE_OUT_OUTPUT: "voice_out"
      SCENE_OUTPUT: "scene_update"
      TCP_OFFSET_MM: "63.0"
      APPROACH_OFFSET_MM: "50.0"
      STEP_MM: "20.0"
      DEFAULT_ROLL: "180.0"
      DEFAULT_PITCH: "0.0"
      DEFAULT_YAW: "0.0"
      DRY_RUN: "false"
      # Initial position (used on startup and reset command)
      INIT_ON_START: "true"
      INIT_X: "300.0"
      INIT_Y: "0.0"
      INIT_Z: "350.0"
      INIT_ROLL: "180.0"
      INIT_PITCH: "0.0"
      INIT_YAW: "0.0"
      IMAGE_INPUT: "image_annotated"
      IMAGE_WIDTH: "1280"
      IMAGE_HEIGHT: "720"
      API_ENABLED: "true"
      API_PORT: "8080"
    inputs:
      objects: detector/objects
      tcp_pose: ulite6/tcp_pose
      status: ulite6/status
      voice_in: iobridge/voice_in
      image_annotated: detector/image_annotated
      tick: dora/timer/millis/100
    outputs:
      - robot_cmd
      - voice_out
      - scene_update
--- a/dora_detector/dora_detector/init.py
+++ b/dora_detector/dora_detector/init.py
@@ -0,0 +1 @@
 """Dora detector node package."""
--- a/dora_detector/dora_detector/main.py
+++ b/dora_detector/dora_detector/main.py
@@ -0,0 +1,513 @@
 """Dora node for YOLO detection and base-frame object localization."""
 from __future__ import annotations
 import json
 import os
 import time
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 import cv2
 import numpy as np
 import pyarrow as pa
 from dora import Node
 from ultralytics import YOLO
 try:
    import tomllib
 except ModuleNotFoundError:  # pragma: no cover
    import tomli as tomllib
 DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt")
@dataclass
 class DetectionConfig:
    imgsz: int
    conf: float
    iou: float
    size_threshold: int
    roi_top_left: Tuple[int, int]
    roi_bottom_right: Tuple[int, int]
    use_roi: bool
    detect_every_n: int
    min_depth_mm: float
    max_depth_mm: float
    color_blue: Tuple[int, int, int]
    color_red: Tuple[int, int, int]
    color_yellow: Tuple[int, int, int]
    color_white: Tuple[int, int, int]
 def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]:
    try:
        parts = [p.strip() for p in raw.split(",")]
        if len(parts) >= 2:
            return int(parts[0]), int(parts[1])
    except Exception:
        pass
    return default
 def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, float]:
    try:
        parts = [p.strip() for p in raw.split(",")]
        if len(parts) >= 2:
            return float(parts[0]), float(parts[1])
    except Exception:
        pass
    return default
 def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int]:
    try:
        parts = [p.strip() for p in raw.split(",")]
        if len(parts) >= 3:
            return int(parts[0]), int(parts[1]), int(parts[2])
    except Exception:
        pass
    return default
 def _rotation_matrix_xyz(
    roll_deg: float, pitch_deg: float, yaw_deg: float
 ) -> np.ndarray:
    roll = np.deg2rad(roll_deg)
    pitch = np.deg2rad(pitch_deg)
    yaw = np.deg2rad(yaw_deg)
    cx, sx = np.cos(roll), np.sin(roll)
    cy, sy = np.cos(pitch), np.sin(pitch)
    cz, sz = np.cos(yaw), np.sin(yaw)
    rot_x = np.array([[1.0, 0.0, 0.0], [0.0, cx, -sx], [0.0, sx, cx]])
    rot_y = np.array([[cy, 0.0, sy], [0.0, 1.0, 0.0], [-sy, 0.0, cy]])
    rot_z = np.array([[cz, -sz, 0.0], [sz, cz, 0.0], [0.0, 0.0, 1.0]])
    return rot_z @ rot_y @ rot_x
 def _pose_to_matrix(tcp_pose_mm_deg: List[float]) -> np.ndarray:
    x, y, z, roll, pitch, yaw = tcp_pose_mm_deg
    rot = _rotation_matrix_xyz(roll, pitch, yaw)
    mat = np.eye(4)
    mat[:3, :3] = rot
    mat[:3, 3] = np.array([x, y, z], dtype=np.float64) / 1000.0
    return mat
 def _decode_image(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
    encoding = str(metadata.get("encoding", "bgr8")).lower()
    width = metadata.get("width")
    height = metadata.get("height")
    if (width is None or height is None) and "shape" in metadata:
        shape = metadata.get("shape")
        if isinstance(shape, (list, tuple)) and len(shape) >= 2:
            height = height if height is not None else int(shape[0])
            width = width if width is not None else int(shape[1])
    if width is None or height is None:
        raise KeyError("width/height (or shape) missing from metadata")
    if encoding == "bgr8":
        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
        return frame.copy()
    if encoding == "rgb8":
        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
        return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    if encoding in {"jpeg", "jpg", "jpe", "bmp", "webp", "png"}:
        frame = storage.to_numpy().astype(np.uint8)
        return cv2.imdecode(frame, cv2.IMREAD_COLOR)
    if encoding == "yuv420":
        yuv = storage.to_numpy().astype(np.uint8)
        yuv = yuv[: width * height * 3 // 2].reshape((height * 3 // 2, width))
        return cv2.cvtColor(yuv, cv2.COLOR_YUV420p2BGR)
    raise RuntimeError(f"Unsupported image encoding: {encoding}")
 def _decode_point_cloud(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
    dtype_str = str(metadata.get("dtype", "float32"))
    if dtype_str != "float32":
        raise RuntimeError(f"Unsupported point cloud dtype: {dtype_str}")
    shape = metadata.get("shape")
    if not isinstance(shape, (list, tuple)) or len(shape) < 3:
        raise KeyError("point cloud shape missing from metadata")
    height, width, channels = [int(v) for v in shape[:3]]
    if channels < 3:
        raise ValueError("point cloud requires at least 3 channels")
    raw = storage.to_numpy().astype(np.uint8).tobytes()
    values = np.frombuffer(raw, dtype=np.float32)
    return values.reshape((height, width, channels))
 def _valid_point(point_xyz: np.ndarray, cfg: DetectionConfig) -> bool:
    if not np.all(np.isfinite(point_xyz)):
        return False
    z = float(point_xyz[2])
    if z <= 0:
        return False
    if z < cfg.min_depth_mm or z > cfg.max_depth_mm:
        return False
    return True
 def _sample_point(
    point_cloud: np.ndarray, x: int, y: int, cfg: DetectionConfig
 ) -> Optional[np.ndarray]:
    h, w, _ = point_cloud.shape
    if x < 0 or y < 0 or x >= w or y >= h:
        return None
    point_xyz = point_cloud[y, x, :3].astype(np.float64)
    if _valid_point(point_xyz, cfg):
        return point_xyz
    radius = 3
    samples = []
    for dy in range(-radius, radius + 1):
        for dx in range(-radius, radius + 1):
            xx = x + dx
            yy = y + dy
            if xx < 0 or yy < 0 or xx >= w or yy >= h:
                continue
            p = point_cloud[yy, xx, :3].astype(np.float64)
            if _valid_point(p, cfg):
                samples.append(p)
    if not samples:
        return None
    return np.median(np.stack(samples, axis=0), axis=0)
 def _dominant_color(image: np.ndarray, bbox: List[int]) -> Tuple[int, int, int]:
    x1, y1, x2, y2 = bbox
    x1 = max(0, x1)
    y1 = max(0, y1)
    x2 = min(image.shape[1], x2)
    y2 = min(image.shape[0], y2)
    roi = image[y1:y2, x1:x2]
    if roi.size == 0:
        return (0, 0, 0)
    color = np.median(roi, axis=(0, 1)).astype(int)
    return int(color[0]), int(color[1]), int(color[2])
 def _closest_color(color: Tuple[int, int, int], cfg: DetectionConfig) -> str:
    colors = {
        "blue": np.array(cfg.color_blue, dtype=np.float64),
        "red": np.array(cfg.color_red, dtype=np.float64),
        "yellow": np.array(cfg.color_yellow, dtype=np.float64),
        "white": np.array(cfg.color_white, dtype=np.float64),
    }
    color_vec = np.array(color, dtype=np.float64)
    best_name = "unknown"
    best_dist = float("inf")
    for name, value in colors.items():
        dist = np.linalg.norm(color_vec - value)
        if dist < best_dist:
            best_name = name
            best_dist = dist
    return best_name
 def _load_calibration(calibration_file: str) -> np.ndarray:
    calib = np.load(calibration_file, allow_pickle=True)
    t_cam2gripper = calib["T_cam2gripper"]
    return t_cam2gripper
 def _load_config_file(path: str) -> Dict[str, Any]:
    if not path or not os.path.exists(path):
        return {}
    try:
        with open(path, "rb") as handle:
            return tomllib.load(handle)
    except Exception:
        return {}
 def _build_config(config_path: str) -> DetectionConfig:
    cfg_data = _load_config_file(config_path)
    roi_cfg = cfg_data.get("roi", {})
    colors_cfg = cfg_data.get("dominant_colors", {})
    obj_cfg = cfg_data.get("object_parameters", {})
    imgsz = int(os.getenv("YOLO_IMGSZ", "640"))
    conf = float(os.getenv("YOLO_CONF", "0.25"))
    iou = float(os.getenv("YOLO_IOU", "0.45"))
    size_threshold = int(
        os.getenv("SIZE_THRESHOLD", str(obj_cfg.get("size_threshold", 4200)))
    )
    roi_top_left = _parse_int_pair(
        os.getenv(
            "ROI_TOP_LEFT",
            ",".join([str(v) for v in roi_cfg.get("roi_top_left", [500, 230])]),
        ),
        (500, 230),
    )
    roi_bottom_right = _parse_int_pair(
        os.getenv(
            "ROI_BOTTOM_RIGHT",
            ",".join([str(v) for v in roi_cfg.get("roi_bottom_right", [775, 510])]),
        ),
        (775, 510),
    )
    use_roi = os.getenv("USE_ROI", "true").lower() in ("true", "1", "yes")
    detect_every_n = int(os.getenv("DETECT_EVERY_N", "3"))
    min_depth_mm = float(os.getenv("MIN_DEPTH_MM", "10"))
    max_depth_mm = float(os.getenv("MAX_DEPTH_MM", "600"))
    color_blue = _parse_color(
        os.getenv(
            "COLOR_BLUE",
            ",".join([str(v) for v in colors_cfg.get("blue", [255, 0, 0])]),
        ),
        (255, 0, 0),
    )
    color_red = _parse_color(
        os.getenv(
            "COLOR_RED",
            ",".join([str(v) for v in colors_cfg.get("red", [0, 0, 255])]),
        ),
        (0, 0, 255),
    )
    color_yellow = _parse_color(
        os.getenv(
            "COLOR_YELLOW",
            ",".join([str(v) for v in colors_cfg.get("yellow", [0, 255, 255])]),
        ),
        (0, 255, 255),
    )
    color_white = _parse_color(
        os.getenv(
            "COLOR_WHITE",
            ",".join([str(v) for v in colors_cfg.get("white", [255, 255, 255])]),
        ),
        (255, 255, 255),
    )
    return DetectionConfig(
        imgsz=imgsz,
        conf=conf,
        iou=iou,
        size_threshold=size_threshold,
        roi_top_left=roi_top_left,
        roi_bottom_right=roi_bottom_right,
        use_roi=use_roi,
        detect_every_n=detect_every_n,
        min_depth_mm=min_depth_mm,
        max_depth_mm=max_depth_mm,
        color_blue=color_blue,
        color_red=color_red,
        color_yellow=color_yellow,
        color_white=color_white,
    )
 def _within_roi(bbox: List[int], cfg: DetectionConfig) -> bool:
    if not cfg.use_roi:
        return True
    x1, y1, x2, y2 = bbox
    rx1, ry1 = cfg.roi_top_left
    rx2, ry2 = cfg.roi_bottom_right
    return x1 >= rx1 and y1 >= ry1 and x2 <= rx2 and y2 <= ry2
 def _draw_detections(
    frame: np.ndarray, objects: List[Dict[str, Any]], cfg: DetectionConfig
 ) -> np.ndarray:
    """Draw bounding boxes and labels on frame."""
    annotated = frame.copy()
    # Draw ROI rectangle (always visible)
    cv2.rectangle(
        annotated,
        cfg.roi_top_left,
        cfg.roi_bottom_right,
        (0, 255, 0) if cfg.use_roi else (128, 128, 128),
        2,
    )
    # Label the ROI
    cv2.putText(
        annotated,
        "ROI",
        (cfg.roi_top_left[0] + 5, cfg.roi_top_left[1] + 20),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.6,
        (0, 255, 0) if cfg.use_roi else (128, 128, 128),
        2,
    )
    # Color mapping for visualization
    color_map = {
        "blue": (255, 100, 0),
        "red": (0, 0, 255),
        "yellow": (0, 255, 255),
        "white": (200, 200, 200),
        "unknown": (128, 128, 128),
    }
    for obj in objects:
        bbox = obj.get("bbox", [0, 0, 0, 0])
        color_name = obj.get("color", "unknown")
        obj_type = obj.get("object_type", "?")
        size = obj.get("size", "?")
        pos = obj.get("position_mm", [0, 0, 0])
        color = color_map.get(color_name, (128, 128, 128))
        # Draw bounding box
        cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
        # Draw label background
        label = f"{obj_type} {color_name} {size}"
        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        cv2.rectangle(
            annotated,
            (bbox[0], bbox[1] - th - 8),
            (bbox[0] + tw + 4, bbox[1]),
            color,
            -1,
        )
        cv2.putText(
            annotated,
            label,
            (bbox[0] + 2, bbox[1] - 4),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
            1,
        )
        # Draw position
        pos_label = f"[{pos[0]:.0f}, {pos[1]:.0f}, {pos[2]:.0f}]"
        cv2.putText(
            annotated,
            pos_label,
            (bbox[0], bbox[3] + 15),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.4,
            color,
            1,
        )
    return annotated
 def main() -> None:
    image_input = os.getenv("IMAGE_INPUT", "image_bgr")
    point_cloud_input = os.getenv("POINT_CLOUD_INPUT", "point_cloud")
    pose_input = os.getenv("POSE_INPUT", "tcp_pose")
    objects_output = os.getenv("OBJECTS_OUTPUT", "objects")
    image_output = os.getenv("IMAGE_OUTPUT", "image_annotated")
    calibration_file = os.getenv("CALIBRATION_FILE", "calibration.npz")
    weights = os.getenv("DETECTOR_WEIGHTS", DEFAULT_WEIGHTS)
    config_file = os.getenv("CONFIG_FILE", "config.toml")
    cfg = _build_config(config_file)
    model = YOLO(weights)
    if calibration_file and not os.path.isabs(calibration_file):
        config_path = os.path.join("config", calibration_file)
        calibration_file = config_path if os.path.exists(config_path) else calibration_file
    t_cam2gripper = _load_calibration(calibration_file)
    node = Node()
    latest_pose: Optional[List[float]] = None
    latest_pose_at: Optional[float] = None
    latest_point_cloud: Optional[np.ndarray] = None
    latest_pc_at: Optional[float] = None
    frame_count = 0
    for event in node:
        if event["type"] != "INPUT":
            continue
        now = time.monotonic()
        if event["id"] == pose_input:
            tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
            if tcp_pose.size >= 6:
                latest_pose = tcp_pose[:6].tolist()
                latest_pose_at = now
            continue
        if event["id"] == point_cloud_input:
            latest_point_cloud = _decode_point_cloud(event["value"], event.get("metadata", {}))
            latest_pc_at = now
            continue
        if event["id"] != image_input:
            continue
        frame_count += 1
        if frame_count % max(1, cfg.detect_every_n) != 0:
            continue
        if latest_pose is None or latest_point_cloud is None:
            continue
        frame = _decode_image(event["value"], event.get("metadata", {}))
        results = model.predict(
            frame, imgsz=cfg.imgsz, conf=cfg.conf, iou=cfg.iou, verbose=False
        )[0]
        base_T_flange = _pose_to_matrix(latest_pose)
        objects: List[Dict[str, Any]] = []
        for r in results.boxes:
            bbox = [int(x) for x in r.xyxy[0]]
            if not _within_roi(bbox, cfg):
                continue
            cx = int((bbox[0] + bbox[2]) / 2)
            cy = int((bbox[1] + bbox[3]) / 2)
            point_cam_mm = _sample_point(latest_point_cloud, cx, cy, cfg)
            if point_cam_mm is None:
                continue
            point_cam_m = np.array(
                [point_cam_mm[0], point_cam_mm[1], point_cam_mm[2], 1.0],
                dtype=np.float64,
            )
            point_cam_m[:3] /= 1000.0
            point_base = base_T_flange @ t_cam2gripper @ point_cam_m
            point_base_mm = point_base[:3] * 1000.0
            dominant = _dominant_color(frame, bbox)
            color_name = _closest_color(dominant, cfg)
            area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
            size_label = "big" if area >= cfg.size_threshold else "small"
            objects.append(
                {
                    "object_type": results.names[int(r.cls.item())],
                    "confidence": float(r.conf.item()),
                    "color": color_name,
                    "size": size_label,
                    "bbox": bbox,
                    "center_px": [cx, cy],
                    "position_mm": [
                        float(point_base_mm[0]),
                        float(point_base_mm[1]),
                        float(point_base_mm[2]),
                    ],
                    "timestamp_ns": time.time_ns(),
                }
            )
        payload = json.dumps({"objects": objects, "timestamp_ns": time.time_ns()})
        node.send_output(
            objects_output,
            pa.array([payload]),
            metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
        )
        # Send annotated image
        annotated = _draw_detections(frame, objects, cfg)
        h, w = annotated.shape[:2]
        node.send_output(
            image_output,
            pa.array(annotated.ravel().tolist()),
            metadata={
                "encoding": "bgr8",
                "width": w,
                "height": h,
                "timestamp_ns": time.time_ns(),
            },
        )
 if __name__ == "__main__":
    main()
--- a/dora_detector/pyproject.toml
+++ b/dora_detector/pyproject.toml
@@ -0,0 +1,19 @@
 [project]
 name = "dora-detector"
 version = "0.1.0"
 license = { file = "MIT" }
 authors = [{ name = "Dora" }]
 description = "Dora node for YOLO-based object detection with ZED point cloud"
 requires-python = ">=3.8"
 dependencies = [
  "dora-rs >= 0.3.9",
  "numpy < 2.0.0",
  "opencv-python >= 4.1.1",
  "pyarrow >= 12.0.0",
  "ultralytics >= 8.0.0",
 ]
 [project.scripts]
 dora-detector = "dora_detector.main:main"
--- a/dora_iobridge/README.md
+++ b/dora_iobridge/README.md
@@ -0,0 +1,178 @@
 # Dora IOBridge Node
 A WebSocket server that bridges web clients with the Dora dataflow for real-time voice commands and scene updates.
 ## Inputs/Outputs
 | Input          | Type   | Description                           |
 |----------------|--------|---------------------------------------|
 | `voice_out`    | JSON   | Response from voice control node      |
 | `scene_update` | JSON   | Scene objects from voice control      |
 | Output         | Type   | Description                           |
 |----------------|--------|---------------------------------------|
 | `voice_in`     | string | Voice commands forwarded to Dora      |
 ## Environment Variables
 ```bash
 VOICE_HOST=0.0.0.0    # Bind address
 VOICE_PORT=8765       # Listen port
 ```
 ## Installation
 ```bash
 cd dora_iobridge
 pip install -e .
 ```
 ## Testing
 ### Test with WebSocket (wscat)
 ```bash
 # Install wscat
 npm install -g wscat
 # Connect to the server
 wscat -c ws://localhost:8765
 ```
 ### Test with curl (websocat)
 ```bash
 # Install websocat
 # Ubuntu: sudo apt install websocat
 # macOS: brew install websocat
 # Send a ping
 echo '{"type": "ping"}' | websocat ws://localhost:8765
 # Response: {"type": "pong"}
 # Send a voice command
 echo '{"type": "command", "text": "sube"}' | websocat ws://localhost:8765
 # Request scene refresh
 echo '{"type": "scene_refresh"}' | websocat ws://localhost:8765
 ```
 ### Test with Python
 ```python
 import asyncio
 import websockets
 import json
 async def test_iobridge():
    uri = "ws://localhost:8765"
    async with websockets.connect(uri) as ws:
        # Test ping
        await ws.send(json.dumps({"type": "ping"}))
        response = await ws.recv()
        print(f"Ping response: {response}")
        # Send command
        await ws.send(json.dumps({
            "type": "command",
            "text": "agarra el cubo rojo"
        }))
        # Listen for responses
        async for message in ws:
            data = json.loads(message)
            print(f"Received: {data}")
 asyncio.run(test_iobridge())
 ```
 ### Test with curl (HTTP upgrade not supported directly)
 Since WebSocket requires an upgrade handshake, use this shell script:
 ```bash
 #!/bin/bash
 # test_iobridge.sh
 # Using websocat for interactive testing
 websocat ws://localhost:8765 <<EOF
 {"type": "ping"}
 {"type": "command", "text": "sube"}
 {"type": "scene_refresh"}
 EOF
 ```
 ## WebSocket Message Types
 ### Client -> Server
 **Command (voice input)**
 ```json
 {"type": "command", "text": "agarra el cubo rojo"}
 ```
 **Ping (health check)**
 ```json
 {"type": "ping"}
 ```
 Response: `{"type": "pong"}`
 **Scene Refresh**
 ```json
 {"type": "scene_refresh"}
 ```
 ### Server -> Client (Broadcasts)
 **Command Response**
 ```json
 {
  "type": "response",
  "text": "Ok, voy a tomar",
  "status": "ok"
 }
 ```
 **Scene Update**
 ```json
 {
  "type": "scene_updated",
  "objects": [
    {
      "object_type": "cubo",
      "color": "rojo",
      "size": "big",
      "position_mm": [150.0, 200.0, 280.0],
      "source": "detection"
    }
  ]
 }
 ```
 ## Dora Dataflow Configuration
 ```yaml
 nodes:
  - id: iobridge
    build: pip install -e ./dora_iobridge
    path: dora_iobridge
    inputs:
      voice_out: voice_control/voice_out
      scene_update: voice_control/scene_update
    outputs:
      - voice_in
    env:
      VOICE_HOST: "0.0.0.0"
      VOICE_PORT: "8765"
 ```
 ```bash
 dora up
 dora start dataflow.yml
 ```
 ## Dependencies
 - dora-rs >= 0.3.9
 - pyarrow >= 12.0.0
 - websockets >= 12.0
--- a/dora_iobridge/dora_iobridge/init.py
+++ b/dora_iobridge/dora_iobridge/init.py
@@ -0,0 +1 @@
 """Dora IO bridge node package."""
--- a/dora_iobridge/dora_iobridge/main.py
+++ b/dora_iobridge/dora_iobridge/main.py
@@ -0,0 +1,145 @@
 """Dora node bridging WebSocket IO to Dora topics."""
 from __future__ import annotations
 import asyncio
 import json
 import os
 import threading
 import time
 from typing import Any, Dict, Optional, Set
 import pyarrow as pa
 from dora import Node
 from websockets.server import serve, WebSocketServerProtocol
 class IoBridgeServer:
    def __init__(self, host: str, port: int):
        self.host = host
        self.port = port
        self.clients: Set[WebSocketServerProtocol] = set()
        self.command_handler = None
        self.scene_refresh_handler = None
    async def handler(self, websocket: WebSocketServerProtocol):
        self.clients.add(websocket)
        try:
            async for message in websocket:
                try:
                    data = json.loads(message)
                except json.JSONDecodeError:
                    await websocket.send(
                        json.dumps({"type": "error", "text": "Invalid JSON message"})
                    )
                    continue
                response = await self._route_message(data, websocket)
                if response:
                    await websocket.send(json.dumps(response))
        finally:
            self.clients.discard(websocket)
    async def _route_message(
        self, data: Dict[str, Any], websocket: WebSocketServerProtocol
    ) -> Optional[Dict[str, Any]]:
        msg_type = data.get("type")
        if msg_type == "command":
            text = data.get("text", "")
            if self.command_handler:
                await self.command_handler(text)
                return None
            return {"type": "error", "text": "No command handler registered"}
        if msg_type == "ping":
            return {"type": "pong"}
        if msg_type == "scene_refresh":
            if self.scene_refresh_handler:
                objects = await self.scene_refresh_handler()
                return {"type": "scene_updated", "objects": objects}
            return {"type": "error", "text": "No scene handler registered"}
        return {"type": "error", "text": f"Unknown message type: {msg_type}"}
    async def broadcast(self, message: Dict[str, Any]):
        if not self.clients:
            return
        payload = json.dumps(message)
        await asyncio.gather(
            *[client.send(payload) for client in self.clients], return_exceptions=True
        )
    async def send(self, message: Dict[str, Any], websocket: WebSocketServerProtocol):
        await websocket.send(json.dumps(message))
    async def start(self):
        async with serve(self.handler, self.host, self.port):
            await asyncio.Future()
 def main() -> None:
    host = os.getenv("VOICE_HOST", "0.0.0.0")
    port = int(os.getenv("VOICE_PORT", "8765"))
    input_topic = os.getenv("VOICE_IN_OUTPUT", "voice_in")
    response_input = os.getenv("VOICE_OUT_INPUT", "voice_out")
    scene_input = os.getenv("SCENE_INPUT", "scene_update")
    node = Node()
    server = IoBridgeServer(host, port)
    loop = asyncio.new_event_loop()
    def push_command(text: str) -> None:
        node.send_output(
            input_topic,
            pa.array([text]),
            metadata={"encoding": "utf-8", "timestamp_ns": time.time_ns()},
        )
    async def handle_scene_refresh():
        return []
    def command_handler(text: str):
        push_command(text)
        return None
    server.command_handler = command_handler
    server.scene_refresh_handler = handle_scene_refresh
    def run_server():
        asyncio.set_event_loop(loop)
        loop.run_until_complete(server.start())
    threading.Thread(target=run_server, daemon=True).start()
    for event in node:
        if event["type"] != "INPUT":
            continue
        if event["id"] == response_input:
            raw = event["value"][0].as_py() if len(event["value"]) else ""
            if not raw:
                continue
            try:
                payload = json.loads(raw)
                message = {
                    "type": "response",
                    "text": payload.get("text", ""),
                    "status": payload.get("status", "ok"),
                }
            except Exception:
                message = {"type": "response", "text": raw, "status": "ok"}
            asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
            continue
        if event["id"] == scene_input:
            raw = event["value"][0].as_py() if len(event["value"]) else ""
            if not raw:
                continue
            try:
                payload = json.loads(raw)
                objects = payload.get("objects", [])
                message = {"type": "scene_updated", "objects": objects}
            except Exception:
                message = {"type": "scene_updated", "objects": []}
            asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
 if __name__ == "__main__":
    main()
--- a/dora_iobridge/pyproject.toml
+++ b/dora_iobridge/pyproject.toml
@@ -0,0 +1,17 @@
 [project]
 name = "dora-iobridge"
 version = "0.1.0"
 license = { file = "MIT" }
 authors = [{ name = "Dora" }]
 description = "Dora node bridging WebSocket IO to Dora topics"
 requires-python = ">=3.8"
 dependencies = [
  "dora-rs >= 0.3.9",
  "pyarrow >= 12.0.0",
  "websockets >= 12.0",
 ]
 [project.scripts]
 dora-iobridge = "dora_iobridge.main:main"
--- a/dora_ulite6/dora_ulite6/main.py
+++ b/dora_ulite6/dora_ulite6/main.py
@@ -781,6 +781,12 @@ def _status_snapshot(helper: ULite6Helper) -> Dict[str, Any]:
    }
 def _log(msg: str) -> None:
    """Print a timestamped log message."""
    timestamp = time.strftime("%H:%M:%S")
    print(f"[ulite6 {timestamp}] {msg}", flush=True)
 def main() -> None:
    node = Node()
@@ -791,7 +797,42 @@ def main() -> None:
    api_port = int(os.getenv("API_PORT", "8080"))
    vacuum_enabled = os.getenv("VACUUM_ENABLED", "false").lower() in ("true", "1", "yes")
    # Initial position settings
    init_mode = os.getenv("INIT_MODE", "none").lower()  # "home", "pose", or "none"
    init_x = float(os.getenv("INIT_X", "300.0"))
    init_y = float(os.getenv("INIT_Y", "0.0"))
    init_z = float(os.getenv("INIT_Z", "250.0"))
    init_roll = float(os.getenv("INIT_ROLL", "180.0"))
    init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
    init_yaw = float(os.getenv("INIT_YAW", "0.0"))
    init_speed = float(os.getenv("INIT_SPEED", "50.0"))
    _log(f"Connecting to robot at {robot_ip}...")
    helper = ULite6Helper(robot_ip)
    _log("Robot connected")
    # Move to initial position on startup
    if init_mode == "home":
        _log("Moving to home position...")
        code = helper.go_home()
        if code == 0:
            _log("Home position reached")
        else:
            _log(f"Home failed with code {code}")
    elif init_mode == "pose":
        _log(f"Moving to initial pose: [{init_x}, {init_y}, {init_z}] roll={init_roll} pitch={init_pitch} yaw={init_yaw}")
        code = helper.move_to_pose(
            init_x, init_y, init_z,
            init_roll, init_pitch, init_yaw,
            speed=init_speed,
            units="mm",
        )
        if code == 0:
            _log("Initial pose reached")
        else:
            _log(f"Move to initial pose failed with code {code}")
    else:
        _log("Skipping initial position (INIT_MODE=none)")
    # Create and start FastAPI server in background thread
    app = create_api(helper, default_speed, default_units, vacuum_enabled)
@@ -895,6 +936,28 @@ def main() -> None:
                        code=code,
                        status=_status_snapshot(helper),
                    )
                elif action in ("vacuum_on", "vacuum_off"):
                    if not vacuum_enabled:
                        _send_command_status(
                            node,
                            command_id=command_id,
                            action=action,
                            ok=False,
                            message="Vacuum gripper not enabled",
                            status=_status_snapshot(helper),
                        )
                        continue
                    vacuum_on = action == "vacuum_on"
                    code = helper.set_vacuum_gripper(vacuum_on)
                    _send_command_status(
                        node,
                        command_id=command_id,
                        action=action,
                        ok=code == 0,
                        message="Vacuum command executed",
                        code=code,
                        status=_status_snapshot(helper),
                    )
                else:
                    _send_command_status(
                        node,
--- a/dora_voice_control/README.md
+++ b/dora_voice_control/README.md
@@ -0,0 +1,211 @@
 # Dora Voice Control Node
 A Dora node that processes Spanish voice commands from children and translates them into robot actions (movement, grasping, releasing objects). Includes a web debug interface.
 ## Features
 - Spanish voice command parsing (rule-based or Gemini LLM)
 - Real-time web debug interface
 - Command queue management
 - Workspace bounds validation
 - Object detection integration
 ## File Structure
 ```
 dora_voice_control/
 ├── __init__.py
 ├── main.py        # Main Dora node entry point
 ├── api.py         # FastAPI web server
 ├── config.py      # Configuration management
 ├── models.py      # Pydantic request/response models
 ├── parser.py      # Voice command parsing logic
 ├── state.py       # Shared state management
 └── templates.py   # HTML template for web interface
 ```
 ## Web Debug Interface
 Access the debug interface at `http://localhost:8080` (default).
 Features:
 - Real-time status monitoring (pose, objects, queue)
 - Send manual voice commands
 - Quick command buttons
 - View parse results
 - Command history
 - Clear queue
 ## Inputs/Outputs
 | Input         | Type   | Description                              |
 |---------------|--------|------------------------------------------|
 | `voice_in`    | string | Text transcription of voice command      |
 | `tcp_pose`    | array  | Current robot pose [x, y, z, roll, pitch, yaw] |
 | `objects`     | JSON   | Detected objects from vision system      |
 | `status`      | JSON   | Command execution status from robot      |
 | Output        | Type   | Description                              |
 |---------------|--------|------------------------------------------|
 | `robot_cmd`   | JSON   | Robot command with action and payload    |
 | `voice_out`   | JSON   | Response confirmation to user            |
 | `scene_update`| JSON   | Updated scene with all visible objects   |
 ## Supported Commands (Spanish)
 | Command       | Action         | Example                        |
 |---------------|----------------|--------------------------------|
 | `subir`       | Move up        | "sube"                         |
 | `bajar`       | Move down      | "baja"                         |
 | `tomar`       | Grab object    | "agarra el cubo rojo"          |
 | `soltar`      | Release object | "suelta en la caja azul"       |
 | `ir`          | Go to object   | "ve al cilindro"               |
 | `reiniciar`   | Reset          | "reinicia"                     |
 ## Environment Variables
 ```bash
 # Web API Server
 API_ENABLED=true        # Enable/disable web interface
 API_HOST=0.0.0.0        # Bind address
 API_PORT=8080           # Listen port
 # TCP Parameters
 TCP_OFFSET_MM=63.0          # Z-offset to object surface
 APPROACH_OFFSET_MM=50.0     # Safe approach distance above object
 STEP_MM=20.0                # Distance for up/down increments
 # LLM Configuration (optional)
 LLM_PROVIDER=rules          # "rules" or "gemini"
 GOOGLE_API_KEY=your_key     # Required if using gemini
 GEMINI_MODEL=gemini-2.0-flash
 # Workspace Safety (optional)
 WORKSPACE_MIN_X=-300
 WORKSPACE_MAX_X=300
 WORKSPACE_MIN_Y=-300
 WORKSPACE_MAX_Y=300
 WORKSPACE_MIN_Z=0
 WORKSPACE_MAX_Z=500
 # Misc
 DRY_RUN=false               # Skip sending robot commands
 ```
 ## Installation
 ```bash
 cd dora_voice_control
 pip install -e .
 # With LLM support
 pip install -e ".[llm]"
 ```
 ## Testing
 ### Web Interface
 ```bash
 # Start the node (standalone for testing)
 python -m dora_voice_control.main
 # Open in browser
 open http://localhost:8080
 ```
 ### API Endpoints
 ```bash
 # Get status
 curl http://localhost:8080/api/status
 # Get objects
 curl http://localhost:8080/api/objects
 # Get queue
 curl http://localhost:8080/api/queue
 # Send command
 curl -X POST http://localhost:8080/api/command \
  -H "Content-Type: application/json" \
  -d '{"text": "sube"}'
 # Clear queue
 curl -X POST http://localhost:8080/api/queue/clear
 ```
 ### Python Test
 ```python
 from dora_voice_control.parser import rule_parse, normalize
 # Test command parsing
 text = "agarra el cubo rojo grande"
 result = rule_parse(normalize(text))
 print(result)
 # {'resultado': 'ok', 'accion': 'tomar', 'objeto': 'cubo', 'color': 'rojo', 'tamano': 'grande'}
 ```
 ## Dora Dataflow Configuration
 ```yaml
 nodes:
  - id: voice_control
    build: pip install -e ./dora_voice_control
    path: dora_voice_control
    inputs:
      voice_in: iobridge/voice_in
      tcp_pose: robot/tcp_pose
      objects: detector/objects
      status: robot/status
    outputs:
      - robot_cmd
      - voice_out
      - scene_update
    env:
      API_ENABLED: "true"
      API_PORT: "8080"
      DRY_RUN: "false"
 ```
 ## Message Examples
 ### Input: voice_in
 ```
 "sube"
 "agarra el cubo rojo"
 "suelta en la caja azul"
 ```
 ### Output: robot_cmd
 ```json
 {
  "id": "550e8400-e29b-41d4-a716-446655440000",
  "action": "move_to_pose",
  "payload": {
    "x": 150.0,
    "y": 200.0,
    "z": 280.0,
    "roll": 180.0,
    "pitch": 0.0,
    "yaw": 0.0
  }
 }
 ```
 ### Output: voice_out
 ```json
 {"text": "Ok, voy a subir", "status": "ok"}
 {"text": "No entendi el comando", "status": "error"}
 ```
 ## Dependencies
 - dora-rs >= 0.3.9
 - numpy < 2.0.0
 - pyarrow >= 12.0.0
 - fastapi >= 0.109.0
 - uvicorn >= 0.27.0
 - pydantic >= 2.0.0
 - google-genai (optional, for Gemini mode)
--- a/dora_voice_control/dora_voice_control/init.py
+++ b/dora_voice_control/dora_voice_control/init.py
@@ -0,0 +1 @@
 """Dora voice control node package."""
--- a/dora_voice_control/dora_voice_control/api.py
+++ b/dora_voice_control/dora_voice_control/api.py
@@ -0,0 +1,162 @@
 """FastAPI application for the voice control web interface."""
 from __future__ import annotations
 import os
 import sys
 import threading
 from typing import Any
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import HTMLResponse, Response
 # Handle both package and direct script execution
 # __package__ is None when run as script, '' when imported from a script
 if not __package__:
    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
    if _pkg_dir not in sys.path:
        sys.path.insert(0, _pkg_dir)
    from models import CommandRequest, CommandResponse
    from state import SharedState
    from templates import HTML_TEMPLATE
 else:
    from .models import CommandRequest, CommandResponse
    from .state import SharedState
    from .templates import HTML_TEMPLATE
 def create_api(state: SharedState) -> FastAPI:
    """Create FastAPI application with voice control endpoints."""
    app = FastAPI(
        title="Voice Control Debug API",
        description="Debug interface for the voice control node",
        version="0.1.0",
    )
    @app.get("/", response_class=HTMLResponse)
    def index() -> str:
        """Serve the web interface."""
        return HTML_TEMPLATE
    @app.get("/api/status")
    def get_status() -> dict:
        """Get current status."""
        try:
            return state.get_status()
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.get("/api/objects")
    def get_objects() -> dict:
        """Get detected and static objects."""
        try:
            return state.get_objects()
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.get("/api/queue")
    def get_queue() -> list:
        """Get the command queue."""
        try:
            return state.get_queue()
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.post("/api/queue/clear")
    def clear_queue() -> dict:
        """Clear the command queue."""
        try:
            with state._lock:
                state.voice_state.queue.clear()
            return {"ok": True}
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.get("/api/history")
    def get_history() -> list:
        """Get command history."""
        try:
            return state.get_history()
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.get("/api/errors")
    def get_errors() -> list:
        """Get error log."""
        try:
            return state.get_errors()
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.post("/api/command", response_model=CommandResponse)
    def send_command(request: CommandRequest) -> CommandResponse:
        """Send a voice command."""
        try:
            callback = state.get_command_callback()
            if callback is None:
                return CommandResponse(ok=False, text="No command handler available", status="error")
            result = callback(request.text)
            return CommandResponse(
                ok=result.get("status") == "ok",
                text=result.get("text", ""),
                status=result.get("status", "error"),
            )
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.get("/api/image")
    def get_image() -> Response:
        """Get the latest camera image as JPEG."""
        try:
            image_data = state.get_image()
            if image_data is None:
                # Return a 1x1 transparent pixel if no image
                return Response(
                    content=b"",
                    media_type="image/jpeg",
                    status_code=204,
                )
            return Response(
                content=image_data,
                media_type="image/jpeg",
                headers={"Cache-Control": "no-cache, no-store, must-revalidate"},
            )
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    @app.get("/api/image/info")
    def get_image_info() -> dict:
        """Get image metadata."""
        try:
            return {
                "has_image": state.get_image() is not None,
                "age_ms": state.get_image_age_ms(),
            }
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    return app
 def run_uvicorn(app: FastAPI, host: str, port: int) -> None:
    """Run uvicorn server (for use in background thread)."""
    config = uvicorn.Config(app, host=host, port=port, log_level="warning")
    server = uvicorn.Server(config)
    server.run()
 def start_api_server(state: SharedState, config: Any) -> threading.Thread:
    """Start the API server in a background thread."""
    import time as _time
    app = create_api(state)
    api_thread = threading.Thread(
        target=run_uvicorn,
        args=(app, config.host, config.port),
        daemon=True,
    )
    api_thread.start()
    timestamp = _time.strftime("%H:%M:%S")
    print(f"[voice_control {timestamp}] Web interface at http://{config.host}:{config.port}", flush=True)
    return api_thread
--- a/dora_voice_control/dora_voice_control/config.py
+++ b/dora_voice_control/dora_voice_control/config.py
@@ -0,0 +1,95 @@
 """Configuration for the voice control node."""
 from __future__ import annotations
 import os
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple
@dataclass
 class VoiceConfig:
    """Configuration for voice control."""
    host: str
    port: int
    tcp_offset_mm: float
    approach_offset_mm: float
    step_mm: float
    default_roll: float
    default_pitch: float
    default_yaw: float
    dry_run: bool
    workspace_min: Tuple[Optional[float], Optional[float], Optional[float]]
    workspace_max: Tuple[Optional[float], Optional[float], Optional[float]]
    class_map: Dict[str, str]
@dataclass
 class ApiConfig:
    """Configuration for the web API server."""
    host: str
    port: int
    enabled: bool
 def _parse_float_env(name: str) -> Optional[float]:
    """Parse an optional float from environment variable."""
    raw = os.getenv(name)
    if raw is None or raw == "":
        return None
    try:
        return float(raw)
    except ValueError:
        return None
 def _parse_class_map(raw: str) -> Dict[str, str]:
    """Parse JSON class mapping from string."""
    import json
    if not raw:
        return {}
    try:
        data = json.loads(raw)
        if isinstance(data, dict):
            return {str(k): str(v) for k, v in data.items()}
    except Exception:
        pass
    return {}
 def load_voice_config() -> VoiceConfig:
    """Load voice configuration from environment variables."""
    return VoiceConfig(
        host="",
        port=0,
        tcp_offset_mm=float(os.getenv("TCP_OFFSET_MM", "63.0")),
        approach_offset_mm=float(os.getenv("APPROACH_OFFSET_MM", "50.0")),
        step_mm=float(os.getenv("STEP_MM", "20.0")),
        default_roll=float(os.getenv("DEFAULT_ROLL", "180.0")),
        default_pitch=float(os.getenv("DEFAULT_PITCH", "0.0")),
        default_yaw=float(os.getenv("DEFAULT_YAW", "0.0")),
        dry_run=os.getenv("DRY_RUN", "false").lower() in ("true", "1", "yes"),
        workspace_min=(
            _parse_float_env("WORKSPACE_MIN_X"),
            _parse_float_env("WORKSPACE_MIN_Y"),
            _parse_float_env("WORKSPACE_MIN_Z"),
        ),
        workspace_max=(
            _parse_float_env("WORKSPACE_MAX_X"),
            _parse_float_env("WORKSPACE_MAX_Y"),
            _parse_float_env("WORKSPACE_MAX_Z"),
        ),
        class_map=_parse_class_map(os.getenv("CLASS_MAP", "")),
    )
 def load_api_config() -> ApiConfig:
    """Load API server configuration from environment variables."""
    return ApiConfig(
        host=os.getenv("API_HOST", "0.0.0.0"),
        port=int(os.getenv("API_PORT", "8080")),
        enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"),
    )
--- a/dora_voice_control/dora_voice_control/main.py
+++ b/dora_voice_control/dora_voice_control/main.py
@@ -0,0 +1,501 @@
 """Dora node for voice control with safe robot commands."""
 from __future__ import annotations
 import json
 import os
 import sys
 import time
 import uuid
 from collections import deque
 from typing import Any, Deque, Dict, List, Optional, Tuple
 import cv2
 import numpy as np
 import pyarrow as pa
 from dora import Node
 try:
    import tomllib
 except ModuleNotFoundError:
    import tomli as tomllib
 # Handle both package and direct script execution
 # __package__ is None when run as script, '' when imported from a script
 _RUNNING_AS_SCRIPT = not __package__
 if _RUNNING_AS_SCRIPT:
    # Running as script - use absolute imports
    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
    if _pkg_dir not in sys.path:
        sys.path.insert(0, _pkg_dir)
    from config import VoiceConfig, load_api_config, load_voice_config
    from parser import normalize, parse_command
    from state import RobotStep, SharedState
    from api import start_api_server
 else:
    # Running as package - use relative imports
    from .config import VoiceConfig, load_api_config, load_voice_config
    from .parser import normalize, parse_command
    from .state import RobotStep, SharedState
    from .api import start_api_server
 def _within_bounds(
    point_mm: np.ndarray,
    min_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
    max_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
 ) -> bool:
    """Check if point is within workspace bounds."""
    x, y, z = point_mm.tolist()
    min_x, min_y, min_z = min_xyz
    max_x, max_y, max_z = max_xyz
    if min_x is not None and x < min_x:
        return False
    if max_x is not None and x > max_x:
        return False
    if min_y is not None and y < min_y:
        return False
    if max_y is not None and y > max_y:
        return False
    if min_z is not None and z < min_z:
        return False
    if max_z is not None and z > max_z:
        return False
    return True
 def _translate_target(token: str, mapping: Dict[str, str]) -> str:
    """Translate object name using class map."""
    if token in mapping:
        return mapping[token]
    return token
 def _load_config_file(path: str) -> Dict[str, Any]:
    """Load TOML configuration file."""
    if not path or not os.path.exists(path):
        return {}
    try:
        with open(path, "rb") as handle:
            return tomllib.load(handle)
    except Exception:
        return {}
 def _load_bucket_objects(config_path: str) -> List[Dict[str, Any]]:
    """Load bucket positions from config file."""
    cfg = _load_config_file(config_path)
    buckets = cfg.get("bucket_positions", {})
    obj_cfg = cfg.get("object_parameters", {})
    base_z = float(obj_cfg.get("normal_height", 220.0))
    out = []
    for key, color in [
        ("blue_bucket_pos", "blue"),
        ("red_bucket_pos", "red"),
        ("yellow_bucket_pos", "yellow"),
        ("white_bucket_pos", "white"),
    ]:
        pos = buckets.get(key)
        if not isinstance(pos, list) or len(pos) < 2:
            continue
        out.append(
            {
                "object_type": "box",
                "color": color,
                "size": "big",
                "position_mm": [float(pos[0]), float(pos[1]), base_z],
                "source": "config",
            }
        )
    return out
 def _send_dora_command(
    node: Node, output_name: str, action: str, payload: Dict[str, Any]
 ) -> str:
    """Send a robot command via Dora."""
    command_id = str(uuid.uuid4())
    message = {"id": command_id, "action": action, "payload": payload}
    node.send_output(
        output_name,
        pa.array([json.dumps(message)]),
        metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
    )
    return command_id
 def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]:
    """Parse status payload from robot."""
    if len(value) == 0:
        return None
    raw = value[0].as_py()
    if not raw:
        return None
    try:
        return json.loads(raw)
    except Exception:
        return None
 def _log(msg: str) -> None:
    """Print a timestamped log message."""
    timestamp = time.strftime("%H:%M:%S")
    print(f"[voice_control {timestamp}] {msg}", flush=True)
 def main() -> None:
    """Main entry point for the voice control node."""
    _log("Starting voice control node...")
    # Load configuration
    cfg = load_voice_config()
    api_cfg = load_api_config()
    # Environment variables for I/O topics
    objects_input = os.getenv("OBJECTS_INPUT", "objects")
    voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in")
    voice_out_output = os.getenv("VOICE_OUT_OUTPUT", "voice_out")
    scene_output = os.getenv("SCENE_OUTPUT", "scene_update")
    pose_input = os.getenv("POSE_INPUT", "tcp_pose")
    status_input = os.getenv("STATUS_INPUT", "status")
    command_output = os.getenv("COMMAND_OUTPUT", "robot_cmd")
    image_input = os.getenv("IMAGE_INPUT", "image_annotated")
    llm_provider = os.getenv("LLM_PROVIDER", "rules").lower()
    config_file = os.getenv("CONFIG_FILE", "config.toml")
    # Image dimensions (will be detected from first frame)
    image_width = int(os.getenv("IMAGE_WIDTH", "1280"))
    image_height = int(os.getenv("IMAGE_HEIGHT", "720"))
    # Initial/home position for reset command
    init_x = float(os.getenv("INIT_X", "300.0"))
    init_y = float(os.getenv("INIT_Y", "0.0"))
    init_z = float(os.getenv("INIT_Z", "250.0"))
    init_roll = float(os.getenv("INIT_ROLL", "180.0"))
    init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
    init_yaw = float(os.getenv("INIT_YAW", "0.0"))
    _log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm")
    _log(f"Initial position: [{init_x}, {init_y}, {init_z}]")
    _log(f"LLM provider: {llm_provider}")
    _log(f"Dry run: {cfg.dry_run}")
    # Initialize shared state
    shared_state = SharedState()
    state = shared_state.voice_state
    state.static_objects = _load_bucket_objects(config_file)
    pending_intents: Deque[Dict[str, Any]] = deque()
    _log(f"Loaded {len(state.static_objects)} static objects from config")
    # Queue initial position movement on startup (same as reiniciar)
    init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes")
    send_init_scene_reset = init_on_start  # Flag to send scene reset after node starts
    if init_on_start:
        _log(f"Startup: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
        # Clear detected objects
        state.latest_objects = []
        state.latest_objects_at = None
        # Queue vacuum off and move to home
        state.queue.append(RobotStep(action="vacuum_off", payload={}))
        state.queue.append(
            RobotStep(
                action="move_to_pose",
                payload={
                    "x": init_x,
                    "y": init_y,
                    "z": init_z,
                    "roll": init_roll,
                    "pitch": init_pitch,
                    "yaw": init_yaw,
                },
            )
        )
    def command_handler(transcript: str) -> Dict[str, str]:
        """Handle voice command and return response."""
        _log(f"Voice input received: \"{transcript}\"")
        llm_result = parse_command(transcript, llm_provider)
        _log(f"Parse result: {llm_result}")
        # Update debug state
        shared_state.update_voice_input(transcript, llm_result, time.monotonic())
        if llm_result.get("resultado") != "ok":
            _log("Command not understood")
            return {"text": "No entendi el comando", "status": "error"}
        action = llm_result.get("accion", "error")
        obj = llm_result.get("objeto", "no especificado")
        color = llm_result.get("color", "no especificado")
        size = llm_result.get("tamano", "no especificado")
        _log(f"Intent: action={action}, object={obj}, color={color}, size={size}")
        pending_intents.append(
            {"action": action, "obj": obj, "color": color, "size": size}
        )
        # Add to history
        shared_state.add_to_history({
            "timestamp": time.time(),
            "input": transcript,
            "action": action,
            "object": obj,
            "color": color,
            "size": size,
        })
        return {"text": f"Ok, voy a {action}", "status": "ok"}
    # Set command callback for web interface
    shared_state.set_command_callback(command_handler)
    # Start web API server if enabled
    if api_cfg.enabled:
        start_api_server(shared_state, api_cfg)
    # Create Dora node
    node = Node()
    _log("Dora node created, waiting for events...")
    first_event = True
    for event in node:
        # Send scene reset on first event (startup)
        if first_event and send_init_scene_reset:
            first_event = False
            scene_payload = json.dumps(
                {"objects": list(state.static_objects), "reset": True}
            )
            node.send_output(
                scene_output,
                pa.array([scene_payload]),
                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
            )
            _log("Sent initial scene reset notification")
        if event["type"] != "INPUT":
            continue
        # Handle voice input
        if event["id"] == voice_in_input:
            raw = event["value"][0].as_py() if len(event["value"]) else ""
            if not raw:
                continue
            response = command_handler(raw)
            node.send_output(
                voice_out_output,
                pa.array([json.dumps(response)]),
                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
            )
            continue
        # Handle pose updates
        if event["id"] == pose_input:
            tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
            if tcp_pose.size >= 6:
                state.latest_pose = tcp_pose[:6].tolist()
                state.latest_pose_at = time.monotonic()
            continue
        # Handle object detection updates
        if event["id"] == objects_input:
            raw = event["value"][0].as_py() if len(event["value"]) else ""
            if raw:
                try:
                    payload = json.loads(raw)
                    objects = payload.get("objects", [])
                except Exception:
                    objects = []
                state.latest_objects = objects
                state.latest_objects_at = time.monotonic()
            continue
        # Handle camera image
        if event["id"] == image_input:
            try:
                # Get raw image data
                img_data = event["value"].to_numpy()
                # Reshape to image (assuming BGR format)
                img = img_data.reshape((image_height, image_width, 3)).astype(np.uint8)
                # Encode to JPEG
                _, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 80])
                shared_state.update_image(jpeg_data.tobytes(), time.monotonic())
            except Exception as e:
                # Log error but don't crash
                pass
            continue
        # Handle robot status updates
        if event["id"] == status_input:
            payload = _parse_status_payload(event["value"])
            if payload and state.pending_command:
                if payload.get("command_id") == state.pending_command.get("id"):
                    _log(f"Command completed: {state.pending_command.get('action')} (status={payload.get('status', 'ok')})")
                    state.pending_command = None
            continue
        # Process pending intents
        if pending_intents:
            intent = pending_intents.popleft()
            action = intent["action"]
            obj = intent["obj"]
            color = intent["color"]
            size = intent["size"]
            _log(f"Processing intent: {action} {obj} {color} {size}")
            latest_pose = state.latest_pose
            objects = list(state.latest_objects) + list(state.static_objects)
            _log(f"Available objects: {len(state.latest_objects)} detected + {len(state.static_objects)} static")
            if action in ("subir", "bajar") and latest_pose:
                delta = cfg.step_mm if action == "subir" else -cfg.step_mm
                target = np.array(latest_pose[:3], dtype=np.float64)
                target[2] += delta
                if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
                    step = RobotStep(
                        action="move_to_pose",
                        payload={
                            "x": float(target[0]),
                            "y": float(target[1]),
                            "z": float(target[2]),
                            "roll": cfg.default_roll,
                            "pitch": cfg.default_pitch,
                            "yaw": cfg.default_yaw,
                        },
                    )
                    state.queue.append(step)
                    _log(f"Queued: move Z to {target[2]:.1f}mm (delta={delta:+.1f})")
                else:
                    _log(f"Target {target.tolist()} out of bounds, skipping")
            elif action in ("ir", "tomar", "soltar"):
                target_obj = None
                if obj != "no especificado":
                    target_name = _translate_target(obj, cfg.class_map)
                    target_color = _translate_target(color, cfg.class_map)
                    _log(f"Looking for: type={target_name}, color={target_color}")
                    # Log available objects for debugging
                    for o in objects:
                        _log(f"  -> Available: {o.get('object_type')} {o.get('color')} {o.get('size')} at {o.get('position_mm')}")
                    for o in objects:
                        if o.get("object_type") == target_name:
                            if color == "no especificado" or o.get("color") == target_color:
                                if size == "no especificado" or o.get("size") == _translate_target(size, cfg.class_map):
                                    target_obj = o
                                    break
                if target_obj:
                    _log(f"Found target: {target_obj.get('object_type')} {target_obj.get('color')} at {target_obj.get('position_mm')}")
                    pos = np.array(target_obj["position_mm"], dtype=np.float64)
                    approach = pos.copy()
                    approach[2] += cfg.tcp_offset_mm + cfg.approach_offset_mm
                    target = pos.copy()
                    target[2] += cfg.tcp_offset_mm
                    if _within_bounds(approach, cfg.workspace_min, cfg.workspace_max):
                        state.queue.append(
                            RobotStep(
                                action="move_to_pose",
                                payload={
                                    "x": float(approach[0]),
                                    "y": float(approach[1]),
                                    "z": float(approach[2]),
                                    "roll": cfg.default_roll,
                                    "pitch": cfg.default_pitch,
                                    "yaw": cfg.default_yaw,
                                },
                            )
                        )
                        _log(f"Queued: approach pose at Z={approach[2]:.1f}mm")
                    if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
                        state.queue.append(
                            RobotStep(
                                action="move_to_pose",
                                payload={
                                    "x": float(target[0]),
                                    "y": float(target[1]),
                                    "z": float(target[2]),
                                    "roll": cfg.default_roll,
                                    "pitch": cfg.default_pitch,
                                    "yaw": cfg.default_yaw,
                                },
                            )
                        )
                        _log(f"Queued: target pose at Z={target[2]:.1f}mm")
                    if action == "tomar":
                        state.queue.append(RobotStep(action="vacuum_on", payload={}))
                        _log("Queued: vacuum_on")
                    elif action == "soltar":
                        state.queue.append(RobotStep(action="vacuum_off", payload={}))
                        _log("Queued: vacuum_off")
                else:
                    _log(f"Target object not found: {obj} {color}")
                    continue
            elif action == "reiniciar":
                _log(f"Reiniciar: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
                # Turn off vacuum first
                state.queue.append(RobotStep(action="vacuum_off", payload={}))
                # Clear current detected objects (will be refreshed by detector)
                state.latest_objects = []
                state.latest_objects_at = None
                _log("Cleared detected objects - waiting for fresh detection")
                # Move to initial position
                state.queue.append(
                    RobotStep(
                        action="move_to_pose",
                        payload={
                            "x": init_x,
                            "y": init_y,
                            "z": init_z,
                            "roll": init_roll,
                            "pitch": init_pitch,
                            "yaw": init_yaw,
                        },
                    )
                )
                _log(f"Queued: vacuum_off + move to home")
                # Send scene update to notify clients that scene was reset
                scene_payload = json.dumps(
                    {"objects": list(state.static_objects), "reset": True}
                )
                node.send_output(
                    scene_output,
                    pa.array([scene_payload]),
                    metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
                )
                _log("Sent scene reset notification")
            _log(f"Queue size: {len(state.queue)}")
        # Emit scene updates when objects change
        if event["id"] == objects_input:
            scene_payload = json.dumps(
                {"objects": list(state.latest_objects) + list(state.static_objects)}
            )
            node.send_output(
                scene_output,
                pa.array([scene_payload]),
                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
            )
        # Send queued robot steps one at a time
        if state.pending_command is None and state.queue:
            step = state.queue.popleft()
            if cfg.dry_run:
                _log(f"[DRY RUN] Would send: {step.action} {step.payload}")
                state.pending_command = None
                continue
            cmd_id = _send_dora_command(node, command_output, step.action, step.payload)
            state.pending_command = {"id": cmd_id, "action": step.action}
            _log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={len(state.queue)}")
            # Update debug state
            shared_state.update_robot_command(
                {"id": cmd_id, "action": step.action, "payload": step.payload},
                time.monotonic(),
            )
 if __name__ == "__main__":
    main()
--- a/dora_voice_control/dora_voice_control/models.py
+++ b/dora_voice_control/dora_voice_control/models.py
@@ -0,0 +1,38 @@
 """Pydantic models for the voice control API."""
 from __future__ import annotations
 from typing import Optional
 from pydantic import BaseModel
 class CommandRequest(BaseModel):
    """Request to send a voice command."""
    text: str
 class CommandResponse(BaseModel):
    """Response from a voice command."""
    ok: bool
    text: str
    status: str
 class MoveRequest(BaseModel):
    """Request to move to a position."""
    x: float
    y: float
    z: float
    roll: Optional[float] = 180.0
    pitch: Optional[float] = 0.0
    yaw: Optional[float] = 0.0
 class VacuumRequest(BaseModel):
    """Request to control the vacuum."""
    on: bool
--- a/dora_voice_control/dora_voice_control/parser.py
+++ b/dora_voice_control/dora_voice_control/parser.py
@@ -0,0 +1,118 @@
 """Voice command parsing logic."""
 from __future__ import annotations
 import json
 import os
 import unicodedata
 from typing import Dict
 def normalize(text: str) -> str:
    """Normalize text: lowercase, remove accents."""
    text = text.lower().strip()
    text = unicodedata.normalize("NFKD", text)
    text = "".join([c for c in text if not unicodedata.combining(c)])
    return text
 def rule_parse(transcript: str) -> Dict[str, str]:
    """Parse voice command using rule-based approach."""
    text = normalize(transcript)
    action = "error"
    if any(w in text for w in ["reiniciar", "reinicia", "reset"]):
        action = "reiniciar"
    elif any(w in text for w in ["sube", "subir", "arriba"]):
        action = "subir"
    elif any(w in text for w in ["baja", "bajar", "abajo"]):
        action = "bajar"
    elif any(w in text for w in ["soltar", "deja", "dejar"]):
        action = "soltar"
    elif any(w in text for w in ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"]):
        action = "tomar"
    elif any(w in text for w in ["ir", "ve", "mover", "muevete", "acercar"]):
        action = "ir"
    color = "no especificado"
    if "rojo" in text:
        color = "rojo"
    elif "azul" in text:
        color = "azul"
    elif "amarillo" in text:
        color = "amarillo"
    elif "blanco" in text:
        color = "blanco"
    obj = "no especificado"
    if "estrella" in text:
        obj = "estrella"
    elif "cilindro" in text:
        obj = "cilindro"
    elif "cubo" in text:
        obj = "cubo"
    elif "caja" in text:
        obj = "caja"
    size = "no especificado"
    if "grande" in text:
        size = "grande"
    elif "pequeno" in text or "pequeño" in text or "chico" in text:
        size = "pequeno"
    if action == "error":
        return {"resultado": "error"}
    return {
        "resultado": "ok",
        "accion": action,
        "objeto": obj,
        "color": color,
        "tamano": size,
    }
 def build_gemini_prompt(transcript: str) -> str:
    """Build prompt for Gemini LLM parsing."""
    return f"""Interpreta el siguiente comando de voz de un niño, convertido a texto, para controlar
    un robot (manito). Asegúrate de responder con 'accion', 'objeto', 'color' y 'tamano'. Si el color
    o el tamaño no están especificados, responde con 'no especificado'. Si no entiendes la frase,
    responde con 'resultado: error'. En caso contrario, responde con 'resultado: ok'. Las acciones
    posibles son 'bajar', 'subir', 'soltar', 'tomar', 'ir', 'reiniciar'. Los colores posibles son 'rojo',
    'blanco','azul' y 'amarillo'. Los tamaños posibles son 'grande', 'pequeno'. Los posible objetos son estrella,
    cilindro, cubo y caja; cualquier otro objeto es error.
    Comando: "{transcript}"
    Nota: Los comandos pueden incluir variaciones en la expresión y errores comunes en el lenguaje de
    los niños. Normaliza la respuesta a las categorías establecidas. La salida es un json con los campos
    'resultado', 'accion', 'objeto', 'color' y 'tamano'. Adicionalmente los ninos pueden decir tomar,chupar, succionar o similar para tomar un objeto.
    """
 def parse_command(transcript: str, llm_provider: str = "rules") -> Dict[str, str]:
    """Parse voice command using specified provider."""
    if llm_provider == "gemini":
        try:
            from google import genai
            from google.genai import types
        except Exception:
            return rule_parse(transcript)
        api_key = os.getenv("GOOGLE_API_KEY")
        if not api_key:
            return rule_parse(transcript)
        try:
            client = genai.Client(api_key=api_key)
            prompt = build_gemini_prompt(transcript)
            reply = client.models.generate_content(
                model=os.getenv("GEMINI_MODEL", "gemini-2.0-flash"),
                contents=prompt,
                config=types.GenerateContentConfig(temperature=0.5),
            )
            raw = str(reply.text).replace("```json", "").replace("```", "")
            return json.loads(raw)
        except json.JSONDecodeError:
            return {"resultado": "error"}
        except Exception:
            return rule_parse(transcript)
    else:
        return rule_parse(transcript)
--- a/dora_voice_control/dora_voice_control/state.py
+++ b/dora_voice_control/dora_voice_control/state.py
@@ -0,0 +1,158 @@
 """Shared state management for voice control node."""
 from __future__ import annotations
 import threading
 from collections import deque
 from dataclasses import dataclass, field
 from typing import Any, Deque, Dict, List, Optional
@dataclass
 class RobotStep:
    """A single step in the robot command queue."""
    action: str
    payload: Dict[str, Any]
@dataclass
 class VoiceState:
    """Runtime state for voice control."""
    latest_pose: Optional[List[float]] = None
    latest_pose_at: Optional[float] = None
    latest_objects: List[Dict[str, Any]] = field(default_factory=list)
    latest_objects_at: Optional[float] = None
    static_objects: List[Dict[str, Any]] = field(default_factory=list)
    pending_command: Optional[Dict[str, Any]] = None
    queue: Deque[RobotStep] = field(default_factory=deque)
@dataclass
 class DebugState:
    """Debug information for the web interface."""
    last_voice_input: Optional[str] = None
    last_voice_input_at: Optional[float] = None
    last_parse_result: Optional[Dict[str, Any]] = None
    last_robot_command: Optional[Dict[str, Any]] = None
    last_robot_command_at: Optional[float] = None
    command_history: List[Dict[str, Any]] = field(default_factory=list)
    error_log: List[Dict[str, Any]] = field(default_factory=list)
    latest_image: Optional[bytes] = None
    latest_image_at: Optional[float] = None
 class SharedState:
    """Thread-safe shared state container."""
    def __init__(self) -> None:
        self._lock = threading.Lock()
        self.voice_state = VoiceState()
        self.debug_state = DebugState()
        self._command_callback: Optional[Any] = None
    def set_command_callback(self, callback: Any) -> None:
        """Set callback for sending commands from web interface."""
        with self._lock:
            self._command_callback = callback
    def get_command_callback(self) -> Optional[Any]:
        """Get the command callback."""
        with self._lock:
            return self._command_callback
    def get_status(self) -> Dict[str, Any]:
        """Get current status for web interface."""
        with self._lock:
            vs = self.voice_state
            ds = self.debug_state
            return {
                "has_pose": vs.latest_pose is not None,
                "pose": vs.latest_pose,
                "pose_age_ms": _age_ms(vs.latest_pose_at),
                "object_count": len(vs.latest_objects),
                "static_object_count": len(vs.static_objects),
                "queue_size": len(vs.queue),
                "has_pending_command": vs.pending_command is not None,
                "pending_command": vs.pending_command,
                "last_voice_input": ds.last_voice_input,
                "last_voice_input_age_ms": _age_ms(ds.last_voice_input_at),
                "last_parse_result": ds.last_parse_result,
            }
    def get_objects(self) -> Dict[str, Any]:
        """Get detected and static objects."""
        with self._lock:
            return {
                "detected": list(self.voice_state.latest_objects),
                "static": list(self.voice_state.static_objects),
            }
    def get_queue(self) -> List[Dict[str, Any]]:
        """Get the command queue."""
        with self._lock:
            return [{"action": s.action, "payload": s.payload} for s in self.voice_state.queue]
    def get_history(self) -> List[Dict[str, Any]]:
        """Get command history."""
        with self._lock:
            return list(self.debug_state.command_history[-50:])
    def get_errors(self) -> List[Dict[str, Any]]:
        """Get error log."""
        with self._lock:
            return list(self.debug_state.error_log[-50:])
    def add_to_history(self, entry: Dict[str, Any]) -> None:
        """Add entry to command history."""
        with self._lock:
            self.debug_state.command_history.append(entry)
            if len(self.debug_state.command_history) > 100:
                self.debug_state.command_history = self.debug_state.command_history[-100:]
    def add_error(self, error: Dict[str, Any]) -> None:
        """Add entry to error log."""
        with self._lock:
            self.debug_state.error_log.append(error)
            if len(self.debug_state.error_log) > 100:
                self.debug_state.error_log = self.debug_state.error_log[-100:]
    def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None:
        """Update last voice input info."""
        with self._lock:
            self.debug_state.last_voice_input = text
            self.debug_state.last_voice_input_at = timestamp
            self.debug_state.last_parse_result = parse_result
    def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None:
        """Update last robot command info."""
        with self._lock:
            self.debug_state.last_robot_command = command
            self.debug_state.last_robot_command_at = timestamp
    def update_image(self, image_bytes: bytes, timestamp: float) -> None:
        """Update latest camera image."""
        with self._lock:
            self.debug_state.latest_image = image_bytes
            self.debug_state.latest_image_at = timestamp
    def get_image(self) -> Optional[bytes]:
        """Get latest camera image."""
        with self._lock:
            return self.debug_state.latest_image
    def get_image_age_ms(self) -> Optional[int]:
        """Get age of latest image in milliseconds."""
        with self._lock:
            return _age_ms(self.debug_state.latest_image_at)
 def _age_ms(timestamp: Optional[float]) -> Optional[int]:
    """Calculate age in milliseconds from monotonic timestamp."""
    import time
    if timestamp is None:
        return None
    return int((time.monotonic() - timestamp) * 1000)
--- a/dora_voice_control/dora_voice_control/templates.py
+++ b/dora_voice_control/dora_voice_control/templates.py
@@ -0,0 +1,700 @@
 """HTML templates for the voice control web interface."""
 HTML_TEMPLATE = """<!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Voice Control Debug</title>
    <style>
        * { box-sizing: border-box; margin: 0; padding: 0; }
        body {
            font-family: 'Segoe UI', system-ui, sans-serif;
            background: #1a1a2e;
            color: #eee;
            min-height: 100vh;
            padding: 20px;
        }
        .header {
            text-align: center;
            margin-bottom: 20px;
            padding-bottom: 15px;
            border-bottom: 1px solid #333;
        }
        .header h1 { color: #00d4ff; font-size: 1.5em; }
        .header .status {
            margin-top: 8px;
            font-size: 0.9em;
        }
        .status-dot {
            display: inline-block;
            width: 10px;
            height: 10px;
            border-radius: 50%;
            margin-right: 6px;
        }
        .status-dot.ok { background: #00ff88; }
        .status-dot.warn { background: #ffaa00; }
        .status-dot.error { background: #ff4444; }
        .grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
            gap: 15px;
            max-width: 1400px;
            margin: 0 auto;
        }
        .card {
            background: #16213e;
            border-radius: 8px;
            padding: 15px;
            border: 1px solid #0f3460;
        }
        .card h2 {
            color: #00d4ff;
            font-size: 1em;
            margin-bottom: 12px;
            padding-bottom: 8px;
            border-bottom: 1px solid #0f3460;
        }
        /* Command Input */
        .command-form {
            display: flex;
            gap: 10px;
            margin-bottom: 15px;
        }
        .command-form input {
            flex: 1;
            padding: 10px 12px;
            border: 1px solid #0f3460;
            border-radius: 6px;
            background: #1a1a2e;
            color: #fff;
            font-size: 14px;
        }
        .command-form input:focus {
            outline: none;
            border-color: #00d4ff;
        }
        .btn {
            padding: 10px 20px;
            border: none;
            border-radius: 6px;
            cursor: pointer;
            font-weight: 500;
            transition: all 0.2s;
        }
        .btn-primary {
            background: #00d4ff;
            color: #000;
        }
        .btn-primary:hover { background: #00b8e0; }
        .btn-primary:disabled { background: #555; color: #888; cursor: not-allowed; }
        .btn-secondary {
            background: #333;
            color: #fff;
        }
        .btn-secondary:hover { background: #444; }
        .btn-danger {
            background: #ff4444;
            color: #fff;
        }
        .btn-danger:hover { background: #cc3333; }
        .btn-success {
            background: #00ff88;
            color: #000;
        }
        .btn-success:hover { background: #00cc6a; }
        /* Quick Commands */
        .quick-commands {
            display: flex;
            flex-wrap: wrap;
            gap: 8px;
        }
        .quick-btn {
            padding: 8px 14px;
            background: #0f3460;
            border: 1px solid #1a4a7a;
            border-radius: 20px;
            color: #00d4ff;
            cursor: pointer;
            font-size: 13px;
            transition: all 0.2s;
        }
        .quick-btn:hover {
            background: #1a4a7a;
            border-color: #00d4ff;
        }
        /* Status Grid */
        .status-grid {
            display: grid;
            grid-template-columns: 1fr 1fr;
            gap: 10px;
        }
        .status-item {
            background: #1a1a2e;
            padding: 10px;
            border-radius: 6px;
        }
        .status-item .label {
            font-size: 11px;
            color: #888;
            text-transform: uppercase;
            margin-bottom: 4px;
        }
        .status-item .value {
            font-size: 14px;
            font-weight: 500;
        }
        .status-item .value.ok { color: #00ff88; }
        .status-item .value.warn { color: #ffaa00; }
        .status-item .value.error { color: #ff4444; }
        /* Pose Display */
        .pose-grid {
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 8px;
        }
        .pose-item {
            background: #1a1a2e;
            padding: 10px;
            border-radius: 6px;
            text-align: center;
        }
        .pose-item .label {
            font-size: 11px;
            color: #888;
            margin-bottom: 4px;
        }
        .pose-item .value {
            font-size: 16px;
            font-weight: 600;
            color: #00d4ff;
            font-family: monospace;
        }
        /* Objects List */
        .objects-list {
            max-height: 300px;
            overflow-y: auto;
        }
        .object-item {
            display: flex;
            justify-content: space-between;
            align-items: center;
            padding: 8px 10px;
            background: #1a1a2e;
            border-radius: 6px;
            margin-bottom: 6px;
            font-size: 13px;
        }
        .object-item .type { color: #00d4ff; font-weight: 500; }
        .object-item .color-badge {
            padding: 2px 8px;
            border-radius: 10px;
            font-size: 11px;
        }
        .color-badge.red { background: #ff4444; color: #fff; }
        .color-badge.blue { background: #4488ff; color: #fff; }
        .color-badge.yellow { background: #ffcc00; color: #000; }
        .color-badge.white { background: #fff; color: #000; }
        .object-item .pos {
            font-family: monospace;
            font-size: 11px;
            color: #888;
        }
        /* Queue Display */
        .queue-list {
            max-height: 150px;
            overflow-y: auto;
        }
        .queue-item {
            display: flex;
            align-items: center;
            gap: 10px;
            padding: 8px 10px;
            background: #1a1a2e;
            border-radius: 6px;
            margin-bottom: 6px;
            font-size: 13px;
        }
        .queue-item .index {
            background: #0f3460;
            color: #00d4ff;
            padding: 2px 8px;
            border-radius: 4px;
            font-size: 11px;
        }
        .queue-item .action { color: #00ff88; font-weight: 500; }
        .queue-item.pending { border-left: 3px solid #ffaa00; }
        /* Log Display */
        .log {
            max-height: 250px;
            overflow-y: auto;
            font-family: monospace;
            font-size: 12px;
        }
        .log-entry {
            padding: 6px 10px;
            border-bottom: 1px solid #0f3460;
        }
        .log-entry:last-child { border-bottom: none; }
        .log-entry .time {
            color: #666;
            margin-right: 10px;
        }
        .log-entry.error { color: #ff4444; }
        .log-entry.success { color: #00ff88; }
        .log-entry.info { color: #00d4ff; }
        /* Parse Result */
        .parse-result {
            background: #1a1a2e;
            padding: 12px;
            border-radius: 6px;
            font-family: monospace;
            font-size: 13px;
        }
        .parse-result .field {
            display: flex;
            justify-content: space-between;
            padding: 4px 0;
            border-bottom: 1px solid #0f3460;
        }
        .parse-result .field:last-child { border-bottom: none; }
        .parse-result .key { color: #888; }
        .parse-result .val { color: #00d4ff; }
        /* Empty State */
        .empty {
            text-align: center;
            color: #666;
            padding: 20px;
            font-style: italic;
        }
        /* Camera View */
        .camera-container {
            position: relative;
            background: #000;
            border-radius: 6px;
            overflow: hidden;
            min-height: 240px;
        }
        .camera-container img {
            width: 100%;
            height: auto;
            display: block;
        }
        .camera-overlay {
            position: absolute;
            top: 10px;
            right: 10px;
            background: rgba(0,0,0,0.6);
            padding: 4px 8px;
            border-radius: 4px;
            font-size: 11px;
        }
        .camera-overlay.ok { color: #00ff88; }
        .camera-overlay.stale { color: #ffaa00; }
        .camera-overlay.error { color: #ff4444; }
        .no-image {
            display: flex;
            align-items: center;
            justify-content: center;
            height: 240px;
            color: #666;
            font-style: italic;
        }
    </style>
 </head>
 <body>
    <div class="header">
        <h1>Voice Control Debug Interface</h1>
        <div class="status">
            <span class="status-dot" id="status-dot"></span>
            <span id="status-text">Connecting...</span>
        </div>
    </div>
    <div class="grid">
        <!-- Camera View -->
        <div class="card" style="grid-column: span 2;">
            <h2>Camera View <span id="camera-status" style="font-weight:normal;font-size:11px;color:#888;"></span></h2>
            <div class="camera-container" id="camera-container">
                <div class="no-image" id="no-image">No camera image available</div>
                <img id="camera-img" style="display:none;" alt="Camera feed">
                <div class="camera-overlay" id="camera-overlay"></div>
            </div>
        </div>
        <!-- Command Input -->
        <div class="card">
            <h2>Send Command</h2>
            <form class="command-form" id="command-form">
                <input type="text" id="command-input" placeholder="Enter voice command (e.g., 'sube', 'agarra el cubo rojo')" autocomplete="off">
                <button type="submit" class="btn btn-primary" id="btn-send">Send</button>
            </form>
            <div class="quick-commands">
                <button class="quick-btn" onclick="sendQuick('sube')">Sube</button>
                <button class="quick-btn" onclick="sendQuick('baja')">Baja</button>
                <button class="quick-btn" onclick="sendQuick('agarra el cubo rojo')">Cubo Rojo</button>
                <button class="quick-btn" onclick="sendQuick('agarra el cubo azul')">Cubo Azul</button>
                <button class="quick-btn" onclick="sendQuick('suelta')">Soltar</button>
                <button class="quick-btn" onclick="sendQuick('reinicia')">Reiniciar</button>
            </div>
        </div>
        <!-- Status -->
        <div class="card">
            <h2>System Status</h2>
            <div class="status-grid">
                <div class="status-item">
                    <div class="label">Pose Available</div>
                    <div class="value" id="st-pose">--</div>
                </div>
                <div class="status-item">
                    <div class="label">Pose Age</div>
                    <div class="value" id="st-pose-age">--</div>
                </div>
                <div class="status-item">
                    <div class="label">Objects Detected</div>
                    <div class="value" id="st-objects">--</div>
                </div>
                <div class="status-item">
                    <div class="label">Static Objects</div>
                    <div class="value" id="st-static">--</div>
                </div>
                <div class="status-item">
                    <div class="label">Queue Size</div>
                    <div class="value" id="st-queue">--</div>
                </div>
                <div class="status-item">
                    <div class="label">Pending Command</div>
                    <div class="value" id="st-pending">--</div>
                </div>
            </div>
        </div>
        <!-- TCP Pose -->
        <div class="card">
            <h2>TCP Pose</h2>
            <div class="pose-grid">
                <div class="pose-item">
                    <div class="label">X (mm)</div>
                    <div class="value" id="pose-x">--</div>
                </div>
                <div class="pose-item">
                    <div class="label">Y (mm)</div>
                    <div class="value" id="pose-y">--</div>
                </div>
                <div class="pose-item">
                    <div class="label">Z (mm)</div>
                    <div class="value" id="pose-z">--</div>
                </div>
                <div class="pose-item">
                    <div class="label">Roll</div>
                    <div class="value" id="pose-roll">--</div>
                </div>
                <div class="pose-item">
                    <div class="label">Pitch</div>
                    <div class="value" id="pose-pitch">--</div>
                </div>
                <div class="pose-item">
                    <div class="label">Yaw</div>
                    <div class="value" id="pose-yaw">--</div>
                </div>
            </div>
        </div>
        <!-- Last Parse Result -->
        <div class="card">
            <h2>Last Parse Result</h2>
            <div class="parse-result" id="parse-result">
                <div class="empty">No command parsed yet</div>
            </div>
            <div style="margin-top: 10px; font-size: 12px; color: #888;">
                <span>Last input: </span><span id="last-input">--</span>
            </div>
        </div>
        <!-- Detected Objects -->
        <div class="card">
            <h2>Detected Objects</h2>
            <div class="objects-list" id="objects-list">
                <div class="empty">No objects detected</div>
            </div>
        </div>
        <!-- Command Queue -->
        <div class="card">
            <h2>Command Queue</h2>
            <div class="queue-list" id="queue-list">
                <div class="empty">Queue is empty</div>
            </div>
            <div style="margin-top: 10px;">
                <button class="btn btn-danger btn-sm" onclick="clearQueue()">Clear Queue</button>
            </div>
        </div>
        <!-- Log -->
        <div class="card" style="grid-column: span 2;">
            <h2>Activity Log</h2>
            <div class="log" id="log"></div>
        </div>
    </div>
    <script>
        const $ = id => document.getElementById(id);
        async function fetchJson(url, opts = {}) {
            try {
                const res = await fetch(url, opts);
                return await res.json();
            } catch (e) {
                return { error: e.message };
            }
        }
        function log(msg, type = 'info') {
            const logEl = $('log');
            const time = new Date().toLocaleTimeString();
            const entry = document.createElement('div');
            entry.className = 'log-entry ' + type;
            entry.innerHTML = '<span class="time">' + time + '</span>' + msg;
            logEl.insertBefore(entry, logEl.firstChild);
            if (logEl.children.length > 100) logEl.removeChild(logEl.lastChild);
        }
        async function updateStatus() {
            const data = await fetchJson('/api/status');
            if (data.error) {
                $('status-dot').className = 'status-dot error';
                $('status-text').textContent = 'Error: ' + data.error;
                return;
            }
            $('status-dot').className = 'status-dot ok';
            $('status-text').textContent = 'Connected';
            $('st-pose').textContent = data.has_pose ? 'Yes' : 'No';
            $('st-pose').className = 'value ' + (data.has_pose ? 'ok' : 'warn');
            $('st-pose-age').textContent = data.pose_age_ms !== null ? data.pose_age_ms + 'ms' : '--';
            $('st-pose-age').className = 'value ' + (data.pose_age_ms < 1000 ? 'ok' : 'warn');
            $('st-objects').textContent = data.object_count;
            $('st-static').textContent = data.static_object_count;
            $('st-queue').textContent = data.queue_size;
            $('st-queue').className = 'value ' + (data.queue_size > 0 ? 'warn' : 'ok');
            $('st-pending').textContent = data.has_pending_command ? 'Yes' : 'No';
            $('st-pending').className = 'value ' + (data.has_pending_command ? 'warn' : 'ok');
            // Update pose
            if (data.pose) {
                $('pose-x').textContent = data.pose[0].toFixed(1);
                $('pose-y').textContent = data.pose[1].toFixed(1);
                $('pose-z').textContent = data.pose[2].toFixed(1);
                $('pose-roll').textContent = data.pose[3].toFixed(1);
                $('pose-pitch').textContent = data.pose[4].toFixed(1);
                $('pose-yaw').textContent = data.pose[5].toFixed(1);
            }
            // Update last input
            $('last-input').textContent = data.last_voice_input || '--';
            // Update parse result
            if (data.last_parse_result) {
                let html = '';
                for (const [k, v] of Object.entries(data.last_parse_result)) {
                    html += '<div class="field"><span class="key">' + k + '</span><span class="val">' + v + '</span></div>';
                }
                $('parse-result').innerHTML = html;
            }
        }
        async function updateObjects() {
            const data = await fetchJson('/api/objects');
            if (data.error) return;
            const list = $('objects-list');
            const detected = data.detected || [];
            const staticObjs = data.static || [];
            if (detected.length === 0 && staticObjs.length === 0) {
                list.innerHTML = '<div class="empty">No objects detected</div>';
                return;
            }
            let html = '';
            // Detected objects
            if (detected.length > 0) {
                html += '<div style="font-size:11px;color:#00d4ff;margin-bottom:6px;">Detected (' + detected.length + ')</div>';
                html += detected.map(obj => {
                    const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
                    const colorClass = obj.color || 'white';
                    const conf = obj.confidence ? (obj.confidence * 100).toFixed(0) + '%' : '';
                    const size = obj.size || '';
                    return '<div class="object-item">' +
                        '<span class="type">' + (obj.object_type || '?') + '</span>' +
                        '<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
                        '<span style="color:#888;font-size:10px;">' + size + '</span>' +
                        '<span style="color:#00ff88;font-size:10px;">' + conf + '</span>' +
                        '<span class="pos">[' + pos + ']</span>' +
                        '</div>';
                }).join('');
            }
            // Static objects
            if (staticObjs.length > 0) {
                html += '<div style="font-size:11px;color:#888;margin:8px 0 6px 0;">Static (' + staticObjs.length + ')</div>';
                html += staticObjs.map(obj => {
                    const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
                    const colorClass = obj.color || 'white';
                    return '<div class="object-item" style="opacity:0.7;">' +
                        '<span class="type">' + (obj.object_type || '?') + '</span>' +
                        '<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
                        '<span class="pos">[' + pos + ']</span>' +
                        '</div>';
                }).join('');
            }
            list.innerHTML = html;
        }
        async function updateQueue() {
            const data = await fetchJson('/api/queue');
            if (data.error) return;
            const list = $('queue-list');
            if (!data.length) {
                list.innerHTML = '<div class="empty">Queue is empty</div>';
                return;
            }
            list.innerHTML = data.map((item, i) => {
                const payload = JSON.stringify(item.payload || {});
                return '<div class="queue-item">' +
                    '<span class="index">' + (i + 1) + '</span>' +
                    '<span class="action">' + item.action + '</span>' +
                    '<span style="color:#888;font-size:11px">' + payload + '</span>' +
                    '</div>';
            }).join('');
        }
        async function sendCommand(text) {
            if (!text.trim()) return;
            $('btn-send').disabled = true;
            log('Sending: "' + text + '"', 'info');
            const res = await fetchJson('/api/command', {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ text: text })
            });
            $('btn-send').disabled = false;
            if (res.ok) {
                log('Response: ' + res.text, 'success');
            } else {
                log('Error: ' + (res.text || res.detail || 'Unknown error'), 'error');
            }
            $('command-input').value = '';
            updateStatus();
            updateQueue();
        }
        function sendQuick(text) {
            $('command-input').value = text;
            sendCommand(text);
        }
        async function clearQueue() {
            log('Clearing queue...', 'info');
            const res = await fetchJson('/api/queue/clear', { method: 'POST' });
            if (res.ok) {
                log('Queue cleared', 'success');
            } else {
                log('Failed to clear queue', 'error');
            }
            updateQueue();
        }
        // Form submit
        $('command-form').addEventListener('submit', e => {
            e.preventDefault();
            sendCommand($('command-input').value);
        });
        // Camera update
        let cameraErrorCount = 0;
        async function updateCamera() {
            const info = await fetchJson('/api/image/info');
            const overlay = $('camera-overlay');
            const img = $('camera-img');
            const noImage = $('no-image');
            const status = $('camera-status');
            if (info.error || !info.has_image) {
                cameraErrorCount++;
                if (cameraErrorCount > 3) {
                    img.style.display = 'none';
                    noImage.style.display = 'flex';
                    overlay.textContent = '';
                    status.textContent = '(no feed)';
                }
                return;
            }
            cameraErrorCount = 0;
            noImage.style.display = 'none';
            img.style.display = 'block';
            // Update image with cache-busting
            const newSrc = '/api/image?t=' + Date.now();
            if (img.src !== newSrc) {
                img.src = newSrc;
            }
            // Update overlay
            const ageMs = info.age_ms || 0;
            if (ageMs < 500) {
                overlay.textContent = 'LIVE';
                overlay.className = 'camera-overlay ok';
            } else if (ageMs < 2000) {
                overlay.textContent = ageMs + 'ms';
                overlay.className = 'camera-overlay stale';
            } else {
                overlay.textContent = 'STALE ' + (ageMs/1000).toFixed(1) + 's';
                overlay.className = 'camera-overlay error';
            }
            status.textContent = '';
        }
        // Auto-refresh
        setInterval(updateStatus, 500);
        setInterval(updateObjects, 1000);
        setInterval(updateQueue, 500);
        setInterval(updateCamera, 100);
        // Initial load
        updateStatus();
        updateObjects();
        updateQueue();
        updateCamera();
        log('Interface loaded', 'info');
    </script>
 </body>
 </html>
 """
--- a/dora_voice_control/pyproject.toml
+++ b/dora_voice_control/pyproject.toml
@@ -0,0 +1,25 @@
 [project]
 name = "dora-voice-control"
 version = "0.1.0"
 license = { file = "MIT" }
 authors = [{ name = "Dora" }]
 description = "Dora node for voice command control via WebSocket"
 requires-python = ">=3.8"
 dependencies = [
  "dora-rs >= 0.3.9",
  "numpy < 2.0.0",
  "pyarrow >= 12.0.0",
  "websockets >= 12.0",
  "fastapi >= 0.109.0",
  "uvicorn >= 0.27.0",
  "pydantic >= 2.0.0",
  "opencv-python >= 4.8.0",
 ]
 [project.optional-dependencies]
 llm = ["google-genai"]
 [project.scripts]
 dora-voice-control = "dora_voice_control.main:main"
--- a/trained_models/yolo8n.pt
+++ b/trained_models/yolo8n.pt