Add voice control, working but need more work

2026-01-31 11:41:50 -03:00
parent 380c466170
commit b9798a2f46
21 changed files with 3101 additions and 0 deletions
--- a/dora_voice_control/README.md
+++ b/dora_voice_control/README.md
@@ -0,0 +1,211 @@
+# Dora Voice Control Node
+
+A Dora node that processes Spanish voice commands from children and translates them into robot actions (movement, grasping, releasing objects). Includes a web debug interface.
+
+## Features
+
+- Spanish voice command parsing (rule-based or Gemini LLM)
+- Real-time web debug interface
+- Command queue management
+- Workspace bounds validation
+- Object detection integration
+
+## File Structure
+
+```
+dora_voice_control/
+├── __init__.py
+├── main.py        # Main Dora node entry point
+├── api.py         # FastAPI web server
+├── config.py      # Configuration management
+├── models.py      # Pydantic request/response models
+├── parser.py      # Voice command parsing logic
+├── state.py       # Shared state management
+└── templates.py   # HTML template for web interface
+```
+
+## Web Debug Interface
+
+Access the debug interface at `http://localhost:8080` (default).
+
+Features:
+- Real-time status monitoring (pose, objects, queue)
+- Send manual voice commands
+- Quick command buttons
+- View parse results
+- Command history
+- Clear queue
+
+## Inputs/Outputs
+
+| Input         | Type   | Description                              |
+|---------------|--------|------------------------------------------|
+| `voice_in`    | string | Text transcription of voice command      |
+| `tcp_pose`    | array  | Current robot pose [x, y, z, roll, pitch, yaw] |
+| `objects`     | JSON   | Detected objects from vision system      |
+| `status`      | JSON   | Command execution status from robot      |
+
+| Output        | Type   | Description                              |
+|---------------|--------|------------------------------------------|
+| `robot_cmd`   | JSON   | Robot command with action and payload    |
+| `voice_out`   | JSON   | Response confirmation to user            |
+| `scene_update`| JSON   | Updated scene with all visible objects   |
+
+## Supported Commands (Spanish)
+
+| Command       | Action         | Example                        |
+|---------------|----------------|--------------------------------|
+| `subir`       | Move up        | "sube"                         |
+| `bajar`       | Move down      | "baja"                         |
+| `tomar`       | Grab object    | "agarra el cubo rojo"          |
+| `soltar`      | Release object | "suelta en la caja azul"       |
+| `ir`          | Go to object   | "ve al cilindro"               |
+| `reiniciar`   | Reset          | "reinicia"                     |
+
+## Environment Variables
+
+```bash
+# Web API Server
+API_ENABLED=true        # Enable/disable web interface
+API_HOST=0.0.0.0        # Bind address
+API_PORT=8080           # Listen port
+
+# TCP Parameters
+TCP_OFFSET_MM=63.0          # Z-offset to object surface
+APPROACH_OFFSET_MM=50.0     # Safe approach distance above object
+STEP_MM=20.0                # Distance for up/down increments
+
+# LLM Configuration (optional)
+LLM_PROVIDER=rules          # "rules" or "gemini"
+GOOGLE_API_KEY=your_key     # Required if using gemini
+GEMINI_MODEL=gemini-2.0-flash
+
+# Workspace Safety (optional)
+WORKSPACE_MIN_X=-300
+WORKSPACE_MAX_X=300
+WORKSPACE_MIN_Y=-300
+WORKSPACE_MAX_Y=300
+WORKSPACE_MIN_Z=0
+WORKSPACE_MAX_Z=500
+
+# Misc
+DRY_RUN=false               # Skip sending robot commands
+```
+
+## Installation
+
+```bash
+cd dora_voice_control
+pip install -e .
+
+# With LLM support
+pip install -e ".[llm]"
+```
+
+## Testing
+
+### Web Interface
+
+```bash
+# Start the node (standalone for testing)
+python -m dora_voice_control.main
+
+# Open in browser
+open http://localhost:8080
+```
+
+### API Endpoints
+
+```bash
+# Get status
+curl http://localhost:8080/api/status
+
+# Get objects
+curl http://localhost:8080/api/objects
+
+# Get queue
+curl http://localhost:8080/api/queue
+
+# Send command
+curl -X POST http://localhost:8080/api/command \
+  -H "Content-Type: application/json" \
+  -d '{"text": "sube"}'
+
+# Clear queue
+curl -X POST http://localhost:8080/api/queue/clear
+```
+
+### Python Test
+
+```python
+from dora_voice_control.parser import rule_parse, normalize
+
+# Test command parsing
+text = "agarra el cubo rojo grande"
+result = rule_parse(normalize(text))
+print(result)
+# {'resultado': 'ok', 'accion': 'tomar', 'objeto': 'cubo', 'color': 'rojo', 'tamano': 'grande'}
+```
+
+## Dora Dataflow Configuration
+
+```yaml
+nodes:
+  - id: voice_control
+    build: pip install -e ./dora_voice_control
+    path: dora_voice_control
+    inputs:
+      voice_in: iobridge/voice_in
+      tcp_pose: robot/tcp_pose
+      objects: detector/objects
+      status: robot/status
+    outputs:
+      - robot_cmd
+      - voice_out
+      - scene_update
+    env:
+      API_ENABLED: "true"
+      API_PORT: "8080"
+      DRY_RUN: "false"
+```
+
+## Message Examples
+
+### Input: voice_in
+```
+"sube"
+"agarra el cubo rojo"
+"suelta en la caja azul"
+```
+
+### Output: robot_cmd
+```json
+{
+  "id": "550e8400-e29b-41d4-a716-446655440000",
+  "action": "move_to_pose",
+  "payload": {
+    "x": 150.0,
+    "y": 200.0,
+    "z": 280.0,
+    "roll": 180.0,
+    "pitch": 0.0,
+    "yaw": 0.0
+  }
+}
+```
+
+### Output: voice_out
+```json
+{"text": "Ok, voy a subir", "status": "ok"}
+{"text": "No entendi el comando", "status": "error"}
+```
+
+## Dependencies
+
+- dora-rs >= 0.3.9
+- numpy < 2.0.0
+- pyarrow >= 12.0.0
+- fastapi >= 0.109.0
+- uvicorn >= 0.27.0
+- pydantic >= 2.0.0
+- google-genai (optional, for Gemini mode)
--- a/dora_voice_control/dora_voice_control/init.py
+++ b/dora_voice_control/dora_voice_control/init.py
@@ -0,0 +1 @@
+"""Dora voice control node package."""
--- a/dora_voice_control/dora_voice_control/api.py
+++ b/dora_voice_control/dora_voice_control/api.py
@@ -0,0 +1,162 @@
+"""FastAPI application for the voice control web interface."""
+
+from __future__ import annotations
+
+import os
+import sys
+import threading
+from typing import Any
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse, Response
+
+# Handle both package and direct script execution
+# __package__ is None when run as script, '' when imported from a script
+if not __package__:
+    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
+    if _pkg_dir not in sys.path:
+        sys.path.insert(0, _pkg_dir)
+    from models import CommandRequest, CommandResponse
+    from state import SharedState
+    from templates import HTML_TEMPLATE
+else:
+    from .models import CommandRequest, CommandResponse
+    from .state import SharedState
+    from .templates import HTML_TEMPLATE
+
+
+def create_api(state: SharedState) -> FastAPI:
+    """Create FastAPI application with voice control endpoints."""
+    app = FastAPI(
+        title="Voice Control Debug API",
+        description="Debug interface for the voice control node",
+        version="0.1.0",
+    )
+
+    @app.get("/", response_class=HTMLResponse)
+    def index() -> str:
+        """Serve the web interface."""
+        return HTML_TEMPLATE
+
+    @app.get("/api/status")
+    def get_status() -> dict:
+        """Get current status."""
+        try:
+            return state.get_status()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/objects")
+    def get_objects() -> dict:
+        """Get detected and static objects."""
+        try:
+            return state.get_objects()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/queue")
+    def get_queue() -> list:
+        """Get the command queue."""
+        try:
+            return state.get_queue()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.post("/api/queue/clear")
+    def clear_queue() -> dict:
+        """Clear the command queue."""
+        try:
+            with state._lock:
+                state.voice_state.queue.clear()
+            return {"ok": True}
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/history")
+    def get_history() -> list:
+        """Get command history."""
+        try:
+            return state.get_history()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/errors")
+    def get_errors() -> list:
+        """Get error log."""
+        try:
+            return state.get_errors()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.post("/api/command", response_model=CommandResponse)
+    def send_command(request: CommandRequest) -> CommandResponse:
+        """Send a voice command."""
+        try:
+            callback = state.get_command_callback()
+            if callback is None:
+                return CommandResponse(ok=False, text="No command handler available", status="error")
+
+            result = callback(request.text)
+            return CommandResponse(
+                ok=result.get("status") == "ok",
+                text=result.get("text", ""),
+                status=result.get("status", "error"),
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/image")
+    def get_image() -> Response:
+        """Get the latest camera image as JPEG."""
+        try:
+            image_data = state.get_image()
+            if image_data is None:
+                # Return a 1x1 transparent pixel if no image
+                return Response(
+                    content=b"",
+                    media_type="image/jpeg",
+                    status_code=204,
+                )
+            return Response(
+                content=image_data,
+                media_type="image/jpeg",
+                headers={"Cache-Control": "no-cache, no-store, must-revalidate"},
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/api/image/info")
+    def get_image_info() -> dict:
+        """Get image metadata."""
+        try:
+            return {
+                "has_image": state.get_image() is not None,
+                "age_ms": state.get_image_age_ms(),
+            }
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+    return app
+
+
+def run_uvicorn(app: FastAPI, host: str, port: int) -> None:
+    """Run uvicorn server (for use in background thread)."""
+    config = uvicorn.Config(app, host=host, port=port, log_level="warning")
+    server = uvicorn.Server(config)
+    server.run()
+
+
+def start_api_server(state: SharedState, config: Any) -> threading.Thread:
+    """Start the API server in a background thread."""
+    import time as _time
+    app = create_api(state)
+    api_thread = threading.Thread(
+        target=run_uvicorn,
+        args=(app, config.host, config.port),
+        daemon=True,
+    )
+    api_thread.start()
+    timestamp = _time.strftime("%H:%M:%S")
+    print(f"[voice_control {timestamp}] Web interface at http://{config.host}:{config.port}", flush=True)
+    return api_thread
--- a/dora_voice_control/dora_voice_control/config.py
+++ b/dora_voice_control/dora_voice_control/config.py
@@ -0,0 +1,95 @@
+"""Configuration for the voice control node."""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+
+@dataclass
+class VoiceConfig:
+    """Configuration for voice control."""
+
+    host: str
+    port: int
+    tcp_offset_mm: float
+    approach_offset_mm: float
+    step_mm: float
+    default_roll: float
+    default_pitch: float
+    default_yaw: float
+    dry_run: bool
+    workspace_min: Tuple[Optional[float], Optional[float], Optional[float]]
+    workspace_max: Tuple[Optional[float], Optional[float], Optional[float]]
+    class_map: Dict[str, str]
+
+
+@dataclass
+class ApiConfig:
+    """Configuration for the web API server."""
+
+    host: str
+    port: int
+    enabled: bool
+
+
+def _parse_float_env(name: str) -> Optional[float]:
+    """Parse an optional float from environment variable."""
+    raw = os.getenv(name)
+    if raw is None or raw == "":
+        return None
+    try:
+        return float(raw)
+    except ValueError:
+        return None
+
+
+def _parse_class_map(raw: str) -> Dict[str, str]:
+    """Parse JSON class mapping from string."""
+    import json
+
+    if not raw:
+        return {}
+    try:
+        data = json.loads(raw)
+        if isinstance(data, dict):
+            return {str(k): str(v) for k, v in data.items()}
+    except Exception:
+        pass
+    return {}
+
+
+def load_voice_config() -> VoiceConfig:
+    """Load voice configuration from environment variables."""
+    return VoiceConfig(
+        host="",
+        port=0,
+        tcp_offset_mm=float(os.getenv("TCP_OFFSET_MM", "63.0")),
+        approach_offset_mm=float(os.getenv("APPROACH_OFFSET_MM", "50.0")),
+        step_mm=float(os.getenv("STEP_MM", "20.0")),
+        default_roll=float(os.getenv("DEFAULT_ROLL", "180.0")),
+        default_pitch=float(os.getenv("DEFAULT_PITCH", "0.0")),
+        default_yaw=float(os.getenv("DEFAULT_YAW", "0.0")),
+        dry_run=os.getenv("DRY_RUN", "false").lower() in ("true", "1", "yes"),
+        workspace_min=(
+            _parse_float_env("WORKSPACE_MIN_X"),
+            _parse_float_env("WORKSPACE_MIN_Y"),
+            _parse_float_env("WORKSPACE_MIN_Z"),
+        ),
+        workspace_max=(
+            _parse_float_env("WORKSPACE_MAX_X"),
+            _parse_float_env("WORKSPACE_MAX_Y"),
+            _parse_float_env("WORKSPACE_MAX_Z"),
+        ),
+        class_map=_parse_class_map(os.getenv("CLASS_MAP", "")),
+    )
+
+
+def load_api_config() -> ApiConfig:
+    """Load API server configuration from environment variables."""
+    return ApiConfig(
+        host=os.getenv("API_HOST", "0.0.0.0"),
+        port=int(os.getenv("API_PORT", "8080")),
+        enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"),
+    )
--- a/dora_voice_control/dora_voice_control/main.py
+++ b/dora_voice_control/dora_voice_control/main.py
@@ -0,0 +1,501 @@
+"""Dora node for voice control with safe robot commands."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import time
+import uuid
+from collections import deque
+from typing import Any, Deque, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib
+
+# Handle both package and direct script execution
+# __package__ is None when run as script, '' when imported from a script
+_RUNNING_AS_SCRIPT = not __package__
+
+if _RUNNING_AS_SCRIPT:
+    # Running as script - use absolute imports
+    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
+    if _pkg_dir not in sys.path:
+        sys.path.insert(0, _pkg_dir)
+    from config import VoiceConfig, load_api_config, load_voice_config
+    from parser import normalize, parse_command
+    from state import RobotStep, SharedState
+    from api import start_api_server
+else:
+    # Running as package - use relative imports
+    from .config import VoiceConfig, load_api_config, load_voice_config
+    from .parser import normalize, parse_command
+    from .state import RobotStep, SharedState
+    from .api import start_api_server
+
+
+def _within_bounds(
+    point_mm: np.ndarray,
+    min_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
+    max_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
+) -> bool:
+    """Check if point is within workspace bounds."""
+    x, y, z = point_mm.tolist()
+    min_x, min_y, min_z = min_xyz
+    max_x, max_y, max_z = max_xyz
+    if min_x is not None and x < min_x:
+        return False
+    if max_x is not None and x > max_x:
+        return False
+    if min_y is not None and y < min_y:
+        return False
+    if max_y is not None and y > max_y:
+        return False
+    if min_z is not None and z < min_z:
+        return False
+    if max_z is not None and z > max_z:
+        return False
+    return True
+
+
+def _translate_target(token: str, mapping: Dict[str, str]) -> str:
+    """Translate object name using class map."""
+    if token in mapping:
+        return mapping[token]
+    return token
+
+
+def _load_config_file(path: str) -> Dict[str, Any]:
+    """Load TOML configuration file."""
+    if not path or not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "rb") as handle:
+            return tomllib.load(handle)
+    except Exception:
+        return {}
+
+
+def _load_bucket_objects(config_path: str) -> List[Dict[str, Any]]:
+    """Load bucket positions from config file."""
+    cfg = _load_config_file(config_path)
+    buckets = cfg.get("bucket_positions", {})
+    obj_cfg = cfg.get("object_parameters", {})
+    base_z = float(obj_cfg.get("normal_height", 220.0))
+    out = []
+    for key, color in [
+        ("blue_bucket_pos", "blue"),
+        ("red_bucket_pos", "red"),
+        ("yellow_bucket_pos", "yellow"),
+        ("white_bucket_pos", "white"),
+    ]:
+        pos = buckets.get(key)
+        if not isinstance(pos, list) or len(pos) < 2:
+            continue
+        out.append(
+            {
+                "object_type": "box",
+                "color": color,
+                "size": "big",
+                "position_mm": [float(pos[0]), float(pos[1]), base_z],
+                "source": "config",
+            }
+        )
+    return out
+
+
+def _send_dora_command(
+    node: Node, output_name: str, action: str, payload: Dict[str, Any]
+) -> str:
+    """Send a robot command via Dora."""
+    command_id = str(uuid.uuid4())
+    message = {"id": command_id, "action": action, "payload": payload}
+    node.send_output(
+        output_name,
+        pa.array([json.dumps(message)]),
+        metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+    )
+    return command_id
+
+
+def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]:
+    """Parse status payload from robot."""
+    if len(value) == 0:
+        return None
+    raw = value[0].as_py()
+    if not raw:
+        return None
+    try:
+        return json.loads(raw)
+    except Exception:
+        return None
+
+
+def _log(msg: str) -> None:
+    """Print a timestamped log message."""
+    timestamp = time.strftime("%H:%M:%S")
+    print(f"[voice_control {timestamp}] {msg}", flush=True)
+
+
+def main() -> None:
+    """Main entry point for the voice control node."""
+    _log("Starting voice control node...")
+
+    # Load configuration
+    cfg = load_voice_config()
+    api_cfg = load_api_config()
+
+    # Environment variables for I/O topics
+    objects_input = os.getenv("OBJECTS_INPUT", "objects")
+    voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in")
+    voice_out_output = os.getenv("VOICE_OUT_OUTPUT", "voice_out")
+    scene_output = os.getenv("SCENE_OUTPUT", "scene_update")
+    pose_input = os.getenv("POSE_INPUT", "tcp_pose")
+    status_input = os.getenv("STATUS_INPUT", "status")
+    command_output = os.getenv("COMMAND_OUTPUT", "robot_cmd")
+    image_input = os.getenv("IMAGE_INPUT", "image_annotated")
+    llm_provider = os.getenv("LLM_PROVIDER", "rules").lower()
+    config_file = os.getenv("CONFIG_FILE", "config.toml")
+
+    # Image dimensions (will be detected from first frame)
+    image_width = int(os.getenv("IMAGE_WIDTH", "1280"))
+    image_height = int(os.getenv("IMAGE_HEIGHT", "720"))
+
+    # Initial/home position for reset command
+    init_x = float(os.getenv("INIT_X", "300.0"))
+    init_y = float(os.getenv("INIT_Y", "0.0"))
+    init_z = float(os.getenv("INIT_Z", "250.0"))
+    init_roll = float(os.getenv("INIT_ROLL", "180.0"))
+    init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
+    init_yaw = float(os.getenv("INIT_YAW", "0.0"))
+
+    _log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm")
+    _log(f"Initial position: [{init_x}, {init_y}, {init_z}]")
+    _log(f"LLM provider: {llm_provider}")
+    _log(f"Dry run: {cfg.dry_run}")
+
+    # Initialize shared state
+    shared_state = SharedState()
+    state = shared_state.voice_state
+    state.static_objects = _load_bucket_objects(config_file)
+    pending_intents: Deque[Dict[str, Any]] = deque()
+
+    _log(f"Loaded {len(state.static_objects)} static objects from config")
+
+    # Queue initial position movement on startup (same as reiniciar)
+    init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes")
+    send_init_scene_reset = init_on_start  # Flag to send scene reset after node starts
+    if init_on_start:
+        _log(f"Startup: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
+        # Clear detected objects
+        state.latest_objects = []
+        state.latest_objects_at = None
+        # Queue vacuum off and move to home
+        state.queue.append(RobotStep(action="vacuum_off", payload={}))
+        state.queue.append(
+            RobotStep(
+                action="move_to_pose",
+                payload={
+                    "x": init_x,
+                    "y": init_y,
+                    "z": init_z,
+                    "roll": init_roll,
+                    "pitch": init_pitch,
+                    "yaw": init_yaw,
+                },
+            )
+        )
+
+    def command_handler(transcript: str) -> Dict[str, str]:
+        """Handle voice command and return response."""
+        _log(f"Voice input received: \"{transcript}\"")
+        llm_result = parse_command(transcript, llm_provider)
+        _log(f"Parse result: {llm_result}")
+
+        # Update debug state
+        shared_state.update_voice_input(transcript, llm_result, time.monotonic())
+
+        if llm_result.get("resultado") != "ok":
+            _log("Command not understood")
+            return {"text": "No entendi el comando", "status": "error"}
+
+        action = llm_result.get("accion", "error")
+        obj = llm_result.get("objeto", "no especificado")
+        color = llm_result.get("color", "no especificado")
+        size = llm_result.get("tamano", "no especificado")
+
+        _log(f"Intent: action={action}, object={obj}, color={color}, size={size}")
+
+        pending_intents.append(
+            {"action": action, "obj": obj, "color": color, "size": size}
+        )
+
+        # Add to history
+        shared_state.add_to_history({
+            "timestamp": time.time(),
+            "input": transcript,
+            "action": action,
+            "object": obj,
+            "color": color,
+            "size": size,
+        })
+
+        return {"text": f"Ok, voy a {action}", "status": "ok"}
+
+    # Set command callback for web interface
+    shared_state.set_command_callback(command_handler)
+
+    # Start web API server if enabled
+    if api_cfg.enabled:
+        start_api_server(shared_state, api_cfg)
+
+    # Create Dora node
+    node = Node()
+    _log("Dora node created, waiting for events...")
+
+    first_event = True
+    for event in node:
+        # Send scene reset on first event (startup)
+        if first_event and send_init_scene_reset:
+            first_event = False
+            scene_payload = json.dumps(
+                {"objects": list(state.static_objects), "reset": True}
+            )
+            node.send_output(
+                scene_output,
+                pa.array([scene_payload]),
+                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+            )
+            _log("Sent initial scene reset notification")
+        if event["type"] != "INPUT":
+            continue
+
+        # Handle voice input
+        if event["id"] == voice_in_input:
+            raw = event["value"][0].as_py() if len(event["value"]) else ""
+            if not raw:
+                continue
+            response = command_handler(raw)
+            node.send_output(
+                voice_out_output,
+                pa.array([json.dumps(response)]),
+                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+            )
+            continue
+
+        # Handle pose updates
+        if event["id"] == pose_input:
+            tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
+            if tcp_pose.size >= 6:
+                state.latest_pose = tcp_pose[:6].tolist()
+                state.latest_pose_at = time.monotonic()
+            continue
+
+        # Handle object detection updates
+        if event["id"] == objects_input:
+            raw = event["value"][0].as_py() if len(event["value"]) else ""
+            if raw:
+                try:
+                    payload = json.loads(raw)
+                    objects = payload.get("objects", [])
+                except Exception:
+                    objects = []
+                state.latest_objects = objects
+                state.latest_objects_at = time.monotonic()
+            continue
+
+        # Handle camera image
+        if event["id"] == image_input:
+            try:
+                # Get raw image data
+                img_data = event["value"].to_numpy()
+                # Reshape to image (assuming BGR format)
+                img = img_data.reshape((image_height, image_width, 3)).astype(np.uint8)
+                # Encode to JPEG
+                _, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 80])
+                shared_state.update_image(jpeg_data.tobytes(), time.monotonic())
+            except Exception as e:
+                # Log error but don't crash
+                pass
+            continue
+
+        # Handle robot status updates
+        if event["id"] == status_input:
+            payload = _parse_status_payload(event["value"])
+            if payload and state.pending_command:
+                if payload.get("command_id") == state.pending_command.get("id"):
+                    _log(f"Command completed: {state.pending_command.get('action')} (status={payload.get('status', 'ok')})")
+                    state.pending_command = None
+            continue
+
+        # Process pending intents
+        if pending_intents:
+            intent = pending_intents.popleft()
+            action = intent["action"]
+            obj = intent["obj"]
+            color = intent["color"]
+            size = intent["size"]
+
+            _log(f"Processing intent: {action} {obj} {color} {size}")
+
+            latest_pose = state.latest_pose
+            objects = list(state.latest_objects) + list(state.static_objects)
+            _log(f"Available objects: {len(state.latest_objects)} detected + {len(state.static_objects)} static")
+
+            if action in ("subir", "bajar") and latest_pose:
+                delta = cfg.step_mm if action == "subir" else -cfg.step_mm
+                target = np.array(latest_pose[:3], dtype=np.float64)
+                target[2] += delta
+                if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
+                    step = RobotStep(
+                        action="move_to_pose",
+                        payload={
+                            "x": float(target[0]),
+                            "y": float(target[1]),
+                            "z": float(target[2]),
+                            "roll": cfg.default_roll,
+                            "pitch": cfg.default_pitch,
+                            "yaw": cfg.default_yaw,
+                        },
+                    )
+                    state.queue.append(step)
+                    _log(f"Queued: move Z to {target[2]:.1f}mm (delta={delta:+.1f})")
+                else:
+                    _log(f"Target {target.tolist()} out of bounds, skipping")
+
+            elif action in ("ir", "tomar", "soltar"):
+                target_obj = None
+                if obj != "no especificado":
+                    target_name = _translate_target(obj, cfg.class_map)
+                    target_color = _translate_target(color, cfg.class_map)
+                    _log(f"Looking for: type={target_name}, color={target_color}")
+                    # Log available objects for debugging
+                    for o in objects:
+                        _log(f"  -> Available: {o.get('object_type')} {o.get('color')} {o.get('size')} at {o.get('position_mm')}")
+                    for o in objects:
+                        if o.get("object_type") == target_name:
+                            if color == "no especificado" or o.get("color") == target_color:
+                                if size == "no especificado" or o.get("size") == _translate_target(size, cfg.class_map):
+                                    target_obj = o
+                                    break
+                if target_obj:
+                    _log(f"Found target: {target_obj.get('object_type')} {target_obj.get('color')} at {target_obj.get('position_mm')}")
+                    pos = np.array(target_obj["position_mm"], dtype=np.float64)
+                    approach = pos.copy()
+                    approach[2] += cfg.tcp_offset_mm + cfg.approach_offset_mm
+                    target = pos.copy()
+                    target[2] += cfg.tcp_offset_mm
+                    if _within_bounds(approach, cfg.workspace_min, cfg.workspace_max):
+                        state.queue.append(
+                            RobotStep(
+                                action="move_to_pose",
+                                payload={
+                                    "x": float(approach[0]),
+                                    "y": float(approach[1]),
+                                    "z": float(approach[2]),
+                                    "roll": cfg.default_roll,
+                                    "pitch": cfg.default_pitch,
+                                    "yaw": cfg.default_yaw,
+                                },
+                            )
+                        )
+                        _log(f"Queued: approach pose at Z={approach[2]:.1f}mm")
+                    if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
+                        state.queue.append(
+                            RobotStep(
+                                action="move_to_pose",
+                                payload={
+                                    "x": float(target[0]),
+                                    "y": float(target[1]),
+                                    "z": float(target[2]),
+                                    "roll": cfg.default_roll,
+                                    "pitch": cfg.default_pitch,
+                                    "yaw": cfg.default_yaw,
+                                },
+                            )
+                        )
+                        _log(f"Queued: target pose at Z={target[2]:.1f}mm")
+                    if action == "tomar":
+                        state.queue.append(RobotStep(action="vacuum_on", payload={}))
+                        _log("Queued: vacuum_on")
+                    elif action == "soltar":
+                        state.queue.append(RobotStep(action="vacuum_off", payload={}))
+                        _log("Queued: vacuum_off")
+                else:
+                    _log(f"Target object not found: {obj} {color}")
+                    continue
+
+            elif action == "reiniciar":
+                _log(f"Reiniciar: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
+                # Turn off vacuum first
+                state.queue.append(RobotStep(action="vacuum_off", payload={}))
+                # Clear current detected objects (will be refreshed by detector)
+                state.latest_objects = []
+                state.latest_objects_at = None
+                _log("Cleared detected objects - waiting for fresh detection")
+                # Move to initial position
+                state.queue.append(
+                    RobotStep(
+                        action="move_to_pose",
+                        payload={
+                            "x": init_x,
+                            "y": init_y,
+                            "z": init_z,
+                            "roll": init_roll,
+                            "pitch": init_pitch,
+                            "yaw": init_yaw,
+                        },
+                    )
+                )
+                _log(f"Queued: vacuum_off + move to home")
+                # Send scene update to notify clients that scene was reset
+                scene_payload = json.dumps(
+                    {"objects": list(state.static_objects), "reset": True}
+                )
+                node.send_output(
+                    scene_output,
+                    pa.array([scene_payload]),
+                    metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+                )
+                _log("Sent scene reset notification")
+
+            _log(f"Queue size: {len(state.queue)}")
+
+        # Emit scene updates when objects change
+        if event["id"] == objects_input:
+            scene_payload = json.dumps(
+                {"objects": list(state.latest_objects) + list(state.static_objects)}
+            )
+            node.send_output(
+                scene_output,
+                pa.array([scene_payload]),
+                metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+            )
+
+        # Send queued robot steps one at a time
+        if state.pending_command is None and state.queue:
+            step = state.queue.popleft()
+            if cfg.dry_run:
+                _log(f"[DRY RUN] Would send: {step.action} {step.payload}")
+                state.pending_command = None
+                continue
+            cmd_id = _send_dora_command(node, command_output, step.action, step.payload)
+            state.pending_command = {"id": cmd_id, "action": step.action}
+            _log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={len(state.queue)}")
+
+            # Update debug state
+            shared_state.update_robot_command(
+                {"id": cmd_id, "action": step.action, "payload": step.payload},
+                time.monotonic(),
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/dora_voice_control/dora_voice_control/models.py
+++ b/dora_voice_control/dora_voice_control/models.py
@@ -0,0 +1,38 @@
+"""Pydantic models for the voice control API."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class CommandRequest(BaseModel):
+    """Request to send a voice command."""
+
+    text: str
+
+
+class CommandResponse(BaseModel):
+    """Response from a voice command."""
+
+    ok: bool
+    text: str
+    status: str
+
+
+class MoveRequest(BaseModel):
+    """Request to move to a position."""
+
+    x: float
+    y: float
+    z: float
+    roll: Optional[float] = 180.0
+    pitch: Optional[float] = 0.0
+    yaw: Optional[float] = 0.0
+
+
+class VacuumRequest(BaseModel):
+    """Request to control the vacuum."""
+
+    on: bool
--- a/dora_voice_control/dora_voice_control/parser.py
+++ b/dora_voice_control/dora_voice_control/parser.py
@@ -0,0 +1,118 @@
+"""Voice command parsing logic."""
+
+from __future__ import annotations
+
+import json
+import os
+import unicodedata
+from typing import Dict
+
+
+def normalize(text: str) -> str:
+    """Normalize text: lowercase, remove accents."""
+    text = text.lower().strip()
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join([c for c in text if not unicodedata.combining(c)])
+    return text
+
+
+def rule_parse(transcript: str) -> Dict[str, str]:
+    """Parse voice command using rule-based approach."""
+    text = normalize(transcript)
+
+    action = "error"
+    if any(w in text for w in ["reiniciar", "reinicia", "reset"]):
+        action = "reiniciar"
+    elif any(w in text for w in ["sube", "subir", "arriba"]):
+        action = "subir"
+    elif any(w in text for w in ["baja", "bajar", "abajo"]):
+        action = "bajar"
+    elif any(w in text for w in ["soltar", "deja", "dejar"]):
+        action = "soltar"
+    elif any(w in text for w in ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"]):
+        action = "tomar"
+    elif any(w in text for w in ["ir", "ve", "mover", "muevete", "acercar"]):
+        action = "ir"
+
+    color = "no especificado"
+    if "rojo" in text:
+        color = "rojo"
+    elif "azul" in text:
+        color = "azul"
+    elif "amarillo" in text:
+        color = "amarillo"
+    elif "blanco" in text:
+        color = "blanco"
+
+    obj = "no especificado"
+    if "estrella" in text:
+        obj = "estrella"
+    elif "cilindro" in text:
+        obj = "cilindro"
+    elif "cubo" in text:
+        obj = "cubo"
+    elif "caja" in text:
+        obj = "caja"
+
+    size = "no especificado"
+    if "grande" in text:
+        size = "grande"
+    elif "pequeno" in text or "pequeño" in text or "chico" in text:
+        size = "pequeno"
+
+    if action == "error":
+        return {"resultado": "error"}
+    return {
+        "resultado": "ok",
+        "accion": action,
+        "objeto": obj,
+        "color": color,
+        "tamano": size,
+    }
+
+
+def build_gemini_prompt(transcript: str) -> str:
+    """Build prompt for Gemini LLM parsing."""
+    return f"""Interpreta el siguiente comando de voz de un niño, convertido a texto, para controlar
+    un robot (manito). Asegúrate de responder con 'accion', 'objeto', 'color' y 'tamano'. Si el color
+    o el tamaño no están especificados, responde con 'no especificado'. Si no entiendes la frase,
+    responde con 'resultado: error'. En caso contrario, responde con 'resultado: ok'. Las acciones
+    posibles son 'bajar', 'subir', 'soltar', 'tomar', 'ir', 'reiniciar'. Los colores posibles son 'rojo',
+    'blanco','azul' y 'amarillo'. Los tamaños posibles son 'grande', 'pequeno'. Los posible objetos son estrella,
+    cilindro, cubo y caja; cualquier otro objeto es error.
+    Comando: "{transcript}"
+    Nota: Los comandos pueden incluir variaciones en la expresión y errores comunes en el lenguaje de
+    los niños. Normaliza la respuesta a las categorías establecidas. La salida es un json con los campos
+    'resultado', 'accion', 'objeto', 'color' y 'tamano'. Adicionalmente los ninos pueden decir tomar,chupar, succionar o similar para tomar un objeto.
+    """
+
+
+def parse_command(transcript: str, llm_provider: str = "rules") -> Dict[str, str]:
+    """Parse voice command using specified provider."""
+    if llm_provider == "gemini":
+        try:
+            from google import genai
+            from google.genai import types
+        except Exception:
+            return rule_parse(transcript)
+
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            return rule_parse(transcript)
+
+        try:
+            client = genai.Client(api_key=api_key)
+            prompt = build_gemini_prompt(transcript)
+            reply = client.models.generate_content(
+                model=os.getenv("GEMINI_MODEL", "gemini-2.0-flash"),
+                contents=prompt,
+                config=types.GenerateContentConfig(temperature=0.5),
+            )
+            raw = str(reply.text).replace("```json", "").replace("```", "")
+            return json.loads(raw)
+        except json.JSONDecodeError:
+            return {"resultado": "error"}
+        except Exception:
+            return rule_parse(transcript)
+    else:
+        return rule_parse(transcript)
--- a/dora_voice_control/dora_voice_control/state.py
+++ b/dora_voice_control/dora_voice_control/state.py
@@ -0,0 +1,158 @@
+"""Shared state management for voice control node."""
+
+from __future__ import annotations
+
+import threading
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Any, Deque, Dict, List, Optional
+
+
+@dataclass
+class RobotStep:
+    """A single step in the robot command queue."""
+
+    action: str
+    payload: Dict[str, Any]
+
+
+@dataclass
+class VoiceState:
+    """Runtime state for voice control."""
+
+    latest_pose: Optional[List[float]] = None
+    latest_pose_at: Optional[float] = None
+    latest_objects: List[Dict[str, Any]] = field(default_factory=list)
+    latest_objects_at: Optional[float] = None
+    static_objects: List[Dict[str, Any]] = field(default_factory=list)
+    pending_command: Optional[Dict[str, Any]] = None
+    queue: Deque[RobotStep] = field(default_factory=deque)
+
+
+@dataclass
+class DebugState:
+    """Debug information for the web interface."""
+
+    last_voice_input: Optional[str] = None
+    last_voice_input_at: Optional[float] = None
+    last_parse_result: Optional[Dict[str, Any]] = None
+    last_robot_command: Optional[Dict[str, Any]] = None
+    last_robot_command_at: Optional[float] = None
+    command_history: List[Dict[str, Any]] = field(default_factory=list)
+    error_log: List[Dict[str, Any]] = field(default_factory=list)
+    latest_image: Optional[bytes] = None
+    latest_image_at: Optional[float] = None
+
+
+class SharedState:
+    """Thread-safe shared state container."""
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self.voice_state = VoiceState()
+        self.debug_state = DebugState()
+        self._command_callback: Optional[Any] = None
+
+    def set_command_callback(self, callback: Any) -> None:
+        """Set callback for sending commands from web interface."""
+        with self._lock:
+            self._command_callback = callback
+
+    def get_command_callback(self) -> Optional[Any]:
+        """Get the command callback."""
+        with self._lock:
+            return self._command_callback
+
+    def get_status(self) -> Dict[str, Any]:
+        """Get current status for web interface."""
+        with self._lock:
+            vs = self.voice_state
+            ds = self.debug_state
+            return {
+                "has_pose": vs.latest_pose is not None,
+                "pose": vs.latest_pose,
+                "pose_age_ms": _age_ms(vs.latest_pose_at),
+                "object_count": len(vs.latest_objects),
+                "static_object_count": len(vs.static_objects),
+                "queue_size": len(vs.queue),
+                "has_pending_command": vs.pending_command is not None,
+                "pending_command": vs.pending_command,
+                "last_voice_input": ds.last_voice_input,
+                "last_voice_input_age_ms": _age_ms(ds.last_voice_input_at),
+                "last_parse_result": ds.last_parse_result,
+            }
+
+    def get_objects(self) -> Dict[str, Any]:
+        """Get detected and static objects."""
+        with self._lock:
+            return {
+                "detected": list(self.voice_state.latest_objects),
+                "static": list(self.voice_state.static_objects),
+            }
+
+    def get_queue(self) -> List[Dict[str, Any]]:
+        """Get the command queue."""
+        with self._lock:
+            return [{"action": s.action, "payload": s.payload} for s in self.voice_state.queue]
+
+    def get_history(self) -> List[Dict[str, Any]]:
+        """Get command history."""
+        with self._lock:
+            return list(self.debug_state.command_history[-50:])
+
+    def get_errors(self) -> List[Dict[str, Any]]:
+        """Get error log."""
+        with self._lock:
+            return list(self.debug_state.error_log[-50:])
+
+    def add_to_history(self, entry: Dict[str, Any]) -> None:
+        """Add entry to command history."""
+        with self._lock:
+            self.debug_state.command_history.append(entry)
+            if len(self.debug_state.command_history) > 100:
+                self.debug_state.command_history = self.debug_state.command_history[-100:]
+
+    def add_error(self, error: Dict[str, Any]) -> None:
+        """Add entry to error log."""
+        with self._lock:
+            self.debug_state.error_log.append(error)
+            if len(self.debug_state.error_log) > 100:
+                self.debug_state.error_log = self.debug_state.error_log[-100:]
+
+    def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None:
+        """Update last voice input info."""
+        with self._lock:
+            self.debug_state.last_voice_input = text
+            self.debug_state.last_voice_input_at = timestamp
+            self.debug_state.last_parse_result = parse_result
+
+    def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None:
+        """Update last robot command info."""
+        with self._lock:
+            self.debug_state.last_robot_command = command
+            self.debug_state.last_robot_command_at = timestamp
+
+    def update_image(self, image_bytes: bytes, timestamp: float) -> None:
+        """Update latest camera image."""
+        with self._lock:
+            self.debug_state.latest_image = image_bytes
+            self.debug_state.latest_image_at = timestamp
+
+    def get_image(self) -> Optional[bytes]:
+        """Get latest camera image."""
+        with self._lock:
+            return self.debug_state.latest_image
+
+    def get_image_age_ms(self) -> Optional[int]:
+        """Get age of latest image in milliseconds."""
+        with self._lock:
+            return _age_ms(self.debug_state.latest_image_at)
+
+
+def _age_ms(timestamp: Optional[float]) -> Optional[int]:
+    """Calculate age in milliseconds from monotonic timestamp."""
+    import time
+
+    if timestamp is None:
+        return None
+    return int((time.monotonic() - timestamp) * 1000)
--- a/dora_voice_control/dora_voice_control/templates.py
+++ b/dora_voice_control/dora_voice_control/templates.py
@@ -0,0 +1,700 @@
+"""HTML templates for the voice control web interface."""
+
+HTML_TEMPLATE = """<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Voice Control Debug</title>
+    <style>
+        * { box-sizing: border-box; margin: 0; padding: 0; }
+        body {
+            font-family: 'Segoe UI', system-ui, sans-serif;
+            background: #1a1a2e;
+            color: #eee;
+            min-height: 100vh;
+            padding: 20px;
+        }
+        .header {
+            text-align: center;
+            margin-bottom: 20px;
+            padding-bottom: 15px;
+            border-bottom: 1px solid #333;
+        }
+        .header h1 { color: #00d4ff; font-size: 1.5em; }
+        .header .status {
+            margin-top: 8px;
+            font-size: 0.9em;
+        }
+        .status-dot {
+            display: inline-block;
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            margin-right: 6px;
+        }
+        .status-dot.ok { background: #00ff88; }
+        .status-dot.warn { background: #ffaa00; }
+        .status-dot.error { background: #ff4444; }
+
+        .grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
+            gap: 15px;
+            max-width: 1400px;
+            margin: 0 auto;
+        }
+        .card {
+            background: #16213e;
+            border-radius: 8px;
+            padding: 15px;
+            border: 1px solid #0f3460;
+        }
+        .card h2 {
+            color: #00d4ff;
+            font-size: 1em;
+            margin-bottom: 12px;
+            padding-bottom: 8px;
+            border-bottom: 1px solid #0f3460;
+        }
+
+        /* Command Input */
+        .command-form {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 15px;
+        }
+        .command-form input {
+            flex: 1;
+            padding: 10px 12px;
+            border: 1px solid #0f3460;
+            border-radius: 6px;
+            background: #1a1a2e;
+            color: #fff;
+            font-size: 14px;
+        }
+        .command-form input:focus {
+            outline: none;
+            border-color: #00d4ff;
+        }
+        .btn {
+            padding: 10px 20px;
+            border: none;
+            border-radius: 6px;
+            cursor: pointer;
+            font-weight: 500;
+            transition: all 0.2s;
+        }
+        .btn-primary {
+            background: #00d4ff;
+            color: #000;
+        }
+        .btn-primary:hover { background: #00b8e0; }
+        .btn-primary:disabled { background: #555; color: #888; cursor: not-allowed; }
+        .btn-secondary {
+            background: #333;
+            color: #fff;
+        }
+        .btn-secondary:hover { background: #444; }
+        .btn-danger {
+            background: #ff4444;
+            color: #fff;
+        }
+        .btn-danger:hover { background: #cc3333; }
+        .btn-success {
+            background: #00ff88;
+            color: #000;
+        }
+        .btn-success:hover { background: #00cc6a; }
+
+        /* Quick Commands */
+        .quick-commands {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+        }
+        .quick-btn {
+            padding: 8px 14px;
+            background: #0f3460;
+            border: 1px solid #1a4a7a;
+            border-radius: 20px;
+            color: #00d4ff;
+            cursor: pointer;
+            font-size: 13px;
+            transition: all 0.2s;
+        }
+        .quick-btn:hover {
+            background: #1a4a7a;
+            border-color: #00d4ff;
+        }
+
+        /* Status Grid */
+        .status-grid {
+            display: grid;
+            grid-template-columns: 1fr 1fr;
+            gap: 10px;
+        }
+        .status-item {
+            background: #1a1a2e;
+            padding: 10px;
+            border-radius: 6px;
+        }
+        .status-item .label {
+            font-size: 11px;
+            color: #888;
+            text-transform: uppercase;
+            margin-bottom: 4px;
+        }
+        .status-item .value {
+            font-size: 14px;
+            font-weight: 500;
+        }
+        .status-item .value.ok { color: #00ff88; }
+        .status-item .value.warn { color: #ffaa00; }
+        .status-item .value.error { color: #ff4444; }
+
+        /* Pose Display */
+        .pose-grid {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 8px;
+        }
+        .pose-item {
+            background: #1a1a2e;
+            padding: 10px;
+            border-radius: 6px;
+            text-align: center;
+        }
+        .pose-item .label {
+            font-size: 11px;
+            color: #888;
+            margin-bottom: 4px;
+        }
+        .pose-item .value {
+            font-size: 16px;
+            font-weight: 600;
+            color: #00d4ff;
+            font-family: monospace;
+        }
+
+        /* Objects List */
+        .objects-list {
+            max-height: 300px;
+            overflow-y: auto;
+        }
+        .object-item {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            padding: 8px 10px;
+            background: #1a1a2e;
+            border-radius: 6px;
+            margin-bottom: 6px;
+            font-size: 13px;
+        }
+        .object-item .type { color: #00d4ff; font-weight: 500; }
+        .object-item .color-badge {
+            padding: 2px 8px;
+            border-radius: 10px;
+            font-size: 11px;
+        }
+        .color-badge.red { background: #ff4444; color: #fff; }
+        .color-badge.blue { background: #4488ff; color: #fff; }
+        .color-badge.yellow { background: #ffcc00; color: #000; }
+        .color-badge.white { background: #fff; color: #000; }
+        .object-item .pos {
+            font-family: monospace;
+            font-size: 11px;
+            color: #888;
+        }
+
+        /* Queue Display */
+        .queue-list {
+            max-height: 150px;
+            overflow-y: auto;
+        }
+        .queue-item {
+            display: flex;
+            align-items: center;
+            gap: 10px;
+            padding: 8px 10px;
+            background: #1a1a2e;
+            border-radius: 6px;
+            margin-bottom: 6px;
+            font-size: 13px;
+        }
+        .queue-item .index {
+            background: #0f3460;
+            color: #00d4ff;
+            padding: 2px 8px;
+            border-radius: 4px;
+            font-size: 11px;
+        }
+        .queue-item .action { color: #00ff88; font-weight: 500; }
+        .queue-item.pending { border-left: 3px solid #ffaa00; }
+
+        /* Log Display */
+        .log {
+            max-height: 250px;
+            overflow-y: auto;
+            font-family: monospace;
+            font-size: 12px;
+        }
+        .log-entry {
+            padding: 6px 10px;
+            border-bottom: 1px solid #0f3460;
+        }
+        .log-entry:last-child { border-bottom: none; }
+        .log-entry .time {
+            color: #666;
+            margin-right: 10px;
+        }
+        .log-entry.error { color: #ff4444; }
+        .log-entry.success { color: #00ff88; }
+        .log-entry.info { color: #00d4ff; }
+
+        /* Parse Result */
+        .parse-result {
+            background: #1a1a2e;
+            padding: 12px;
+            border-radius: 6px;
+            font-family: monospace;
+            font-size: 13px;
+        }
+        .parse-result .field {
+            display: flex;
+            justify-content: space-between;
+            padding: 4px 0;
+            border-bottom: 1px solid #0f3460;
+        }
+        .parse-result .field:last-child { border-bottom: none; }
+        .parse-result .key { color: #888; }
+        .parse-result .val { color: #00d4ff; }
+
+        /* Empty State */
+        .empty {
+            text-align: center;
+            color: #666;
+            padding: 20px;
+            font-style: italic;
+        }
+
+        /* Camera View */
+        .camera-container {
+            position: relative;
+            background: #000;
+            border-radius: 6px;
+            overflow: hidden;
+            min-height: 240px;
+        }
+        .camera-container img {
+            width: 100%;
+            height: auto;
+            display: block;
+        }
+        .camera-overlay {
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background: rgba(0,0,0,0.6);
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 11px;
+        }
+        .camera-overlay.ok { color: #00ff88; }
+        .camera-overlay.stale { color: #ffaa00; }
+        .camera-overlay.error { color: #ff4444; }
+        .no-image {
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            height: 240px;
+            color: #666;
+            font-style: italic;
+        }
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>Voice Control Debug Interface</h1>
+        <div class="status">
+            <span class="status-dot" id="status-dot"></span>
+            <span id="status-text">Connecting...</span>
+        </div>
+    </div>
+
+    <div class="grid">
+        <!-- Camera View -->
+        <div class="card" style="grid-column: span 2;">
+            <h2>Camera View <span id="camera-status" style="font-weight:normal;font-size:11px;color:#888;"></span></h2>
+            <div class="camera-container" id="camera-container">
+                <div class="no-image" id="no-image">No camera image available</div>
+                <img id="camera-img" style="display:none;" alt="Camera feed">
+                <div class="camera-overlay" id="camera-overlay"></div>
+            </div>
+        </div>
+
+        <!-- Command Input -->
+        <div class="card">
+            <h2>Send Command</h2>
+            <form class="command-form" id="command-form">
+                <input type="text" id="command-input" placeholder="Enter voice command (e.g., 'sube', 'agarra el cubo rojo')" autocomplete="off">
+                <button type="submit" class="btn btn-primary" id="btn-send">Send</button>
+            </form>
+            <div class="quick-commands">
+                <button class="quick-btn" onclick="sendQuick('sube')">Sube</button>
+                <button class="quick-btn" onclick="sendQuick('baja')">Baja</button>
+                <button class="quick-btn" onclick="sendQuick('agarra el cubo rojo')">Cubo Rojo</button>
+                <button class="quick-btn" onclick="sendQuick('agarra el cubo azul')">Cubo Azul</button>
+                <button class="quick-btn" onclick="sendQuick('suelta')">Soltar</button>
+                <button class="quick-btn" onclick="sendQuick('reinicia')">Reiniciar</button>
+            </div>
+        </div>
+
+        <!-- Status -->
+        <div class="card">
+            <h2>System Status</h2>
+            <div class="status-grid">
+                <div class="status-item">
+                    <div class="label">Pose Available</div>
+                    <div class="value" id="st-pose">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Pose Age</div>
+                    <div class="value" id="st-pose-age">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Objects Detected</div>
+                    <div class="value" id="st-objects">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Static Objects</div>
+                    <div class="value" id="st-static">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Queue Size</div>
+                    <div class="value" id="st-queue">--</div>
+                </div>
+                <div class="status-item">
+                    <div class="label">Pending Command</div>
+                    <div class="value" id="st-pending">--</div>
+                </div>
+            </div>
+        </div>
+
+        <!-- TCP Pose -->
+        <div class="card">
+            <h2>TCP Pose</h2>
+            <div class="pose-grid">
+                <div class="pose-item">
+                    <div class="label">X (mm)</div>
+                    <div class="value" id="pose-x">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Y (mm)</div>
+                    <div class="value" id="pose-y">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Z (mm)</div>
+                    <div class="value" id="pose-z">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Roll</div>
+                    <div class="value" id="pose-roll">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Pitch</div>
+                    <div class="value" id="pose-pitch">--</div>
+                </div>
+                <div class="pose-item">
+                    <div class="label">Yaw</div>
+                    <div class="value" id="pose-yaw">--</div>
+                </div>
+            </div>
+        </div>
+
+        <!-- Last Parse Result -->
+        <div class="card">
+            <h2>Last Parse Result</h2>
+            <div class="parse-result" id="parse-result">
+                <div class="empty">No command parsed yet</div>
+            </div>
+            <div style="margin-top: 10px; font-size: 12px; color: #888;">
+                <span>Last input: </span><span id="last-input">--</span>
+            </div>
+        </div>
+
+        <!-- Detected Objects -->
+        <div class="card">
+            <h2>Detected Objects</h2>
+            <div class="objects-list" id="objects-list">
+                <div class="empty">No objects detected</div>
+            </div>
+        </div>
+
+        <!-- Command Queue -->
+        <div class="card">
+            <h2>Command Queue</h2>
+            <div class="queue-list" id="queue-list">
+                <div class="empty">Queue is empty</div>
+            </div>
+            <div style="margin-top: 10px;">
+                <button class="btn btn-danger btn-sm" onclick="clearQueue()">Clear Queue</button>
+            </div>
+        </div>
+
+        <!-- Log -->
+        <div class="card" style="grid-column: span 2;">
+            <h2>Activity Log</h2>
+            <div class="log" id="log"></div>
+        </div>
+    </div>
+
+    <script>
+        const $ = id => document.getElementById(id);
+
+        async function fetchJson(url, opts = {}) {
+            try {
+                const res = await fetch(url, opts);
+                return await res.json();
+            } catch (e) {
+                return { error: e.message };
+            }
+        }
+
+        function log(msg, type = 'info') {
+            const logEl = $('log');
+            const time = new Date().toLocaleTimeString();
+            const entry = document.createElement('div');
+            entry.className = 'log-entry ' + type;
+            entry.innerHTML = '<span class="time">' + time + '</span>' + msg;
+            logEl.insertBefore(entry, logEl.firstChild);
+            if (logEl.children.length > 100) logEl.removeChild(logEl.lastChild);
+        }
+
+        async function updateStatus() {
+            const data = await fetchJson('/api/status');
+            if (data.error) {
+                $('status-dot').className = 'status-dot error';
+                $('status-text').textContent = 'Error: ' + data.error;
+                return;
+            }
+
+            $('status-dot').className = 'status-dot ok';
+            $('status-text').textContent = 'Connected';
+
+            $('st-pose').textContent = data.has_pose ? 'Yes' : 'No';
+            $('st-pose').className = 'value ' + (data.has_pose ? 'ok' : 'warn');
+
+            $('st-pose-age').textContent = data.pose_age_ms !== null ? data.pose_age_ms + 'ms' : '--';
+            $('st-pose-age').className = 'value ' + (data.pose_age_ms < 1000 ? 'ok' : 'warn');
+
+            $('st-objects').textContent = data.object_count;
+            $('st-static').textContent = data.static_object_count;
+            $('st-queue').textContent = data.queue_size;
+            $('st-queue').className = 'value ' + (data.queue_size > 0 ? 'warn' : 'ok');
+
+            $('st-pending').textContent = data.has_pending_command ? 'Yes' : 'No';
+            $('st-pending').className = 'value ' + (data.has_pending_command ? 'warn' : 'ok');
+
+            // Update pose
+            if (data.pose) {
+                $('pose-x').textContent = data.pose[0].toFixed(1);
+                $('pose-y').textContent = data.pose[1].toFixed(1);
+                $('pose-z').textContent = data.pose[2].toFixed(1);
+                $('pose-roll').textContent = data.pose[3].toFixed(1);
+                $('pose-pitch').textContent = data.pose[4].toFixed(1);
+                $('pose-yaw').textContent = data.pose[5].toFixed(1);
+            }
+
+            // Update last input
+            $('last-input').textContent = data.last_voice_input || '--';
+
+            // Update parse result
+            if (data.last_parse_result) {
+                let html = '';
+                for (const [k, v] of Object.entries(data.last_parse_result)) {
+                    html += '<div class="field"><span class="key">' + k + '</span><span class="val">' + v + '</span></div>';
+                }
+                $('parse-result').innerHTML = html;
+            }
+        }
+
+        async function updateObjects() {
+            const data = await fetchJson('/api/objects');
+            if (data.error) return;
+
+            const list = $('objects-list');
+            const detected = data.detected || [];
+            const staticObjs = data.static || [];
+
+            if (detected.length === 0 && staticObjs.length === 0) {
+                list.innerHTML = '<div class="empty">No objects detected</div>';
+                return;
+            }
+
+            let html = '';
+
+            // Detected objects
+            if (detected.length > 0) {
+                html += '<div style="font-size:11px;color:#00d4ff;margin-bottom:6px;">Detected (' + detected.length + ')</div>';
+                html += detected.map(obj => {
+                    const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
+                    const colorClass = obj.color || 'white';
+                    const conf = obj.confidence ? (obj.confidence * 100).toFixed(0) + '%' : '';
+                    const size = obj.size || '';
+                    return '<div class="object-item">' +
+                        '<span class="type">' + (obj.object_type || '?') + '</span>' +
+                        '<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
+                        '<span style="color:#888;font-size:10px;">' + size + '</span>' +
+                        '<span style="color:#00ff88;font-size:10px;">' + conf + '</span>' +
+                        '<span class="pos">[' + pos + ']</span>' +
+                        '</div>';
+                }).join('');
+            }
+
+            // Static objects
+            if (staticObjs.length > 0) {
+                html += '<div style="font-size:11px;color:#888;margin:8px 0 6px 0;">Static (' + staticObjs.length + ')</div>';
+                html += staticObjs.map(obj => {
+                    const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
+                    const colorClass = obj.color || 'white';
+                    return '<div class="object-item" style="opacity:0.7;">' +
+                        '<span class="type">' + (obj.object_type || '?') + '</span>' +
+                        '<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
+                        '<span class="pos">[' + pos + ']</span>' +
+                        '</div>';
+                }).join('');
+            }
+
+            list.innerHTML = html;
+        }
+
+        async function updateQueue() {
+            const data = await fetchJson('/api/queue');
+            if (data.error) return;
+
+            const list = $('queue-list');
+            if (!data.length) {
+                list.innerHTML = '<div class="empty">Queue is empty</div>';
+                return;
+            }
+
+            list.innerHTML = data.map((item, i) => {
+                const payload = JSON.stringify(item.payload || {});
+                return '<div class="queue-item">' +
+                    '<span class="index">' + (i + 1) + '</span>' +
+                    '<span class="action">' + item.action + '</span>' +
+                    '<span style="color:#888;font-size:11px">' + payload + '</span>' +
+                    '</div>';
+            }).join('');
+        }
+
+        async function sendCommand(text) {
+            if (!text.trim()) return;
+
+            $('btn-send').disabled = true;
+            log('Sending: "' + text + '"', 'info');
+
+            const res = await fetchJson('/api/command', {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ text: text })
+            });
+
+            $('btn-send').disabled = false;
+
+            if (res.ok) {
+                log('Response: ' + res.text, 'success');
+            } else {
+                log('Error: ' + (res.text || res.detail || 'Unknown error'), 'error');
+            }
+
+            $('command-input').value = '';
+            updateStatus();
+            updateQueue();
+        }
+
+        function sendQuick(text) {
+            $('command-input').value = text;
+            sendCommand(text);
+        }
+
+        async function clearQueue() {
+            log('Clearing queue...', 'info');
+            const res = await fetchJson('/api/queue/clear', { method: 'POST' });
+            if (res.ok) {
+                log('Queue cleared', 'success');
+            } else {
+                log('Failed to clear queue', 'error');
+            }
+            updateQueue();
+        }
+
+        // Form submit
+        $('command-form').addEventListener('submit', e => {
+            e.preventDefault();
+            sendCommand($('command-input').value);
+        });
+
+        // Camera update
+        let cameraErrorCount = 0;
+        async function updateCamera() {
+            const info = await fetchJson('/api/image/info');
+            const overlay = $('camera-overlay');
+            const img = $('camera-img');
+            const noImage = $('no-image');
+            const status = $('camera-status');
+
+            if (info.error || !info.has_image) {
+                cameraErrorCount++;
+                if (cameraErrorCount > 3) {
+                    img.style.display = 'none';
+                    noImage.style.display = 'flex';
+                    overlay.textContent = '';
+                    status.textContent = '(no feed)';
+                }
+                return;
+            }
+
+            cameraErrorCount = 0;
+            noImage.style.display = 'none';
+            img.style.display = 'block';
+
+            // Update image with cache-busting
+            const newSrc = '/api/image?t=' + Date.now();
+            if (img.src !== newSrc) {
+                img.src = newSrc;
+            }
+
+            // Update overlay
+            const ageMs = info.age_ms || 0;
+            if (ageMs < 500) {
+                overlay.textContent = 'LIVE';
+                overlay.className = 'camera-overlay ok';
+            } else if (ageMs < 2000) {
+                overlay.textContent = ageMs + 'ms';
+                overlay.className = 'camera-overlay stale';
+            } else {
+                overlay.textContent = 'STALE ' + (ageMs/1000).toFixed(1) + 's';
+                overlay.className = 'camera-overlay error';
+            }
+            status.textContent = '';
+        }
+
+        // Auto-refresh
+        setInterval(updateStatus, 500);
+        setInterval(updateObjects, 1000);
+        setInterval(updateQueue, 500);
+        setInterval(updateCamera, 100);
+
+        // Initial load
+        updateStatus();
+        updateObjects();
+        updateQueue();
+        updateCamera();
+        log('Interface loaded', 'info');
+    </script>
+</body>
+</html>
+"""
--- a/dora_voice_control/pyproject.toml
+++ b/dora_voice_control/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "dora-voice-control"
+version = "0.1.0"
+license = { file = "MIT" }
+authors = [{ name = "Dora" }]
+description = "Dora node for voice command control via WebSocket"
+
+requires-python = ">=3.8"
+
+dependencies = [
+  "dora-rs >= 0.3.9",
+  "numpy < 2.0.0",
+  "pyarrow >= 12.0.0",
+  "websockets >= 12.0",
+  "fastapi >= 0.109.0",
+  "uvicorn >= 0.27.0",
+  "pydantic >= 2.0.0",
+  "opencv-python >= 4.8.0",
+]
+
+[project.optional-dependencies]
+llm = ["google-genai"]
+
+[project.scripts]
+dora-voice-control = "dora_voice_control.main:main"