diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..120c1c7 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,6 @@ +{ + "MD013": { + "line_length": 120 + }, + "MD041": false +} diff --git a/AGENTS.md b/AGENTS.md index 797474a..7f07b49 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -38,6 +38,13 @@ Dora - Prefer small nodes with clear inputs/outputs. - Validate node I/O types and message schemas. +Building and Running +- Build dataflows with: `dora build --uv` +- The `--uv` flag uses uv for fast Python dependency management. +- Run dataflows with: `dora start ` +- Stop with: `dora stop` +- Check logs with: `dora logs` + Testing and Verification - Add or update tests when feasible. - For robot-facing changes, provide a staged validation plan: diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..229a10a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,44 @@ +# CLAUDE.md + +## Purpose + +Project guidelines for Claude Code working in this repository. + +## Project Overview + +Robotics project using Dora to control a ULite6 robot arm with Spanish voice commands for children. + +## Building and Running + +- Build: `dora build --uv` +- Start: `dora run --uv` + +## Key Packages + +- `dora_voice_control`: Voice processing and robot behavior +- `dora_yolo_object_detector`: YOLO-based object detection +- `dora_ulite6`: Robot arm control +- `dora_zed_cpp`: ZED camera (C++) +- `dora_iobridge`: WebSocket bridge for external clients + +## Robot Adapter + +Voice control uses a robot adapter pattern to support different robots: + +- `ROBOT_TYPE=vacuum`: Vacuum gripper (grab→vacuum_on, release→vacuum_off) +- `ROBOT_TYPE=gripper`: Parallel gripper (grab→gripper_close, release→gripper_open) + +To add a new robot, create a new adapter in `robot/adapter.py`. + +## Guidelines + +- Keep the event loop responsive. Run slow operations (LLM) in threads. +- Robot commands are serialized: one command at a time, wait for status before next. +- State is shared via `SharedState` (thread-safe). +- Handlers process events, tick callbacks run after each event. + +## Safety + +- This controls a real robot. Test changes carefully. +- Workspace bounds are enforced in behaviors. +- Use `DRY_RUN=true` to test without robot motion. diff --git a/README.md b/README.md index 477dfc9..fa050d9 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,27 @@ -## Getting started +## Getting Started -- Install it with: +Build and run: ```bash -uv venv -p 3.12 --seed -dora build [dataflow.yml] --uv -``` - -- Run it with: - -```bash -dora run [dataflow.yml] --uv +dora build --uv +dora run --uv ``` ## Dataflows | Dataflow | Description | |----------|-------------| -| `dataflow_ulite6.yml` | Ufactory ULite6 robot control with web UI| +| `dataflow_voice_control_ulite6_zed.yml` | Voice-controlled robot with object detection | +| `dataflow_ulite6.yml` | ULite6 robot control with web UI | | `dataflow_zed_cpp.yml` | ZED camera capture with image viewer | ## Nodes | Node | Description | |------|-------------| -| `ulite6` | UFactory Lite6 robot controller with REST API and web UI | -| `zed_camera_cpp` | ZED stereo camera capture (C++) | -| `image_viewer` | Display images from Dora stream | +| `dora_voice_control` | Voice command processing and robot behavior | +| `dora_yolo_object_detector` | YOLO-based object detection with ZED point cloud | +| `dora_ulite6` | UFactory Lite6 robot controller with REST API | +| `dora_zed_cpp` | ZED stereo camera capture (C++) | +| `dora_iobridge` | I/O bridge for voice websocket | +| `dora_image_viewer` | Display images from Dora stream | diff --git a/dataflow_voice_control_ulite6_zed.yml b/dataflow_voice_control_ulite6_zed.yml index a5e89f6..e79867f 100644 --- a/dataflow_voice_control_ulite6_zed.yml +++ b/dataflow_voice_control_ulite6_zed.yml @@ -43,22 +43,19 @@ nodes: path: dora_iobridge/dora_iobridge/main.py env: VIRTUAL_ENV: ./.venv - VOICE_HOST: "0.0.0.0" - VOICE_PORT: "8765" - VOICE_IN_OUTPUT: "voice_in" - VOICE_OUT_INPUT: "voice_out" - SCENE_INPUT: "scene_update" + WS_HOST: "0.0.0.0" + WS_PORT: "9001" inputs: - voice_out: voice/voice_out - scene_update: voice/scene_update + text_in: voice/voice_out + data_in: voice/scene_update tick: dora/timer/millis/100 outputs: - - voice_in + - text_out - id: detector build: | uv venv -p 3.12 --seed --allow-existing - uv pip install -e dora_detector - path: dora_detector/dora_detector/main.py + uv pip install -e dora_yolo_object_detector + path: dora_yolo_object_detector/dora_yolo_object_detector/main.py env: VIRTUAL_ENV: ./.venv IMAGE_INPUT: "image_bgr" @@ -90,6 +87,7 @@ nodes: path: dora_voice_control/dora_voice_control/main.py env: VIRTUAL_ENV: ./.venv + ROBOT_TYPE: "vacuum" # "vacuum" or "gripper" OBJECTS_INPUT: "objects" POSE_INPUT: "tcp_pose" STATUS_INPUT: "status" @@ -119,12 +117,12 @@ nodes: IMAGE_WIDTH: "1280" IMAGE_HEIGHT: "720" API_ENABLED: "true" - API_PORT: "8080" + API_PORT: "9002" inputs: objects: detector/objects tcp_pose: ulite6/tcp_pose status: ulite6/status - voice_in: iobridge/voice_in + voice_in: iobridge/text_out image_annotated: detector/image_annotated tick: dora/timer/millis/100 outputs: diff --git a/dora_iobridge/README.md b/dora_iobridge/README.md index 54d2c0e..ce73d44 100644 --- a/dora_iobridge/README.md +++ b/dora_iobridge/README.md @@ -1,178 +1,106 @@ # Dora IOBridge Node -A WebSocket server that bridges web clients with the Dora dataflow for real-time voice commands and scene updates. +Generic WebSocket bridge between web clients and Dora dataflow. -## Inputs/Outputs +## Dora Outputs (WebSocket → Dora) -| Input | Type | Description | -|----------------|--------|---------------------------------------| -| `voice_out` | JSON | Response from voice control node | -| `scene_update` | JSON | Scene objects from voice control | +| Output | Type | Description | +|-------------|--------|--------------------------| +| `text_out` | string | Text from clients | +| `audio_out` | bytes | WAV audio from clients | +| `data_out` | JSON | Generic data from clients| -| Output | Type | Description | -|----------------|--------|---------------------------------------| -| `voice_in` | string | Voice commands forwarded to Dora | +## Dora Inputs (Dora → WebSocket) + +| Input | Type | Description | +|-------------|--------|--------------------------| +| `text_in` | string | Text to broadcast | +| `audio_in` | bytes | WAV audio to broadcast | +| `data_in` | JSON | Generic data to broadcast| ## Environment Variables -```bash -VOICE_HOST=0.0.0.0 # Bind address -VOICE_PORT=8765 # Listen port +| Variable | Default | Description | +|-----------|-----------|--------------| +| `WS_HOST` | `0.0.0.0` | Bind address | +| `WS_PORT` | `8765` | Listen port | + +## WebSocket Endpoint + +```text +ws://{WS_HOST}:{WS_PORT} ``` -## Installation +Default: `ws://0.0.0.0:8765` -```bash -cd dora_iobridge -pip install -e . +## WebSocket Protocol + +### Client → Server + +| Type | Field | Description | +|---------|-----------|-----------------------| +| `text` | `content` | Text string | +| `audio` | `content` | Base64-encoded WAV | +| `data` | `payload` | Any JSON object | +| `ping` | - | Health check | + +Examples: + +```json +{"type": "text", "content": "hello world"} +{"type": "audio", "content": "UklGRi4AAABXQVZFZm10..."} +{"type": "data", "payload": {"key": "value"}} +{"type": "ping"} +``` + +### Server → Client + +| Type | Field | Description | +|---------|-----------|-----------------------| +| `text` | `content` | Text string | +| `audio` | `content` | Base64-encoded WAV | +| `data` | `payload` | Any JSON object | +| `pong` | - | Response to ping | +| `error` | `message` | Error description | + +Examples: + +```json +{"type": "text", "content": "response text"} +{"type": "audio", "content": "UklGRi4A...", "format": "wav"} +{"type": "data", "payload": {"objects": [...]}} +{"type": "pong"} +{"type": "error", "message": "Invalid JSON"} +``` + +## Dataflow Example + +```yaml +- id: iobridge + build: uv pip install -e dora_iobridge + path: dora_iobridge/dora_iobridge/main.py + env: + WS_HOST: "0.0.0.0" + WS_PORT: "8765" + inputs: + text_in: voice/voice_out + data_in: voice/scene_update + outputs: + - text_out ``` ## Testing -### Test with WebSocket (wscat) - -```bash -# Install wscat -npm install -g wscat - -# Connect to the server -wscat -c ws://localhost:8765 -``` - -### Test with curl (websocat) - ```bash # Install websocat -# Ubuntu: sudo apt install websocat -# macOS: brew install websocat +sudo apt install websocat -# Send a ping -echo '{"type": "ping"}' | websocat ws://localhost:8765 -# Response: {"type": "pong"} +# Connect +websocat ws://localhost:8765 -# Send a voice command -echo '{"type": "command", "text": "sube"}' | websocat ws://localhost:8765 +# Send text +{"type": "text", "content": "hello"} -# Request scene refresh -echo '{"type": "scene_refresh"}' | websocat ws://localhost:8765 -``` - -### Test with Python - -```python -import asyncio -import websockets -import json - -async def test_iobridge(): - uri = "ws://localhost:8765" - async with websockets.connect(uri) as ws: - # Test ping - await ws.send(json.dumps({"type": "ping"})) - response = await ws.recv() - print(f"Ping response: {response}") - - # Send command - await ws.send(json.dumps({ - "type": "command", - "text": "agarra el cubo rojo" - })) - - # Listen for responses - async for message in ws: - data = json.loads(message) - print(f"Received: {data}") - -asyncio.run(test_iobridge()) -``` - -### Test with curl (HTTP upgrade not supported directly) - -Since WebSocket requires an upgrade handshake, use this shell script: - -```bash -#!/bin/bash -# test_iobridge.sh - -# Using websocat for interactive testing -websocat ws://localhost:8765 < Server - -**Command (voice input)** -```json -{"type": "command", "text": "agarra el cubo rojo"} -``` - -**Ping (health check)** -```json +# Ping {"type": "ping"} ``` -Response: `{"type": "pong"}` - -**Scene Refresh** -```json -{"type": "scene_refresh"} -``` - -### Server -> Client (Broadcasts) - -**Command Response** -```json -{ - "type": "response", - "text": "Ok, voy a tomar", - "status": "ok" -} -``` - -**Scene Update** -```json -{ - "type": "scene_updated", - "objects": [ - { - "object_type": "cubo", - "color": "rojo", - "size": "big", - "position_mm": [150.0, 200.0, 280.0], - "source": "detection" - } - ] -} -``` - -## Dora Dataflow Configuration - -```yaml -nodes: - - id: iobridge - build: pip install -e ./dora_iobridge - path: dora_iobridge - inputs: - voice_out: voice_control/voice_out - scene_update: voice_control/scene_update - outputs: - - voice_in - env: - VOICE_HOST: "0.0.0.0" - VOICE_PORT: "8765" -``` - -```bash -dora up -dora start dataflow.yml -``` - -## Dependencies - -- dora-rs >= 0.3.9 -- pyarrow >= 12.0.0 -- websockets >= 12.0 diff --git a/dora_iobridge/dora_iobridge/main.py b/dora_iobridge/dora_iobridge/main.py index a074e0c..dbb67f3 100644 --- a/dora_iobridge/dora_iobridge/main.py +++ b/dora_iobridge/dora_iobridge/main.py @@ -1,8 +1,14 @@ -"""Dora node bridging WebSocket IO to Dora topics.""" +"""Dora node bridging WebSocket IO to Dora topics. + +Generic I/O bridge with fixed endpoints: +- Outputs (WS → Dora): text_out, audio_out, data_out +- Inputs (Dora → WS): text_in, audio_in, data_in +""" from __future__ import annotations import asyncio +import base64 import json import os import threading @@ -15,129 +21,164 @@ from websockets.server import serve, WebSocketServerProtocol class IoBridgeServer: + """WebSocket server that bridges clients to Dora dataflow.""" + def __init__(self, host: str, port: int): self.host = host self.port = port self.clients: Set[WebSocketServerProtocol] = set() - self.command_handler = None - self.scene_refresh_handler = None + # Callbacks for sending to Dora + self.on_text: Optional[callable] = None + self.on_audio: Optional[callable] = None + self.on_data: Optional[callable] = None async def handler(self, websocket: WebSocketServerProtocol): + """Handle WebSocket connection.""" self.clients.add(websocket) try: async for message in websocket: try: data = json.loads(message) except json.JSONDecodeError: - await websocket.send( - json.dumps({"type": "error", "text": "Invalid JSON message"}) - ) + await websocket.send(json.dumps({ + "type": "error", + "message": "Invalid JSON" + })) continue - response = await self._route_message(data, websocket) + response = await self._route_message(data) if response: await websocket.send(json.dumps(response)) finally: self.clients.discard(websocket) - async def _route_message( - self, data: Dict[str, Any], websocket: WebSocketServerProtocol - ) -> Optional[Dict[str, Any]]: + async def _route_message(self, data: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Route incoming message to appropriate handler.""" msg_type = data.get("type") - if msg_type == "command": - text = data.get("text", "") - if self.command_handler: - await self.command_handler(text) - return None - return {"type": "error", "text": "No command handler registered"} + if msg_type == "ping": return {"type": "pong"} - if msg_type == "scene_refresh": - if self.scene_refresh_handler: - objects = await self.scene_refresh_handler() - return {"type": "scene_updated", "objects": objects} - return {"type": "error", "text": "No scene handler registered"} - return {"type": "error", "text": f"Unknown message type: {msg_type}"} + + if msg_type == "text": + content = data.get("content", "") + if self.on_text and content: + self.on_text(content) + return None + + if msg_type == "audio": + content = data.get("content", "") # base64 WAV + if self.on_audio and content: + self.on_audio(content) + return None + + if msg_type == "data": + payload = data.get("payload", {}) + if self.on_data: + self.on_data(payload) + return None + + return {"type": "error", "message": f"Unknown type: {msg_type}"} async def broadcast(self, message: Dict[str, Any]): + """Broadcast message to all connected clients.""" if not self.clients: return payload = json.dumps(message) await asyncio.gather( - *[client.send(payload) for client in self.clients], return_exceptions=True + *[client.send(payload) for client in self.clients], + return_exceptions=True ) - async def send(self, message: Dict[str, Any], websocket: WebSocketServerProtocol): - await websocket.send(json.dumps(message)) - async def start(self): + """Start the WebSocket server.""" async with serve(self.handler, self.host, self.port): await asyncio.Future() def main() -> None: - host = os.getenv("VOICE_HOST", "0.0.0.0") - port = int(os.getenv("VOICE_PORT", "8765")) - input_topic = os.getenv("VOICE_IN_OUTPUT", "voice_in") - response_input = os.getenv("VOICE_OUT_INPUT", "voice_out") - scene_input = os.getenv("SCENE_INPUT", "scene_update") + """Main entry point.""" + host = os.getenv("WS_HOST", "0.0.0.0") + port = int(os.getenv("WS_PORT", "9000")) node = Node() server = IoBridgeServer(host, port) loop = asyncio.new_event_loop() - def push_command(text: str) -> None: + # Callbacks: WebSocket → Dora + def send_text(content: str) -> None: node.send_output( - input_topic, - pa.array([text]), - metadata={"encoding": "utf-8", "timestamp_ns": time.time_ns()}, + "text_out", + pa.array([content]), + {"encoding": "utf-8", "timestamp_ns": time.time_ns()}, ) - async def handle_scene_refresh(): - return [] + def send_audio(content_b64: str) -> None: + audio_bytes = base64.b64decode(content_b64) + node.send_output( + "audio_out", + pa.array([audio_bytes]), + {"encoding": "wav", "timestamp_ns": time.time_ns()}, + ) - def command_handler(text: str): - push_command(text) - return None + def send_data(payload: Dict[str, Any]) -> None: + node.send_output( + "data_out", + pa.array([json.dumps(payload)]), + {"encoding": "json", "timestamp_ns": time.time_ns()}, + ) - server.command_handler = command_handler - server.scene_refresh_handler = handle_scene_refresh + server.on_text = send_text + server.on_audio = send_audio + server.on_data = send_data + # Start WebSocket server in background thread def run_server(): asyncio.set_event_loop(loop) loop.run_until_complete(server.start()) threading.Thread(target=run_server, daemon=True).start() + timestamp = time.strftime("%H:%M:%S") + print(f"[iobridge {timestamp}] WebSocket server at ws://{host}:{port}", flush=True) + + # Dora event loop: Dora → WebSocket for event in node: if event["type"] != "INPUT": continue - if event["id"] == response_input: - raw = event["value"][0].as_py() if len(event["value"]) else "" - if not raw: - continue - try: - payload = json.loads(raw) - message = { - "type": "response", - "text": payload.get("text", ""), - "status": payload.get("status", "ok"), - } - except Exception: - message = {"type": "response", "text": raw, "status": "ok"} - asyncio.run_coroutine_threadsafe(server.broadcast(message), loop) + event_id = event["id"] + raw = event["value"][0].as_py() if len(event["value"]) else None + if raw is None: continue - if event["id"] == scene_input: - raw = event["value"][0].as_py() if len(event["value"]) else "" - if not raw: - continue + if event_id == "text_in": + # Text from Dora to broadcast + if isinstance(raw, str): + try: + # Try to parse as JSON for structured response + payload = json.loads(raw) + message = {"type": "text", "content": payload.get("text", raw)} + except json.JSONDecodeError: + message = {"type": "text", "content": raw} + else: + message = {"type": "text", "content": str(raw)} + asyncio.run_coroutine_threadsafe(server.broadcast(message), loop) + + elif event_id == "audio_in": + # Audio from Dora to broadcast (binary → base64) + if isinstance(raw, bytes): + content_b64 = base64.b64encode(raw).decode("utf-8") + else: + content_b64 = raw + message = {"type": "audio", "content": content_b64, "format": "wav"} + asyncio.run_coroutine_threadsafe(server.broadcast(message), loop) + + elif event_id == "data_in": + # Generic JSON data from Dora to broadcast try: - payload = json.loads(raw) - objects = payload.get("objects", []) - message = {"type": "scene_updated", "objects": objects} - except Exception: - message = {"type": "scene_updated", "objects": []} + payload = json.loads(raw) if isinstance(raw, str) else raw + except json.JSONDecodeError: + payload = {"raw": raw} + message = {"type": "data", "payload": payload} asyncio.run_coroutine_threadsafe(server.broadcast(message), loop) diff --git a/dora_ulite6/dora_ulite6/main.py b/dora_ulite6/dora_ulite6/main.py index 8c8c99a..79551dc 100644 --- a/dora_ulite6/dora_ulite6/main.py +++ b/dora_ulite6/dora_ulite6/main.py @@ -840,6 +840,7 @@ def main() -> None: target=run_uvicorn, args=(app, api_host, api_port), daemon=True ) api_thread.start() + _log(f"Web interface at http://{api_host}:{api_port}") # Dora event loop - only handles tick for state publishing for event in node: diff --git a/dora_voice_control/PLAN.md b/dora_voice_control/PLAN.md new file mode 100644 index 0000000..13d4677 --- /dev/null +++ b/dora_voice_control/PLAN.md @@ -0,0 +1,236 @@ +# Voice Control Simplification Plan + +## Current State + +**30 files** across 7 directories (~3300 lines): + +``` +dora_voice_control/ +├── behaviors/ # 2 files +│ ├── __init__.py +│ └── base.py +├── node/ # 5 files +│ ├── __init__.py +│ ├── adapter.py +│ ├── context.py +│ ├── dispatcher.py +│ └── logging.py +├── robot/ # 6 files +│ ├── __init__.py +│ ├── adapter.py # VacuumGripperAdapter, ParallelGripperAdapter +│ ├── command_queue.py +│ ├── image_handler.py +│ ├── pose_handler.py +│ └── status_handler.py +├── scene/ # 4 files +│ ├── __init__.py +│ ├── notifier.py +│ ├── objects_handler.py +│ └── state.py +├── utils/ # 3 files +│ ├── __init__.py +│ ├── config.py +│ └── state.py +├── voice/ # 4 files +│ ├── __init__.py +│ ├── handler.py +│ ├── intent.py +│ └── parser.py +├── web/ # 4 files +│ ├── __init__.py +│ ├── api.py +│ ├── models.py +│ └── templates.py +├── __init__.py +└── main.py +``` + +## Problems + +1. **Too many small files** - Many files are <50 lines +2. **Robot-specific code scattered** - Adapter in robot/, behavior in behaviors/ +3. **Hard to add new robots** - Need to touch multiple directories +4. **Deep import chains** - `from ..node import ...` + +--- + +## Proposed Structure + +**12 files** across 3 directories: + +``` +dora_voice_control/ +├── __init__.py +├── main.py +├── node.py # Dora infrastructure (adapter, context, dispatcher, logger) +├── handlers.py # All event handlers (voice, robot, scene) +├── scene.py # SceneState, SceneObject +├── state.py # SharedState, RobotStep, config loading +├── web.py # API server (api + models + templates) +└── behaviors/ + ├── __init__.py # get_behavior() factory + ├── base.py # ActionContext, ActionInfo, RobotBehavior base + └── littlehand/ + ├── __init__.py + └── robot.py # LittlehandBehavior + VacuumGripperAdapter +``` + +--- + +## File Consolidation Map + +### `node.py` (merge 5 files) + +| From | Content | +|------|---------| +| `node/logging.py` | VoiceControlLogger | +| `node/adapter.py` | NodePort, DoraNodeAdapter, MockNodeAdapter | +| `node/context.py` | OutputPort, NodeContext | +| `node/dispatcher.py` | EventHandler, EventDispatcher | + +### `handlers.py` (merge 7 files) + +| From | Content | +|------|---------| +| `voice/handler.py` | VoiceInputHandler | +| `voice/intent.py` | Intent, IntentQueue | +| `robot/pose_handler.py` | PoseHandler | +| `robot/status_handler.py` | StatusHandler | +| `robot/image_handler.py` | ImageHandler | +| `robot/command_queue.py` | CommandQueueService | +| `scene/objects_handler.py` | ObjectsHandler | +| `scene/notifier.py` | SceneNotifier | + +### `scene.py` (merge 2 files) + +| From | Content | +|------|---------| +| `scene/state.py` | SceneState, SceneObject | + +Already consolidated - just move to root. + +### `state.py` (merge 2 files) + +| From | Content | +|------|---------| +| `utils/state.py` | SharedState, RobotStep, VoiceState, DebugState | +| `utils/config.py` | VoiceConfig, ApiConfig, LLMConfig, loaders | + +### `web.py` (merge 3 files) + +| From | Content | +|------|---------| +| `web/api.py` | create_api(), start_api_server() | +| `web/models.py` | CommandRequest, CommandResponse | +| `web/templates.py` | HTML_TEMPLATE | + +### `behaviors/littlehand/robot.py` (new) + +| From | Content | +|------|---------| +| `robot/adapter.py` | VacuumGripperAdapter (robot-specific) | +| `behaviors/base.py` | Action implementations (robot-specific) | + +Robot adapter moves INTO the behavior folder. + +--- + +## Per-Robot Folder Structure + +Each robot gets its own folder with everything it needs: + +``` +behaviors/ +├── __init__.py # get_behavior(robot_type) factory +├── base.py # Shared base classes +│ - ActionContext +│ - ActionInfo +│ - RobotBehavior (abstract) +│ +└── littlehand/ # Vacuum gripper robot + ├── __init__.py + └── robot.py + - VacuumGripperAdapter + - LittlehandBehavior (actions, LLM signature) + - ACTIONS dict + - CommandSignature (DSPy) +``` + +### Adding a New Robot (e.g., "gripper_arm") + +1. Create `behaviors/gripper_arm/` +2. Create `robot.py` with: + - `ParallelGripperAdapter` + - `GripperArmBehavior` + - Custom actions +3. Register in `behaviors/__init__.py` + +No need to touch other files. + +--- + +## Implementation Steps + +### Phase 1: Consolidate node/ → node.py + +1. Create `node.py` merging adapter, context, dispatcher, logging +2. Update imports in all files +3. Delete `node/` directory + +### Phase 2: Consolidate handlers + +1. Create `handlers.py` merging all handlers +2. Update imports +3. Delete `voice/`, `robot/` directories (except adapter) + +### Phase 3: Consolidate utils/ → state.py + +1. Create `state.py` merging config and state +2. Update imports +3. Delete `utils/` directory + +### Phase 4: Move scene/ → scene.py + +1. Move `scene/state.py` to `scene.py` +2. Update imports +3. Delete `scene/` directory + +### Phase 5: Consolidate web/ → web.py + +1. Create `web.py` merging api, models, templates +2. Update imports +3. Delete `web/` directory + +### Phase 6: Restructure behaviors per robot + +1. Create `behaviors/littlehand/robot.py` +2. Move VacuumGripperAdapter from robot/adapter.py +3. Move action implementations from behaviors/base.py +4. Update `behaviors/__init__.py` factory +5. Delete old robot/adapter.py + +### Phase 7: Update main.py + +1. Update all imports to new structure +2. Simplify initialization + +--- + +## Final Structure Comparison + +| Before | After | +|--------|-------| +| 30 files | 12 files | +| 7 directories | 3 directories | +| Robot code scattered | Robot code in one folder | +| Deep imports | Flat imports | + +--- + +## Benefits + +1. **Easier navigation** - Fewer files to search +2. **Self-contained robots** - One folder per robot type +3. **Simpler imports** - `from .node import ...` instead of `from ..node import ...` +4. **Easier testing** - Mock one module instead of many +5. **Clear ownership** - Robot-specific code in robot folder diff --git a/dora_voice_control/README.md b/dora_voice_control/README.md index 15af54f..a4dfdcd 100644 --- a/dora_voice_control/README.md +++ b/dora_voice_control/README.md @@ -1,131 +1,129 @@ # Dora Voice Control Node -A Dora node that processes Spanish voice commands from children and translates them into robot actions (movement, grasping, releasing objects). Includes a web debug interface. +Dora node that processes Spanish voice commands and translates them into robot actions. Supports multiple robot types via robot subfolders. ## Features -- Spanish voice command parsing (rule-based or Gemini LLM) +- Spanish voice command parsing (rule-based or LLM) +- Robot adapter pattern for different gripper types - Real-time web debug interface - Command queue management - Workspace bounds validation -- Object detection integration ## File Structure -``` +```text dora_voice_control/ -├── __init__.py -├── main.py # Main Dora node entry point -├── api.py # FastAPI web server -├── config.py # Configuration management -├── models.py # Pydantic request/response models -├── parser.py # Voice command parsing logic -├── state.py # Shared state management -└── templates.py # HTML template for web interface +├── main.py # Thin orchestrator +│ +├── core/ # Shared logic +│ ├── behavior.py # RobotBehavior with actions +│ ├── config.py # Configuration classes +│ ├── node.py # Dora adapter + dispatcher + context +│ ├── robot.py # RobotAdapter base +│ ├── robot_io.py # Pose/status/image handlers + command queue +│ ├── scene.py # Scene state + notifier + objects handler +│ ├── state.py # Thread-safe shared state +│ └── voice.py # Voice input + parsing + intents +│ +├── robots/ # Robot-specific implementations +│ └── littlehand/ # Vacuum gripper robot +│ ├── adapter.py # Vacuum adapter +│ ├── actions.py # Action vocabulary +│ └── behavior.py # Behavior binding +│ +└── web/ # Web interface + ├── api.py # FastAPI server + ├── models.py # Pydantic models + └── templates.py # HTML template ``` -## Web Debug Interface +## Robot Adapters -Access the debug interface at `http://localhost:8080` (default). +Set `ROBOT_TYPE` to select the robot package: -Features: -- Real-time status monitoring (pose, objects, queue) -- Send manual voice commands -- Quick command buttons -- View parse results -- Command history -- Clear queue +| Type | Grab Command | Release Command | +|------|--------------|-----------------| +| `littlehand` (alias: `vacuum`) | `vacuum_on` | `vacuum_off` | + +To add a new robot, create a new subfolder under `robots/` with its adapter and behavior, then register it in `robots/__init__.py`. ## Inputs/Outputs -| Input | Type | Description | -|---------------|--------|------------------------------------------| -| `voice_in` | string | Text transcription of voice command | -| `tcp_pose` | array | Current robot pose [x, y, z, roll, pitch, yaw] | -| `objects` | JSON | Detected objects from vision system | -| `status` | JSON | Command execution status from robot | +| Input | Type | Description | +|-------|------|-------------| +| `voice_in` | string | Voice command text | +| `tcp_pose` | array | Robot pose [x, y, z, roll, pitch, yaw] | +| `objects` | JSON | Detected objects | +| `status` | JSON | Command execution status | +| `image_annotated` | array | Camera image | -| Output | Type | Description | -|---------------|--------|------------------------------------------| -| `robot_cmd` | JSON | Robot command with action and payload | -| `voice_out` | JSON | Response confirmation to user | -| `scene_update`| JSON | Updated scene with all visible objects | +| Output | Type | Description | +|--------|------|-------------| +| `robot_cmd` | JSON | Robot command | +| `voice_out` | JSON | Response to user | +| `scene_update` | JSON | Scene state | ## Supported Commands (Spanish) -| Command | Action | Example | -|---------------|----------------|--------------------------------| -| `subir` | Move up | "sube" | -| `bajar` | Move down | "baja" | -| `tomar` | Grab object | "agarra el cubo rojo" | -| `soltar` | Release object | "suelta en la caja azul" | -| `ir` | Go to object | "ve al cilindro" | -| `reiniciar` | Reset | "reinicia" | +| Command | Action | Example | +|---------|--------|---------| +| `subir` | Move up | "sube" | +| `bajar` | Move down | "baja" | +| `tomar` | Grab object | "agarra el cubo rojo" | +| `soltar` | Release object | "suelta en la caja azul" | +| `ir` | Go to object | "ve al cilindro" | +| `reiniciar` | Reset | "reinicia" | ## Environment Variables ```bash -# Web API Server -API_ENABLED=true # Enable/disable web interface -API_HOST=0.0.0.0 # Bind address -API_PORT=8080 # Listen port +# Robot Configuration +ROBOT_TYPE=littlehand # "littlehand" (alias: "vacuum") + +# Web API +API_ENABLED=true +API_PORT=9001 # TCP Parameters -TCP_OFFSET_MM=63.0 # Z-offset to object surface -APPROACH_OFFSET_MM=50.0 # Safe approach distance above object -STEP_MM=20.0 # Distance for up/down increments +TCP_OFFSET_MM=63.0 +APPROACH_OFFSET_MM=50.0 +STEP_MM=20.0 -# LLM Configuration (optional) -LLM_PROVIDER=rules # "rules" or "gemini" -GOOGLE_API_KEY=your_key # Required if using gemini -GEMINI_MODEL=gemini-2.0-flash +# LLM (optional) +LLM_PROVIDER=rules # "rules", "gemini", "ollama" +GOOGLE_API_KEY=your_key -# Workspace Safety (optional) -WORKSPACE_MIN_X=-300 -WORKSPACE_MAX_X=300 -WORKSPACE_MIN_Y=-300 -WORKSPACE_MAX_Y=300 +# Initial Position +INIT_ON_START=true +INIT_X=300.0 +INIT_Y=0.0 +INIT_Z=350.0 + +# Safety +DRY_RUN=false WORKSPACE_MIN_Z=0 WORKSPACE_MAX_Z=500 - -# Misc -DRY_RUN=false # Skip sending robot commands ``` -## Installation +## Web Debug Interface + +Access at `http://localhost:8080`: + +- Camera view with detections +- Real-time status (pose, objects, queue) +- Send manual commands +- View parse results + +## API Endpoints ```bash -cd dora_voice_control -pip install -e . - -# With LLM support -pip install -e ".[llm]" -``` - -## Testing - -### Web Interface - -```bash -# Start the node (standalone for testing) -python -m dora_voice_control.main - -# Open in browser -open http://localhost:8080 -``` - -### API Endpoints - -```bash -# Get status +# Status curl http://localhost:8080/api/status -# Get objects +# Objects curl http://localhost:8080/api/objects -# Get queue -curl http://localhost:8080/api/queue - # Send command curl -X POST http://localhost:8080/api/command \ -H "Content-Type: application/json" \ @@ -135,77 +133,30 @@ curl -X POST http://localhost:8080/api/command \ curl -X POST http://localhost:8080/api/queue/clear ``` -### Python Test - -```python -from dora_voice_control.parser import rule_parse, normalize - -# Test command parsing -text = "agarra el cubo rojo grande" -result = rule_parse(normalize(text)) -print(result) -# {'resultado': 'ok', 'accion': 'tomar', 'objeto': 'cubo', 'color': 'rojo', 'tamano': 'grande'} -``` - -## Dora Dataflow Configuration +## Dataflow Example ```yaml -nodes: - - id: voice_control - build: pip install -e ./dora_voice_control - path: dora_voice_control - inputs: - voice_in: iobridge/voice_in - tcp_pose: robot/tcp_pose - objects: detector/objects - status: robot/status - outputs: - - robot_cmd - - voice_out - - scene_update - env: - API_ENABLED: "true" - API_PORT: "8080" - DRY_RUN: "false" +- id: voice + build: uv pip install -e dora_voice_control + path: dora_voice_control/dora_voice_control/main.py + env: + ROBOT_TYPE: "vacuum" + API_ENABLED: "true" + inputs: + voice_in: iobridge/text_out + tcp_pose: robot/tcp_pose + objects: detector/objects + status: robot/status + outputs: + - robot_cmd + - voice_out + - scene_update ``` -## Message Examples +## Adding a New Robot -### Input: voice_in -``` -"sube" -"agarra el cubo rojo" -"suelta en la caja azul" -``` - -### Output: robot_cmd -```json -{ - "id": "550e8400-e29b-41d4-a716-446655440000", - "action": "move_to_pose", - "payload": { - "x": 150.0, - "y": 200.0, - "z": 280.0, - "roll": 180.0, - "pitch": 0.0, - "yaw": 0.0 - } -} -``` - -### Output: voice_out -```json -{"text": "Ok, voy a subir", "status": "ok"} -{"text": "No entendi el comando", "status": "error"} -``` - -## Dependencies - -- dora-rs >= 0.3.9 -- numpy < 2.0.0 -- pyarrow >= 12.0.0 -- fastapi >= 0.109.0 -- uvicorn >= 0.27.0 -- pydantic >= 2.0.0 -- google-genai (optional, for Gemini mode) +1) Create `dora_voice_control/dora_voice_control/robots//` with: +- `adapter.py` implementing a `RobotAdapter` +- `actions.py` defining action aliases (can reuse defaults) +- `behavior.py` binding the behavior class +2) Register it in `dora_voice_control/dora_voice_control/robots/__init__.py` diff --git a/dora_voice_control/dora_voice_control/core/__init__.py b/dora_voice_control/dora_voice_control/core/__init__.py new file mode 100644 index 0000000..8b0277d --- /dev/null +++ b/dora_voice_control/dora_voice_control/core/__init__.py @@ -0,0 +1,65 @@ +"""Core modules for voice control node.""" + +from .config import ( + VoiceConfig, + ApiConfig, + LLMConfig, + JPEG_QUALITY, + load_voice_config, + load_api_config, + load_llm_config, +) +from .state import SharedState, RobotStep, VoiceState, DebugState +from .node import ( + VoiceControlLogger, + NodePort, + DoraNodeAdapter, + MockNodeAdapter, + EventDispatcher, + OutputPort, + NodeContext, +) +from .scene import SceneState, SceneObject, SceneNotifier, ObjectsHandler +from .voice import Intent, IntentQueue, VoiceInputHandler, normalize, rule_parse +from .behavior import ActionContext, ActionInfo, RobotBehavior, DEFAULT_ACTIONS +from .robot import RobotAdapter +from .robot_io import PoseHandler, StatusHandler, ImageHandler, CommandQueueService + +__all__ = [ + "VoiceConfig", + "ApiConfig", + "LLMConfig", + "JPEG_QUALITY", + "load_voice_config", + "load_api_config", + "load_llm_config", + "SharedState", + "RobotStep", + "VoiceState", + "DebugState", + "VoiceControlLogger", + "NodePort", + "DoraNodeAdapter", + "MockNodeAdapter", + "EventDispatcher", + "OutputPort", + "NodeContext", + "SceneState", + "SceneObject", + "SceneNotifier", + "ObjectsHandler", + "Intent", + "IntentQueue", + "VoiceInputHandler", + "normalize", + "rule_parse", + "ActionContext", + "ActionInfo", + "RobotBehavior", + "DEFAULT_ACTIONS", + "RobotAdapter", + "PoseHandler", + "StatusHandler", + "ImageHandler", + "CommandQueueService", +] diff --git a/dora_voice_control/dora_voice_control/core/behavior.py b/dora_voice_control/dora_voice_control/core/behavior.py new file mode 100644 index 0000000..f2cb24d --- /dev/null +++ b/dora_voice_control/dora_voice_control/core/behavior.py @@ -0,0 +1,339 @@ +"""Base classes for robot behaviors.""" + +from __future__ import annotations + +import os +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, TYPE_CHECKING + +from .config import VoiceConfig, LLMConfig +from .robot import RobotAdapter +from .state import RobotStep, SharedState +from .voice import rule_parse + +if TYPE_CHECKING: + from .scene import SceneState, SceneObject + +# Try to import dspy (optional dependency) +try: + import dspy + DSPY_AVAILABLE = True +except ImportError: + dspy = None # type: ignore + DSPY_AVAILABLE = False + + +def _log(msg: str) -> None: + """Print a timestamped log message.""" + timestamp = time.strftime("%H:%M:%S") + print(f"[behaviors {timestamp}] {msg}", flush=True) + + +@dataclass +class ActionContext: + """Context for action execution.""" + + pose: Optional[List[float]] # [x, y, z, roll, pitch, yaw] + target: Optional["SceneObject"] # Target object + scene: "SceneState" # Scene state with all objects + config: VoiceConfig + home_pose: Dict[str, float] + shared_state: SharedState + + +@dataclass +class ActionInfo: + """Metadata for an action.""" + + name: str + aliases: List[str] = field(default_factory=list) + requires_object: bool = False + requires_pose: bool = False + description: str = "" + + +# Default actions for pick-and-place robots +DEFAULT_ACTIONS: Dict[str, ActionInfo] = { + "subir": ActionInfo( + name="subir", + aliases=["sube", "arriba"], + requires_pose=True, + description="Subir el robot", + ), + "bajar": ActionInfo( + name="bajar", + aliases=["baja", "abajo"], + requires_pose=True, + description="Bajar el robot", + ), + "ir": ActionInfo( + name="ir", + aliases=["ve", "mover", "muevete", "acercar"], + requires_object=True, + description="Ir hacia un objeto", + ), + "tomar": ActionInfo( + name="tomar", + aliases=["toma", "agarra", "agarrar", "coger", "chupar", "succionar"], + requires_object=True, + description="Tomar un objeto", + ), + "soltar": ActionInfo( + name="soltar", + aliases=["deja", "dejar"], + requires_object=True, + description="Soltar el objeto", + ), + "reiniciar": ActionInfo( + name="reiniciar", + aliases=["reinicia", "reset"], + requires_pose=False, + requires_object=False, + description="Reiniciar a posicion inicial", + ), +} + + +class RobotBehavior(ABC): + """ + Robot behavior with configurable robot adapter. + + Uses a RobotAdapter to translate generic commands (grab, release, move) + to robot-specific commands (vacuum_on, gripper_close, etc.). + """ + + # Class-level action definitions + ACTIONS: ClassVar[Dict[str, ActionInfo]] = DEFAULT_ACTIONS + + # Supported language + LANGUAGE: ClassVar[str] = "es" + + # DSPy signature class (override in subclasses) + CommandSignature: ClassVar[type] = None # type: ignore + + def __init__( + self, + config: VoiceConfig, + robot_adapter: RobotAdapter, + llm_config: Optional[LLMConfig] = None, + ) -> None: + self.config = config + self.robot_adapter = robot_adapter + self._alias_map: Dict[str, str] = {} + self._predictor: Optional[Any] = None + self._build_alias_map() + if llm_config and llm_config.provider != "rules": + self._init_dspy(llm_config) + + def _init_dspy(self, llm_config: LLMConfig) -> None: + """Initialize DSPy predictor for this behavior.""" + if not DSPY_AVAILABLE: + _log("DSPy not available, falling back to rules") + return + if self.CommandSignature is None: + _log("No CommandSignature defined, falling back to rules") + return + try: + lm = self._create_lm(llm_config) + if lm: + dspy.configure(lm=lm) + self._predictor = dspy.Predict(self.CommandSignature) + _log(f"DSPy initialized with {llm_config.provider}/{llm_config.model}") + except Exception as e: + _log(f"Failed to initialize DSPy: {e}") + + def _create_lm(self, config: LLMConfig) -> Optional[Any]: + """Create DSPy language model.""" + if not DSPY_AVAILABLE: + return None + if config.provider == "gemini": + api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") + return dspy.LM( + f"gemini/{config.model}", + api_key=api_key, + temperature=config.temperature, + ) + if config.provider == "ollama": + return dspy.LM( + f"ollama_chat/{config.model}", + api_base=config.api_base, + api_key="", + ) + if config.provider == "openai": + return dspy.LM( + f"openai/{config.model}", + temperature=config.temperature, + ) + return None + + def parse_command(self, transcript: str) -> Dict[str, str]: + """Parse voice command using DSPy or fallback to rules.""" + if self._predictor: + try: + result = self._predictor(comando=transcript) + return { + "resultado": "ok" if result.accion != "error" else "error", + "accion": result.accion, + "objeto": result.objeto, + "color": result.color, + "tamano": result.tamano, + } + except Exception as e: + _log(f"DSPy parsing failed: {e}, falling back to rules") + return self.rule_parse(transcript) + + def rule_parse(self, transcript: str) -> Dict[str, str]: + """Rule-based parsing fallback using behavior's vocabulary.""" + return rule_parse(transcript, self.get_all_keywords()) + + def _build_alias_map(self) -> None: + """Build reverse mapping from aliases to action names.""" + self._alias_map = {} + for action_name, info in self.ACTIONS.items(): + self._alias_map[action_name] = action_name + for alias in info.aliases: + self._alias_map[alias] = action_name + + def resolve_action(self, name: str) -> Optional[str]: + """Resolve action name or alias to canonical name.""" + return self._alias_map.get(name) + + def get_action_info(self, name: str) -> Optional[ActionInfo]: + """Get action info by name or alias.""" + canonical = self.resolve_action(name) + return self.ACTIONS.get(canonical) if canonical else None + + def get_all_keywords(self) -> Dict[str, List[str]]: + """Get all action names and aliases for parser.""" + return { + name: [name] + info.aliases + for name, info in self.ACTIONS.items() + } + + def execute(self, action_name: str, context: ActionContext) -> bool: + """Execute action by name using explicit handler mapping.""" + canonical = self.resolve_action(action_name) + if not canonical: + return False + + info = self.ACTIONS.get(canonical) + if not info: + return False + + # Check preconditions + if info.requires_pose and not context.pose: + return False + if info.requires_object and not context.target: + return False + + handler = self.action_handlers().get(canonical) + if not handler: + return False + + return handler(context) + + @abstractmethod + def action_handlers(self) -> Dict[str, Callable[[ActionContext], bool]]: + """Return mapping of action names to handler functions.""" + pass + + def _within_bounds( + self, + point_mm: List[float], + workspace_min: Tuple[Optional[float], Optional[float], Optional[float]], + workspace_max: Tuple[Optional[float], Optional[float], Optional[float]], + ) -> bool: + """Check if position is within workspace bounds.""" + x, y, z = point_mm[:3] + min_x, min_y, min_z = workspace_min + max_x, max_y, max_z = workspace_max + if min_x is not None and x < min_x: + return False + if max_x is not None and x > max_x: + return False + if min_y is not None and y < min_y: + return False + if max_y is not None and y > max_y: + return False + if min_z is not None and z < min_z: + return False + if max_z is not None and z > max_z: + return False + return True + + def _queue_steps(self, ctx: ActionContext, steps: List[RobotStep]) -> None: + """Queue multiple robot steps.""" + for step in steps: + ctx.shared_state.append_queue(step) + + def _make_pose_payload(self, x: float, y: float, z: float) -> Dict[str, float]: + """Create pose payload with default orientation.""" + return { + "x": x, + "y": y, + "z": z, + "roll": self.config.default_roll, + "pitch": self.config.default_pitch, + "yaw": self.config.default_yaw, + } + + def _queue_move(self, ctx: ActionContext, x: float, y: float, z: float) -> bool: + """Queue a move command if within bounds.""" + if not self._within_bounds([x, y, z], self.config.workspace_min, self.config.workspace_max): + return False + payload = self._make_pose_payload(x, y, z) + self._queue_steps(ctx, self.robot_adapter.move(payload)) + return True + + def _queue_approach_sequence(self, ctx: ActionContext, pos: List[float]) -> None: + """Queue approach pose then target pose.""" + approach_z = pos[2] + self.config.tcp_offset_mm + self.config.approach_offset_mm + target_z = pos[2] + self.config.tcp_offset_mm + + # Approach + self._queue_move(ctx, pos[0], pos[1], approach_z) + # Target + self._queue_move(ctx, pos[0], pos[1], target_z) + + # ========================================================= + # Action implementations using robot adapter + # ========================================================= + + def action_subir(self, ctx: ActionContext) -> bool: + """Move up by step_mm.""" + target_z = ctx.pose[2] + self.config.step_mm + return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z) + + def action_bajar(self, ctx: ActionContext) -> bool: + """Move down by step_mm.""" + target_z = ctx.pose[2] - self.config.step_mm + return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z) + + def action_ir(self, ctx: ActionContext) -> bool: + """Move to object position (approach + target).""" + pos = ctx.target.position_mm + self._queue_approach_sequence(ctx, pos) + return True + + def action_tomar(self, ctx: ActionContext) -> bool: + """Pick object using robot adapter.""" + pos = ctx.target.position_mm + self._queue_approach_sequence(ctx, pos) + self._queue_steps(ctx, self.robot_adapter.grab()) + return True + + def action_soltar(self, ctx: ActionContext) -> bool: + """Release object at target using robot adapter.""" + pos = ctx.target.position_mm + self._queue_approach_sequence(ctx, pos) + self._queue_steps(ctx, self.robot_adapter.release()) + return True + + def action_reiniciar(self, ctx: ActionContext) -> bool: + """Reset: release tool, move home, clear objects.""" + self._queue_steps(ctx, self.robot_adapter.reset_tool()) + self._queue_steps(ctx, self.robot_adapter.move(ctx.home_pose)) + ctx.scene.clear_detected() + return True diff --git a/dora_voice_control/dora_voice_control/config.py b/dora_voice_control/dora_voice_control/core/config.py similarity index 74% rename from dora_voice_control/dora_voice_control/config.py rename to dora_voice_control/dora_voice_control/core/config.py index 68101fb..9bb6b9a 100644 --- a/dora_voice_control/dora_voice_control/config.py +++ b/dora_voice_control/dora_voice_control/core/config.py @@ -6,6 +6,9 @@ import os from dataclasses import dataclass from typing import Dict, Optional, Tuple +# Image encoding constants +JPEG_QUALITY = 80 + @dataclass class VoiceConfig: @@ -34,6 +37,16 @@ class ApiConfig: enabled: bool +@dataclass +class LLMConfig: + """Configuration for LLM provider.""" + + provider: str # "rules", "gemini", "ollama", "openai" + model: str # Model name/path + api_base: Optional[str] = None # For Ollama + temperature: float = 0.3 + + def _parse_float_env(name: str) -> Optional[float]: """Parse an optional float from environment variable.""" raw = os.getenv(name) @@ -48,6 +61,7 @@ def _parse_float_env(name: str) -> Optional[float]: def _parse_class_map(raw: str) -> Dict[str, str]: """Parse JSON class mapping from string.""" import json + import time if not raw: return {} @@ -55,8 +69,9 @@ def _parse_class_map(raw: str) -> Dict[str, str]: data = json.loads(raw) if isinstance(data, dict): return {str(k): str(v) for k, v in data.items()} - except Exception: - pass + except Exception as e: + timestamp = time.strftime("%H:%M:%S") + print(f"[voice_control {timestamp}] Warning: failed to parse CLASS_MAP: {e}", flush=True) return {} @@ -90,6 +105,16 @@ def load_api_config() -> ApiConfig: """Load API server configuration from environment variables.""" return ApiConfig( host=os.getenv("API_HOST", "0.0.0.0"), - port=int(os.getenv("API_PORT", "8080")), + port=int(os.getenv("API_PORT", "9001")), enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"), ) + + +def load_llm_config() -> LLMConfig: + """Load LLM configuration from environment variables.""" + return LLMConfig( + provider=os.getenv("LLM_PROVIDER", "rules").lower(), + model=os.getenv("LLM_MODEL", "gemma2"), + api_base=os.getenv("OLLAMA_API_BASE", "http://localhost:11434"), + temperature=float(os.getenv("LLM_TEMPERATURE", "0.3")), + ) diff --git a/dora_voice_control/dora_voice_control/core/node.py b/dora_voice_control/dora_voice_control/core/node.py new file mode 100644 index 0000000..2648765 --- /dev/null +++ b/dora_voice_control/dora_voice_control/core/node.py @@ -0,0 +1,260 @@ +"""Dora node infrastructure abstractions.""" + +from __future__ import annotations + +import json +import time +import uuid +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Callable, Dict, Iterator, List, Optional, Protocol + +import pyarrow as pa + + +class VoiceControlLogger: + """Timestamped logger for voice control node.""" + + def __init__(self, prefix: str = "voice_control") -> None: + self._prefix = prefix + + def log(self, msg: str) -> None: + """Print a timestamped log message.""" + timestamp = time.strftime("%H:%M:%S") + print(f"[{self._prefix} {timestamp}] {msg}", flush=True) + + def warn(self, msg: str) -> None: + """Print a timestamped warning message.""" + self.log(f"Warning: {msg}") + + def error(self, msg: str) -> None: + """Print a timestamped error message.""" + self.log(f"Error: {msg}") + + +class NodePort(ABC): + """Abstract interface for Dora node operations.""" + + @abstractmethod + def send_output(self, output_name: str, data: pa.Array, metadata: Dict[str, Any]) -> None: + """Send output to a Dora topic.""" + pass + + @abstractmethod + def events(self) -> Iterator[Dict[str, Any]]: + """Iterate over incoming events.""" + pass + + +class DoraNodeAdapter(NodePort): + """Real Dora node wrapper.""" + + def __init__(self, node: Any) -> None: + self._node = node + + def send_output(self, output_name: str, data: pa.Array, metadata: Dict[str, Any]) -> None: + """Send output to a Dora topic.""" + self._node.send_output(output_name, data, metadata) + + def events(self) -> Iterator[Dict[str, Any]]: + """Iterate over incoming events.""" + for event in self._node: + yield event + + def send_command(self, output_name: str, action: str, payload: Dict[str, Any]) -> str: + """Send a robot command and return the command ID.""" + command_id = str(uuid.uuid4()) + message = {"id": command_id, "action": action, "payload": payload} + self.send_output( + output_name, + pa.array([json.dumps(message)]), + {"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + return command_id + + def send_json(self, output_name: str, data: Dict[str, Any]) -> None: + """Send JSON data to a topic.""" + self.send_output( + output_name, + pa.array([json.dumps(data)]), + {"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + + +class MockNodeAdapter(NodePort): + """Mock node adapter for testing.""" + + def __init__(self) -> None: + self._outputs: list[tuple[str, pa.Array, Dict[str, Any]]] = [] + self._events: list[Dict[str, Any]] = [] + + def send_output(self, output_name: str, data: pa.Array, metadata: Dict[str, Any]) -> None: + """Record output for testing.""" + self._outputs.append((output_name, data, metadata)) + + def events(self) -> Iterator[Dict[str, Any]]: + """Return queued test events.""" + for event in self._events: + yield event + + def queue_event(self, event: Dict[str, Any]) -> None: + """Queue an event for testing.""" + self._events.append(event) + + def get_outputs(self) -> list[tuple[str, pa.Array, Dict[str, Any]]]: + """Get recorded outputs.""" + return self._outputs + + def clear(self) -> None: + """Clear recorded outputs and events.""" + self._outputs.clear() + self._events.clear() + + def send_command(self, output_name: str, action: str, payload: Dict[str, Any]) -> str: + """Send a robot command and return the command ID.""" + command_id = str(uuid.uuid4()) + message = {"id": command_id, "action": action, "payload": payload} + self.send_output( + output_name, + pa.array([json.dumps(message)]), + {"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + return command_id + + def send_json(self, output_name: str, data: Dict[str, Any]) -> None: + """Send JSON data to a topic.""" + self.send_output( + output_name, + pa.array([json.dumps(data)]), + {"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + + +class EventHandler(Protocol): + """Protocol for event handlers.""" + + def handle(self, event: Dict[str, Any]) -> None: + """Handle an incoming event.""" + ... + + +class EventDispatcher: + """Routes Dora events to registered handlers.""" + + def __init__(self, adapter: NodePort, logger: Optional[VoiceControlLogger] = None) -> None: + self._adapter = adapter + self._logger = logger or VoiceControlLogger() + self._handlers: Dict[str, EventHandler] = {} + self._tick_callbacks: List[Callable[[], None]] = [] + self._first_event_callbacks: List[Callable[[], None]] = [] + self._first_event_fired = False + + def register(self, event_id: str, handler: EventHandler) -> None: + """Register a handler for a specific event ID.""" + self._handlers[event_id] = handler + + def on_tick(self, callback: Callable[[], None]) -> None: + """Register a callback to run after each event.""" + self._tick_callbacks.append(callback) + + def on_first_event(self, callback: Callable[[], None]) -> None: + """Register a callback to run on the first event.""" + self._first_event_callbacks.append(callback) + + def run(self) -> None: + """Main event loop.""" + self._logger.log("Dora node created, waiting for events...") + + for event in self._adapter.events(): + # Fire first event callbacks + if not self._first_event_fired: + self._first_event_fired = True + for callback in self._first_event_callbacks: + try: + callback() + except Exception as e: + self._logger.error(f"First event callback failed: {e}") + + # Skip non-input events + if event.get("type") != "INPUT": + continue + + # Dispatch to handler + event_id = event.get("id") + handler = self._handlers.get(event_id) + if handler: + try: + handler.handle(event) + except Exception as e: + self._logger.error(f"Handler for '{event_id}' failed: {e}") + + # Run tick callbacks + for callback in self._tick_callbacks: + try: + callback() + except Exception as e: + self._logger.error(f"Tick callback failed: {e}") + + +class OutputPort: + """Typed output port for sending data to a Dora topic.""" + + def __init__(self, adapter: DoraNodeAdapter, name: str) -> None: + self._adapter = adapter + self._name = name + + @property + def name(self) -> str: + """Get the output topic name.""" + return self._name + + def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data to the topic.""" + self._adapter.send_json(self._name, data) + + def send_command(self, action: str, payload: Dict[str, Any]) -> str: + """Send a robot command and return command ID.""" + return self._adapter.send_command(self._name, action, payload) + + +@dataclass +class NodeContext: + """Central context for Dora node I/O. + + Bundles the adapter, logger, and typed output ports into a single + context object that can be passed to services and handlers. + """ + + adapter: DoraNodeAdapter + logger: VoiceControlLogger + + # Typed output ports + robot_out: OutputPort + scene_out: OutputPort + voice_out: OutputPort + + @classmethod + def create( + cls, + adapter: DoraNodeAdapter, + logger: VoiceControlLogger, + robot_output: str = "robot_cmd", + scene_output: str = "scene_update", + voice_output: str = "voice_out", + ) -> "NodeContext": + """Create a NodeContext with named output ports. + + Args: + adapter: The Dora node adapter + logger: The logger instance + robot_output: Topic name for robot commands + scene_output: Topic name for scene updates + voice_output: Topic name for voice responses + """ + return cls( + adapter=adapter, + logger=logger, + robot_out=OutputPort(adapter, robot_output), + scene_out=OutputPort(adapter, scene_output), + voice_out=OutputPort(adapter, voice_output), + ) diff --git a/dora_voice_control/dora_voice_control/core/robot.py b/dora_voice_control/dora_voice_control/core/robot.py new file mode 100644 index 0000000..bbf01c7 --- /dev/null +++ b/dora_voice_control/dora_voice_control/core/robot.py @@ -0,0 +1,39 @@ +"""Robot adapter base types.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List + +from .state import RobotStep + + +class RobotAdapter(ABC): + """Abstract adapter for translating generic commands to robot-specific commands. + + Generic commands: + - grab: Activate tool to grab object + - release: Deactivate tool to release object + - move: Move to position + - reset_tool: Reset tool to safe state + """ + + @abstractmethod + def grab(self) -> List[RobotStep]: + """Translate grab command to robot steps.""" + pass + + @abstractmethod + def release(self) -> List[RobotStep]: + """Translate release command to robot steps.""" + pass + + @abstractmethod + def move(self, payload: Dict[str, Any]) -> List[RobotStep]: + """Translate move command to robot steps.""" + pass + + @abstractmethod + def reset_tool(self) -> List[RobotStep]: + """Translate reset_tool command to robot steps.""" + pass diff --git a/dora_voice_control/dora_voice_control/core/robot_io.py b/dora_voice_control/dora_voice_control/core/robot_io.py new file mode 100644 index 0000000..feb8b6b --- /dev/null +++ b/dora_voice_control/dora_voice_control/core/robot_io.py @@ -0,0 +1,155 @@ +"""Robot I/O handlers and command queue service.""" + +from __future__ import annotations + +import json +import os +import time +from typing import Any, Dict, Optional, TYPE_CHECKING + +import cv2 +import numpy as np +import pyarrow as pa + +from .config import JPEG_QUALITY + +if TYPE_CHECKING: + from .config import VoiceConfig + from .state import SharedState + from .node import OutputPort, VoiceControlLogger + + +class CommandQueueService: + """Dispatches robot commands from the queue.""" + + def __init__( + self, + state: "SharedState", + port: "OutputPort", + config: "VoiceConfig", + logger: "VoiceControlLogger", + ) -> None: + self._state = state + self._port = port + self._config = config + self._logger = logger + + def dispatch_next(self) -> bool: + """Dispatch the next command in the queue if possible. + + Returns True if a command was dispatched, False otherwise. + """ + # Don't dispatch if there's a pending command + if self._state.get_pending_command() is not None: + return False + + # Don't dispatch if queue is empty + if self._state.queue_size() == 0: + return False + + step = self._state.pop_queue() + if step is None: + return False + + # Handle dry run mode + if self._config.dry_run: + self._logger.log(f"[DRY RUN] Would send: {step.action} {step.payload}") + self._state.set_pending_command(None) + return True + + # Send command + cmd_id = self._port.send_command(step.action, step.payload) + self._state.set_pending_command({"id": cmd_id, "action": step.action}) + self._logger.log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={self._state.queue_size()}") + + # Update debug state + self._state.update_robot_command( + {"id": cmd_id, "action": step.action, "payload": step.payload}, + time.monotonic(), + ) + + return True + + +def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]: + """Parse status payload from robot.""" + if len(value) == 0: + return None + raw = value[0].as_py() + if not raw: + return None + try: + return json.loads(raw) + except Exception: + return None + + +class StatusHandler: + """Handles status events from the robot.""" + + def __init__( + self, + state: "SharedState", + logger: "VoiceControlLogger", + ) -> None: + self._state = state + self._logger = logger + + def handle(self, event: Dict[str, Any]) -> None: + """Handle a status event.""" + payload = _parse_status_payload(event["value"]) + pending = self._state.get_pending_command() + + if payload and pending: + if payload.get("command_id") == pending.get("id"): + self._logger.log( + f"Command completed: {pending.get('action')} (status={payload.get('status', 'ok')})" + ) + self._state.set_pending_command(None) + + +class PoseHandler: + """Handles tcp_pose events from the Dora node.""" + + def __init__( + self, + state: "SharedState", + logger: "VoiceControlLogger", + ) -> None: + self._state = state + self._logger = logger + + def handle(self, event: Dict[str, Any]) -> None: + """Handle a tcp_pose event.""" + tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1) + if tcp_pose.size >= 6: + self._state.update_pose(tcp_pose[:6].tolist(), time.monotonic()) + + +class ImageHandler: + """Handles camera image events from the Dora node.""" + + def __init__( + self, + state: "SharedState", + logger: "VoiceControlLogger", + image_width: Optional[int] = None, + image_height: Optional[int] = None, + ) -> None: + self._state = state + self._logger = logger + self._image_width = image_width or int(os.getenv("IMAGE_WIDTH", "1280")) + self._image_height = image_height or int(os.getenv("IMAGE_HEIGHT", "720")) + + def handle(self, event: Dict[str, Any]) -> None: + """Handle an image event.""" + try: + # Get raw image data + img_data = event["value"].to_numpy() + # Reshape to image (assuming BGR format) + img = img_data.reshape((self._image_height, self._image_width, 3)).astype(np.uint8) + # Encode to JPEG + _, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY]) + self._state.update_image(jpeg_data.tobytes(), time.monotonic()) + except Exception as e: + self._logger.warn(f"failed to process camera image: {e}") diff --git a/dora_voice_control/dora_voice_control/core/scene.py b/dora_voice_control/dora_voice_control/core/scene.py new file mode 100644 index 0000000..309e093 --- /dev/null +++ b/dora_voice_control/dora_voice_control/core/scene.py @@ -0,0 +1,573 @@ +"""Scene state and handlers for object management.""" + +from __future__ import annotations + +import json +import os +import threading +import time +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +try: + import tomllib +except ModuleNotFoundError: + import tomli as tomllib # type: ignore + +if TYPE_CHECKING: + from .node import OutputPort, VoiceControlLogger + + +@dataclass +class SceneObject: + """An object in the scene with spatial properties.""" + + id: str # Unique identifier + object_type: str # cube, cylinder, box, etc. + color: str # red, blue, etc. + size: str = "normal" # big, small, normal + + # Spatial properties + position_mm: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0]) + bbox_mm: Optional[List[float]] = None # [x1, y1, x2, y2] bounding box + height_mm: float = 30.0 # Object height + area_mm2: Optional[float] = None # Footprint area + + # Relationships + on_top_of: Optional[str] = None # ID of object this is stacked on + + # Metadata + source: str = "detected" # "detected", "manual", or "config" + confidence: float = 1.0 + last_seen_ns: int = 0 + + @property + def center(self) -> List[float]: + """Get center point (alias for position_mm).""" + return self.position_mm + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "id": self.id, + "object_type": self.object_type, + "color": self.color, + "size": self.size, + "position_mm": self.position_mm, + "bbox_mm": self.bbox_mm, + "height_mm": self.height_mm, + "area_mm2": self.area_mm2, + "on_top_of": self.on_top_of, + "source": self.source, + "confidence": self.confidence, + "last_seen_ns": self.last_seen_ns, + } + + +class SceneState: + """Thread-safe scene state with spatial relationships.""" + + # Tolerance for stacking detection (mm) + Z_TOLERANCE_MM: float = 15.0 + + def __init__(self) -> None: + self._lock = threading.Lock() + self._objects: Dict[str, SceneObject] = {} + + # === Core Operations === + + def add(self, obj: SceneObject) -> bool: + """Add object to scene. + + Returns True if added, False if ID already exists. + """ + with self._lock: + if obj.id in self._objects: + return False + self._objects[obj.id] = obj + return True + + def delete(self, object_id: str) -> bool: + """Remove object and clear relationships pointing to it. + + Returns True if deleted, False if not found. + """ + with self._lock: + if object_id not in self._objects: + return False + del self._objects[object_id] + # Clear relationships pointing to deleted object + for obj in self._objects.values(): + if obj.on_top_of == object_id: + obj.on_top_of = None + return True + + def update(self, object_id: str, **properties: Any) -> bool: + """Update object properties. + + Returns True if updated, False if not found. + """ + with self._lock: + if object_id not in self._objects: + return False + obj = self._objects[object_id] + for key, value in properties.items(): + if hasattr(obj, key): + setattr(obj, key, value) + return True + + def get(self, object_id: str) -> Optional[SceneObject]: + """Get object by ID.""" + with self._lock: + obj = self._objects.get(object_id) + return obj # Return reference (caller should not mutate) + + def get_all(self) -> List[SceneObject]: + """Get all objects.""" + with self._lock: + return list(self._objects.values()) + + def clear(self) -> None: + """Remove all objects.""" + with self._lock: + self._objects.clear() + + def clear_detected(self) -> None: + """Remove only detected objects (keep manual and config).""" + with self._lock: + to_delete = [ + obj_id + for obj_id, obj in self._objects.items() + if obj.source == "detected" + ] + for obj_id in to_delete: + del self._objects[obj_id] + # Clear broken relationships + for obj in self._objects.values(): + if obj.on_top_of and obj.on_top_of not in self._objects: + obj.on_top_of = None + + # === Query === + + def query( + self, + object_type: Optional[str] = None, + color: Optional[str] = None, + size: Optional[str] = None, + source: Optional[str] = None, + ) -> List[SceneObject]: + """Find objects matching criteria.""" + with self._lock: + results: List[SceneObject] = [] + for obj in self._objects.values(): + if object_type is not None and obj.object_type != object_type: + continue + if color is not None and obj.color != color: + continue + if size is not None and obj.size != size: + continue + if source is not None and obj.source != source: + continue + results.append(obj) + return results + + def count(self) -> int: + """Get total object count.""" + with self._lock: + return len(self._objects) + + # === Spatial Relationships === + + def set_on_top_of(self, object_id: str, below_id: Optional[str]) -> bool: + """Set stacking relationship. + + Returns True if set, False if object not found. + """ + with self._lock: + if object_id not in self._objects: + return False + if below_id is not None and below_id not in self._objects: + return False + self._objects[object_id].on_top_of = below_id + return True + + def infer_relationships(self) -> None: + """Infer stacking from positions. + + Call after replace_detected() to automatically detect stacking. + A is on B if: A.z ≈ B.z + B.height AND A overlaps B horizontally. + """ + with self._lock: + self._infer_relationships_unlocked() + + def _infer_relationships_unlocked(self) -> None: + """Infer relationships without holding lock (internal use).""" + # Reset all relationships first + for obj in self._objects.values(): + obj.on_top_of = None + + for obj in self._objects.values(): + best_match: Optional[SceneObject] = None + best_distance = float("inf") + + for candidate in self._objects.values(): + if candidate.id == obj.id: + continue + + # Check if obj is on top of candidate + expected_z = candidate.position_mm[2] + candidate.height_mm + actual_z = obj.position_mm[2] + z_diff = abs(actual_z - expected_z) + + if z_diff < self.Z_TOLERANCE_MM: + if self._overlaps_horizontally_unlocked(obj, candidate): + # Use closest match in z + if z_diff < best_distance: + best_match = candidate + best_distance = z_diff + + if best_match is not None: + obj.on_top_of = best_match.id + + def get_object_below(self, object_id: str) -> Optional[SceneObject]: + """Get object this one is stacked on.""" + with self._lock: + obj = self._objects.get(object_id) + if obj is None or obj.on_top_of is None: + return None + return self._objects.get(obj.on_top_of) + + def get_objects_above(self, object_id: str) -> List[SceneObject]: + """Get objects stacked directly on this one.""" + with self._lock: + results: List[SceneObject] = [] + for obj in self._objects.values(): + if obj.on_top_of == object_id: + results.append(obj) + return results + + def get_stack(self, object_id: str) -> List[SceneObject]: + """Get all objects in stack (bottom to top). + + Returns empty list if object not found. + """ + with self._lock: + obj = self._objects.get(object_id) + if obj is None: + return [] + + # Find bottom of stack + bottom = obj + while bottom.on_top_of is not None: + below = self._objects.get(bottom.on_top_of) + if below is None: + break + bottom = below + + # Build stack from bottom up + stack = [bottom] + visited = {bottom.id} + + while True: + # Find object on top of current top + current_top = stack[-1] + found_next = False + for candidate in self._objects.values(): + if candidate.on_top_of == current_top.id and candidate.id not in visited: + stack.append(candidate) + visited.add(candidate.id) + found_next = True + break + if not found_next: + break + + return stack + + def effective_height(self, object_id: str) -> float: + """Get height including objects below (if stacked). + + Returns 0 if object not found. + """ + with self._lock: + obj = self._objects.get(object_id) + if obj is None: + return 0.0 + return self._effective_height_unlocked(obj) + + def _effective_height_unlocked(self, obj: SceneObject) -> float: + """Calculate effective height without holding lock.""" + if obj.on_top_of is None: + return obj.height_mm + below = self._objects.get(obj.on_top_of) + if below is None: + return obj.height_mm + return obj.height_mm + self._effective_height_unlocked(below) + + def _overlaps_horizontally(self, a: SceneObject, b: SceneObject) -> bool: + """Check if two objects overlap in x,y plane (thread-safe).""" + with self._lock: + return self._overlaps_horizontally_unlocked(a, b) + + def _overlaps_horizontally_unlocked(self, a: SceneObject, b: SceneObject) -> bool: + """Check horizontal overlap without lock. + + If bounding boxes are available, use them. + Otherwise, check if centers are within a distance threshold. + """ + # If both have bounding boxes, use proper overlap check + if a.bbox_mm is not None and b.bbox_mm is not None: + # bbox format: [x1, y1, x2, y2] + ax1, ay1, ax2, ay2 = a.bbox_mm + bx1, by1, bx2, by2 = b.bbox_mm + # Check for no overlap (if any of these is true, no overlap) + if ax2 < bx1 or bx2 < ax1: + return False + if ay2 < by1 or by2 < ay1: + return False + return True + + # Fall back to center distance check + # Default threshold: ~30mm (typical small object radius) + threshold = 40.0 + dx = abs(a.position_mm[0] - b.position_mm[0]) + dy = abs(a.position_mm[1] - b.position_mm[1]) + return dx < threshold and dy < threshold + + # === Detector Integration === + + def replace_detected(self, objects: List[Dict[str, Any]]) -> None: + """Replace detected objects (keeps manual and config objects). + + Automatically generates IDs and infers stacking relationships. + """ + with self._lock: + # Remove existing detected objects + to_delete = [ + obj_id + for obj_id, obj in self._objects.items() + if obj.source == "detected" + ] + for obj_id in to_delete: + del self._objects[obj_id] + + # Add new detected objects + now_ns = time.time_ns() + counters: Dict[str, int] = {} + + for obj_data in objects: + obj_type = obj_data.get("object_type", "unknown") + color = obj_data.get("color", "unknown") + + # Generate unique ID + key = f"{obj_type}_{color}" + counters[key] = counters.get(key, 0) + 1 + obj_id = f"{key}_{counters[key]}" + + position = obj_data.get("position_mm", [0.0, 0.0, 0.0]) + if len(position) < 3: + position = position + [0.0] * (3 - len(position)) + + scene_obj = SceneObject( + id=obj_id, + object_type=obj_type, + color=color, + size=obj_data.get("size", "normal"), + position_mm=position, + bbox_mm=obj_data.get("bbox_mm"), + height_mm=obj_data.get("height_mm", 30.0), + area_mm2=obj_data.get("area_mm2"), + source="detected", + confidence=obj_data.get("confidence", 1.0), + last_seen_ns=now_ns, + ) + self._objects[obj_id] = scene_obj + + # Clear broken relationships + for obj in self._objects.values(): + if obj.on_top_of and obj.on_top_of not in self._objects: + obj.on_top_of = None + + # Infer stacking relationships + self._infer_relationships_unlocked() + + # === Serialization === + + def to_list(self) -> List[Dict[str, Any]]: + """Convert all objects to list of dictionaries.""" + with self._lock: + return [obj.to_dict() for obj in self._objects.values()] + + def get_objects_as_dicts(self) -> List[Dict[str, Any]]: + """Get all objects as dictionaries (for compatibility).""" + return self.to_list() + + # === Object Matching (voice criteria) === + + def find_by_criteria( + self, + obj_type: str, + color: str, + size: str, + class_map: Optional[Dict[str, str]] = None, + ) -> Optional[SceneObject]: + """Find object matching voice command criteria. + + Args: + obj_type: Object type from voice command (e.g., "cubo", "cilindro") + color: Color from voice command (e.g., "rojo", "azul") + size: Size from voice command (e.g., "grande", "pequeno") + class_map: Optional mapping to translate voice terms to detector terms + + Returns: + First matching SceneObject, or None if not found. + """ + if obj_type == "no especificado": + return None + + class_map = class_map or {} + + def translate(token: str) -> str: + return class_map.get(token, token) + + target_type = translate(obj_type) + target_color = translate(color) if color != "no especificado" else None + target_size = translate(size) if size != "no especificado" else None + + results = self.query( + object_type=target_type, + color=target_color, + size=target_size, + ) + + return results[0] if results else None + + # === Config Loading === + + def load_from_config(self, config_path: Optional[str] = None) -> int: + """Load static objects from TOML config file. + + Config format: + [object_parameters] + normal_height = 220.0 + + [[static_objects]] + type = "box" + color = "blue" + position = [100.0, -200.0] + size = "big" + height = 50.0 + + Args: + config_path: Path to config file. Defaults to CONFIG_FILE env var or "config.toml". + + Returns: + Number of objects loaded. + """ + if config_path is None: + config_path = os.getenv("CONFIG_FILE", "config.toml") + + cfg = self._load_toml(config_path) + if not cfg: + return 0 + + obj_cfg = cfg.get("object_parameters", {}) + base_z = float(obj_cfg.get("normal_height", 220.0)) + + count = 0 + counters: Dict[str, int] = {} + + for obj in cfg.get("static_objects", []): + obj_type = obj.get("type", "box") + color = obj.get("color", "unknown") + pos = obj.get("position", [0, 0]) + + # Generate unique ID + key = f"{obj_type}_{color}" + counters[key] = counters.get(key, 0) + 1 + obj_id = f"config_{key}_{counters[key]}" + + scene_obj = SceneObject( + id=obj_id, + object_type=obj_type, + color=color, + size=obj.get("size", "big"), + position_mm=[float(pos[0]), float(pos[1]), base_z], + height_mm=obj.get("height", 30.0), + source="config", + ) + if self.add(scene_obj): + count += 1 + + return count + + @staticmethod + def _load_toml(path: str) -> Dict[str, Any]: + """Load TOML file, returning empty dict on error.""" + if not path or not os.path.exists(path): + return {} + try: + with open(path, "rb") as f: + return tomllib.load(f) + except Exception: + return {} + + +class SceneNotifier: + """Emits scene updates to clients.""" + + def __init__( + self, + scene: SceneState, + port: "OutputPort", + logger: "VoiceControlLogger", + ) -> None: + self._scene = scene + self._port = port + self._logger = logger + + def send_scene_update(self) -> None: + """Send current scene state to clients.""" + self._port.send_json({"objects": self._scene.to_list()}) + + def send_scene_reset(self) -> None: + """Send scene reset notification to clients.""" + self._scene.clear_detected() + self._port.send_json({"objects": self._scene.to_list(), "reset": True}) + self._logger.log("Sent scene reset notification") + + def send_initial_scene(self) -> None: + """Send initial scene state on startup.""" + self._port.send_json({"objects": self._scene.to_list(), "reset": True}) + self._logger.log("Sent initial scene reset notification") + + +class ObjectsHandler: + """Handles objects detection events from the Dora node.""" + + def __init__( + self, + scene: SceneState, + notifier: SceneNotifier, + logger: "VoiceControlLogger", + ) -> None: + self._scene = scene + self._notifier = notifier + self._logger = logger + + def handle(self, event: Dict[str, Any]) -> None: + """Handle an objects detection event.""" + raw = event["value"][0].as_py() if len(event["value"]) else "" + if not raw: + return + + try: + payload = json.loads(raw) + objects = payload.get("objects", []) + except Exception as e: + self._logger.warn(f"failed to parse objects payload: {e}") + objects = [] + + self._scene.replace_detected(objects) + + # Emit scene update + self._notifier.send_scene_update() diff --git a/dora_voice_control/dora_voice_control/state.py b/dora_voice_control/dora_voice_control/core/state.py similarity index 54% rename from dora_voice_control/dora_voice_control/state.py rename to dora_voice_control/dora_voice_control/core/state.py index a68d1d6..00ce664 100644 --- a/dora_voice_control/dora_voice_control/state.py +++ b/dora_voice_control/dora_voice_control/core/state.py @@ -22,9 +22,6 @@ class VoiceState: latest_pose: Optional[List[float]] = None latest_pose_at: Optional[float] = None - latest_objects: List[Dict[str, Any]] = field(default_factory=list) - latest_objects_at: Optional[float] = None - static_objects: List[Dict[str, Any]] = field(default_factory=list) pending_command: Optional[Dict[str, Any]] = None queue: Deque[RobotStep] = field(default_factory=deque) @@ -45,12 +42,16 @@ class DebugState: class SharedState: - """Thread-safe shared state container.""" + """Thread-safe shared state container. + + Note: Object state (detected/static objects) has moved to SceneState. + This class now only handles robot queue, pose, and debug state. + """ def __init__(self) -> None: self._lock = threading.Lock() - self.voice_state = VoiceState() - self.debug_state = DebugState() + self._voice_state = VoiceState() + self._debug_state = DebugState() self._command_callback: Optional[Any] = None def set_command_callback(self, callback: Any) -> None: @@ -66,14 +67,12 @@ class SharedState: def get_status(self) -> Dict[str, Any]: """Get current status for web interface.""" with self._lock: - vs = self.voice_state - ds = self.debug_state + vs = self._voice_state + ds = self._debug_state return { "has_pose": vs.latest_pose is not None, "pose": vs.latest_pose, "pose_age_ms": _age_ms(vs.latest_pose_at), - "object_count": len(vs.latest_objects), - "static_object_count": len(vs.static_objects), "queue_size": len(vs.queue), "has_pending_command": vs.pending_command is not None, "pending_command": vs.pending_command, @@ -82,71 +81,116 @@ class SharedState: "last_parse_result": ds.last_parse_result, } - def get_objects(self) -> Dict[str, Any]: - """Get detected and static objects.""" - with self._lock: - return { - "detected": list(self.voice_state.latest_objects), - "static": list(self.voice_state.static_objects), - } - def get_queue(self) -> List[Dict[str, Any]]: """Get the command queue.""" with self._lock: - return [{"action": s.action, "payload": s.payload} for s in self.voice_state.queue] + return [{"action": s.action, "payload": s.payload} for s in self._voice_state.queue] + + def clear_queue(self) -> None: + """Clear the command queue.""" + with self._lock: + self._voice_state.queue.clear() + + def pop_queue(self) -> Optional[RobotStep]: + """Pop the next step from the queue. Returns None if empty.""" + with self._lock: + if self._voice_state.queue: + return self._voice_state.queue.popleft() + return None + + def append_queue(self, step: RobotStep) -> None: + """Append a step to the command queue.""" + with self._lock: + self._voice_state.queue.append(step) + + def queue_size(self) -> int: + """Get the size of the command queue.""" + with self._lock: + return len(self._voice_state.queue) + + def get_pose(self) -> Optional[List[float]]: + """Get the latest TCP pose.""" + with self._lock: + return self._voice_state.latest_pose + + def update_pose(self, pose: List[float], timestamp: float) -> None: + """Update the latest TCP pose.""" + with self._lock: + self._voice_state.latest_pose = pose + self._voice_state.latest_pose_at = timestamp + + def get_pending_command(self) -> Optional[Dict[str, Any]]: + """Get the pending command.""" + with self._lock: + return self._voice_state.pending_command + + def set_pending_command(self, command: Optional[Dict[str, Any]]) -> None: + """Set the pending command.""" + with self._lock: + self._voice_state.pending_command = command def get_history(self) -> List[Dict[str, Any]]: """Get command history.""" with self._lock: - return list(self.debug_state.command_history[-50:]) + return list(self._debug_state.command_history[-50:]) def get_errors(self) -> List[Dict[str, Any]]: """Get error log.""" with self._lock: - return list(self.debug_state.error_log[-50:]) + return list(self._debug_state.error_log[-50:]) def add_to_history(self, entry: Dict[str, Any]) -> None: """Add entry to command history.""" with self._lock: - self.debug_state.command_history.append(entry) - if len(self.debug_state.command_history) > 100: - self.debug_state.command_history = self.debug_state.command_history[-100:] + self._debug_state.command_history.append(entry) + if len(self._debug_state.command_history) > 100: + self._debug_state.command_history = self._debug_state.command_history[-100:] def add_error(self, error: Dict[str, Any]) -> None: """Add entry to error log.""" with self._lock: - self.debug_state.error_log.append(error) - if len(self.debug_state.error_log) > 100: - self.debug_state.error_log = self.debug_state.error_log[-100:] + self._debug_state.error_log.append(error) + if len(self._debug_state.error_log) > 100: + self._debug_state.error_log = self._debug_state.error_log[-100:] def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None: """Update last voice input info.""" with self._lock: - self.debug_state.last_voice_input = text - self.debug_state.last_voice_input_at = timestamp - self.debug_state.last_parse_result = parse_result + self._debug_state.last_voice_input = text + self._debug_state.last_voice_input_at = timestamp + self._debug_state.last_parse_result = parse_result def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None: """Update last robot command info.""" with self._lock: - self.debug_state.last_robot_command = command - self.debug_state.last_robot_command_at = timestamp + self._debug_state.last_robot_command = command + self._debug_state.last_robot_command_at = timestamp def update_image(self, image_bytes: bytes, timestamp: float) -> None: """Update latest camera image.""" with self._lock: - self.debug_state.latest_image = image_bytes - self.debug_state.latest_image_at = timestamp + self._debug_state.latest_image = image_bytes + self._debug_state.latest_image_at = timestamp def get_image(self) -> Optional[bytes]: """Get latest camera image.""" with self._lock: - return self.debug_state.latest_image + return self._debug_state.latest_image def get_image_age_ms(self) -> Optional[int]: """Get age of latest image in milliseconds.""" with self._lock: - return _age_ms(self.debug_state.latest_image_at) + return _age_ms(self._debug_state.latest_image_at) + + def get_last_voice_input(self) -> Optional[str]: + """Get the last voice input text.""" + with self._lock: + return self._debug_state.last_voice_input + + def get_last_parse_result(self) -> Optional[Dict[str, Any]]: + """Get the last parse result.""" + with self._lock: + return self._debug_state.last_parse_result def _age_ms(timestamp: Optional[float]) -> Optional[int]: diff --git a/dora_voice_control/dora_voice_control/core/voice.py b/dora_voice_control/dora_voice_control/core/voice.py new file mode 100644 index 0000000..9d6a8d1 --- /dev/null +++ b/dora_voice_control/dora_voice_control/core/voice.py @@ -0,0 +1,231 @@ +"""Voice processing and intent handling.""" + +from __future__ import annotations + +import threading +import time +import unicodedata +from collections import deque +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from typing import Any, Callable, Deque, Dict, List, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from .behavior import RobotBehavior + from .state import SharedState + from .node import OutputPort, VoiceControlLogger + + +# Default action vocabulary (used when no behavior vocabulary is provided) +DEFAULT_ACTION_VOCABULARY: Dict[str, List[str]] = { + "reiniciar": ["reiniciar", "reinicia", "reset"], + "subir": ["subir", "sube", "arriba"], + "bajar": ["bajar", "baja", "abajo"], + "soltar": ["soltar", "deja", "dejar"], + "tomar": ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"], + "ir": ["ir", "ve", "mover", "muevete", "acercar"], +} + + +def normalize(text: str) -> str: + """Normalize text: lowercase, remove accents.""" + text = text.lower().strip() + text = unicodedata.normalize("NFKD", text) + text = "".join([c for c in text if not unicodedata.combining(c)]) + return text + + +def rule_parse(transcript: str, vocabulary: Optional[Dict[str, List[str]]] = None) -> Dict[str, str]: + """Parse voice command using rule-based approach. + + Args: + transcript: The voice input text + vocabulary: Optional action vocabulary mapping action names to keywords. + If not provided, uses the default vocabulary. + """ + text = normalize(transcript) + vocab = vocabulary if vocabulary is not None else DEFAULT_ACTION_VOCABULARY + + # Find matching action using vocabulary + action = "error" + for action_name, keywords in vocab.items(): + if any(normalize(w) in text for w in keywords): + action = action_name + break + + color = "no especificado" + if "rojo" in text: + color = "rojo" + elif "azul" in text: + color = "azul" + elif "amarillo" in text: + color = "amarillo" + elif "blanco" in text: + color = "blanco" + + obj = "no especificado" + if "estrella" in text: + obj = "estrella" + elif "cilindro" in text: + obj = "cilindro" + elif "cubo" in text: + obj = "cubo" + elif "caja" in text: + obj = "caja" + + size = "no especificado" + if "grande" in text: + size = "grande" + elif "pequeno" in text or "pequeño" in text or "chico" in text: + size = "pequeno" + + if action == "error": + return {"resultado": "error"} + return { + "resultado": "ok", + "accion": action, + "objeto": obj, + "color": color, + "tamano": size, + } + + +@dataclass +class Intent: + """Parsed voice command intent.""" + + action: str + obj: str = "no especificado" + color: str = "no especificado" + size: str = "no especificado" + + def to_dict(self) -> Dict[str, str]: + """Convert to dictionary.""" + return { + "action": self.action, + "obj": self.obj, + "color": self.color, + "size": self.size, + } + + @classmethod + def from_parse_result(cls, result: Dict[str, str]) -> Optional["Intent"]: + """Create intent from parse result. Returns None if parsing failed.""" + if result.get("resultado") != "ok": + return None + return cls( + action=result.get("accion", "error"), + obj=result.get("objeto", "no especificado"), + color=result.get("color", "no especificado"), + size=result.get("tamano", "no especificado"), + ) + + +class IntentQueue: + """Thread-safe queue for voice intents.""" + + def __init__(self) -> None: + self._queue: Deque[Intent] = deque() + self._lock = threading.Lock() + + def append(self, intent: Intent) -> None: + """Add intent to queue.""" + with self._lock: + self._queue.append(intent) + + def popleft(self) -> Optional[Intent]: + """Remove and return the oldest intent. Returns None if empty.""" + with self._lock: + if self._queue: + return self._queue.popleft() + return None + + def __len__(self) -> int: + """Return queue length.""" + with self._lock: + return len(self._queue) + + def __bool__(self) -> bool: + """Return True if queue has items.""" + with self._lock: + return bool(self._queue) + + def clear(self) -> None: + """Clear all intents.""" + with self._lock: + self._queue.clear() + + +class VoiceInputHandler: + """Handles voice_in events from the Dora node.""" + + def __init__( + self, + state: "SharedState", + behavior: "RobotBehavior", + intent_queue: IntentQueue, + port: "OutputPort", + logger: "VoiceControlLogger", + ) -> None: + self._state = state + self._behavior = behavior + self._intent_queue = intent_queue + self._port = port + self._logger = logger + self._executor = ThreadPoolExecutor(max_workers=1) + + def handle(self, event: Dict[str, Any]) -> None: + """Handle a voice_in event. LLM parsing runs in background thread.""" + raw = event["value"][0].as_py() if len(event["value"]) else "" + if not raw: + return + + # Run LLM parsing in background thread to not block event loop + self._executor.submit(self._process_and_respond, raw) + + def _process_and_respond(self, transcript: str) -> None: + """Process command in background thread and send response.""" + response = self._process_command(transcript) + self._port.send_json(response) + + def _process_command(self, transcript: str) -> Dict[str, str]: + """Process voice command and return response.""" + self._logger.log(f'Voice input received: "{transcript}"') + llm_result = self._behavior.parse_command(transcript) + self._logger.log(f"Parse result: {llm_result}") + + # Update debug state + self._state.update_voice_input(transcript, llm_result, time.monotonic()) + + if llm_result.get("resultado") != "ok": + self._logger.log("Command not understood") + return {"text": "No entendi el comando", "status": "error"} + + intent = Intent.from_parse_result(llm_result) + if intent: + self._logger.log( + "Intent: action=%s, object=%s, color=%s, size=%s" + % (intent.action, intent.obj, intent.color, intent.size) + ) + self._intent_queue.append(intent) + + # Add to history + self._state.add_to_history( + { + "timestamp": time.time(), + "input": transcript, + "action": intent.action, + "object": intent.obj, + "color": intent.color, + "size": intent.size, + } + ) + + return {"text": f"Ok, voy a {llm_result.get('accion')}", "status": "ok"} + + def create_command_callback(self) -> Callable[[str], Dict[str, str]]: + """Create a callback for the web interface to send commands.""" + def callback(transcript: str) -> Dict[str, str]: + return self._process_command(transcript) + + return callback diff --git a/dora_voice_control/dora_voice_control/main.py b/dora_voice_control/dora_voice_control/main.py index 277ba86..a15ae5e 100644 --- a/dora_voice_control/dora_voice_control/main.py +++ b/dora_voice_control/dora_voice_control/main.py @@ -1,500 +1,176 @@ -"""Dora node for voice control with safe robot commands.""" +"""Dora node for voice control - thin orchestrator.""" from __future__ import annotations -import json import os -import sys -import time -import uuid -from collections import deque -from typing import Any, Deque, Dict, List, Optional, Tuple -import cv2 -import numpy as np -import pyarrow as pa from dora import Node -try: - import tomllib -except ModuleNotFoundError: - import tomli as tomllib - -# Handle both package and direct script execution -# __package__ is None when run as script, '' when imported from a script -_RUNNING_AS_SCRIPT = not __package__ - -if _RUNNING_AS_SCRIPT: - # Running as script - use absolute imports - _pkg_dir = os.path.dirname(os.path.abspath(__file__)) - if _pkg_dir not in sys.path: - sys.path.insert(0, _pkg_dir) - from config import VoiceConfig, load_api_config, load_voice_config - from parser import normalize, parse_command - from state import RobotStep, SharedState - from api import start_api_server -else: - # Running as package - use relative imports - from .config import VoiceConfig, load_api_config, load_voice_config - from .parser import normalize, parse_command - from .state import RobotStep, SharedState - from .api import start_api_server - - -def _within_bounds( - point_mm: np.ndarray, - min_xyz: Tuple[Optional[float], Optional[float], Optional[float]], - max_xyz: Tuple[Optional[float], Optional[float], Optional[float]], -) -> bool: - """Check if point is within workspace bounds.""" - x, y, z = point_mm.tolist() - min_x, min_y, min_z = min_xyz - max_x, max_y, max_z = max_xyz - if min_x is not None and x < min_x: - return False - if max_x is not None and x > max_x: - return False - if min_y is not None and y < min_y: - return False - if max_y is not None and y > max_y: - return False - if min_z is not None and z < min_z: - return False - if max_z is not None and z > max_z: - return False - return True - - -def _translate_target(token: str, mapping: Dict[str, str]) -> str: - """Translate object name using class map.""" - if token in mapping: - return mapping[token] - return token - - -def _load_config_file(path: str) -> Dict[str, Any]: - """Load TOML configuration file.""" - if not path or not os.path.exists(path): - return {} - try: - with open(path, "rb") as handle: - return tomllib.load(handle) - except Exception: - return {} - - -def _load_bucket_objects(config_path: str) -> List[Dict[str, Any]]: - """Load bucket positions from config file.""" - cfg = _load_config_file(config_path) - buckets = cfg.get("bucket_positions", {}) - obj_cfg = cfg.get("object_parameters", {}) - base_z = float(obj_cfg.get("normal_height", 220.0)) - out = [] - for key, color in [ - ("blue_bucket_pos", "blue"), - ("red_bucket_pos", "red"), - ("yellow_bucket_pos", "yellow"), - ("white_bucket_pos", "white"), - ]: - pos = buckets.get(key) - if not isinstance(pos, list) or len(pos) < 2: - continue - out.append( - { - "object_type": "box", - "color": color, - "size": "big", - "position_mm": [float(pos[0]), float(pos[1]), base_z], - "source": "config", - } - ) - return out - - -def _send_dora_command( - node: Node, output_name: str, action: str, payload: Dict[str, Any] -) -> str: - """Send a robot command via Dora.""" - command_id = str(uuid.uuid4()) - message = {"id": command_id, "action": action, "payload": payload} - node.send_output( - output_name, - pa.array([json.dumps(message)]), - metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, - ) - return command_id - - -def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]: - """Parse status payload from robot.""" - if len(value) == 0: - return None - raw = value[0].as_py() - if not raw: - return None - try: - return json.loads(raw) - except Exception: - return None - - -def _log(msg: str) -> None: - """Print a timestamped log message.""" - timestamp = time.strftime("%H:%M:%S") - print(f"[voice_control {timestamp}] {msg}", flush=True) +from dora_voice_control.core import ( + load_voice_config, + load_llm_config, + load_api_config, + SharedState, + ActionContext, + VoiceInputHandler, + IntentQueue, + ObjectsHandler, + SceneNotifier, + SceneState, + PoseHandler, + StatusHandler, + CommandQueueService, + ImageHandler, + DoraNodeAdapter, + EventDispatcher, + VoiceControlLogger, + NodeContext, +) +from dora_voice_control.web import start_api_server +from dora_voice_control.robots import get_behavior, get_robot_adapter def main() -> None: """Main entry point for the voice control node.""" - _log("Starting voice control node...") + logger = VoiceControlLogger() + logger.log("Starting voice control node...") - # Load configuration + # Configuration cfg = load_voice_config() + llm_cfg = load_llm_config() api_cfg = load_api_config() # Environment variables for I/O topics objects_input = os.getenv("OBJECTS_INPUT", "objects") voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in") - voice_out_output = os.getenv("VOICE_OUT_OUTPUT", "voice_out") - scene_output = os.getenv("SCENE_OUTPUT", "scene_update") pose_input = os.getenv("POSE_INPUT", "tcp_pose") status_input = os.getenv("STATUS_INPUT", "status") - command_output = os.getenv("COMMAND_OUTPUT", "robot_cmd") image_input = os.getenv("IMAGE_INPUT", "image_annotated") - llm_provider = os.getenv("LLM_PROVIDER", "rules").lower() - config_file = os.getenv("CONFIG_FILE", "config.toml") - # Image dimensions (will be detected from first frame) - image_width = int(os.getenv("IMAGE_WIDTH", "1280")) - image_height = int(os.getenv("IMAGE_HEIGHT", "720")) + # Home position + home_pose = { + "x": float(os.getenv("INIT_X", "300.0")), + "y": float(os.getenv("INIT_Y", "0.0")), + "z": float(os.getenv("INIT_Z", "250.0")), + "roll": float(os.getenv("INIT_ROLL", "180.0")), + "pitch": float(os.getenv("INIT_PITCH", "0.0")), + "yaw": float(os.getenv("INIT_YAW", "0.0")), + } - # Initial/home position for reset command - init_x = float(os.getenv("INIT_X", "300.0")) - init_y = float(os.getenv("INIT_Y", "0.0")) - init_z = float(os.getenv("INIT_Z", "250.0")) - init_roll = float(os.getenv("INIT_ROLL", "180.0")) - init_pitch = float(os.getenv("INIT_PITCH", "0.0")) - init_yaw = float(os.getenv("INIT_YAW", "0.0")) + # State + state = SharedState() + scene = SceneState() + loaded = scene.load_from_config() + logger.log(f"Loaded {loaded} static objects from config") - _log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm") - _log(f"Initial position: [{init_x}, {init_y}, {init_z}]") - _log(f"LLM provider: {llm_provider}") - _log(f"Dry run: {cfg.dry_run}") + # Robot adapter and behavior + robot_type = os.getenv("ROBOT_TYPE", "vacuum").lower() + robot_adapter = get_robot_adapter(robot_type, logger) + behavior = get_behavior(robot_type, cfg, llm_cfg) + logger.log(f"Robot type: {robot_type} ({type(robot_adapter).__name__})") + logger.log(f"Available actions: {list(behavior.get_all_keywords().keys())}") + logger.log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm") + logger.log(f"Initial position: [{home_pose['x']}, {home_pose['y']}, {home_pose['z']}]") + logger.log(f"LLM provider: {llm_cfg.provider}/{llm_cfg.model}") + logger.log(f"Dry run: {cfg.dry_run}") - # Initialize shared state - shared_state = SharedState() - state = shared_state.voice_state - state.static_objects = _load_bucket_objects(config_file) - pending_intents: Deque[Dict[str, Any]] = deque() + # Node context with output ports + ctx = NodeContext.create( + adapter=DoraNodeAdapter(Node()), + logger=logger, + robot_output=os.getenv("COMMAND_OUTPUT", "robot_cmd"), + scene_output=os.getenv("SCENE_OUTPUT", "scene_update"), + voice_output=os.getenv("VOICE_OUT_OUTPUT", "voice_out"), + ) - _log(f"Loaded {len(state.static_objects)} static objects from config") + # Services (using typed output ports) + intent_queue = IntentQueue() + scene_notifier = SceneNotifier(scene, ctx.scene_out, ctx.logger) + command_queue = CommandQueueService(state, ctx.robot_out, cfg, ctx.logger) - # Queue initial position movement on startup (same as reiniciar) - init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes") - send_init_scene_reset = init_on_start # Flag to send scene reset after node starts - if init_on_start: - _log(f"Startup: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]") - # Clear detected objects - state.latest_objects = [] - state.latest_objects_at = None - # Queue vacuum off and move to home - state.queue.append(RobotStep(action="vacuum_off", payload={})) - state.queue.append( - RobotStep( - action="move_to_pose", - payload={ - "x": init_x, - "y": init_y, - "z": init_z, - "roll": init_roll, - "pitch": init_pitch, - "yaw": init_yaw, - }, - ) - ) - - def command_handler(transcript: str) -> Dict[str, str]: - """Handle voice command and return response.""" - _log(f"Voice input received: \"{transcript}\"") - llm_result = parse_command(transcript, llm_provider) - _log(f"Parse result: {llm_result}") - - # Update debug state - shared_state.update_voice_input(transcript, llm_result, time.monotonic()) - - if llm_result.get("resultado") != "ok": - _log("Command not understood") - return {"text": "No entendi el comando", "status": "error"} - - action = llm_result.get("accion", "error") - obj = llm_result.get("objeto", "no especificado") - color = llm_result.get("color", "no especificado") - size = llm_result.get("tamano", "no especificado") - - _log(f"Intent: action={action}, object={obj}, color={color}, size={size}") - - pending_intents.append( - {"action": action, "obj": obj, "color": color, "size": size} - ) - - # Add to history - shared_state.add_to_history({ - "timestamp": time.time(), - "input": transcript, - "action": action, - "object": obj, - "color": color, - "size": size, - }) - - return {"text": f"Ok, voy a {action}", "status": "ok"} + # Handlers + voice_handler = VoiceInputHandler(state, behavior, intent_queue, ctx.voice_out, ctx.logger) # Set command callback for web interface - shared_state.set_command_callback(command_handler) + state.set_command_callback(voice_handler.create_command_callback()) - # Start web API server if enabled + # Queue initial movements on startup + init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes") + if init_on_start: + ctx.logger.log(f"Startup: resetting scene and moving to home [{home_pose['x']}, {home_pose['y']}, {home_pose['z']}]") + scene.clear_detected() + for step in robot_adapter.reset_tool(): + state.append_queue(step) + for step in robot_adapter.move(home_pose): + state.append_queue(step) + + # Event dispatcher + dispatcher = EventDispatcher(ctx.adapter, ctx.logger) + dispatcher.register(voice_in_input, voice_handler) + dispatcher.register(pose_input, PoseHandler(state, ctx.logger)) + dispatcher.register(objects_input, ObjectsHandler(scene, scene_notifier, ctx.logger)) + dispatcher.register(image_input, ImageHandler(state, ctx.logger)) + dispatcher.register(status_input, StatusHandler(state, ctx.logger)) + + # First event callback - send initial scene + if init_on_start: + dispatcher.on_first_event(scene_notifier.send_initial_scene) + + # Tick callbacks - process intents and dispatch commands + def process_intents() -> None: + """Process pending voice intents.""" + intent = intent_queue.popleft() + if not intent: + return + + ctx.logger.log(f"Processing intent: {intent.action} {intent.obj} {intent.color} {intent.size}") + + latest_pose = state.get_pose() + detected_count = len(scene.query(source="detected")) + config_count = len(scene.query(source="config")) + ctx.logger.log(f"Available objects: {detected_count} detected + {config_count} config") + + # Find target object if action requires it + action_info = behavior.get_action_info(intent.action) + target_obj = None + if action_info and action_info.requires_object: + target_obj = scene.find_by_criteria( + intent.obj, intent.color, intent.size, cfg.class_map + ) + if not target_obj: + ctx.logger.log(f"Target object not found: {intent.obj} {intent.color}") + return + ctx.logger.log(f"Found target: {target_obj.object_type} {target_obj.color} at {target_obj.position_mm}") + + # Build action context + context = ActionContext( + pose=latest_pose, + target=target_obj, + scene=scene, + config=cfg, + home_pose=home_pose, + shared_state=state, + ) + + # Execute action + if behavior.execute(intent.action, context): + ctx.logger.log(f"Executed action: {intent.action}") + if intent.action == "reiniciar": + scene_notifier.send_scene_reset() + else: + ctx.logger.log(f"Failed to execute action: {intent.action}") + + ctx.logger.log(f"Queue size: {state.queue_size()}") + + dispatcher.on_tick(process_intents) + dispatcher.on_tick(command_queue.dispatch_next) + + # Start API server if api_cfg.enabled: - start_api_server(shared_state, api_cfg) + start_api_server(state, scene, api_cfg, behavior.get_all_keywords) - # Create Dora node - node = Node() - _log("Dora node created, waiting for events...") - - first_event = True - for event in node: - # Send scene reset on first event (startup) - if first_event and send_init_scene_reset: - first_event = False - scene_payload = json.dumps( - {"objects": list(state.static_objects), "reset": True} - ) - node.send_output( - scene_output, - pa.array([scene_payload]), - metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, - ) - _log("Sent initial scene reset notification") - if event["type"] != "INPUT": - continue - - # Handle voice input - if event["id"] == voice_in_input: - raw = event["value"][0].as_py() if len(event["value"]) else "" - if not raw: - continue - response = command_handler(raw) - node.send_output( - voice_out_output, - pa.array([json.dumps(response)]), - metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, - ) - continue - - # Handle pose updates - if event["id"] == pose_input: - tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1) - if tcp_pose.size >= 6: - state.latest_pose = tcp_pose[:6].tolist() - state.latest_pose_at = time.monotonic() - continue - - # Handle object detection updates - if event["id"] == objects_input: - raw = event["value"][0].as_py() if len(event["value"]) else "" - if raw: - try: - payload = json.loads(raw) - objects = payload.get("objects", []) - except Exception: - objects = [] - state.latest_objects = objects - state.latest_objects_at = time.monotonic() - continue - - # Handle camera image - if event["id"] == image_input: - try: - # Get raw image data - img_data = event["value"].to_numpy() - # Reshape to image (assuming BGR format) - img = img_data.reshape((image_height, image_width, 3)).astype(np.uint8) - # Encode to JPEG - _, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 80]) - shared_state.update_image(jpeg_data.tobytes(), time.monotonic()) - except Exception as e: - # Log error but don't crash - pass - continue - - # Handle robot status updates - if event["id"] == status_input: - payload = _parse_status_payload(event["value"]) - if payload and state.pending_command: - if payload.get("command_id") == state.pending_command.get("id"): - _log(f"Command completed: {state.pending_command.get('action')} (status={payload.get('status', 'ok')})") - state.pending_command = None - continue - - # Process pending intents - if pending_intents: - intent = pending_intents.popleft() - action = intent["action"] - obj = intent["obj"] - color = intent["color"] - size = intent["size"] - - _log(f"Processing intent: {action} {obj} {color} {size}") - - latest_pose = state.latest_pose - objects = list(state.latest_objects) + list(state.static_objects) - _log(f"Available objects: {len(state.latest_objects)} detected + {len(state.static_objects)} static") - - if action in ("subir", "bajar") and latest_pose: - delta = cfg.step_mm if action == "subir" else -cfg.step_mm - target = np.array(latest_pose[:3], dtype=np.float64) - target[2] += delta - if _within_bounds(target, cfg.workspace_min, cfg.workspace_max): - step = RobotStep( - action="move_to_pose", - payload={ - "x": float(target[0]), - "y": float(target[1]), - "z": float(target[2]), - "roll": cfg.default_roll, - "pitch": cfg.default_pitch, - "yaw": cfg.default_yaw, - }, - ) - state.queue.append(step) - _log(f"Queued: move Z to {target[2]:.1f}mm (delta={delta:+.1f})") - else: - _log(f"Target {target.tolist()} out of bounds, skipping") - - elif action in ("ir", "tomar", "soltar"): - target_obj = None - if obj != "no especificado": - target_name = _translate_target(obj, cfg.class_map) - target_color = _translate_target(color, cfg.class_map) - _log(f"Looking for: type={target_name}, color={target_color}") - # Log available objects for debugging - for o in objects: - _log(f" -> Available: {o.get('object_type')} {o.get('color')} {o.get('size')} at {o.get('position_mm')}") - for o in objects: - if o.get("object_type") == target_name: - if color == "no especificado" or o.get("color") == target_color: - if size == "no especificado" or o.get("size") == _translate_target(size, cfg.class_map): - target_obj = o - break - if target_obj: - _log(f"Found target: {target_obj.get('object_type')} {target_obj.get('color')} at {target_obj.get('position_mm')}") - pos = np.array(target_obj["position_mm"], dtype=np.float64) - approach = pos.copy() - approach[2] += cfg.tcp_offset_mm + cfg.approach_offset_mm - target = pos.copy() - target[2] += cfg.tcp_offset_mm - if _within_bounds(approach, cfg.workspace_min, cfg.workspace_max): - state.queue.append( - RobotStep( - action="move_to_pose", - payload={ - "x": float(approach[0]), - "y": float(approach[1]), - "z": float(approach[2]), - "roll": cfg.default_roll, - "pitch": cfg.default_pitch, - "yaw": cfg.default_yaw, - }, - ) - ) - _log(f"Queued: approach pose at Z={approach[2]:.1f}mm") - if _within_bounds(target, cfg.workspace_min, cfg.workspace_max): - state.queue.append( - RobotStep( - action="move_to_pose", - payload={ - "x": float(target[0]), - "y": float(target[1]), - "z": float(target[2]), - "roll": cfg.default_roll, - "pitch": cfg.default_pitch, - "yaw": cfg.default_yaw, - }, - ) - ) - _log(f"Queued: target pose at Z={target[2]:.1f}mm") - if action == "tomar": - state.queue.append(RobotStep(action="vacuum_on", payload={})) - _log("Queued: vacuum_on") - elif action == "soltar": - state.queue.append(RobotStep(action="vacuum_off", payload={})) - _log("Queued: vacuum_off") - else: - _log(f"Target object not found: {obj} {color}") - continue - - elif action == "reiniciar": - _log(f"Reiniciar: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]") - # Turn off vacuum first - state.queue.append(RobotStep(action="vacuum_off", payload={})) - # Clear current detected objects (will be refreshed by detector) - state.latest_objects = [] - state.latest_objects_at = None - _log("Cleared detected objects - waiting for fresh detection") - # Move to initial position - state.queue.append( - RobotStep( - action="move_to_pose", - payload={ - "x": init_x, - "y": init_y, - "z": init_z, - "roll": init_roll, - "pitch": init_pitch, - "yaw": init_yaw, - }, - ) - ) - _log(f"Queued: vacuum_off + move to home") - # Send scene update to notify clients that scene was reset - scene_payload = json.dumps( - {"objects": list(state.static_objects), "reset": True} - ) - node.send_output( - scene_output, - pa.array([scene_payload]), - metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, - ) - _log("Sent scene reset notification") - - _log(f"Queue size: {len(state.queue)}") - - # Emit scene updates when objects change - if event["id"] == objects_input: - scene_payload = json.dumps( - {"objects": list(state.latest_objects) + list(state.static_objects)} - ) - node.send_output( - scene_output, - pa.array([scene_payload]), - metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, - ) - - # Send queued robot steps one at a time - if state.pending_command is None and state.queue: - step = state.queue.popleft() - if cfg.dry_run: - _log(f"[DRY RUN] Would send: {step.action} {step.payload}") - state.pending_command = None - continue - cmd_id = _send_dora_command(node, command_output, step.action, step.payload) - state.pending_command = {"id": cmd_id, "action": step.action} - _log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={len(state.queue)}") - - # Update debug state - shared_state.update_robot_command( - {"id": cmd_id, "action": step.action, "payload": step.payload}, - time.monotonic(), - ) + # Run event loop + dispatcher.run() if __name__ == "__main__": diff --git a/dora_voice_control/dora_voice_control/parser.py b/dora_voice_control/dora_voice_control/parser.py deleted file mode 100644 index 231a710..0000000 --- a/dora_voice_control/dora_voice_control/parser.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Voice command parsing logic.""" - -from __future__ import annotations - -import json -import os -import unicodedata -from typing import Dict - - -def normalize(text: str) -> str: - """Normalize text: lowercase, remove accents.""" - text = text.lower().strip() - text = unicodedata.normalize("NFKD", text) - text = "".join([c for c in text if not unicodedata.combining(c)]) - return text - - -def rule_parse(transcript: str) -> Dict[str, str]: - """Parse voice command using rule-based approach.""" - text = normalize(transcript) - - action = "error" - if any(w in text for w in ["reiniciar", "reinicia", "reset"]): - action = "reiniciar" - elif any(w in text for w in ["sube", "subir", "arriba"]): - action = "subir" - elif any(w in text for w in ["baja", "bajar", "abajo"]): - action = "bajar" - elif any(w in text for w in ["soltar", "deja", "dejar"]): - action = "soltar" - elif any(w in text for w in ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"]): - action = "tomar" - elif any(w in text for w in ["ir", "ve", "mover", "muevete", "acercar"]): - action = "ir" - - color = "no especificado" - if "rojo" in text: - color = "rojo" - elif "azul" in text: - color = "azul" - elif "amarillo" in text: - color = "amarillo" - elif "blanco" in text: - color = "blanco" - - obj = "no especificado" - if "estrella" in text: - obj = "estrella" - elif "cilindro" in text: - obj = "cilindro" - elif "cubo" in text: - obj = "cubo" - elif "caja" in text: - obj = "caja" - - size = "no especificado" - if "grande" in text: - size = "grande" - elif "pequeno" in text or "pequeño" in text or "chico" in text: - size = "pequeno" - - if action == "error": - return {"resultado": "error"} - return { - "resultado": "ok", - "accion": action, - "objeto": obj, - "color": color, - "tamano": size, - } - - -def build_gemini_prompt(transcript: str) -> str: - """Build prompt for Gemini LLM parsing.""" - return f"""Interpreta el siguiente comando de voz de un niño, convertido a texto, para controlar - un robot (manito). Asegúrate de responder con 'accion', 'objeto', 'color' y 'tamano'. Si el color - o el tamaño no están especificados, responde con 'no especificado'. Si no entiendes la frase, - responde con 'resultado: error'. En caso contrario, responde con 'resultado: ok'. Las acciones - posibles son 'bajar', 'subir', 'soltar', 'tomar', 'ir', 'reiniciar'. Los colores posibles son 'rojo', - 'blanco','azul' y 'amarillo'. Los tamaños posibles son 'grande', 'pequeno'. Los posible objetos son estrella, - cilindro, cubo y caja; cualquier otro objeto es error. - Comando: "{transcript}" - Nota: Los comandos pueden incluir variaciones en la expresión y errores comunes en el lenguaje de - los niños. Normaliza la respuesta a las categorías establecidas. La salida es un json con los campos - 'resultado', 'accion', 'objeto', 'color' y 'tamano'. Adicionalmente los ninos pueden decir tomar,chupar, succionar o similar para tomar un objeto. - """ - - -def parse_command(transcript: str, llm_provider: str = "rules") -> Dict[str, str]: - """Parse voice command using specified provider.""" - if llm_provider == "gemini": - try: - from google import genai - from google.genai import types - except Exception: - return rule_parse(transcript) - - api_key = os.getenv("GOOGLE_API_KEY") - if not api_key: - return rule_parse(transcript) - - try: - client = genai.Client(api_key=api_key) - prompt = build_gemini_prompt(transcript) - reply = client.models.generate_content( - model=os.getenv("GEMINI_MODEL", "gemini-2.0-flash"), - contents=prompt, - config=types.GenerateContentConfig(temperature=0.5), - ) - raw = str(reply.text).replace("```json", "").replace("```", "") - return json.loads(raw) - except json.JSONDecodeError: - return {"resultado": "error"} - except Exception: - return rule_parse(transcript) - else: - return rule_parse(transcript) diff --git a/dora_voice_control/dora_voice_control/robots/__init__.py b/dora_voice_control/dora_voice_control/robots/__init__.py new file mode 100644 index 0000000..e394b10 --- /dev/null +++ b/dora_voice_control/dora_voice_control/robots/__init__.py @@ -0,0 +1,50 @@ +"""Robot registry and factory helpers.""" + +from __future__ import annotations + +from typing import Optional + +from ..core.behavior import RobotBehavior +from ..core.config import LLMConfig, VoiceConfig +from ..core.node import VoiceControlLogger +from ..core.robot import RobotAdapter +from .littlehand.adapter import ( + ALIASES as _littlehand_aliases, + NAME as _littlehand_name, + VacuumGripperAdapter, +) +from .littlehand.behavior import LittlehandBehavior + + +def _resolve_robot_type(robot_type: str) -> tuple[str, bool]: + """Resolve a robot type or alias to a registered robot name.""" + token = robot_type.strip().lower() + if token in _littlehand_aliases or token == _littlehand_name: + return _littlehand_name, True + return _littlehand_name, False + + +def get_robot_adapter(robot_type: str, logger: Optional[VoiceControlLogger] = None) -> RobotAdapter: + """Factory to get robot adapter by type.""" + resolved, known = _resolve_robot_type(robot_type) + if not known and logger: + logger.warn(f"Unknown robot type '{robot_type}', defaulting to {resolved}") + return VacuumGripperAdapter() + + +def get_behavior( + robot_type: str, + config: VoiceConfig, + llm_config: Optional[LLMConfig] = None, +) -> RobotBehavior: + """Factory to get robot behavior by type.""" + resolved, _known = _resolve_robot_type(robot_type) + if resolved == _littlehand_name: + return LittlehandBehavior(config, VacuumGripperAdapter(), llm_config) + return LittlehandBehavior(config, VacuumGripperAdapter(), llm_config) + + +__all__ = [ + "get_robot_adapter", + "get_behavior", +] diff --git a/dora_voice_control/dora_voice_control/robots/littlehand/__init__.py b/dora_voice_control/dora_voice_control/robots/littlehand/__init__.py new file mode 100644 index 0000000..48c3d81 --- /dev/null +++ b/dora_voice_control/dora_voice_control/robots/littlehand/__init__.py @@ -0,0 +1 @@ +"""Littlehand robot package.""" diff --git a/dora_voice_control/dora_voice_control/robots/littlehand/actions.py b/dora_voice_control/dora_voice_control/robots/littlehand/actions.py new file mode 100644 index 0000000..6481e1b --- /dev/null +++ b/dora_voice_control/dora_voice_control/robots/littlehand/actions.py @@ -0,0 +1,48 @@ +"""Action definitions for Littlehand.""" + +from __future__ import annotations + +from ...core.behavior import ActionInfo + +# Littlehand starts from the default pick-and-place actions. +LITTLEHAND_ACTIONS: dict[str, ActionInfo] = { + "subir": ActionInfo( + name="subir", + aliases=["sube", "arriba"], + requires_pose=True, + description="Subir el robot", + ), + "bajar": ActionInfo( + name="bajar", + aliases=["baja", "abajo"], + requires_pose=True, + description="Bajar el robot", + ), + "ir": ActionInfo( + name="ir", + aliases=["ve", "mover", "muevete", "acercar"], + requires_object=True, + description="Ir hacia un objeto", + ), + "tomar": ActionInfo( + name="tomar", + aliases=["toma", "agarra", "agarrar", "coger", "chupar", "succionar"], + requires_pose=False, + requires_object=False, + description="Tomar un objeto", + ), + "soltar": ActionInfo( + name="soltar", + aliases=["deja", "dejar"], + requires_pose=False, + requires_object=False, + description="Soltar el objeto", + ), + "reiniciar": ActionInfo( + name="reiniciar", + aliases=["reinicia", "reset"], + requires_pose=False, + requires_object=False, + description="Reiniciar a posicion inicial", + ), +} diff --git a/dora_voice_control/dora_voice_control/robots/littlehand/adapter.py b/dora_voice_control/dora_voice_control/robots/littlehand/adapter.py new file mode 100644 index 0000000..5414a8f --- /dev/null +++ b/dora_voice_control/dora_voice_control/robots/littlehand/adapter.py @@ -0,0 +1,28 @@ +"""Adapter for the Littlehand robot (vacuum gripper).""" + +from __future__ import annotations + +from typing import Any, Dict, List + +from ...core.robot import RobotAdapter +from ...core.state import RobotStep + + +class VacuumGripperAdapter(RobotAdapter): + """Adapter for robots with vacuum gripper (suction).""" + + def grab(self) -> List[RobotStep]: + return [RobotStep(action="vacuum_on", payload={})] + + def release(self) -> List[RobotStep]: + return [RobotStep(action="vacuum_off", payload={})] + + def move(self, payload: Dict[str, Any]) -> List[RobotStep]: + return [RobotStep(action="move_to_pose", payload=payload)] + + def reset_tool(self) -> List[RobotStep]: + return [RobotStep(action="vacuum_off", payload={})] + + +NAME = "littlehand" +ALIASES = {"vacuum", "littlehand"} diff --git a/dora_voice_control/dora_voice_control/robots/littlehand/behavior.py b/dora_voice_control/dora_voice_control/robots/littlehand/behavior.py new file mode 100644 index 0000000..4bc6860 --- /dev/null +++ b/dora_voice_control/dora_voice_control/robots/littlehand/behavior.py @@ -0,0 +1,57 @@ +"""Behavior for the Littlehand robot.""" + +from __future__ import annotations + +from typing import Callable + +from ...core.behavior import ActionContext, RobotBehavior +from .actions import LITTLEHAND_ACTIONS + + +class LittlehandBehavior(RobotBehavior): + """Littlehand behavior using the default pick-and-place actions.""" + + ACTIONS = LITTLEHAND_ACTIONS + + def action_handlers(self) -> dict[str, Callable[[ActionContext], bool]]: + return { + "subir": self.action_subir, + "bajar": self.action_bajar, + "ir": self.action_ir, + "tomar": self.action_tomar, + "soltar": self.action_soltar, + "reiniciar": self.action_reiniciar, + } + + def action_subir(self, ctx: ActionContext) -> bool: + """Move up by step_mm.""" + target_z = ctx.pose[2] + self.config.step_mm + return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z) + + def action_bajar(self, ctx: ActionContext) -> bool: + """Move down by step_mm.""" + target_z = ctx.pose[2] - self.config.step_mm + return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z) + + def action_ir(self, ctx: ActionContext) -> bool: + """Move to object position (approach + target).""" + pos = ctx.target.position_mm + self._queue_approach_sequence(ctx, pos) + return True + + def action_tomar(self, ctx: ActionContext) -> bool: + """Activate tool (low-level grab).""" + self._queue_steps(ctx, self.robot_adapter.grab()) + return True + + def action_soltar(self, ctx: ActionContext) -> bool: + """Deactivate tool (low-level release).""" + self._queue_steps(ctx, self.robot_adapter.release()) + return True + + def action_reiniciar(self, ctx: ActionContext) -> bool: + """Reset: release tool, move home, clear objects.""" + self._queue_steps(ctx, self.robot_adapter.reset_tool()) + self._queue_steps(ctx, self.robot_adapter.move(ctx.home_pose)) + ctx.scene.clear_detected() + return True diff --git a/dora_voice_control/dora_voice_control/web/__init__.py b/dora_voice_control/dora_voice_control/web/__init__.py new file mode 100644 index 0000000..99f870d --- /dev/null +++ b/dora_voice_control/dora_voice_control/web/__init__.py @@ -0,0 +1,8 @@ +"""Web interface module.""" + +from .api import create_api, start_api_server + +__all__ = [ + "create_api", + "start_api_server", +] diff --git a/dora_voice_control/dora_voice_control/api.py b/dora_voice_control/dora_voice_control/web/api.py similarity index 62% rename from dora_voice_control/dora_voice_control/api.py rename to dora_voice_control/dora_voice_control/web/api.py index c2d1dc4..33b3399 100644 --- a/dora_voice_control/dora_voice_control/api.py +++ b/dora_voice_control/dora_voice_control/web/api.py @@ -2,31 +2,29 @@ from __future__ import annotations -import os -import sys import threading -from typing import Any +from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING import uvicorn from fastapi import FastAPI, HTTPException from fastapi.responses import HTMLResponse, Response -# Handle both package and direct script execution -# __package__ is None when run as script, '' when imported from a script -if not __package__: - _pkg_dir = os.path.dirname(os.path.abspath(__file__)) - if _pkg_dir not in sys.path: - sys.path.insert(0, _pkg_dir) - from models import CommandRequest, CommandResponse - from state import SharedState - from templates import HTML_TEMPLATE -else: - from .models import CommandRequest, CommandResponse - from .state import SharedState - from .templates import HTML_TEMPLATE +from .models import CommandRequest, CommandResponse +from .templates import HTML_TEMPLATE +from ..core.state import SharedState + +if TYPE_CHECKING: + from ..core.scene import SceneState + +# Type alias for vocabulary getter +VocabularyGetter = Callable[[], Dict[str, List[str]]] -def create_api(state: SharedState) -> FastAPI: +def create_api( + state: SharedState, + scene: "SceneState", + vocabulary_getter: Optional[VocabularyGetter] = None, +) -> FastAPI: """Create FastAPI application with voice control endpoints.""" app = FastAPI( title="Voice Control Debug API", @@ -49,9 +47,14 @@ def create_api(state: SharedState) -> FastAPI: @app.get("/api/objects") def get_objects() -> dict: - """Get detected and static objects.""" + """Get all scene objects.""" try: - return state.get_objects() + detected = [obj.to_dict() for obj in scene.query(source="detected")] + config_objs = [obj.to_dict() for obj in scene.query(source="config")] + return { + "detected": detected, + "static": config_objs, + } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -67,8 +70,7 @@ def create_api(state: SharedState) -> FastAPI: def clear_queue() -> dict: """Clear the command queue.""" try: - with state._lock: - state.voice_state.queue.clear() + state.clear_queue() return {"ok": True} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -137,6 +139,35 @@ def create_api(state: SharedState) -> FastAPI: except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + @app.get("/api/debug/state") + def get_debug_state() -> dict: + """Get full system state for debugging.""" + try: + detected = [obj.to_dict() for obj in scene.query(source="detected")] + config_objs = [obj.to_dict() for obj in scene.query(source="config")] + return { + "pose": state.get_pose(), + "queue_size": state.queue_size(), + "pending_command": state.get_pending_command(), + "detected_objects": detected, + "static_objects": config_objs, + "scene_object_count": scene.count(), + "last_voice_input": state.get_last_voice_input(), + "last_parse_result": state.get_last_parse_result(), + } + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/debug/vocabulary") + def get_vocabulary() -> dict: + """Get all recognized keywords.""" + try: + if vocabulary_getter is None: + return {"error": "Vocabulary getter not configured"} + return vocabulary_getter() + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + return app @@ -147,10 +178,23 @@ def run_uvicorn(app: FastAPI, host: str, port: int) -> None: server.run() -def start_api_server(state: SharedState, config: Any) -> threading.Thread: - """Start the API server in a background thread.""" +def start_api_server( + state: SharedState, + scene: "SceneState", + config: Any, + vocabulary_getter: Optional[VocabularyGetter] = None, +) -> threading.Thread: + """Start the API server in a background thread. + + Args: + state: Shared state container + scene: Scene state with object tracking + config: API configuration with host and port + vocabulary_getter: Optional function to get action keywords for debug endpoint + """ import time as _time - app = create_api(state) + + app = create_api(state, scene, vocabulary_getter) api_thread = threading.Thread( target=run_uvicorn, args=(app, config.host, config.port), diff --git a/dora_voice_control/dora_voice_control/models.py b/dora_voice_control/dora_voice_control/web/models.py similarity index 100% rename from dora_voice_control/dora_voice_control/models.py rename to dora_voice_control/dora_voice_control/web/models.py diff --git a/dora_voice_control/dora_voice_control/templates.py b/dora_voice_control/dora_voice_control/web/templates.py similarity index 100% rename from dora_voice_control/dora_voice_control/templates.py rename to dora_voice_control/dora_voice_control/web/templates.py diff --git a/dora_voice_control/pyproject.toml b/dora_voice_control/pyproject.toml index 92e6b1c..adde9c6 100644 --- a/dora_voice_control/pyproject.toml +++ b/dora_voice_control/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ ] [project.optional-dependencies] -llm = ["google-genai"] +llm = ["dspy >= 2.5", "google-genai", "litellm"] [project.scripts] dora-voice-control = "dora_voice_control.main:main" diff --git a/dora_voice_control/tests/__init__.py b/dora_voice_control/tests/__init__.py new file mode 100644 index 0000000..d1c6788 --- /dev/null +++ b/dora_voice_control/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for dora_voice_control.""" diff --git a/dora_voice_control/tests/test_scene_state.py b/dora_voice_control/tests/test_scene_state.py new file mode 100644 index 0000000..00fb248 --- /dev/null +++ b/dora_voice_control/tests/test_scene_state.py @@ -0,0 +1,624 @@ +"""Tests for scene state with spatial relationships.""" + +import sys +from pathlib import Path + +import pytest + +# Add parent directory to path for direct import from core.scene +sys.path.insert(0, str(Path(__file__).parent.parent / "dora_voice_control" / "core")) +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scene import SceneState, SceneObject + + +class TestSceneObjectBasics: + """Test SceneObject dataclass.""" + + def test_create_scene_object(self): + """Test basic SceneObject creation.""" + obj = SceneObject( + id="cube_red_1", + object_type="cube", + color="red", + position_mm=[100.0, 200.0, 50.0], + height_mm=30.0, + ) + assert obj.id == "cube_red_1" + assert obj.object_type == "cube" + assert obj.color == "red" + assert obj.position_mm == [100.0, 200.0, 50.0] + assert obj.height_mm == 30.0 + assert obj.source == "detected" # default + + def test_center_property(self): + """Test center property is alias for position_mm.""" + obj = SceneObject( + id="obj1", + object_type="cube", + color="blue", + position_mm=[10.0, 20.0, 30.0], + ) + assert obj.center == obj.position_mm + + def test_to_dict(self): + """Test to_dict serialization.""" + obj = SceneObject( + id="test_1", + object_type="cylinder", + color="yellow", + size="big", + position_mm=[1.0, 2.0, 3.0], + height_mm=40.0, + source="config", + ) + d = obj.to_dict() + assert d["id"] == "test_1" + assert d["object_type"] == "cylinder" + assert d["color"] == "yellow" + assert d["size"] == "big" + assert d["position_mm"] == [1.0, 2.0, 3.0] + assert d["height_mm"] == 40.0 + assert d["source"] == "config" + + +class TestSceneStateBasics: + """Test basic SceneState operations.""" + + def test_add_and_get(self): + """Test adding and retrieving objects.""" + scene = SceneState() + obj = SceneObject( + id="cube_red_1", + object_type="cube", + color="red", + position_mm=[100.0, 200.0, 50.0], + ) + assert scene.add(obj) is True + retrieved = scene.get("cube_red_1") + assert retrieved is not None + assert retrieved.id == "cube_red_1" + + def test_add_duplicate_fails(self): + """Test adding duplicate ID fails.""" + scene = SceneState() + obj1 = SceneObject(id="obj1", object_type="cube", color="red") + obj2 = SceneObject(id="obj1", object_type="cylinder", color="blue") + assert scene.add(obj1) is True + assert scene.add(obj2) is False + + def test_delete(self): + """Test deleting objects.""" + scene = SceneState() + obj = SceneObject(id="to_delete", object_type="cube", color="red") + scene.add(obj) + assert scene.get("to_delete") is not None + assert scene.delete("to_delete") is True + assert scene.get("to_delete") is None + + def test_delete_nonexistent(self): + """Test deleting non-existent object returns False.""" + scene = SceneState() + assert scene.delete("nonexistent") is False + + def test_update(self): + """Test updating object properties.""" + scene = SceneState() + obj = SceneObject( + id="to_update", + object_type="cube", + color="red", + position_mm=[0.0, 0.0, 0.0], + ) + scene.add(obj) + assert scene.update("to_update", color="blue", position_mm=[1.0, 2.0, 3.0]) + updated = scene.get("to_update") + assert updated.color == "blue" + assert updated.position_mm == [1.0, 2.0, 3.0] + + def test_get_all(self): + """Test getting all objects.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + scene.add(SceneObject(id="b", object_type="cube", color="blue")) + all_objs = scene.get_all() + assert len(all_objs) == 2 + ids = {o.id for o in all_objs} + assert ids == {"a", "b"} + + def test_count(self): + """Test object count.""" + scene = SceneState() + assert scene.count() == 0 + scene.add(SceneObject(id="a", object_type="cube", color="red")) + assert scene.count() == 1 + scene.add(SceneObject(id="b", object_type="cube", color="blue")) + assert scene.count() == 2 + + def test_clear(self): + """Test clearing all objects.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + scene.add(SceneObject(id="b", object_type="cube", color="blue")) + scene.clear() + assert scene.count() == 0 + + +class TestSceneStateQuery: + """Test SceneState query operations.""" + + def test_query_by_type(self): + """Test querying by object type.""" + scene = SceneState() + scene.add(SceneObject(id="cube1", object_type="cube", color="red")) + scene.add(SceneObject(id="cyl1", object_type="cylinder", color="red")) + scene.add(SceneObject(id="cube2", object_type="cube", color="blue")) + + cubes = scene.query(object_type="cube") + assert len(cubes) == 2 + assert all(o.object_type == "cube" for o in cubes) + + def test_query_by_color(self): + """Test querying by color.""" + scene = SceneState() + scene.add(SceneObject(id="r1", object_type="cube", color="red")) + scene.add(SceneObject(id="r2", object_type="cylinder", color="red")) + scene.add(SceneObject(id="b1", object_type="cube", color="blue")) + + reds = scene.query(color="red") + assert len(reds) == 2 + assert all(o.color == "red" for o in reds) + + def test_query_by_size(self): + """Test querying by size.""" + scene = SceneState() + scene.add(SceneObject(id="big1", object_type="cube", color="red", size="big")) + scene.add(SceneObject(id="small1", object_type="cube", color="blue", size="small")) + + big = scene.query(size="big") + assert len(big) == 1 + assert big[0].id == "big1" + + def test_query_by_source(self): + """Test querying by source.""" + scene = SceneState() + scene.add(SceneObject(id="d1", object_type="cube", color="red", source="detected")) + scene.add(SceneObject(id="c1", object_type="box", color="blue", source="config")) + + detected = scene.query(source="detected") + assert len(detected) == 1 + assert detected[0].id == "d1" + + def test_query_multiple_criteria(self): + """Test querying with multiple criteria.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red", size="big")) + scene.add(SceneObject(id="b", object_type="cube", color="red", size="small")) + scene.add(SceneObject(id="c", object_type="cube", color="blue", size="big")) + + results = scene.query(object_type="cube", color="red", size="big") + assert len(results) == 1 + assert results[0].id == "a" + + def test_query_no_matches(self): + """Test query with no matches returns empty list.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + results = scene.query(color="purple") + assert results == [] + + +class TestSceneStateRelationships: + """Test spatial relationship methods.""" + + def test_set_on_top_of(self): + """Test manually setting stacking relationship.""" + scene = SceneState() + scene.add(SceneObject(id="bottom", object_type="cube", color="red")) + scene.add(SceneObject(id="top", object_type="cube", color="blue")) + + assert scene.set_on_top_of("top", "bottom") is True + top = scene.get("top") + assert top.on_top_of == "bottom" + + def test_set_on_top_of_invalid(self): + """Test setting relationship with invalid IDs.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + + # Non-existent top object + assert scene.set_on_top_of("nonexistent", "a") is False + # Non-existent bottom object + assert scene.set_on_top_of("a", "nonexistent") is False + + def test_get_object_below(self): + """Test getting object below.""" + scene = SceneState() + scene.add(SceneObject(id="bottom", object_type="cube", color="red")) + scene.add(SceneObject(id="top", object_type="cube", color="blue", on_top_of="bottom")) + + below = scene.get_object_below("top") + assert below is not None + assert below.id == "bottom" + + def test_get_objects_above(self): + """Test getting objects stacked on top.""" + scene = SceneState() + scene.add(SceneObject(id="base", object_type="cube", color="red")) + scene.add(SceneObject(id="middle", object_type="cube", color="blue", on_top_of="base")) + scene.add(SceneObject(id="side", object_type="cube", color="yellow", on_top_of="base")) + + above = scene.get_objects_above("base") + assert len(above) == 2 + ids = {o.id for o in above} + assert ids == {"middle", "side"} + + def test_effective_height_no_stack(self): + """Test effective height for single object.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red", height_mm=30.0)) + assert scene.effective_height("a") == 30.0 + + def test_effective_height_stacked(self): + """Test effective height for stacked objects.""" + scene = SceneState() + scene.add(SceneObject(id="bottom", object_type="cube", color="red", height_mm=30.0)) + scene.add(SceneObject(id="top", object_type="cube", color="blue", height_mm=25.0, on_top_of="bottom")) + + assert scene.effective_height("bottom") == 30.0 + assert scene.effective_height("top") == 55.0 # 30 + 25 + + def test_get_stack(self): + """Test getting full stack.""" + scene = SceneState() + scene.add(SceneObject(id="bottom", object_type="cube", color="red")) + scene.add(SceneObject(id="middle", object_type="cube", color="blue", on_top_of="bottom")) + scene.add(SceneObject(id="top", object_type="cube", color="yellow", on_top_of="middle")) + + stack = scene.get_stack("middle") + assert len(stack) == 3 + assert stack[0].id == "bottom" + assert stack[1].id == "middle" + assert stack[2].id == "top" + + def test_delete_clears_relationships(self): + """Test that deleting object clears relationships pointing to it.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + scene.add(SceneObject(id="b", object_type="cube", color="blue", on_top_of="a")) + + assert scene.get("b").on_top_of == "a" + scene.delete("a") + obj_b = scene.get("b") + assert obj_b.on_top_of is None # Relationship cleared + + +class TestSceneStateStackingInference: + """Test automatic stacking inference from positions.""" + + def test_infer_stacking_simple(self): + """Test basic stacking inference.""" + scene = SceneState() + + # Bottom cube at z=30 + scene.add(SceneObject( + id="bottom", + object_type="cube", + color="red", + position_mm=[0.0, 0.0, 30.0], + height_mm=30.0, + bbox_mm=[-15.0, -15.0, 15.0, 15.0], + )) + # Top cube at z=60 (30 + 30 = 60, sitting on bottom) + scene.add(SceneObject( + id="top", + object_type="cube", + color="blue", + position_mm=[0.0, 0.0, 60.0], + height_mm=30.0, + bbox_mm=[-15.0, -15.0, 15.0, 15.0], + )) + + scene.infer_relationships() + + top = scene.get("top") + assert top.on_top_of == "bottom" + + def test_infer_stacking_no_overlap(self): + """Test that objects without horizontal overlap are not stacked.""" + scene = SceneState() + + # Object at x=0 + scene.add(SceneObject( + id="a", + object_type="cube", + color="red", + position_mm=[0.0, 0.0, 30.0], + height_mm=30.0, + bbox_mm=[-15.0, -15.0, 15.0, 15.0], + )) + # Object at x=100 (no overlap), same z would indicate stack + scene.add(SceneObject( + id="b", + object_type="cube", + color="blue", + position_mm=[100.0, 0.0, 60.0], + height_mm=30.0, + bbox_mm=[85.0, -15.0, 115.0, 15.0], + )) + + scene.infer_relationships() + + b = scene.get("b") + assert b.on_top_of is None # No stacking - no overlap + + def test_infer_stacking_via_replace_detected(self): + """Test stacking is inferred after replace_detected.""" + scene = SceneState() + + scene.replace_detected([ + { + "object_type": "cube", + "color": "red", + "position_mm": [0.0, 0.0, 30.0], + "height_mm": 30.0, + "bbox_mm": [-15.0, -15.0, 15.0, 15.0], + }, + { + "object_type": "cube", + "color": "blue", + "position_mm": [0.0, 0.0, 60.0], # z = 30 + 30 + "height_mm": 30.0, + "bbox_mm": [-15.0, -15.0, 15.0, 15.0], + }, + ]) + + # Should have auto-inferred the stacking + blues = scene.query(color="blue") + assert len(blues) == 1 + assert blues[0].on_top_of is not None + + +class TestSceneStateDetectorIntegration: + """Test detector integration methods.""" + + def test_replace_detected_basic(self): + """Test replacing detected objects.""" + scene = SceneState() + + scene.replace_detected([ + {"object_type": "cube", "color": "red", "position_mm": [1.0, 2.0, 3.0]}, + {"object_type": "cylinder", "color": "blue"}, + ]) + + assert scene.count() == 2 + all_objs = scene.get_all() + assert all(o.source == "detected" for o in all_objs) + + def test_replace_detected_keeps_config(self): + """Test that replace_detected keeps config objects.""" + scene = SceneState() + + # Add a config object + scene.add(SceneObject( + id="config_box_1", + object_type="box", + color="yellow", + source="config", + )) + + # Replace detected + scene.replace_detected([ + {"object_type": "cube", "color": "red"}, + ]) + + # Config object should still be there + assert scene.get("config_box_1") is not None + config_objs = scene.query(source="config") + assert len(config_objs) == 1 + + # Detected object should be added + detected = scene.query(source="detected") + assert len(detected) == 1 + + def test_replace_detected_replaces_old_detected(self): + """Test that replace_detected replaces previous detected objects.""" + scene = SceneState() + + # First detection + scene.replace_detected([ + {"object_type": "cube", "color": "red"}, + {"object_type": "cube", "color": "blue"}, + ]) + assert len(scene.query(source="detected")) == 2 + + # Second detection + scene.replace_detected([ + {"object_type": "cylinder", "color": "green"}, + ]) + detected = scene.query(source="detected") + assert len(detected) == 1 + assert detected[0].object_type == "cylinder" + + def test_replace_detected_generates_ids(self): + """Test that replace_detected generates unique IDs.""" + scene = SceneState() + + scene.replace_detected([ + {"object_type": "cube", "color": "red"}, + {"object_type": "cube", "color": "red"}, + ]) + + all_objs = scene.get_all() + ids = [o.id for o in all_objs] + assert len(ids) == len(set(ids)) # All IDs should be unique + + def test_clear_detected(self): + """Test clearing only detected objects.""" + scene = SceneState() + + scene.add(SceneObject(id="config1", object_type="box", color="green", source="config")) + scene.replace_detected([ + {"object_type": "cube", "color": "red"}, + ]) + + assert scene.count() == 2 + scene.clear_detected() + assert scene.count() == 1 + assert scene.get("config1") is not None + + def test_to_list(self): + """Test serializing all objects to list.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + scene.add(SceneObject(id="b", object_type="cube", color="blue")) + + lst = scene.to_list() + assert len(lst) == 2 + assert all(isinstance(d, dict) for d in lst) + ids = {d["id"] for d in lst} + assert ids == {"a", "b"} + + +class TestSceneStateThreadSafety: + """Test thread safety of SceneState.""" + + def test_concurrent_add_and_query(self): + """Test concurrent add and query operations.""" + import threading + + scene = SceneState() + errors = [] + + def adder(): + try: + for i in range(100): + scene.add(SceneObject( + id=f"obj_{threading.current_thread().name}_{i}", + object_type="cube", + color="red", + )) + except Exception as e: + errors.append(e) + + def querier(): + try: + for _ in range(100): + scene.query(object_type="cube") + except Exception as e: + errors.append(e) + + threads = [ + threading.Thread(target=adder, name="adder1"), + threading.Thread(target=adder, name="adder2"), + threading.Thread(target=querier, name="querier"), + ] + + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors, f"Thread errors: {errors}" + + +class TestFindByCriteria: + """Test find_by_criteria method (voice command matching).""" + + def test_find_by_type_and_color(self): + """Test finding object by type and color.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + scene.add(SceneObject(id="b", object_type="cube", color="blue")) + + result = scene.find_by_criteria("cube", "red", "no especificado") + assert result is not None + assert result.id == "a" + + def test_find_with_class_map(self): + """Test finding with class_map translation.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + + # Spanish to English mapping + class_map = {"cubo": "cube", "rojo": "red"} + result = scene.find_by_criteria("cubo", "rojo", "no especificado", class_map) + assert result is not None + assert result.id == "a" + + def test_find_unspecified_returns_none(self): + """Test that 'no especificado' object type returns None.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + + result = scene.find_by_criteria("no especificado", "red", "no especificado") + assert result is None + + def test_find_no_match(self): + """Test finding with no matching object.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red")) + + result = scene.find_by_criteria("cylinder", "blue", "no especificado") + assert result is None + + def test_find_with_size(self): + """Test finding with size criteria.""" + scene = SceneState() + scene.add(SceneObject(id="a", object_type="cube", color="red", size="big")) + scene.add(SceneObject(id="b", object_type="cube", color="red", size="small")) + + result = scene.find_by_criteria("cube", "red", "small") + assert result is not None + assert result.id == "b" + + +class TestLoadFromConfig: + """Test load_from_config method.""" + + def test_load_nonexistent_file(self): + """Test loading from nonexistent file returns 0.""" + scene = SceneState() + count = scene.load_from_config("/nonexistent/path.toml") + assert count == 0 + assert scene.count() == 0 + + def test_load_from_config_with_objects(self, tmp_path): + """Test loading objects from config file.""" + config_file = tmp_path / "config.toml" + config_file.write_text(""" +[object_parameters] +normal_height = 100.0 + +[[static_objects]] +type = "box" +color = "blue" +position = [150.0, -200.0] +size = "big" + +[[static_objects]] +type = "box" +color = "red" +position = [250.0, -200.0] +""") + + scene = SceneState() + count = scene.load_from_config(str(config_file)) + + assert count == 2 + assert scene.count() == 2 + + blues = scene.query(color="blue") + assert len(blues) == 1 + assert blues[0].object_type == "box" + assert blues[0].position_mm[2] == 100.0 # normal_height + assert blues[0].source == "config" + + def test_load_empty_config(self, tmp_path): + """Test loading from empty config file.""" + config_file = tmp_path / "empty.toml" + config_file.write_text("") + + scene = SceneState() + count = scene.load_from_config(str(config_file)) + assert count == 0 diff --git a/dora_detector/dora_detector/__init__.py b/dora_yolo_object_detector/dora_yolo_object_detector/__init__.py similarity index 100% rename from dora_detector/dora_detector/__init__.py rename to dora_yolo_object_detector/dora_yolo_object_detector/__init__.py diff --git a/dora_detector/dora_detector/main.py b/dora_yolo_object_detector/dora_yolo_object_detector/main.py similarity index 94% rename from dora_detector/dora_detector/main.py rename to dora_yolo_object_detector/dora_yolo_object_detector/main.py index 717fb8d..1ab109d 100644 --- a/dora_detector/dora_detector/main.py +++ b/dora_yolo_object_detector/dora_yolo_object_detector/main.py @@ -22,6 +22,17 @@ except ModuleNotFoundError: # pragma: no cover DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt") +# Detection visualization constants +POINT_SEARCH_RADIUS = 3 +DEFAULT_FONT_SCALE = 0.5 +DETECTION_BOX_THICKNESS = 2 + + +def _log(msg: str) -> None: + """Print a timestamped log message.""" + timestamp = time.strftime("%H:%M:%S") + print(f"[detector {timestamp}] {msg}", flush=True) + @dataclass class DetectionConfig: @@ -46,8 +57,8 @@ def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]: parts = [p.strip() for p in raw.split(",")] if len(parts) >= 2: return int(parts[0]), int(parts[1]) - except Exception: - pass + except Exception as e: + _log(f"Warning: failed to parse int pair '{raw}': {e}") return default @@ -56,8 +67,8 @@ def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, fl parts = [p.strip() for p in raw.split(",")] if len(parts) >= 2: return float(parts[0]), float(parts[1]) - except Exception: - pass + except Exception as e: + _log(f"Warning: failed to parse float pair '{raw}': {e}") return default @@ -66,8 +77,8 @@ def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int parts = [p.strip() for p in raw.split(",")] if len(parts) >= 3: return int(parts[0]), int(parts[1]), int(parts[2]) - except Exception: - pass + except Exception as e: + _log(f"Warning: failed to parse color '{raw}': {e}") return default @@ -159,10 +170,9 @@ def _sample_point( point_xyz = point_cloud[y, x, :3].astype(np.float64) if _valid_point(point_xyz, cfg): return point_xyz - radius = 3 samples = [] - for dy in range(-radius, radius + 1): - for dx in range(-radius, radius + 1): + for dy in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1): + for dx in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1): xx = x + dx yy = y + dy if xx < 0 or yy < 0 or xx >= w or yy >= h: @@ -218,7 +228,8 @@ def _load_config_file(path: str) -> Dict[str, Any]: try: with open(path, "rb") as handle: return tomllib.load(handle) - except Exception: + except Exception as e: + _log(f"Warning: failed to load config file '{path}': {e}") return {} @@ -351,11 +362,11 @@ def _draw_detections( color = color_map.get(color_name, (128, 128, 128)) # Draw bounding box - cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2) + cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, DETECTION_BOX_THICKNESS) # Draw label background label = f"{obj_type} {color_name} {size}" - (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, DEFAULT_FONT_SCALE, 1) cv2.rectangle( annotated, (bbox[0], bbox[1] - th - 8), @@ -368,7 +379,7 @@ def _draw_detections( label, (bbox[0] + 2, bbox[1] - 4), cv2.FONT_HERSHEY_SIMPLEX, - 0.5, + DEFAULT_FONT_SCALE, (0, 0, 0), 1, ) diff --git a/dora_detector/pyproject.toml b/dora_yolo_object_detector/pyproject.toml similarity index 77% rename from dora_detector/pyproject.toml rename to dora_yolo_object_detector/pyproject.toml index 15c1ce3..c26c7a3 100644 --- a/dora_detector/pyproject.toml +++ b/dora_yolo_object_detector/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "dora-detector" +name = "dora-yolo-object-detector" version = "0.1.0" license = { file = "MIT" } authors = [{ name = "Dora" }] @@ -16,4 +16,4 @@ dependencies = [ ] [project.scripts] -dora-detector = "dora_detector.main:main" +dora-yolo-object-detector = "dora_yolo_object_detector.main:main"