Refactor voice control core and robot behavior

This commit is contained in:
cristhian aguilera
2026-02-02 12:29:59 -03:00
parent b9798a2f46
commit 695d309816
36 changed files with 3436 additions and 1065 deletions

6
.markdownlint.json Normal file
View File

@@ -0,0 +1,6 @@
{
"MD013": {
"line_length": 120
},
"MD041": false
}

View File

@@ -38,6 +38,13 @@ Dora
- Prefer small nodes with clear inputs/outputs. - Prefer small nodes with clear inputs/outputs.
- Validate node I/O types and message schemas. - Validate node I/O types and message schemas.
Building and Running
- Build dataflows with: `dora build <dataflow.yml> --uv`
- The `--uv` flag uses uv for fast Python dependency management.
- Run dataflows with: `dora start <dataflow.yml>`
- Stop with: `dora stop`
- Check logs with: `dora logs`
Testing and Verification Testing and Verification
- Add or update tests when feasible. - Add or update tests when feasible.
- For robot-facing changes, provide a staged validation plan: - For robot-facing changes, provide a staged validation plan:

44
CLAUDE.md Normal file
View File

@@ -0,0 +1,44 @@
# CLAUDE.md
## Purpose
Project guidelines for Claude Code working in this repository.
## Project Overview
Robotics project using Dora to control a ULite6 robot arm with Spanish voice commands for children.
## Building and Running
- Build: `dora build <dataflow.yml> --uv`
- Start: `dora run <dataflow.yml> --uv`
## Key Packages
- `dora_voice_control`: Voice processing and robot behavior
- `dora_yolo_object_detector`: YOLO-based object detection
- `dora_ulite6`: Robot arm control
- `dora_zed_cpp`: ZED camera (C++)
- `dora_iobridge`: WebSocket bridge for external clients
## Robot Adapter
Voice control uses a robot adapter pattern to support different robots:
- `ROBOT_TYPE=vacuum`: Vacuum gripper (grab→vacuum_on, release→vacuum_off)
- `ROBOT_TYPE=gripper`: Parallel gripper (grab→gripper_close, release→gripper_open)
To add a new robot, create a new adapter in `robot/adapter.py`.
## Guidelines
- Keep the event loop responsive. Run slow operations (LLM) in threads.
- Robot commands are serialized: one command at a time, wait for status before next.
- State is shared via `SharedState` (thread-safe).
- Handlers process events, tick callbacks run after each event.
## Safety
- This controls a real robot. Test changes carefully.
- Workspace bounds are enforced in behaviors.
- Use `DRY_RUN=true` to test without robot motion.

View File

@@ -1,29 +1,27 @@
## Getting started ## Getting Started
- Install it with: Build and run:
```bash ```bash
uv venv -p 3.12 --seed dora build <dataflow.yml> --uv
dora build [dataflow.yml] --uv dora run <dataflow.yml> --uv
```
- Run it with:
```bash
dora run [dataflow.yml] --uv
``` ```
## Dataflows ## Dataflows
| Dataflow | Description | | Dataflow | Description |
|----------|-------------| |----------|-------------|
| `dataflow_ulite6.yml` | Ufactory ULite6 robot control with web UI| | `dataflow_voice_control_ulite6_zed.yml` | Voice-controlled robot with object detection |
| `dataflow_ulite6.yml` | ULite6 robot control with web UI |
| `dataflow_zed_cpp.yml` | ZED camera capture with image viewer | | `dataflow_zed_cpp.yml` | ZED camera capture with image viewer |
## Nodes ## Nodes
| Node | Description | | Node | Description |
|------|-------------| |------|-------------|
| `ulite6` | UFactory Lite6 robot controller with REST API and web UI | | `dora_voice_control` | Voice command processing and robot behavior |
| `zed_camera_cpp` | ZED stereo camera capture (C++) | | `dora_yolo_object_detector` | YOLO-based object detection with ZED point cloud |
| `image_viewer` | Display images from Dora stream | | `dora_ulite6` | UFactory Lite6 robot controller with REST API |
| `dora_zed_cpp` | ZED stereo camera capture (C++) |
| `dora_iobridge` | I/O bridge for voice websocket |
| `dora_image_viewer` | Display images from Dora stream |

View File

@@ -43,22 +43,19 @@ nodes:
path: dora_iobridge/dora_iobridge/main.py path: dora_iobridge/dora_iobridge/main.py
env: env:
VIRTUAL_ENV: ./.venv VIRTUAL_ENV: ./.venv
VOICE_HOST: "0.0.0.0" WS_HOST: "0.0.0.0"
VOICE_PORT: "8765" WS_PORT: "9001"
VOICE_IN_OUTPUT: "voice_in"
VOICE_OUT_INPUT: "voice_out"
SCENE_INPUT: "scene_update"
inputs: inputs:
voice_out: voice/voice_out text_in: voice/voice_out
scene_update: voice/scene_update data_in: voice/scene_update
tick: dora/timer/millis/100 tick: dora/timer/millis/100
outputs: outputs:
- voice_in - text_out
- id: detector - id: detector
build: | build: |
uv venv -p 3.12 --seed --allow-existing uv venv -p 3.12 --seed --allow-existing
uv pip install -e dora_detector uv pip install -e dora_yolo_object_detector
path: dora_detector/dora_detector/main.py path: dora_yolo_object_detector/dora_yolo_object_detector/main.py
env: env:
VIRTUAL_ENV: ./.venv VIRTUAL_ENV: ./.venv
IMAGE_INPUT: "image_bgr" IMAGE_INPUT: "image_bgr"
@@ -90,6 +87,7 @@ nodes:
path: dora_voice_control/dora_voice_control/main.py path: dora_voice_control/dora_voice_control/main.py
env: env:
VIRTUAL_ENV: ./.venv VIRTUAL_ENV: ./.venv
ROBOT_TYPE: "vacuum" # "vacuum" or "gripper"
OBJECTS_INPUT: "objects" OBJECTS_INPUT: "objects"
POSE_INPUT: "tcp_pose" POSE_INPUT: "tcp_pose"
STATUS_INPUT: "status" STATUS_INPUT: "status"
@@ -119,12 +117,12 @@ nodes:
IMAGE_WIDTH: "1280" IMAGE_WIDTH: "1280"
IMAGE_HEIGHT: "720" IMAGE_HEIGHT: "720"
API_ENABLED: "true" API_ENABLED: "true"
API_PORT: "8080" API_PORT: "9002"
inputs: inputs:
objects: detector/objects objects: detector/objects
tcp_pose: ulite6/tcp_pose tcp_pose: ulite6/tcp_pose
status: ulite6/status status: ulite6/status
voice_in: iobridge/voice_in voice_in: iobridge/text_out
image_annotated: detector/image_annotated image_annotated: detector/image_annotated
tick: dora/timer/millis/100 tick: dora/timer/millis/100
outputs: outputs:

View File

@@ -1,178 +1,106 @@
# Dora IOBridge Node # Dora IOBridge Node
A WebSocket server that bridges web clients with the Dora dataflow for real-time voice commands and scene updates. Generic WebSocket bridge between web clients and Dora dataflow.
## Inputs/Outputs ## Dora Outputs (WebSocket → Dora)
| Input | Type | Description | | Output | Type | Description |
|----------------|--------|---------------------------------------| |-------------|--------|--------------------------|
| `voice_out` | JSON | Response from voice control node | | `text_out` | string | Text from clients |
| `scene_update` | JSON | Scene objects from voice control | | `audio_out` | bytes | WAV audio from clients |
| `data_out` | JSON | Generic data from clients|
| Output | Type | Description | ## Dora Inputs (Dora → WebSocket)
|----------------|--------|---------------------------------------|
| `voice_in` | string | Voice commands forwarded to Dora | | Input | Type | Description |
|-------------|--------|--------------------------|
| `text_in` | string | Text to broadcast |
| `audio_in` | bytes | WAV audio to broadcast |
| `data_in` | JSON | Generic data to broadcast|
## Environment Variables ## Environment Variables
```bash | Variable | Default | Description |
VOICE_HOST=0.0.0.0 # Bind address |-----------|-----------|--------------|
VOICE_PORT=8765 # Listen port | `WS_HOST` | `0.0.0.0` | Bind address |
| `WS_PORT` | `8765` | Listen port |
## WebSocket Endpoint
```text
ws://{WS_HOST}:{WS_PORT}
``` ```
## Installation Default: `ws://0.0.0.0:8765`
```bash ## WebSocket Protocol
cd dora_iobridge
pip install -e . ### Client → Server
| Type | Field | Description |
|---------|-----------|-----------------------|
| `text` | `content` | Text string |
| `audio` | `content` | Base64-encoded WAV |
| `data` | `payload` | Any JSON object |
| `ping` | - | Health check |
Examples:
```json
{"type": "text", "content": "hello world"}
{"type": "audio", "content": "UklGRi4AAABXQVZFZm10..."}
{"type": "data", "payload": {"key": "value"}}
{"type": "ping"}
```
### Server → Client
| Type | Field | Description |
|---------|-----------|-----------------------|
| `text` | `content` | Text string |
| `audio` | `content` | Base64-encoded WAV |
| `data` | `payload` | Any JSON object |
| `pong` | - | Response to ping |
| `error` | `message` | Error description |
Examples:
```json
{"type": "text", "content": "response text"}
{"type": "audio", "content": "UklGRi4A...", "format": "wav"}
{"type": "data", "payload": {"objects": [...]}}
{"type": "pong"}
{"type": "error", "message": "Invalid JSON"}
```
## Dataflow Example
```yaml
- id: iobridge
build: uv pip install -e dora_iobridge
path: dora_iobridge/dora_iobridge/main.py
env:
WS_HOST: "0.0.0.0"
WS_PORT: "8765"
inputs:
text_in: voice/voice_out
data_in: voice/scene_update
outputs:
- text_out
``` ```
## Testing ## Testing
### Test with WebSocket (wscat)
```bash
# Install wscat
npm install -g wscat
# Connect to the server
wscat -c ws://localhost:8765
```
### Test with curl (websocat)
```bash ```bash
# Install websocat # Install websocat
# Ubuntu: sudo apt install websocat sudo apt install websocat
# macOS: brew install websocat
# Send a ping # Connect
echo '{"type": "ping"}' | websocat ws://localhost:8765 websocat ws://localhost:8765
# Response: {"type": "pong"}
# Send a voice command # Send text
echo '{"type": "command", "text": "sube"}' | websocat ws://localhost:8765 {"type": "text", "content": "hello"}
# Request scene refresh # Ping
echo '{"type": "scene_refresh"}' | websocat ws://localhost:8765
```
### Test with Python
```python
import asyncio
import websockets
import json
async def test_iobridge():
uri = "ws://localhost:8765"
async with websockets.connect(uri) as ws:
# Test ping
await ws.send(json.dumps({"type": "ping"}))
response = await ws.recv()
print(f"Ping response: {response}")
# Send command
await ws.send(json.dumps({
"type": "command",
"text": "agarra el cubo rojo"
}))
# Listen for responses
async for message in ws:
data = json.loads(message)
print(f"Received: {data}")
asyncio.run(test_iobridge())
```
### Test with curl (HTTP upgrade not supported directly)
Since WebSocket requires an upgrade handshake, use this shell script:
```bash
#!/bin/bash
# test_iobridge.sh
# Using websocat for interactive testing
websocat ws://localhost:8765 <<EOF
{"type": "ping"}
{"type": "command", "text": "sube"}
{"type": "scene_refresh"}
EOF
```
## WebSocket Message Types
### Client -> Server
**Command (voice input)**
```json
{"type": "command", "text": "agarra el cubo rojo"}
```
**Ping (health check)**
```json
{"type": "ping"} {"type": "ping"}
``` ```
Response: `{"type": "pong"}`
**Scene Refresh**
```json
{"type": "scene_refresh"}
```
### Server -> Client (Broadcasts)
**Command Response**
```json
{
"type": "response",
"text": "Ok, voy a tomar",
"status": "ok"
}
```
**Scene Update**
```json
{
"type": "scene_updated",
"objects": [
{
"object_type": "cubo",
"color": "rojo",
"size": "big",
"position_mm": [150.0, 200.0, 280.0],
"source": "detection"
}
]
}
```
## Dora Dataflow Configuration
```yaml
nodes:
- id: iobridge
build: pip install -e ./dora_iobridge
path: dora_iobridge
inputs:
voice_out: voice_control/voice_out
scene_update: voice_control/scene_update
outputs:
- voice_in
env:
VOICE_HOST: "0.0.0.0"
VOICE_PORT: "8765"
```
```bash
dora up
dora start dataflow.yml
```
## Dependencies
- dora-rs >= 0.3.9
- pyarrow >= 12.0.0
- websockets >= 12.0

View File

@@ -1,8 +1,14 @@
"""Dora node bridging WebSocket IO to Dora topics.""" """Dora node bridging WebSocket IO to Dora topics.
Generic I/O bridge with fixed endpoints:
- Outputs (WS → Dora): text_out, audio_out, data_out
- Inputs (Dora → WS): text_in, audio_in, data_in
"""
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import base64
import json import json
import os import os
import threading import threading
@@ -15,129 +21,164 @@ from websockets.server import serve, WebSocketServerProtocol
class IoBridgeServer: class IoBridgeServer:
"""WebSocket server that bridges clients to Dora dataflow."""
def __init__(self, host: str, port: int): def __init__(self, host: str, port: int):
self.host = host self.host = host
self.port = port self.port = port
self.clients: Set[WebSocketServerProtocol] = set() self.clients: Set[WebSocketServerProtocol] = set()
self.command_handler = None # Callbacks for sending to Dora
self.scene_refresh_handler = None self.on_text: Optional[callable] = None
self.on_audio: Optional[callable] = None
self.on_data: Optional[callable] = None
async def handler(self, websocket: WebSocketServerProtocol): async def handler(self, websocket: WebSocketServerProtocol):
"""Handle WebSocket connection."""
self.clients.add(websocket) self.clients.add(websocket)
try: try:
async for message in websocket: async for message in websocket:
try: try:
data = json.loads(message) data = json.loads(message)
except json.JSONDecodeError: except json.JSONDecodeError:
await websocket.send( await websocket.send(json.dumps({
json.dumps({"type": "error", "text": "Invalid JSON message"}) "type": "error",
) "message": "Invalid JSON"
}))
continue continue
response = await self._route_message(data, websocket) response = await self._route_message(data)
if response: if response:
await websocket.send(json.dumps(response)) await websocket.send(json.dumps(response))
finally: finally:
self.clients.discard(websocket) self.clients.discard(websocket)
async def _route_message( async def _route_message(self, data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
self, data: Dict[str, Any], websocket: WebSocketServerProtocol """Route incoming message to appropriate handler."""
) -> Optional[Dict[str, Any]]:
msg_type = data.get("type") msg_type = data.get("type")
if msg_type == "command":
text = data.get("text", "")
if self.command_handler:
await self.command_handler(text)
return None
return {"type": "error", "text": "No command handler registered"}
if msg_type == "ping": if msg_type == "ping":
return {"type": "pong"} return {"type": "pong"}
if msg_type == "scene_refresh":
if self.scene_refresh_handler: if msg_type == "text":
objects = await self.scene_refresh_handler() content = data.get("content", "")
return {"type": "scene_updated", "objects": objects} if self.on_text and content:
return {"type": "error", "text": "No scene handler registered"} self.on_text(content)
return {"type": "error", "text": f"Unknown message type: {msg_type}"} return None
if msg_type == "audio":
content = data.get("content", "") # base64 WAV
if self.on_audio and content:
self.on_audio(content)
return None
if msg_type == "data":
payload = data.get("payload", {})
if self.on_data:
self.on_data(payload)
return None
return {"type": "error", "message": f"Unknown type: {msg_type}"}
async def broadcast(self, message: Dict[str, Any]): async def broadcast(self, message: Dict[str, Any]):
"""Broadcast message to all connected clients."""
if not self.clients: if not self.clients:
return return
payload = json.dumps(message) payload = json.dumps(message)
await asyncio.gather( await asyncio.gather(
*[client.send(payload) for client in self.clients], return_exceptions=True *[client.send(payload) for client in self.clients],
return_exceptions=True
) )
async def send(self, message: Dict[str, Any], websocket: WebSocketServerProtocol):
await websocket.send(json.dumps(message))
async def start(self): async def start(self):
"""Start the WebSocket server."""
async with serve(self.handler, self.host, self.port): async with serve(self.handler, self.host, self.port):
await asyncio.Future() await asyncio.Future()
def main() -> None: def main() -> None:
host = os.getenv("VOICE_HOST", "0.0.0.0") """Main entry point."""
port = int(os.getenv("VOICE_PORT", "8765")) host = os.getenv("WS_HOST", "0.0.0.0")
input_topic = os.getenv("VOICE_IN_OUTPUT", "voice_in") port = int(os.getenv("WS_PORT", "9000"))
response_input = os.getenv("VOICE_OUT_INPUT", "voice_out")
scene_input = os.getenv("SCENE_INPUT", "scene_update")
node = Node() node = Node()
server = IoBridgeServer(host, port) server = IoBridgeServer(host, port)
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
def push_command(text: str) -> None: # Callbacks: WebSocket → Dora
def send_text(content: str) -> None:
node.send_output( node.send_output(
input_topic, "text_out",
pa.array([text]), pa.array([content]),
metadata={"encoding": "utf-8", "timestamp_ns": time.time_ns()}, {"encoding": "utf-8", "timestamp_ns": time.time_ns()},
) )
async def handle_scene_refresh(): def send_audio(content_b64: str) -> None:
return [] audio_bytes = base64.b64decode(content_b64)
node.send_output(
"audio_out",
pa.array([audio_bytes]),
{"encoding": "wav", "timestamp_ns": time.time_ns()},
)
def command_handler(text: str): def send_data(payload: Dict[str, Any]) -> None:
push_command(text) node.send_output(
return None "data_out",
pa.array([json.dumps(payload)]),
{"encoding": "json", "timestamp_ns": time.time_ns()},
)
server.command_handler = command_handler server.on_text = send_text
server.scene_refresh_handler = handle_scene_refresh server.on_audio = send_audio
server.on_data = send_data
# Start WebSocket server in background thread
def run_server(): def run_server():
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
loop.run_until_complete(server.start()) loop.run_until_complete(server.start())
threading.Thread(target=run_server, daemon=True).start() threading.Thread(target=run_server, daemon=True).start()
timestamp = time.strftime("%H:%M:%S")
print(f"[iobridge {timestamp}] WebSocket server at ws://{host}:{port}", flush=True)
# Dora event loop: Dora → WebSocket
for event in node: for event in node:
if event["type"] != "INPUT": if event["type"] != "INPUT":
continue continue
if event["id"] == response_input: event_id = event["id"]
raw = event["value"][0].as_py() if len(event["value"]) else "" raw = event["value"][0].as_py() if len(event["value"]) else None
if not raw: if raw is None:
continue
try:
payload = json.loads(raw)
message = {
"type": "response",
"text": payload.get("text", ""),
"status": payload.get("status", "ok"),
}
except Exception:
message = {"type": "response", "text": raw, "status": "ok"}
asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
continue continue
if event["id"] == scene_input: if event_id == "text_in":
raw = event["value"][0].as_py() if len(event["value"]) else "" # Text from Dora to broadcast
if not raw: if isinstance(raw, str):
continue try:
# Try to parse as JSON for structured response
payload = json.loads(raw)
message = {"type": "text", "content": payload.get("text", raw)}
except json.JSONDecodeError:
message = {"type": "text", "content": raw}
else:
message = {"type": "text", "content": str(raw)}
asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
elif event_id == "audio_in":
# Audio from Dora to broadcast (binary → base64)
if isinstance(raw, bytes):
content_b64 = base64.b64encode(raw).decode("utf-8")
else:
content_b64 = raw
message = {"type": "audio", "content": content_b64, "format": "wav"}
asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
elif event_id == "data_in":
# Generic JSON data from Dora to broadcast
try: try:
payload = json.loads(raw) payload = json.loads(raw) if isinstance(raw, str) else raw
objects = payload.get("objects", []) except json.JSONDecodeError:
message = {"type": "scene_updated", "objects": objects} payload = {"raw": raw}
except Exception: message = {"type": "data", "payload": payload}
message = {"type": "scene_updated", "objects": []}
asyncio.run_coroutine_threadsafe(server.broadcast(message), loop) asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)

View File

@@ -840,6 +840,7 @@ def main() -> None:
target=run_uvicorn, args=(app, api_host, api_port), daemon=True target=run_uvicorn, args=(app, api_host, api_port), daemon=True
) )
api_thread.start() api_thread.start()
_log(f"Web interface at http://{api_host}:{api_port}")
# Dora event loop - only handles tick for state publishing # Dora event loop - only handles tick for state publishing
for event in node: for event in node:

236
dora_voice_control/PLAN.md Normal file
View File

@@ -0,0 +1,236 @@
# Voice Control Simplification Plan
## Current State
**30 files** across 7 directories (~3300 lines):
```
dora_voice_control/
├── behaviors/ # 2 files
│ ├── __init__.py
│ └── base.py
├── node/ # 5 files
│ ├── __init__.py
│ ├── adapter.py
│ ├── context.py
│ ├── dispatcher.py
│ └── logging.py
├── robot/ # 6 files
│ ├── __init__.py
│ ├── adapter.py # VacuumGripperAdapter, ParallelGripperAdapter
│ ├── command_queue.py
│ ├── image_handler.py
│ ├── pose_handler.py
│ └── status_handler.py
├── scene/ # 4 files
│ ├── __init__.py
│ ├── notifier.py
│ ├── objects_handler.py
│ └── state.py
├── utils/ # 3 files
│ ├── __init__.py
│ ├── config.py
│ └── state.py
├── voice/ # 4 files
│ ├── __init__.py
│ ├── handler.py
│ ├── intent.py
│ └── parser.py
├── web/ # 4 files
│ ├── __init__.py
│ ├── api.py
│ ├── models.py
│ └── templates.py
├── __init__.py
└── main.py
```
## Problems
1. **Too many small files** - Many files are <50 lines
2. **Robot-specific code scattered** - Adapter in robot/, behavior in behaviors/
3. **Hard to add new robots** - Need to touch multiple directories
4. **Deep import chains** - `from ..node import ...`
---
## Proposed Structure
**12 files** across 3 directories:
```
dora_voice_control/
├── __init__.py
├── main.py
├── node.py # Dora infrastructure (adapter, context, dispatcher, logger)
├── handlers.py # All event handlers (voice, robot, scene)
├── scene.py # SceneState, SceneObject
├── state.py # SharedState, RobotStep, config loading
├── web.py # API server (api + models + templates)
└── behaviors/
├── __init__.py # get_behavior() factory
├── base.py # ActionContext, ActionInfo, RobotBehavior base
└── littlehand/
├── __init__.py
└── robot.py # LittlehandBehavior + VacuumGripperAdapter
```
---
## File Consolidation Map
### `node.py` (merge 5 files)
| From | Content |
|------|---------|
| `node/logging.py` | VoiceControlLogger |
| `node/adapter.py` | NodePort, DoraNodeAdapter, MockNodeAdapter |
| `node/context.py` | OutputPort, NodeContext |
| `node/dispatcher.py` | EventHandler, EventDispatcher |
### `handlers.py` (merge 7 files)
| From | Content |
|------|---------|
| `voice/handler.py` | VoiceInputHandler |
| `voice/intent.py` | Intent, IntentQueue |
| `robot/pose_handler.py` | PoseHandler |
| `robot/status_handler.py` | StatusHandler |
| `robot/image_handler.py` | ImageHandler |
| `robot/command_queue.py` | CommandQueueService |
| `scene/objects_handler.py` | ObjectsHandler |
| `scene/notifier.py` | SceneNotifier |
### `scene.py` (merge 2 files)
| From | Content |
|------|---------|
| `scene/state.py` | SceneState, SceneObject |
Already consolidated - just move to root.
### `state.py` (merge 2 files)
| From | Content |
|------|---------|
| `utils/state.py` | SharedState, RobotStep, VoiceState, DebugState |
| `utils/config.py` | VoiceConfig, ApiConfig, LLMConfig, loaders |
### `web.py` (merge 3 files)
| From | Content |
|------|---------|
| `web/api.py` | create_api(), start_api_server() |
| `web/models.py` | CommandRequest, CommandResponse |
| `web/templates.py` | HTML_TEMPLATE |
### `behaviors/littlehand/robot.py` (new)
| From | Content |
|------|---------|
| `robot/adapter.py` | VacuumGripperAdapter (robot-specific) |
| `behaviors/base.py` | Action implementations (robot-specific) |
Robot adapter moves INTO the behavior folder.
---
## Per-Robot Folder Structure
Each robot gets its own folder with everything it needs:
```
behaviors/
├── __init__.py # get_behavior(robot_type) factory
├── base.py # Shared base classes
│ - ActionContext
│ - ActionInfo
│ - RobotBehavior (abstract)
└── littlehand/ # Vacuum gripper robot
├── __init__.py
└── robot.py
- VacuumGripperAdapter
- LittlehandBehavior (actions, LLM signature)
- ACTIONS dict
- CommandSignature (DSPy)
```
### Adding a New Robot (e.g., "gripper_arm")
1. Create `behaviors/gripper_arm/`
2. Create `robot.py` with:
- `ParallelGripperAdapter`
- `GripperArmBehavior`
- Custom actions
3. Register in `behaviors/__init__.py`
No need to touch other files.
---
## Implementation Steps
### Phase 1: Consolidate node/ → node.py
1. Create `node.py` merging adapter, context, dispatcher, logging
2. Update imports in all files
3. Delete `node/` directory
### Phase 2: Consolidate handlers
1. Create `handlers.py` merging all handlers
2. Update imports
3. Delete `voice/`, `robot/` directories (except adapter)
### Phase 3: Consolidate utils/ → state.py
1. Create `state.py` merging config and state
2. Update imports
3. Delete `utils/` directory
### Phase 4: Move scene/ → scene.py
1. Move `scene/state.py` to `scene.py`
2. Update imports
3. Delete `scene/` directory
### Phase 5: Consolidate web/ → web.py
1. Create `web.py` merging api, models, templates
2. Update imports
3. Delete `web/` directory
### Phase 6: Restructure behaviors per robot
1. Create `behaviors/littlehand/robot.py`
2. Move VacuumGripperAdapter from robot/adapter.py
3. Move action implementations from behaviors/base.py
4. Update `behaviors/__init__.py` factory
5. Delete old robot/adapter.py
### Phase 7: Update main.py
1. Update all imports to new structure
2. Simplify initialization
---
## Final Structure Comparison
| Before | After |
|--------|-------|
| 30 files | 12 files |
| 7 directories | 3 directories |
| Robot code scattered | Robot code in one folder |
| Deep imports | Flat imports |
---
## Benefits
1. **Easier navigation** - Fewer files to search
2. **Self-contained robots** - One folder per robot type
3. **Simpler imports** - `from .node import ...` instead of `from ..node import ...`
4. **Easier testing** - Mock one module instead of many
5. **Clear ownership** - Robot-specific code in robot folder

View File

@@ -1,131 +1,129 @@
# Dora Voice Control Node # Dora Voice Control Node
A Dora node that processes Spanish voice commands from children and translates them into robot actions (movement, grasping, releasing objects). Includes a web debug interface. Dora node that processes Spanish voice commands and translates them into robot actions. Supports multiple robot types via robot subfolders.
## Features ## Features
- Spanish voice command parsing (rule-based or Gemini LLM) - Spanish voice command parsing (rule-based or LLM)
- Robot adapter pattern for different gripper types
- Real-time web debug interface - Real-time web debug interface
- Command queue management - Command queue management
- Workspace bounds validation - Workspace bounds validation
- Object detection integration
## File Structure ## File Structure
``` ```text
dora_voice_control/ dora_voice_control/
├── __init__.py ├── main.py # Thin orchestrator
├── main.py # Main Dora node entry point
├── api.py # FastAPI web server ├── core/ # Shared logic
├── config.py # Configuration management │ ├── behavior.py # RobotBehavior with actions
├── models.py # Pydantic request/response models │ ├── config.py # Configuration classes
├── parser.py # Voice command parsing logic │ ├── node.py # Dora adapter + dispatcher + context
├── state.py # Shared state management │ ├── robot.py # RobotAdapter base
└── templates.py # HTML template for web interface │ ├── robot_io.py # Pose/status/image handlers + command queue
│ ├── scene.py # Scene state + notifier + objects handler
│ ├── state.py # Thread-safe shared state
│ └── voice.py # Voice input + parsing + intents
├── robots/ # Robot-specific implementations
│ └── littlehand/ # Vacuum gripper robot
│ ├── adapter.py # Vacuum adapter
│ ├── actions.py # Action vocabulary
│ └── behavior.py # Behavior binding
└── web/ # Web interface
├── api.py # FastAPI server
├── models.py # Pydantic models
└── templates.py # HTML template
``` ```
## Web Debug Interface ## Robot Adapters
Access the debug interface at `http://localhost:8080` (default). Set `ROBOT_TYPE` to select the robot package:
Features: | Type | Grab Command | Release Command |
- Real-time status monitoring (pose, objects, queue) |------|--------------|-----------------|
- Send manual voice commands | `littlehand` (alias: `vacuum`) | `vacuum_on` | `vacuum_off` |
- Quick command buttons
- View parse results To add a new robot, create a new subfolder under `robots/` with its adapter and behavior, then register it in `robots/__init__.py`.
- Command history
- Clear queue
## Inputs/Outputs ## Inputs/Outputs
| Input | Type | Description | | Input | Type | Description |
|---------------|--------|------------------------------------------| |-------|------|-------------|
| `voice_in` | string | Text transcription of voice command | | `voice_in` | string | Voice command text |
| `tcp_pose` | array | Current robot pose [x, y, z, roll, pitch, yaw] | | `tcp_pose` | array | Robot pose [x, y, z, roll, pitch, yaw] |
| `objects` | JSON | Detected objects from vision system | | `objects` | JSON | Detected objects |
| `status` | JSON | Command execution status from robot | | `status` | JSON | Command execution status |
| `image_annotated` | array | Camera image |
| Output | Type | Description | | Output | Type | Description |
|---------------|--------|------------------------------------------| |--------|------|-------------|
| `robot_cmd` | JSON | Robot command with action and payload | | `robot_cmd` | JSON | Robot command |
| `voice_out` | JSON | Response confirmation to user | | `voice_out` | JSON | Response to user |
| `scene_update`| JSON | Updated scene with all visible objects | | `scene_update` | JSON | Scene state |
## Supported Commands (Spanish) ## Supported Commands (Spanish)
| Command | Action | Example | | Command | Action | Example |
|---------------|----------------|--------------------------------| |---------|--------|---------|
| `subir` | Move up | "sube" | | `subir` | Move up | "sube" |
| `bajar` | Move down | "baja" | | `bajar` | Move down | "baja" |
| `tomar` | Grab object | "agarra el cubo rojo" | | `tomar` | Grab object | "agarra el cubo rojo" |
| `soltar` | Release object | "suelta en la caja azul" | | `soltar` | Release object | "suelta en la caja azul" |
| `ir` | Go to object | "ve al cilindro" | | `ir` | Go to object | "ve al cilindro" |
| `reiniciar` | Reset | "reinicia" | | `reiniciar` | Reset | "reinicia" |
## Environment Variables ## Environment Variables
```bash ```bash
# Web API Server # Robot Configuration
API_ENABLED=true # Enable/disable web interface ROBOT_TYPE=littlehand # "littlehand" (alias: "vacuum")
API_HOST=0.0.0.0 # Bind address
API_PORT=8080 # Listen port # Web API
API_ENABLED=true
API_PORT=9001
# TCP Parameters # TCP Parameters
TCP_OFFSET_MM=63.0 # Z-offset to object surface TCP_OFFSET_MM=63.0
APPROACH_OFFSET_MM=50.0 # Safe approach distance above object APPROACH_OFFSET_MM=50.0
STEP_MM=20.0 # Distance for up/down increments STEP_MM=20.0
# LLM Configuration (optional) # LLM (optional)
LLM_PROVIDER=rules # "rules" or "gemini" LLM_PROVIDER=rules # "rules", "gemini", "ollama"
GOOGLE_API_KEY=your_key # Required if using gemini GOOGLE_API_KEY=your_key
GEMINI_MODEL=gemini-2.0-flash
# Workspace Safety (optional) # Initial Position
WORKSPACE_MIN_X=-300 INIT_ON_START=true
WORKSPACE_MAX_X=300 INIT_X=300.0
WORKSPACE_MIN_Y=-300 INIT_Y=0.0
WORKSPACE_MAX_Y=300 INIT_Z=350.0
# Safety
DRY_RUN=false
WORKSPACE_MIN_Z=0 WORKSPACE_MIN_Z=0
WORKSPACE_MAX_Z=500 WORKSPACE_MAX_Z=500
# Misc
DRY_RUN=false # Skip sending robot commands
``` ```
## Installation ## Web Debug Interface
Access at `http://localhost:8080`:
- Camera view with detections
- Real-time status (pose, objects, queue)
- Send manual commands
- View parse results
## API Endpoints
```bash ```bash
cd dora_voice_control # Status
pip install -e .
# With LLM support
pip install -e ".[llm]"
```
## Testing
### Web Interface
```bash
# Start the node (standalone for testing)
python -m dora_voice_control.main
# Open in browser
open http://localhost:8080
```
### API Endpoints
```bash
# Get status
curl http://localhost:8080/api/status curl http://localhost:8080/api/status
# Get objects # Objects
curl http://localhost:8080/api/objects curl http://localhost:8080/api/objects
# Get queue
curl http://localhost:8080/api/queue
# Send command # Send command
curl -X POST http://localhost:8080/api/command \ curl -X POST http://localhost:8080/api/command \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@@ -135,77 +133,30 @@ curl -X POST http://localhost:8080/api/command \
curl -X POST http://localhost:8080/api/queue/clear curl -X POST http://localhost:8080/api/queue/clear
``` ```
### Python Test ## Dataflow Example
```python
from dora_voice_control.parser import rule_parse, normalize
# Test command parsing
text = "agarra el cubo rojo grande"
result = rule_parse(normalize(text))
print(result)
# {'resultado': 'ok', 'accion': 'tomar', 'objeto': 'cubo', 'color': 'rojo', 'tamano': 'grande'}
```
## Dora Dataflow Configuration
```yaml ```yaml
nodes: - id: voice
- id: voice_control build: uv pip install -e dora_voice_control
build: pip install -e ./dora_voice_control path: dora_voice_control/dora_voice_control/main.py
path: dora_voice_control env:
inputs: ROBOT_TYPE: "vacuum"
voice_in: iobridge/voice_in API_ENABLED: "true"
tcp_pose: robot/tcp_pose inputs:
objects: detector/objects voice_in: iobridge/text_out
status: robot/status tcp_pose: robot/tcp_pose
outputs: objects: detector/objects
- robot_cmd status: robot/status
- voice_out outputs:
- scene_update - robot_cmd
env: - voice_out
API_ENABLED: "true" - scene_update
API_PORT: "8080"
DRY_RUN: "false"
``` ```
## Message Examples ## Adding a New Robot
### Input: voice_in 1) Create `dora_voice_control/dora_voice_control/robots/<robot_name>/` with:
``` - `adapter.py` implementing a `RobotAdapter`
"sube" - `actions.py` defining action aliases (can reuse defaults)
"agarra el cubo rojo" - `behavior.py` binding the behavior class
"suelta en la caja azul" 2) Register it in `dora_voice_control/dora_voice_control/robots/__init__.py`
```
### Output: robot_cmd
```json
{
"id": "550e8400-e29b-41d4-a716-446655440000",
"action": "move_to_pose",
"payload": {
"x": 150.0,
"y": 200.0,
"z": 280.0,
"roll": 180.0,
"pitch": 0.0,
"yaw": 0.0
}
}
```
### Output: voice_out
```json
{"text": "Ok, voy a subir", "status": "ok"}
{"text": "No entendi el comando", "status": "error"}
```
## Dependencies
- dora-rs >= 0.3.9
- numpy < 2.0.0
- pyarrow >= 12.0.0
- fastapi >= 0.109.0
- uvicorn >= 0.27.0
- pydantic >= 2.0.0
- google-genai (optional, for Gemini mode)

View File

@@ -0,0 +1,65 @@
"""Core modules for voice control node."""
from .config import (
VoiceConfig,
ApiConfig,
LLMConfig,
JPEG_QUALITY,
load_voice_config,
load_api_config,
load_llm_config,
)
from .state import SharedState, RobotStep, VoiceState, DebugState
from .node import (
VoiceControlLogger,
NodePort,
DoraNodeAdapter,
MockNodeAdapter,
EventDispatcher,
OutputPort,
NodeContext,
)
from .scene import SceneState, SceneObject, SceneNotifier, ObjectsHandler
from .voice import Intent, IntentQueue, VoiceInputHandler, normalize, rule_parse
from .behavior import ActionContext, ActionInfo, RobotBehavior, DEFAULT_ACTIONS
from .robot import RobotAdapter
from .robot_io import PoseHandler, StatusHandler, ImageHandler, CommandQueueService
__all__ = [
"VoiceConfig",
"ApiConfig",
"LLMConfig",
"JPEG_QUALITY",
"load_voice_config",
"load_api_config",
"load_llm_config",
"SharedState",
"RobotStep",
"VoiceState",
"DebugState",
"VoiceControlLogger",
"NodePort",
"DoraNodeAdapter",
"MockNodeAdapter",
"EventDispatcher",
"OutputPort",
"NodeContext",
"SceneState",
"SceneObject",
"SceneNotifier",
"ObjectsHandler",
"Intent",
"IntentQueue",
"VoiceInputHandler",
"normalize",
"rule_parse",
"ActionContext",
"ActionInfo",
"RobotBehavior",
"DEFAULT_ACTIONS",
"RobotAdapter",
"PoseHandler",
"StatusHandler",
"ImageHandler",
"CommandQueueService",
]

View File

@@ -0,0 +1,339 @@
"""Base classes for robot behaviors."""
from __future__ import annotations
import os
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, TYPE_CHECKING
from .config import VoiceConfig, LLMConfig
from .robot import RobotAdapter
from .state import RobotStep, SharedState
from .voice import rule_parse
if TYPE_CHECKING:
from .scene import SceneState, SceneObject
# Try to import dspy (optional dependency)
try:
import dspy
DSPY_AVAILABLE = True
except ImportError:
dspy = None # type: ignore
DSPY_AVAILABLE = False
def _log(msg: str) -> None:
"""Print a timestamped log message."""
timestamp = time.strftime("%H:%M:%S")
print(f"[behaviors {timestamp}] {msg}", flush=True)
@dataclass
class ActionContext:
"""Context for action execution."""
pose: Optional[List[float]] # [x, y, z, roll, pitch, yaw]
target: Optional["SceneObject"] # Target object
scene: "SceneState" # Scene state with all objects
config: VoiceConfig
home_pose: Dict[str, float]
shared_state: SharedState
@dataclass
class ActionInfo:
"""Metadata for an action."""
name: str
aliases: List[str] = field(default_factory=list)
requires_object: bool = False
requires_pose: bool = False
description: str = ""
# Default actions for pick-and-place robots
DEFAULT_ACTIONS: Dict[str, ActionInfo] = {
"subir": ActionInfo(
name="subir",
aliases=["sube", "arriba"],
requires_pose=True,
description="Subir el robot",
),
"bajar": ActionInfo(
name="bajar",
aliases=["baja", "abajo"],
requires_pose=True,
description="Bajar el robot",
),
"ir": ActionInfo(
name="ir",
aliases=["ve", "mover", "muevete", "acercar"],
requires_object=True,
description="Ir hacia un objeto",
),
"tomar": ActionInfo(
name="tomar",
aliases=["toma", "agarra", "agarrar", "coger", "chupar", "succionar"],
requires_object=True,
description="Tomar un objeto",
),
"soltar": ActionInfo(
name="soltar",
aliases=["deja", "dejar"],
requires_object=True,
description="Soltar el objeto",
),
"reiniciar": ActionInfo(
name="reiniciar",
aliases=["reinicia", "reset"],
requires_pose=False,
requires_object=False,
description="Reiniciar a posicion inicial",
),
}
class RobotBehavior(ABC):
"""
Robot behavior with configurable robot adapter.
Uses a RobotAdapter to translate generic commands (grab, release, move)
to robot-specific commands (vacuum_on, gripper_close, etc.).
"""
# Class-level action definitions
ACTIONS: ClassVar[Dict[str, ActionInfo]] = DEFAULT_ACTIONS
# Supported language
LANGUAGE: ClassVar[str] = "es"
# DSPy signature class (override in subclasses)
CommandSignature: ClassVar[type] = None # type: ignore
def __init__(
self,
config: VoiceConfig,
robot_adapter: RobotAdapter,
llm_config: Optional[LLMConfig] = None,
) -> None:
self.config = config
self.robot_adapter = robot_adapter
self._alias_map: Dict[str, str] = {}
self._predictor: Optional[Any] = None
self._build_alias_map()
if llm_config and llm_config.provider != "rules":
self._init_dspy(llm_config)
def _init_dspy(self, llm_config: LLMConfig) -> None:
"""Initialize DSPy predictor for this behavior."""
if not DSPY_AVAILABLE:
_log("DSPy not available, falling back to rules")
return
if self.CommandSignature is None:
_log("No CommandSignature defined, falling back to rules")
return
try:
lm = self._create_lm(llm_config)
if lm:
dspy.configure(lm=lm)
self._predictor = dspy.Predict(self.CommandSignature)
_log(f"DSPy initialized with {llm_config.provider}/{llm_config.model}")
except Exception as e:
_log(f"Failed to initialize DSPy: {e}")
def _create_lm(self, config: LLMConfig) -> Optional[Any]:
"""Create DSPy language model."""
if not DSPY_AVAILABLE:
return None
if config.provider == "gemini":
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
return dspy.LM(
f"gemini/{config.model}",
api_key=api_key,
temperature=config.temperature,
)
if config.provider == "ollama":
return dspy.LM(
f"ollama_chat/{config.model}",
api_base=config.api_base,
api_key="",
)
if config.provider == "openai":
return dspy.LM(
f"openai/{config.model}",
temperature=config.temperature,
)
return None
def parse_command(self, transcript: str) -> Dict[str, str]:
"""Parse voice command using DSPy or fallback to rules."""
if self._predictor:
try:
result = self._predictor(comando=transcript)
return {
"resultado": "ok" if result.accion != "error" else "error",
"accion": result.accion,
"objeto": result.objeto,
"color": result.color,
"tamano": result.tamano,
}
except Exception as e:
_log(f"DSPy parsing failed: {e}, falling back to rules")
return self.rule_parse(transcript)
def rule_parse(self, transcript: str) -> Dict[str, str]:
"""Rule-based parsing fallback using behavior's vocabulary."""
return rule_parse(transcript, self.get_all_keywords())
def _build_alias_map(self) -> None:
"""Build reverse mapping from aliases to action names."""
self._alias_map = {}
for action_name, info in self.ACTIONS.items():
self._alias_map[action_name] = action_name
for alias in info.aliases:
self._alias_map[alias] = action_name
def resolve_action(self, name: str) -> Optional[str]:
"""Resolve action name or alias to canonical name."""
return self._alias_map.get(name)
def get_action_info(self, name: str) -> Optional[ActionInfo]:
"""Get action info by name or alias."""
canonical = self.resolve_action(name)
return self.ACTIONS.get(canonical) if canonical else None
def get_all_keywords(self) -> Dict[str, List[str]]:
"""Get all action names and aliases for parser."""
return {
name: [name] + info.aliases
for name, info in self.ACTIONS.items()
}
def execute(self, action_name: str, context: ActionContext) -> bool:
"""Execute action by name using explicit handler mapping."""
canonical = self.resolve_action(action_name)
if not canonical:
return False
info = self.ACTIONS.get(canonical)
if not info:
return False
# Check preconditions
if info.requires_pose and not context.pose:
return False
if info.requires_object and not context.target:
return False
handler = self.action_handlers().get(canonical)
if not handler:
return False
return handler(context)
@abstractmethod
def action_handlers(self) -> Dict[str, Callable[[ActionContext], bool]]:
"""Return mapping of action names to handler functions."""
pass
def _within_bounds(
self,
point_mm: List[float],
workspace_min: Tuple[Optional[float], Optional[float], Optional[float]],
workspace_max: Tuple[Optional[float], Optional[float], Optional[float]],
) -> bool:
"""Check if position is within workspace bounds."""
x, y, z = point_mm[:3]
min_x, min_y, min_z = workspace_min
max_x, max_y, max_z = workspace_max
if min_x is not None and x < min_x:
return False
if max_x is not None and x > max_x:
return False
if min_y is not None and y < min_y:
return False
if max_y is not None and y > max_y:
return False
if min_z is not None and z < min_z:
return False
if max_z is not None and z > max_z:
return False
return True
def _queue_steps(self, ctx: ActionContext, steps: List[RobotStep]) -> None:
"""Queue multiple robot steps."""
for step in steps:
ctx.shared_state.append_queue(step)
def _make_pose_payload(self, x: float, y: float, z: float) -> Dict[str, float]:
"""Create pose payload with default orientation."""
return {
"x": x,
"y": y,
"z": z,
"roll": self.config.default_roll,
"pitch": self.config.default_pitch,
"yaw": self.config.default_yaw,
}
def _queue_move(self, ctx: ActionContext, x: float, y: float, z: float) -> bool:
"""Queue a move command if within bounds."""
if not self._within_bounds([x, y, z], self.config.workspace_min, self.config.workspace_max):
return False
payload = self._make_pose_payload(x, y, z)
self._queue_steps(ctx, self.robot_adapter.move(payload))
return True
def _queue_approach_sequence(self, ctx: ActionContext, pos: List[float]) -> None:
"""Queue approach pose then target pose."""
approach_z = pos[2] + self.config.tcp_offset_mm + self.config.approach_offset_mm
target_z = pos[2] + self.config.tcp_offset_mm
# Approach
self._queue_move(ctx, pos[0], pos[1], approach_z)
# Target
self._queue_move(ctx, pos[0], pos[1], target_z)
# =========================================================
# Action implementations using robot adapter
# =========================================================
def action_subir(self, ctx: ActionContext) -> bool:
"""Move up by step_mm."""
target_z = ctx.pose[2] + self.config.step_mm
return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z)
def action_bajar(self, ctx: ActionContext) -> bool:
"""Move down by step_mm."""
target_z = ctx.pose[2] - self.config.step_mm
return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z)
def action_ir(self, ctx: ActionContext) -> bool:
"""Move to object position (approach + target)."""
pos = ctx.target.position_mm
self._queue_approach_sequence(ctx, pos)
return True
def action_tomar(self, ctx: ActionContext) -> bool:
"""Pick object using robot adapter."""
pos = ctx.target.position_mm
self._queue_approach_sequence(ctx, pos)
self._queue_steps(ctx, self.robot_adapter.grab())
return True
def action_soltar(self, ctx: ActionContext) -> bool:
"""Release object at target using robot adapter."""
pos = ctx.target.position_mm
self._queue_approach_sequence(ctx, pos)
self._queue_steps(ctx, self.robot_adapter.release())
return True
def action_reiniciar(self, ctx: ActionContext) -> bool:
"""Reset: release tool, move home, clear objects."""
self._queue_steps(ctx, self.robot_adapter.reset_tool())
self._queue_steps(ctx, self.robot_adapter.move(ctx.home_pose))
ctx.scene.clear_detected()
return True

View File

@@ -6,6 +6,9 @@ import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, Optional, Tuple from typing import Dict, Optional, Tuple
# Image encoding constants
JPEG_QUALITY = 80
@dataclass @dataclass
class VoiceConfig: class VoiceConfig:
@@ -34,6 +37,16 @@ class ApiConfig:
enabled: bool enabled: bool
@dataclass
class LLMConfig:
"""Configuration for LLM provider."""
provider: str # "rules", "gemini", "ollama", "openai"
model: str # Model name/path
api_base: Optional[str] = None # For Ollama
temperature: float = 0.3
def _parse_float_env(name: str) -> Optional[float]: def _parse_float_env(name: str) -> Optional[float]:
"""Parse an optional float from environment variable.""" """Parse an optional float from environment variable."""
raw = os.getenv(name) raw = os.getenv(name)
@@ -48,6 +61,7 @@ def _parse_float_env(name: str) -> Optional[float]:
def _parse_class_map(raw: str) -> Dict[str, str]: def _parse_class_map(raw: str) -> Dict[str, str]:
"""Parse JSON class mapping from string.""" """Parse JSON class mapping from string."""
import json import json
import time
if not raw: if not raw:
return {} return {}
@@ -55,8 +69,9 @@ def _parse_class_map(raw: str) -> Dict[str, str]:
data = json.loads(raw) data = json.loads(raw)
if isinstance(data, dict): if isinstance(data, dict):
return {str(k): str(v) for k, v in data.items()} return {str(k): str(v) for k, v in data.items()}
except Exception: except Exception as e:
pass timestamp = time.strftime("%H:%M:%S")
print(f"[voice_control {timestamp}] Warning: failed to parse CLASS_MAP: {e}", flush=True)
return {} return {}
@@ -90,6 +105,16 @@ def load_api_config() -> ApiConfig:
"""Load API server configuration from environment variables.""" """Load API server configuration from environment variables."""
return ApiConfig( return ApiConfig(
host=os.getenv("API_HOST", "0.0.0.0"), host=os.getenv("API_HOST", "0.0.0.0"),
port=int(os.getenv("API_PORT", "8080")), port=int(os.getenv("API_PORT", "9001")),
enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"), enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"),
) )
def load_llm_config() -> LLMConfig:
"""Load LLM configuration from environment variables."""
return LLMConfig(
provider=os.getenv("LLM_PROVIDER", "rules").lower(),
model=os.getenv("LLM_MODEL", "gemma2"),
api_base=os.getenv("OLLAMA_API_BASE", "http://localhost:11434"),
temperature=float(os.getenv("LLM_TEMPERATURE", "0.3")),
)

View File

@@ -0,0 +1,260 @@
"""Dora node infrastructure abstractions."""
from __future__ import annotations
import json
import time
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Callable, Dict, Iterator, List, Optional, Protocol
import pyarrow as pa
class VoiceControlLogger:
"""Timestamped logger for voice control node."""
def __init__(self, prefix: str = "voice_control") -> None:
self._prefix = prefix
def log(self, msg: str) -> None:
"""Print a timestamped log message."""
timestamp = time.strftime("%H:%M:%S")
print(f"[{self._prefix} {timestamp}] {msg}", flush=True)
def warn(self, msg: str) -> None:
"""Print a timestamped warning message."""
self.log(f"Warning: {msg}")
def error(self, msg: str) -> None:
"""Print a timestamped error message."""
self.log(f"Error: {msg}")
class NodePort(ABC):
"""Abstract interface for Dora node operations."""
@abstractmethod
def send_output(self, output_name: str, data: pa.Array, metadata: Dict[str, Any]) -> None:
"""Send output to a Dora topic."""
pass
@abstractmethod
def events(self) -> Iterator[Dict[str, Any]]:
"""Iterate over incoming events."""
pass
class DoraNodeAdapter(NodePort):
"""Real Dora node wrapper."""
def __init__(self, node: Any) -> None:
self._node = node
def send_output(self, output_name: str, data: pa.Array, metadata: Dict[str, Any]) -> None:
"""Send output to a Dora topic."""
self._node.send_output(output_name, data, metadata)
def events(self) -> Iterator[Dict[str, Any]]:
"""Iterate over incoming events."""
for event in self._node:
yield event
def send_command(self, output_name: str, action: str, payload: Dict[str, Any]) -> str:
"""Send a robot command and return the command ID."""
command_id = str(uuid.uuid4())
message = {"id": command_id, "action": action, "payload": payload}
self.send_output(
output_name,
pa.array([json.dumps(message)]),
{"encoding": "json", "timestamp_ns": time.time_ns()},
)
return command_id
def send_json(self, output_name: str, data: Dict[str, Any]) -> None:
"""Send JSON data to a topic."""
self.send_output(
output_name,
pa.array([json.dumps(data)]),
{"encoding": "json", "timestamp_ns": time.time_ns()},
)
class MockNodeAdapter(NodePort):
"""Mock node adapter for testing."""
def __init__(self) -> None:
self._outputs: list[tuple[str, pa.Array, Dict[str, Any]]] = []
self._events: list[Dict[str, Any]] = []
def send_output(self, output_name: str, data: pa.Array, metadata: Dict[str, Any]) -> None:
"""Record output for testing."""
self._outputs.append((output_name, data, metadata))
def events(self) -> Iterator[Dict[str, Any]]:
"""Return queued test events."""
for event in self._events:
yield event
def queue_event(self, event: Dict[str, Any]) -> None:
"""Queue an event for testing."""
self._events.append(event)
def get_outputs(self) -> list[tuple[str, pa.Array, Dict[str, Any]]]:
"""Get recorded outputs."""
return self._outputs
def clear(self) -> None:
"""Clear recorded outputs and events."""
self._outputs.clear()
self._events.clear()
def send_command(self, output_name: str, action: str, payload: Dict[str, Any]) -> str:
"""Send a robot command and return the command ID."""
command_id = str(uuid.uuid4())
message = {"id": command_id, "action": action, "payload": payload}
self.send_output(
output_name,
pa.array([json.dumps(message)]),
{"encoding": "json", "timestamp_ns": time.time_ns()},
)
return command_id
def send_json(self, output_name: str, data: Dict[str, Any]) -> None:
"""Send JSON data to a topic."""
self.send_output(
output_name,
pa.array([json.dumps(data)]),
{"encoding": "json", "timestamp_ns": time.time_ns()},
)
class EventHandler(Protocol):
"""Protocol for event handlers."""
def handle(self, event: Dict[str, Any]) -> None:
"""Handle an incoming event."""
...
class EventDispatcher:
"""Routes Dora events to registered handlers."""
def __init__(self, adapter: NodePort, logger: Optional[VoiceControlLogger] = None) -> None:
self._adapter = adapter
self._logger = logger or VoiceControlLogger()
self._handlers: Dict[str, EventHandler] = {}
self._tick_callbacks: List[Callable[[], None]] = []
self._first_event_callbacks: List[Callable[[], None]] = []
self._first_event_fired = False
def register(self, event_id: str, handler: EventHandler) -> None:
"""Register a handler for a specific event ID."""
self._handlers[event_id] = handler
def on_tick(self, callback: Callable[[], None]) -> None:
"""Register a callback to run after each event."""
self._tick_callbacks.append(callback)
def on_first_event(self, callback: Callable[[], None]) -> None:
"""Register a callback to run on the first event."""
self._first_event_callbacks.append(callback)
def run(self) -> None:
"""Main event loop."""
self._logger.log("Dora node created, waiting for events...")
for event in self._adapter.events():
# Fire first event callbacks
if not self._first_event_fired:
self._first_event_fired = True
for callback in self._first_event_callbacks:
try:
callback()
except Exception as e:
self._logger.error(f"First event callback failed: {e}")
# Skip non-input events
if event.get("type") != "INPUT":
continue
# Dispatch to handler
event_id = event.get("id")
handler = self._handlers.get(event_id)
if handler:
try:
handler.handle(event)
except Exception as e:
self._logger.error(f"Handler for '{event_id}' failed: {e}")
# Run tick callbacks
for callback in self._tick_callbacks:
try:
callback()
except Exception as e:
self._logger.error(f"Tick callback failed: {e}")
class OutputPort:
"""Typed output port for sending data to a Dora topic."""
def __init__(self, adapter: DoraNodeAdapter, name: str) -> None:
self._adapter = adapter
self._name = name
@property
def name(self) -> str:
"""Get the output topic name."""
return self._name
def send_json(self, data: Dict[str, Any]) -> None:
"""Send JSON data to the topic."""
self._adapter.send_json(self._name, data)
def send_command(self, action: str, payload: Dict[str, Any]) -> str:
"""Send a robot command and return command ID."""
return self._adapter.send_command(self._name, action, payload)
@dataclass
class NodeContext:
"""Central context for Dora node I/O.
Bundles the adapter, logger, and typed output ports into a single
context object that can be passed to services and handlers.
"""
adapter: DoraNodeAdapter
logger: VoiceControlLogger
# Typed output ports
robot_out: OutputPort
scene_out: OutputPort
voice_out: OutputPort
@classmethod
def create(
cls,
adapter: DoraNodeAdapter,
logger: VoiceControlLogger,
robot_output: str = "robot_cmd",
scene_output: str = "scene_update",
voice_output: str = "voice_out",
) -> "NodeContext":
"""Create a NodeContext with named output ports.
Args:
adapter: The Dora node adapter
logger: The logger instance
robot_output: Topic name for robot commands
scene_output: Topic name for scene updates
voice_output: Topic name for voice responses
"""
return cls(
adapter=adapter,
logger=logger,
robot_out=OutputPort(adapter, robot_output),
scene_out=OutputPort(adapter, scene_output),
voice_out=OutputPort(adapter, voice_output),
)

View File

@@ -0,0 +1,39 @@
"""Robot adapter base types."""
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from .state import RobotStep
class RobotAdapter(ABC):
"""Abstract adapter for translating generic commands to robot-specific commands.
Generic commands:
- grab: Activate tool to grab object
- release: Deactivate tool to release object
- move: Move to position
- reset_tool: Reset tool to safe state
"""
@abstractmethod
def grab(self) -> List[RobotStep]:
"""Translate grab command to robot steps."""
pass
@abstractmethod
def release(self) -> List[RobotStep]:
"""Translate release command to robot steps."""
pass
@abstractmethod
def move(self, payload: Dict[str, Any]) -> List[RobotStep]:
"""Translate move command to robot steps."""
pass
@abstractmethod
def reset_tool(self) -> List[RobotStep]:
"""Translate reset_tool command to robot steps."""
pass

View File

@@ -0,0 +1,155 @@
"""Robot I/O handlers and command queue service."""
from __future__ import annotations
import json
import os
import time
from typing import Any, Dict, Optional, TYPE_CHECKING
import cv2
import numpy as np
import pyarrow as pa
from .config import JPEG_QUALITY
if TYPE_CHECKING:
from .config import VoiceConfig
from .state import SharedState
from .node import OutputPort, VoiceControlLogger
class CommandQueueService:
"""Dispatches robot commands from the queue."""
def __init__(
self,
state: "SharedState",
port: "OutputPort",
config: "VoiceConfig",
logger: "VoiceControlLogger",
) -> None:
self._state = state
self._port = port
self._config = config
self._logger = logger
def dispatch_next(self) -> bool:
"""Dispatch the next command in the queue if possible.
Returns True if a command was dispatched, False otherwise.
"""
# Don't dispatch if there's a pending command
if self._state.get_pending_command() is not None:
return False
# Don't dispatch if queue is empty
if self._state.queue_size() == 0:
return False
step = self._state.pop_queue()
if step is None:
return False
# Handle dry run mode
if self._config.dry_run:
self._logger.log(f"[DRY RUN] Would send: {step.action} {step.payload}")
self._state.set_pending_command(None)
return True
# Send command
cmd_id = self._port.send_command(step.action, step.payload)
self._state.set_pending_command({"id": cmd_id, "action": step.action})
self._logger.log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={self._state.queue_size()}")
# Update debug state
self._state.update_robot_command(
{"id": cmd_id, "action": step.action, "payload": step.payload},
time.monotonic(),
)
return True
def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]:
"""Parse status payload from robot."""
if len(value) == 0:
return None
raw = value[0].as_py()
if not raw:
return None
try:
return json.loads(raw)
except Exception:
return None
class StatusHandler:
"""Handles status events from the robot."""
def __init__(
self,
state: "SharedState",
logger: "VoiceControlLogger",
) -> None:
self._state = state
self._logger = logger
def handle(self, event: Dict[str, Any]) -> None:
"""Handle a status event."""
payload = _parse_status_payload(event["value"])
pending = self._state.get_pending_command()
if payload and pending:
if payload.get("command_id") == pending.get("id"):
self._logger.log(
f"Command completed: {pending.get('action')} (status={payload.get('status', 'ok')})"
)
self._state.set_pending_command(None)
class PoseHandler:
"""Handles tcp_pose events from the Dora node."""
def __init__(
self,
state: "SharedState",
logger: "VoiceControlLogger",
) -> None:
self._state = state
self._logger = logger
def handle(self, event: Dict[str, Any]) -> None:
"""Handle a tcp_pose event."""
tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
if tcp_pose.size >= 6:
self._state.update_pose(tcp_pose[:6].tolist(), time.monotonic())
class ImageHandler:
"""Handles camera image events from the Dora node."""
def __init__(
self,
state: "SharedState",
logger: "VoiceControlLogger",
image_width: Optional[int] = None,
image_height: Optional[int] = None,
) -> None:
self._state = state
self._logger = logger
self._image_width = image_width or int(os.getenv("IMAGE_WIDTH", "1280"))
self._image_height = image_height or int(os.getenv("IMAGE_HEIGHT", "720"))
def handle(self, event: Dict[str, Any]) -> None:
"""Handle an image event."""
try:
# Get raw image data
img_data = event["value"].to_numpy()
# Reshape to image (assuming BGR format)
img = img_data.reshape((self._image_height, self._image_width, 3)).astype(np.uint8)
# Encode to JPEG
_, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY])
self._state.update_image(jpeg_data.tobytes(), time.monotonic())
except Exception as e:
self._logger.warn(f"failed to process camera image: {e}")

View File

@@ -0,0 +1,573 @@
"""Scene state and handlers for object management."""
from __future__ import annotations
import json
import os
import threading
import time
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, TYPE_CHECKING
try:
import tomllib
except ModuleNotFoundError:
import tomli as tomllib # type: ignore
if TYPE_CHECKING:
from .node import OutputPort, VoiceControlLogger
@dataclass
class SceneObject:
"""An object in the scene with spatial properties."""
id: str # Unique identifier
object_type: str # cube, cylinder, box, etc.
color: str # red, blue, etc.
size: str = "normal" # big, small, normal
# Spatial properties
position_mm: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
bbox_mm: Optional[List[float]] = None # [x1, y1, x2, y2] bounding box
height_mm: float = 30.0 # Object height
area_mm2: Optional[float] = None # Footprint area
# Relationships
on_top_of: Optional[str] = None # ID of object this is stacked on
# Metadata
source: str = "detected" # "detected", "manual", or "config"
confidence: float = 1.0
last_seen_ns: int = 0
@property
def center(self) -> List[float]:
"""Get center point (alias for position_mm)."""
return self.position_mm
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"id": self.id,
"object_type": self.object_type,
"color": self.color,
"size": self.size,
"position_mm": self.position_mm,
"bbox_mm": self.bbox_mm,
"height_mm": self.height_mm,
"area_mm2": self.area_mm2,
"on_top_of": self.on_top_of,
"source": self.source,
"confidence": self.confidence,
"last_seen_ns": self.last_seen_ns,
}
class SceneState:
"""Thread-safe scene state with spatial relationships."""
# Tolerance for stacking detection (mm)
Z_TOLERANCE_MM: float = 15.0
def __init__(self) -> None:
self._lock = threading.Lock()
self._objects: Dict[str, SceneObject] = {}
# === Core Operations ===
def add(self, obj: SceneObject) -> bool:
"""Add object to scene.
Returns True if added, False if ID already exists.
"""
with self._lock:
if obj.id in self._objects:
return False
self._objects[obj.id] = obj
return True
def delete(self, object_id: str) -> bool:
"""Remove object and clear relationships pointing to it.
Returns True if deleted, False if not found.
"""
with self._lock:
if object_id not in self._objects:
return False
del self._objects[object_id]
# Clear relationships pointing to deleted object
for obj in self._objects.values():
if obj.on_top_of == object_id:
obj.on_top_of = None
return True
def update(self, object_id: str, **properties: Any) -> bool:
"""Update object properties.
Returns True if updated, False if not found.
"""
with self._lock:
if object_id not in self._objects:
return False
obj = self._objects[object_id]
for key, value in properties.items():
if hasattr(obj, key):
setattr(obj, key, value)
return True
def get(self, object_id: str) -> Optional[SceneObject]:
"""Get object by ID."""
with self._lock:
obj = self._objects.get(object_id)
return obj # Return reference (caller should not mutate)
def get_all(self) -> List[SceneObject]:
"""Get all objects."""
with self._lock:
return list(self._objects.values())
def clear(self) -> None:
"""Remove all objects."""
with self._lock:
self._objects.clear()
def clear_detected(self) -> None:
"""Remove only detected objects (keep manual and config)."""
with self._lock:
to_delete = [
obj_id
for obj_id, obj in self._objects.items()
if obj.source == "detected"
]
for obj_id in to_delete:
del self._objects[obj_id]
# Clear broken relationships
for obj in self._objects.values():
if obj.on_top_of and obj.on_top_of not in self._objects:
obj.on_top_of = None
# === Query ===
def query(
self,
object_type: Optional[str] = None,
color: Optional[str] = None,
size: Optional[str] = None,
source: Optional[str] = None,
) -> List[SceneObject]:
"""Find objects matching criteria."""
with self._lock:
results: List[SceneObject] = []
for obj in self._objects.values():
if object_type is not None and obj.object_type != object_type:
continue
if color is not None and obj.color != color:
continue
if size is not None and obj.size != size:
continue
if source is not None and obj.source != source:
continue
results.append(obj)
return results
def count(self) -> int:
"""Get total object count."""
with self._lock:
return len(self._objects)
# === Spatial Relationships ===
def set_on_top_of(self, object_id: str, below_id: Optional[str]) -> bool:
"""Set stacking relationship.
Returns True if set, False if object not found.
"""
with self._lock:
if object_id not in self._objects:
return False
if below_id is not None and below_id not in self._objects:
return False
self._objects[object_id].on_top_of = below_id
return True
def infer_relationships(self) -> None:
"""Infer stacking from positions.
Call after replace_detected() to automatically detect stacking.
A is on B if: A.z ≈ B.z + B.height AND A overlaps B horizontally.
"""
with self._lock:
self._infer_relationships_unlocked()
def _infer_relationships_unlocked(self) -> None:
"""Infer relationships without holding lock (internal use)."""
# Reset all relationships first
for obj in self._objects.values():
obj.on_top_of = None
for obj in self._objects.values():
best_match: Optional[SceneObject] = None
best_distance = float("inf")
for candidate in self._objects.values():
if candidate.id == obj.id:
continue
# Check if obj is on top of candidate
expected_z = candidate.position_mm[2] + candidate.height_mm
actual_z = obj.position_mm[2]
z_diff = abs(actual_z - expected_z)
if z_diff < self.Z_TOLERANCE_MM:
if self._overlaps_horizontally_unlocked(obj, candidate):
# Use closest match in z
if z_diff < best_distance:
best_match = candidate
best_distance = z_diff
if best_match is not None:
obj.on_top_of = best_match.id
def get_object_below(self, object_id: str) -> Optional[SceneObject]:
"""Get object this one is stacked on."""
with self._lock:
obj = self._objects.get(object_id)
if obj is None or obj.on_top_of is None:
return None
return self._objects.get(obj.on_top_of)
def get_objects_above(self, object_id: str) -> List[SceneObject]:
"""Get objects stacked directly on this one."""
with self._lock:
results: List[SceneObject] = []
for obj in self._objects.values():
if obj.on_top_of == object_id:
results.append(obj)
return results
def get_stack(self, object_id: str) -> List[SceneObject]:
"""Get all objects in stack (bottom to top).
Returns empty list if object not found.
"""
with self._lock:
obj = self._objects.get(object_id)
if obj is None:
return []
# Find bottom of stack
bottom = obj
while bottom.on_top_of is not None:
below = self._objects.get(bottom.on_top_of)
if below is None:
break
bottom = below
# Build stack from bottom up
stack = [bottom]
visited = {bottom.id}
while True:
# Find object on top of current top
current_top = stack[-1]
found_next = False
for candidate in self._objects.values():
if candidate.on_top_of == current_top.id and candidate.id not in visited:
stack.append(candidate)
visited.add(candidate.id)
found_next = True
break
if not found_next:
break
return stack
def effective_height(self, object_id: str) -> float:
"""Get height including objects below (if stacked).
Returns 0 if object not found.
"""
with self._lock:
obj = self._objects.get(object_id)
if obj is None:
return 0.0
return self._effective_height_unlocked(obj)
def _effective_height_unlocked(self, obj: SceneObject) -> float:
"""Calculate effective height without holding lock."""
if obj.on_top_of is None:
return obj.height_mm
below = self._objects.get(obj.on_top_of)
if below is None:
return obj.height_mm
return obj.height_mm + self._effective_height_unlocked(below)
def _overlaps_horizontally(self, a: SceneObject, b: SceneObject) -> bool:
"""Check if two objects overlap in x,y plane (thread-safe)."""
with self._lock:
return self._overlaps_horizontally_unlocked(a, b)
def _overlaps_horizontally_unlocked(self, a: SceneObject, b: SceneObject) -> bool:
"""Check horizontal overlap without lock.
If bounding boxes are available, use them.
Otherwise, check if centers are within a distance threshold.
"""
# If both have bounding boxes, use proper overlap check
if a.bbox_mm is not None and b.bbox_mm is not None:
# bbox format: [x1, y1, x2, y2]
ax1, ay1, ax2, ay2 = a.bbox_mm
bx1, by1, bx2, by2 = b.bbox_mm
# Check for no overlap (if any of these is true, no overlap)
if ax2 < bx1 or bx2 < ax1:
return False
if ay2 < by1 or by2 < ay1:
return False
return True
# Fall back to center distance check
# Default threshold: ~30mm (typical small object radius)
threshold = 40.0
dx = abs(a.position_mm[0] - b.position_mm[0])
dy = abs(a.position_mm[1] - b.position_mm[1])
return dx < threshold and dy < threshold
# === Detector Integration ===
def replace_detected(self, objects: List[Dict[str, Any]]) -> None:
"""Replace detected objects (keeps manual and config objects).
Automatically generates IDs and infers stacking relationships.
"""
with self._lock:
# Remove existing detected objects
to_delete = [
obj_id
for obj_id, obj in self._objects.items()
if obj.source == "detected"
]
for obj_id in to_delete:
del self._objects[obj_id]
# Add new detected objects
now_ns = time.time_ns()
counters: Dict[str, int] = {}
for obj_data in objects:
obj_type = obj_data.get("object_type", "unknown")
color = obj_data.get("color", "unknown")
# Generate unique ID
key = f"{obj_type}_{color}"
counters[key] = counters.get(key, 0) + 1
obj_id = f"{key}_{counters[key]}"
position = obj_data.get("position_mm", [0.0, 0.0, 0.0])
if len(position) < 3:
position = position + [0.0] * (3 - len(position))
scene_obj = SceneObject(
id=obj_id,
object_type=obj_type,
color=color,
size=obj_data.get("size", "normal"),
position_mm=position,
bbox_mm=obj_data.get("bbox_mm"),
height_mm=obj_data.get("height_mm", 30.0),
area_mm2=obj_data.get("area_mm2"),
source="detected",
confidence=obj_data.get("confidence", 1.0),
last_seen_ns=now_ns,
)
self._objects[obj_id] = scene_obj
# Clear broken relationships
for obj in self._objects.values():
if obj.on_top_of and obj.on_top_of not in self._objects:
obj.on_top_of = None
# Infer stacking relationships
self._infer_relationships_unlocked()
# === Serialization ===
def to_list(self) -> List[Dict[str, Any]]:
"""Convert all objects to list of dictionaries."""
with self._lock:
return [obj.to_dict() for obj in self._objects.values()]
def get_objects_as_dicts(self) -> List[Dict[str, Any]]:
"""Get all objects as dictionaries (for compatibility)."""
return self.to_list()
# === Object Matching (voice criteria) ===
def find_by_criteria(
self,
obj_type: str,
color: str,
size: str,
class_map: Optional[Dict[str, str]] = None,
) -> Optional[SceneObject]:
"""Find object matching voice command criteria.
Args:
obj_type: Object type from voice command (e.g., "cubo", "cilindro")
color: Color from voice command (e.g., "rojo", "azul")
size: Size from voice command (e.g., "grande", "pequeno")
class_map: Optional mapping to translate voice terms to detector terms
Returns:
First matching SceneObject, or None if not found.
"""
if obj_type == "no especificado":
return None
class_map = class_map or {}
def translate(token: str) -> str:
return class_map.get(token, token)
target_type = translate(obj_type)
target_color = translate(color) if color != "no especificado" else None
target_size = translate(size) if size != "no especificado" else None
results = self.query(
object_type=target_type,
color=target_color,
size=target_size,
)
return results[0] if results else None
# === Config Loading ===
def load_from_config(self, config_path: Optional[str] = None) -> int:
"""Load static objects from TOML config file.
Config format:
[object_parameters]
normal_height = 220.0
[[static_objects]]
type = "box"
color = "blue"
position = [100.0, -200.0]
size = "big"
height = 50.0
Args:
config_path: Path to config file. Defaults to CONFIG_FILE env var or "config.toml".
Returns:
Number of objects loaded.
"""
if config_path is None:
config_path = os.getenv("CONFIG_FILE", "config.toml")
cfg = self._load_toml(config_path)
if not cfg:
return 0
obj_cfg = cfg.get("object_parameters", {})
base_z = float(obj_cfg.get("normal_height", 220.0))
count = 0
counters: Dict[str, int] = {}
for obj in cfg.get("static_objects", []):
obj_type = obj.get("type", "box")
color = obj.get("color", "unknown")
pos = obj.get("position", [0, 0])
# Generate unique ID
key = f"{obj_type}_{color}"
counters[key] = counters.get(key, 0) + 1
obj_id = f"config_{key}_{counters[key]}"
scene_obj = SceneObject(
id=obj_id,
object_type=obj_type,
color=color,
size=obj.get("size", "big"),
position_mm=[float(pos[0]), float(pos[1]), base_z],
height_mm=obj.get("height", 30.0),
source="config",
)
if self.add(scene_obj):
count += 1
return count
@staticmethod
def _load_toml(path: str) -> Dict[str, Any]:
"""Load TOML file, returning empty dict on error."""
if not path or not os.path.exists(path):
return {}
try:
with open(path, "rb") as f:
return tomllib.load(f)
except Exception:
return {}
class SceneNotifier:
"""Emits scene updates to clients."""
def __init__(
self,
scene: SceneState,
port: "OutputPort",
logger: "VoiceControlLogger",
) -> None:
self._scene = scene
self._port = port
self._logger = logger
def send_scene_update(self) -> None:
"""Send current scene state to clients."""
self._port.send_json({"objects": self._scene.to_list()})
def send_scene_reset(self) -> None:
"""Send scene reset notification to clients."""
self._scene.clear_detected()
self._port.send_json({"objects": self._scene.to_list(), "reset": True})
self._logger.log("Sent scene reset notification")
def send_initial_scene(self) -> None:
"""Send initial scene state on startup."""
self._port.send_json({"objects": self._scene.to_list(), "reset": True})
self._logger.log("Sent initial scene reset notification")
class ObjectsHandler:
"""Handles objects detection events from the Dora node."""
def __init__(
self,
scene: SceneState,
notifier: SceneNotifier,
logger: "VoiceControlLogger",
) -> None:
self._scene = scene
self._notifier = notifier
self._logger = logger
def handle(self, event: Dict[str, Any]) -> None:
"""Handle an objects detection event."""
raw = event["value"][0].as_py() if len(event["value"]) else ""
if not raw:
return
try:
payload = json.loads(raw)
objects = payload.get("objects", [])
except Exception as e:
self._logger.warn(f"failed to parse objects payload: {e}")
objects = []
self._scene.replace_detected(objects)
# Emit scene update
self._notifier.send_scene_update()

View File

@@ -22,9 +22,6 @@ class VoiceState:
latest_pose: Optional[List[float]] = None latest_pose: Optional[List[float]] = None
latest_pose_at: Optional[float] = None latest_pose_at: Optional[float] = None
latest_objects: List[Dict[str, Any]] = field(default_factory=list)
latest_objects_at: Optional[float] = None
static_objects: List[Dict[str, Any]] = field(default_factory=list)
pending_command: Optional[Dict[str, Any]] = None pending_command: Optional[Dict[str, Any]] = None
queue: Deque[RobotStep] = field(default_factory=deque) queue: Deque[RobotStep] = field(default_factory=deque)
@@ -45,12 +42,16 @@ class DebugState:
class SharedState: class SharedState:
"""Thread-safe shared state container.""" """Thread-safe shared state container.
Note: Object state (detected/static objects) has moved to SceneState.
This class now only handles robot queue, pose, and debug state.
"""
def __init__(self) -> None: def __init__(self) -> None:
self._lock = threading.Lock() self._lock = threading.Lock()
self.voice_state = VoiceState() self._voice_state = VoiceState()
self.debug_state = DebugState() self._debug_state = DebugState()
self._command_callback: Optional[Any] = None self._command_callback: Optional[Any] = None
def set_command_callback(self, callback: Any) -> None: def set_command_callback(self, callback: Any) -> None:
@@ -66,14 +67,12 @@ class SharedState:
def get_status(self) -> Dict[str, Any]: def get_status(self) -> Dict[str, Any]:
"""Get current status for web interface.""" """Get current status for web interface."""
with self._lock: with self._lock:
vs = self.voice_state vs = self._voice_state
ds = self.debug_state ds = self._debug_state
return { return {
"has_pose": vs.latest_pose is not None, "has_pose": vs.latest_pose is not None,
"pose": vs.latest_pose, "pose": vs.latest_pose,
"pose_age_ms": _age_ms(vs.latest_pose_at), "pose_age_ms": _age_ms(vs.latest_pose_at),
"object_count": len(vs.latest_objects),
"static_object_count": len(vs.static_objects),
"queue_size": len(vs.queue), "queue_size": len(vs.queue),
"has_pending_command": vs.pending_command is not None, "has_pending_command": vs.pending_command is not None,
"pending_command": vs.pending_command, "pending_command": vs.pending_command,
@@ -82,71 +81,116 @@ class SharedState:
"last_parse_result": ds.last_parse_result, "last_parse_result": ds.last_parse_result,
} }
def get_objects(self) -> Dict[str, Any]:
"""Get detected and static objects."""
with self._lock:
return {
"detected": list(self.voice_state.latest_objects),
"static": list(self.voice_state.static_objects),
}
def get_queue(self) -> List[Dict[str, Any]]: def get_queue(self) -> List[Dict[str, Any]]:
"""Get the command queue.""" """Get the command queue."""
with self._lock: with self._lock:
return [{"action": s.action, "payload": s.payload} for s in self.voice_state.queue] return [{"action": s.action, "payload": s.payload} for s in self._voice_state.queue]
def clear_queue(self) -> None:
"""Clear the command queue."""
with self._lock:
self._voice_state.queue.clear()
def pop_queue(self) -> Optional[RobotStep]:
"""Pop the next step from the queue. Returns None if empty."""
with self._lock:
if self._voice_state.queue:
return self._voice_state.queue.popleft()
return None
def append_queue(self, step: RobotStep) -> None:
"""Append a step to the command queue."""
with self._lock:
self._voice_state.queue.append(step)
def queue_size(self) -> int:
"""Get the size of the command queue."""
with self._lock:
return len(self._voice_state.queue)
def get_pose(self) -> Optional[List[float]]:
"""Get the latest TCP pose."""
with self._lock:
return self._voice_state.latest_pose
def update_pose(self, pose: List[float], timestamp: float) -> None:
"""Update the latest TCP pose."""
with self._lock:
self._voice_state.latest_pose = pose
self._voice_state.latest_pose_at = timestamp
def get_pending_command(self) -> Optional[Dict[str, Any]]:
"""Get the pending command."""
with self._lock:
return self._voice_state.pending_command
def set_pending_command(self, command: Optional[Dict[str, Any]]) -> None:
"""Set the pending command."""
with self._lock:
self._voice_state.pending_command = command
def get_history(self) -> List[Dict[str, Any]]: def get_history(self) -> List[Dict[str, Any]]:
"""Get command history.""" """Get command history."""
with self._lock: with self._lock:
return list(self.debug_state.command_history[-50:]) return list(self._debug_state.command_history[-50:])
def get_errors(self) -> List[Dict[str, Any]]: def get_errors(self) -> List[Dict[str, Any]]:
"""Get error log.""" """Get error log."""
with self._lock: with self._lock:
return list(self.debug_state.error_log[-50:]) return list(self._debug_state.error_log[-50:])
def add_to_history(self, entry: Dict[str, Any]) -> None: def add_to_history(self, entry: Dict[str, Any]) -> None:
"""Add entry to command history.""" """Add entry to command history."""
with self._lock: with self._lock:
self.debug_state.command_history.append(entry) self._debug_state.command_history.append(entry)
if len(self.debug_state.command_history) > 100: if len(self._debug_state.command_history) > 100:
self.debug_state.command_history = self.debug_state.command_history[-100:] self._debug_state.command_history = self._debug_state.command_history[-100:]
def add_error(self, error: Dict[str, Any]) -> None: def add_error(self, error: Dict[str, Any]) -> None:
"""Add entry to error log.""" """Add entry to error log."""
with self._lock: with self._lock:
self.debug_state.error_log.append(error) self._debug_state.error_log.append(error)
if len(self.debug_state.error_log) > 100: if len(self._debug_state.error_log) > 100:
self.debug_state.error_log = self.debug_state.error_log[-100:] self._debug_state.error_log = self._debug_state.error_log[-100:]
def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None: def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None:
"""Update last voice input info.""" """Update last voice input info."""
with self._lock: with self._lock:
self.debug_state.last_voice_input = text self._debug_state.last_voice_input = text
self.debug_state.last_voice_input_at = timestamp self._debug_state.last_voice_input_at = timestamp
self.debug_state.last_parse_result = parse_result self._debug_state.last_parse_result = parse_result
def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None: def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None:
"""Update last robot command info.""" """Update last robot command info."""
with self._lock: with self._lock:
self.debug_state.last_robot_command = command self._debug_state.last_robot_command = command
self.debug_state.last_robot_command_at = timestamp self._debug_state.last_robot_command_at = timestamp
def update_image(self, image_bytes: bytes, timestamp: float) -> None: def update_image(self, image_bytes: bytes, timestamp: float) -> None:
"""Update latest camera image.""" """Update latest camera image."""
with self._lock: with self._lock:
self.debug_state.latest_image = image_bytes self._debug_state.latest_image = image_bytes
self.debug_state.latest_image_at = timestamp self._debug_state.latest_image_at = timestamp
def get_image(self) -> Optional[bytes]: def get_image(self) -> Optional[bytes]:
"""Get latest camera image.""" """Get latest camera image."""
with self._lock: with self._lock:
return self.debug_state.latest_image return self._debug_state.latest_image
def get_image_age_ms(self) -> Optional[int]: def get_image_age_ms(self) -> Optional[int]:
"""Get age of latest image in milliseconds.""" """Get age of latest image in milliseconds."""
with self._lock: with self._lock:
return _age_ms(self.debug_state.latest_image_at) return _age_ms(self._debug_state.latest_image_at)
def get_last_voice_input(self) -> Optional[str]:
"""Get the last voice input text."""
with self._lock:
return self._debug_state.last_voice_input
def get_last_parse_result(self) -> Optional[Dict[str, Any]]:
"""Get the last parse result."""
with self._lock:
return self._debug_state.last_parse_result
def _age_ms(timestamp: Optional[float]) -> Optional[int]: def _age_ms(timestamp: Optional[float]) -> Optional[int]:

View File

@@ -0,0 +1,231 @@
"""Voice processing and intent handling."""
from __future__ import annotations
import threading
import time
import unicodedata
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from typing import Any, Callable, Deque, Dict, List, Optional, TYPE_CHECKING
if TYPE_CHECKING:
from .behavior import RobotBehavior
from .state import SharedState
from .node import OutputPort, VoiceControlLogger
# Default action vocabulary (used when no behavior vocabulary is provided)
DEFAULT_ACTION_VOCABULARY: Dict[str, List[str]] = {
"reiniciar": ["reiniciar", "reinicia", "reset"],
"subir": ["subir", "sube", "arriba"],
"bajar": ["bajar", "baja", "abajo"],
"soltar": ["soltar", "deja", "dejar"],
"tomar": ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"],
"ir": ["ir", "ve", "mover", "muevete", "acercar"],
}
def normalize(text: str) -> str:
"""Normalize text: lowercase, remove accents."""
text = text.lower().strip()
text = unicodedata.normalize("NFKD", text)
text = "".join([c for c in text if not unicodedata.combining(c)])
return text
def rule_parse(transcript: str, vocabulary: Optional[Dict[str, List[str]]] = None) -> Dict[str, str]:
"""Parse voice command using rule-based approach.
Args:
transcript: The voice input text
vocabulary: Optional action vocabulary mapping action names to keywords.
If not provided, uses the default vocabulary.
"""
text = normalize(transcript)
vocab = vocabulary if vocabulary is not None else DEFAULT_ACTION_VOCABULARY
# Find matching action using vocabulary
action = "error"
for action_name, keywords in vocab.items():
if any(normalize(w) in text for w in keywords):
action = action_name
break
color = "no especificado"
if "rojo" in text:
color = "rojo"
elif "azul" in text:
color = "azul"
elif "amarillo" in text:
color = "amarillo"
elif "blanco" in text:
color = "blanco"
obj = "no especificado"
if "estrella" in text:
obj = "estrella"
elif "cilindro" in text:
obj = "cilindro"
elif "cubo" in text:
obj = "cubo"
elif "caja" in text:
obj = "caja"
size = "no especificado"
if "grande" in text:
size = "grande"
elif "pequeno" in text or "pequeño" in text or "chico" in text:
size = "pequeno"
if action == "error":
return {"resultado": "error"}
return {
"resultado": "ok",
"accion": action,
"objeto": obj,
"color": color,
"tamano": size,
}
@dataclass
class Intent:
"""Parsed voice command intent."""
action: str
obj: str = "no especificado"
color: str = "no especificado"
size: str = "no especificado"
def to_dict(self) -> Dict[str, str]:
"""Convert to dictionary."""
return {
"action": self.action,
"obj": self.obj,
"color": self.color,
"size": self.size,
}
@classmethod
def from_parse_result(cls, result: Dict[str, str]) -> Optional["Intent"]:
"""Create intent from parse result. Returns None if parsing failed."""
if result.get("resultado") != "ok":
return None
return cls(
action=result.get("accion", "error"),
obj=result.get("objeto", "no especificado"),
color=result.get("color", "no especificado"),
size=result.get("tamano", "no especificado"),
)
class IntentQueue:
"""Thread-safe queue for voice intents."""
def __init__(self) -> None:
self._queue: Deque[Intent] = deque()
self._lock = threading.Lock()
def append(self, intent: Intent) -> None:
"""Add intent to queue."""
with self._lock:
self._queue.append(intent)
def popleft(self) -> Optional[Intent]:
"""Remove and return the oldest intent. Returns None if empty."""
with self._lock:
if self._queue:
return self._queue.popleft()
return None
def __len__(self) -> int:
"""Return queue length."""
with self._lock:
return len(self._queue)
def __bool__(self) -> bool:
"""Return True if queue has items."""
with self._lock:
return bool(self._queue)
def clear(self) -> None:
"""Clear all intents."""
with self._lock:
self._queue.clear()
class VoiceInputHandler:
"""Handles voice_in events from the Dora node."""
def __init__(
self,
state: "SharedState",
behavior: "RobotBehavior",
intent_queue: IntentQueue,
port: "OutputPort",
logger: "VoiceControlLogger",
) -> None:
self._state = state
self._behavior = behavior
self._intent_queue = intent_queue
self._port = port
self._logger = logger
self._executor = ThreadPoolExecutor(max_workers=1)
def handle(self, event: Dict[str, Any]) -> None:
"""Handle a voice_in event. LLM parsing runs in background thread."""
raw = event["value"][0].as_py() if len(event["value"]) else ""
if not raw:
return
# Run LLM parsing in background thread to not block event loop
self._executor.submit(self._process_and_respond, raw)
def _process_and_respond(self, transcript: str) -> None:
"""Process command in background thread and send response."""
response = self._process_command(transcript)
self._port.send_json(response)
def _process_command(self, transcript: str) -> Dict[str, str]:
"""Process voice command and return response."""
self._logger.log(f'Voice input received: "{transcript}"')
llm_result = self._behavior.parse_command(transcript)
self._logger.log(f"Parse result: {llm_result}")
# Update debug state
self._state.update_voice_input(transcript, llm_result, time.monotonic())
if llm_result.get("resultado") != "ok":
self._logger.log("Command not understood")
return {"text": "No entendi el comando", "status": "error"}
intent = Intent.from_parse_result(llm_result)
if intent:
self._logger.log(
"Intent: action=%s, object=%s, color=%s, size=%s"
% (intent.action, intent.obj, intent.color, intent.size)
)
self._intent_queue.append(intent)
# Add to history
self._state.add_to_history(
{
"timestamp": time.time(),
"input": transcript,
"action": intent.action,
"object": intent.obj,
"color": intent.color,
"size": intent.size,
}
)
return {"text": f"Ok, voy a {llm_result.get('accion')}", "status": "ok"}
def create_command_callback(self) -> Callable[[str], Dict[str, str]]:
"""Create a callback for the web interface to send commands."""
def callback(transcript: str) -> Dict[str, str]:
return self._process_command(transcript)
return callback

View File

@@ -1,500 +1,176 @@
"""Dora node for voice control with safe robot commands.""" """Dora node for voice control - thin orchestrator."""
from __future__ import annotations from __future__ import annotations
import json
import os import os
import sys
import time
import uuid
from collections import deque
from typing import Any, Deque, Dict, List, Optional, Tuple
import cv2
import numpy as np
import pyarrow as pa
from dora import Node from dora import Node
try: from dora_voice_control.core import (
import tomllib load_voice_config,
except ModuleNotFoundError: load_llm_config,
import tomli as tomllib load_api_config,
SharedState,
# Handle both package and direct script execution ActionContext,
# __package__ is None when run as script, '' when imported from a script VoiceInputHandler,
_RUNNING_AS_SCRIPT = not __package__ IntentQueue,
ObjectsHandler,
if _RUNNING_AS_SCRIPT: SceneNotifier,
# Running as script - use absolute imports SceneState,
_pkg_dir = os.path.dirname(os.path.abspath(__file__)) PoseHandler,
if _pkg_dir not in sys.path: StatusHandler,
sys.path.insert(0, _pkg_dir) CommandQueueService,
from config import VoiceConfig, load_api_config, load_voice_config ImageHandler,
from parser import normalize, parse_command DoraNodeAdapter,
from state import RobotStep, SharedState EventDispatcher,
from api import start_api_server VoiceControlLogger,
else: NodeContext,
# Running as package - use relative imports )
from .config import VoiceConfig, load_api_config, load_voice_config from dora_voice_control.web import start_api_server
from .parser import normalize, parse_command from dora_voice_control.robots import get_behavior, get_robot_adapter
from .state import RobotStep, SharedState
from .api import start_api_server
def _within_bounds(
point_mm: np.ndarray,
min_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
max_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
) -> bool:
"""Check if point is within workspace bounds."""
x, y, z = point_mm.tolist()
min_x, min_y, min_z = min_xyz
max_x, max_y, max_z = max_xyz
if min_x is not None and x < min_x:
return False
if max_x is not None and x > max_x:
return False
if min_y is not None and y < min_y:
return False
if max_y is not None and y > max_y:
return False
if min_z is not None and z < min_z:
return False
if max_z is not None and z > max_z:
return False
return True
def _translate_target(token: str, mapping: Dict[str, str]) -> str:
"""Translate object name using class map."""
if token in mapping:
return mapping[token]
return token
def _load_config_file(path: str) -> Dict[str, Any]:
"""Load TOML configuration file."""
if not path or not os.path.exists(path):
return {}
try:
with open(path, "rb") as handle:
return tomllib.load(handle)
except Exception:
return {}
def _load_bucket_objects(config_path: str) -> List[Dict[str, Any]]:
"""Load bucket positions from config file."""
cfg = _load_config_file(config_path)
buckets = cfg.get("bucket_positions", {})
obj_cfg = cfg.get("object_parameters", {})
base_z = float(obj_cfg.get("normal_height", 220.0))
out = []
for key, color in [
("blue_bucket_pos", "blue"),
("red_bucket_pos", "red"),
("yellow_bucket_pos", "yellow"),
("white_bucket_pos", "white"),
]:
pos = buckets.get(key)
if not isinstance(pos, list) or len(pos) < 2:
continue
out.append(
{
"object_type": "box",
"color": color,
"size": "big",
"position_mm": [float(pos[0]), float(pos[1]), base_z],
"source": "config",
}
)
return out
def _send_dora_command(
node: Node, output_name: str, action: str, payload: Dict[str, Any]
) -> str:
"""Send a robot command via Dora."""
command_id = str(uuid.uuid4())
message = {"id": command_id, "action": action, "payload": payload}
node.send_output(
output_name,
pa.array([json.dumps(message)]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
return command_id
def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]:
"""Parse status payload from robot."""
if len(value) == 0:
return None
raw = value[0].as_py()
if not raw:
return None
try:
return json.loads(raw)
except Exception:
return None
def _log(msg: str) -> None:
"""Print a timestamped log message."""
timestamp = time.strftime("%H:%M:%S")
print(f"[voice_control {timestamp}] {msg}", flush=True)
def main() -> None: def main() -> None:
"""Main entry point for the voice control node.""" """Main entry point for the voice control node."""
_log("Starting voice control node...") logger = VoiceControlLogger()
logger.log("Starting voice control node...")
# Load configuration # Configuration
cfg = load_voice_config() cfg = load_voice_config()
llm_cfg = load_llm_config()
api_cfg = load_api_config() api_cfg = load_api_config()
# Environment variables for I/O topics # Environment variables for I/O topics
objects_input = os.getenv("OBJECTS_INPUT", "objects") objects_input = os.getenv("OBJECTS_INPUT", "objects")
voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in") voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in")
voice_out_output = os.getenv("VOICE_OUT_OUTPUT", "voice_out")
scene_output = os.getenv("SCENE_OUTPUT", "scene_update")
pose_input = os.getenv("POSE_INPUT", "tcp_pose") pose_input = os.getenv("POSE_INPUT", "tcp_pose")
status_input = os.getenv("STATUS_INPUT", "status") status_input = os.getenv("STATUS_INPUT", "status")
command_output = os.getenv("COMMAND_OUTPUT", "robot_cmd")
image_input = os.getenv("IMAGE_INPUT", "image_annotated") image_input = os.getenv("IMAGE_INPUT", "image_annotated")
llm_provider = os.getenv("LLM_PROVIDER", "rules").lower()
config_file = os.getenv("CONFIG_FILE", "config.toml")
# Image dimensions (will be detected from first frame) # Home position
image_width = int(os.getenv("IMAGE_WIDTH", "1280")) home_pose = {
image_height = int(os.getenv("IMAGE_HEIGHT", "720")) "x": float(os.getenv("INIT_X", "300.0")),
"y": float(os.getenv("INIT_Y", "0.0")),
"z": float(os.getenv("INIT_Z", "250.0")),
"roll": float(os.getenv("INIT_ROLL", "180.0")),
"pitch": float(os.getenv("INIT_PITCH", "0.0")),
"yaw": float(os.getenv("INIT_YAW", "0.0")),
}
# Initial/home position for reset command # State
init_x = float(os.getenv("INIT_X", "300.0")) state = SharedState()
init_y = float(os.getenv("INIT_Y", "0.0")) scene = SceneState()
init_z = float(os.getenv("INIT_Z", "250.0")) loaded = scene.load_from_config()
init_roll = float(os.getenv("INIT_ROLL", "180.0")) logger.log(f"Loaded {loaded} static objects from config")
init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
init_yaw = float(os.getenv("INIT_YAW", "0.0"))
_log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm") # Robot adapter and behavior
_log(f"Initial position: [{init_x}, {init_y}, {init_z}]") robot_type = os.getenv("ROBOT_TYPE", "vacuum").lower()
_log(f"LLM provider: {llm_provider}") robot_adapter = get_robot_adapter(robot_type, logger)
_log(f"Dry run: {cfg.dry_run}") behavior = get_behavior(robot_type, cfg, llm_cfg)
logger.log(f"Robot type: {robot_type} ({type(robot_adapter).__name__})")
logger.log(f"Available actions: {list(behavior.get_all_keywords().keys())}")
logger.log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm")
logger.log(f"Initial position: [{home_pose['x']}, {home_pose['y']}, {home_pose['z']}]")
logger.log(f"LLM provider: {llm_cfg.provider}/{llm_cfg.model}")
logger.log(f"Dry run: {cfg.dry_run}")
# Initialize shared state # Node context with output ports
shared_state = SharedState() ctx = NodeContext.create(
state = shared_state.voice_state adapter=DoraNodeAdapter(Node()),
state.static_objects = _load_bucket_objects(config_file) logger=logger,
pending_intents: Deque[Dict[str, Any]] = deque() robot_output=os.getenv("COMMAND_OUTPUT", "robot_cmd"),
scene_output=os.getenv("SCENE_OUTPUT", "scene_update"),
voice_output=os.getenv("VOICE_OUT_OUTPUT", "voice_out"),
)
_log(f"Loaded {len(state.static_objects)} static objects from config") # Services (using typed output ports)
intent_queue = IntentQueue()
scene_notifier = SceneNotifier(scene, ctx.scene_out, ctx.logger)
command_queue = CommandQueueService(state, ctx.robot_out, cfg, ctx.logger)
# Queue initial position movement on startup (same as reiniciar) # Handlers
init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes") voice_handler = VoiceInputHandler(state, behavior, intent_queue, ctx.voice_out, ctx.logger)
send_init_scene_reset = init_on_start # Flag to send scene reset after node starts
if init_on_start:
_log(f"Startup: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
# Clear detected objects
state.latest_objects = []
state.latest_objects_at = None
# Queue vacuum off and move to home
state.queue.append(RobotStep(action="vacuum_off", payload={}))
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": init_x,
"y": init_y,
"z": init_z,
"roll": init_roll,
"pitch": init_pitch,
"yaw": init_yaw,
},
)
)
def command_handler(transcript: str) -> Dict[str, str]:
"""Handle voice command and return response."""
_log(f"Voice input received: \"{transcript}\"")
llm_result = parse_command(transcript, llm_provider)
_log(f"Parse result: {llm_result}")
# Update debug state
shared_state.update_voice_input(transcript, llm_result, time.monotonic())
if llm_result.get("resultado") != "ok":
_log("Command not understood")
return {"text": "No entendi el comando", "status": "error"}
action = llm_result.get("accion", "error")
obj = llm_result.get("objeto", "no especificado")
color = llm_result.get("color", "no especificado")
size = llm_result.get("tamano", "no especificado")
_log(f"Intent: action={action}, object={obj}, color={color}, size={size}")
pending_intents.append(
{"action": action, "obj": obj, "color": color, "size": size}
)
# Add to history
shared_state.add_to_history({
"timestamp": time.time(),
"input": transcript,
"action": action,
"object": obj,
"color": color,
"size": size,
})
return {"text": f"Ok, voy a {action}", "status": "ok"}
# Set command callback for web interface # Set command callback for web interface
shared_state.set_command_callback(command_handler) state.set_command_callback(voice_handler.create_command_callback())
# Start web API server if enabled # Queue initial movements on startup
init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes")
if init_on_start:
ctx.logger.log(f"Startup: resetting scene and moving to home [{home_pose['x']}, {home_pose['y']}, {home_pose['z']}]")
scene.clear_detected()
for step in robot_adapter.reset_tool():
state.append_queue(step)
for step in robot_adapter.move(home_pose):
state.append_queue(step)
# Event dispatcher
dispatcher = EventDispatcher(ctx.adapter, ctx.logger)
dispatcher.register(voice_in_input, voice_handler)
dispatcher.register(pose_input, PoseHandler(state, ctx.logger))
dispatcher.register(objects_input, ObjectsHandler(scene, scene_notifier, ctx.logger))
dispatcher.register(image_input, ImageHandler(state, ctx.logger))
dispatcher.register(status_input, StatusHandler(state, ctx.logger))
# First event callback - send initial scene
if init_on_start:
dispatcher.on_first_event(scene_notifier.send_initial_scene)
# Tick callbacks - process intents and dispatch commands
def process_intents() -> None:
"""Process pending voice intents."""
intent = intent_queue.popleft()
if not intent:
return
ctx.logger.log(f"Processing intent: {intent.action} {intent.obj} {intent.color} {intent.size}")
latest_pose = state.get_pose()
detected_count = len(scene.query(source="detected"))
config_count = len(scene.query(source="config"))
ctx.logger.log(f"Available objects: {detected_count} detected + {config_count} config")
# Find target object if action requires it
action_info = behavior.get_action_info(intent.action)
target_obj = None
if action_info and action_info.requires_object:
target_obj = scene.find_by_criteria(
intent.obj, intent.color, intent.size, cfg.class_map
)
if not target_obj:
ctx.logger.log(f"Target object not found: {intent.obj} {intent.color}")
return
ctx.logger.log(f"Found target: {target_obj.object_type} {target_obj.color} at {target_obj.position_mm}")
# Build action context
context = ActionContext(
pose=latest_pose,
target=target_obj,
scene=scene,
config=cfg,
home_pose=home_pose,
shared_state=state,
)
# Execute action
if behavior.execute(intent.action, context):
ctx.logger.log(f"Executed action: {intent.action}")
if intent.action == "reiniciar":
scene_notifier.send_scene_reset()
else:
ctx.logger.log(f"Failed to execute action: {intent.action}")
ctx.logger.log(f"Queue size: {state.queue_size()}")
dispatcher.on_tick(process_intents)
dispatcher.on_tick(command_queue.dispatch_next)
# Start API server
if api_cfg.enabled: if api_cfg.enabled:
start_api_server(shared_state, api_cfg) start_api_server(state, scene, api_cfg, behavior.get_all_keywords)
# Create Dora node # Run event loop
node = Node() dispatcher.run()
_log("Dora node created, waiting for events...")
first_event = True
for event in node:
# Send scene reset on first event (startup)
if first_event and send_init_scene_reset:
first_event = False
scene_payload = json.dumps(
{"objects": list(state.static_objects), "reset": True}
)
node.send_output(
scene_output,
pa.array([scene_payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
_log("Sent initial scene reset notification")
if event["type"] != "INPUT":
continue
# Handle voice input
if event["id"] == voice_in_input:
raw = event["value"][0].as_py() if len(event["value"]) else ""
if not raw:
continue
response = command_handler(raw)
node.send_output(
voice_out_output,
pa.array([json.dumps(response)]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
continue
# Handle pose updates
if event["id"] == pose_input:
tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
if tcp_pose.size >= 6:
state.latest_pose = tcp_pose[:6].tolist()
state.latest_pose_at = time.monotonic()
continue
# Handle object detection updates
if event["id"] == objects_input:
raw = event["value"][0].as_py() if len(event["value"]) else ""
if raw:
try:
payload = json.loads(raw)
objects = payload.get("objects", [])
except Exception:
objects = []
state.latest_objects = objects
state.latest_objects_at = time.monotonic()
continue
# Handle camera image
if event["id"] == image_input:
try:
# Get raw image data
img_data = event["value"].to_numpy()
# Reshape to image (assuming BGR format)
img = img_data.reshape((image_height, image_width, 3)).astype(np.uint8)
# Encode to JPEG
_, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 80])
shared_state.update_image(jpeg_data.tobytes(), time.monotonic())
except Exception as e:
# Log error but don't crash
pass
continue
# Handle robot status updates
if event["id"] == status_input:
payload = _parse_status_payload(event["value"])
if payload and state.pending_command:
if payload.get("command_id") == state.pending_command.get("id"):
_log(f"Command completed: {state.pending_command.get('action')} (status={payload.get('status', 'ok')})")
state.pending_command = None
continue
# Process pending intents
if pending_intents:
intent = pending_intents.popleft()
action = intent["action"]
obj = intent["obj"]
color = intent["color"]
size = intent["size"]
_log(f"Processing intent: {action} {obj} {color} {size}")
latest_pose = state.latest_pose
objects = list(state.latest_objects) + list(state.static_objects)
_log(f"Available objects: {len(state.latest_objects)} detected + {len(state.static_objects)} static")
if action in ("subir", "bajar") and latest_pose:
delta = cfg.step_mm if action == "subir" else -cfg.step_mm
target = np.array(latest_pose[:3], dtype=np.float64)
target[2] += delta
if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
step = RobotStep(
action="move_to_pose",
payload={
"x": float(target[0]),
"y": float(target[1]),
"z": float(target[2]),
"roll": cfg.default_roll,
"pitch": cfg.default_pitch,
"yaw": cfg.default_yaw,
},
)
state.queue.append(step)
_log(f"Queued: move Z to {target[2]:.1f}mm (delta={delta:+.1f})")
else:
_log(f"Target {target.tolist()} out of bounds, skipping")
elif action in ("ir", "tomar", "soltar"):
target_obj = None
if obj != "no especificado":
target_name = _translate_target(obj, cfg.class_map)
target_color = _translate_target(color, cfg.class_map)
_log(f"Looking for: type={target_name}, color={target_color}")
# Log available objects for debugging
for o in objects:
_log(f" -> Available: {o.get('object_type')} {o.get('color')} {o.get('size')} at {o.get('position_mm')}")
for o in objects:
if o.get("object_type") == target_name:
if color == "no especificado" or o.get("color") == target_color:
if size == "no especificado" or o.get("size") == _translate_target(size, cfg.class_map):
target_obj = o
break
if target_obj:
_log(f"Found target: {target_obj.get('object_type')} {target_obj.get('color')} at {target_obj.get('position_mm')}")
pos = np.array(target_obj["position_mm"], dtype=np.float64)
approach = pos.copy()
approach[2] += cfg.tcp_offset_mm + cfg.approach_offset_mm
target = pos.copy()
target[2] += cfg.tcp_offset_mm
if _within_bounds(approach, cfg.workspace_min, cfg.workspace_max):
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": float(approach[0]),
"y": float(approach[1]),
"z": float(approach[2]),
"roll": cfg.default_roll,
"pitch": cfg.default_pitch,
"yaw": cfg.default_yaw,
},
)
)
_log(f"Queued: approach pose at Z={approach[2]:.1f}mm")
if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": float(target[0]),
"y": float(target[1]),
"z": float(target[2]),
"roll": cfg.default_roll,
"pitch": cfg.default_pitch,
"yaw": cfg.default_yaw,
},
)
)
_log(f"Queued: target pose at Z={target[2]:.1f}mm")
if action == "tomar":
state.queue.append(RobotStep(action="vacuum_on", payload={}))
_log("Queued: vacuum_on")
elif action == "soltar":
state.queue.append(RobotStep(action="vacuum_off", payload={}))
_log("Queued: vacuum_off")
else:
_log(f"Target object not found: {obj} {color}")
continue
elif action == "reiniciar":
_log(f"Reiniciar: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
# Turn off vacuum first
state.queue.append(RobotStep(action="vacuum_off", payload={}))
# Clear current detected objects (will be refreshed by detector)
state.latest_objects = []
state.latest_objects_at = None
_log("Cleared detected objects - waiting for fresh detection")
# Move to initial position
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": init_x,
"y": init_y,
"z": init_z,
"roll": init_roll,
"pitch": init_pitch,
"yaw": init_yaw,
},
)
)
_log(f"Queued: vacuum_off + move to home")
# Send scene update to notify clients that scene was reset
scene_payload = json.dumps(
{"objects": list(state.static_objects), "reset": True}
)
node.send_output(
scene_output,
pa.array([scene_payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
_log("Sent scene reset notification")
_log(f"Queue size: {len(state.queue)}")
# Emit scene updates when objects change
if event["id"] == objects_input:
scene_payload = json.dumps(
{"objects": list(state.latest_objects) + list(state.static_objects)}
)
node.send_output(
scene_output,
pa.array([scene_payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
# Send queued robot steps one at a time
if state.pending_command is None and state.queue:
step = state.queue.popleft()
if cfg.dry_run:
_log(f"[DRY RUN] Would send: {step.action} {step.payload}")
state.pending_command = None
continue
cmd_id = _send_dora_command(node, command_output, step.action, step.payload)
state.pending_command = {"id": cmd_id, "action": step.action}
_log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={len(state.queue)}")
# Update debug state
shared_state.update_robot_command(
{"id": cmd_id, "action": step.action, "payload": step.payload},
time.monotonic(),
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,118 +0,0 @@
"""Voice command parsing logic."""
from __future__ import annotations
import json
import os
import unicodedata
from typing import Dict
def normalize(text: str) -> str:
"""Normalize text: lowercase, remove accents."""
text = text.lower().strip()
text = unicodedata.normalize("NFKD", text)
text = "".join([c for c in text if not unicodedata.combining(c)])
return text
def rule_parse(transcript: str) -> Dict[str, str]:
"""Parse voice command using rule-based approach."""
text = normalize(transcript)
action = "error"
if any(w in text for w in ["reiniciar", "reinicia", "reset"]):
action = "reiniciar"
elif any(w in text for w in ["sube", "subir", "arriba"]):
action = "subir"
elif any(w in text for w in ["baja", "bajar", "abajo"]):
action = "bajar"
elif any(w in text for w in ["soltar", "deja", "dejar"]):
action = "soltar"
elif any(w in text for w in ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"]):
action = "tomar"
elif any(w in text for w in ["ir", "ve", "mover", "muevete", "acercar"]):
action = "ir"
color = "no especificado"
if "rojo" in text:
color = "rojo"
elif "azul" in text:
color = "azul"
elif "amarillo" in text:
color = "amarillo"
elif "blanco" in text:
color = "blanco"
obj = "no especificado"
if "estrella" in text:
obj = "estrella"
elif "cilindro" in text:
obj = "cilindro"
elif "cubo" in text:
obj = "cubo"
elif "caja" in text:
obj = "caja"
size = "no especificado"
if "grande" in text:
size = "grande"
elif "pequeno" in text or "pequeño" in text or "chico" in text:
size = "pequeno"
if action == "error":
return {"resultado": "error"}
return {
"resultado": "ok",
"accion": action,
"objeto": obj,
"color": color,
"tamano": size,
}
def build_gemini_prompt(transcript: str) -> str:
"""Build prompt for Gemini LLM parsing."""
return f"""Interpreta el siguiente comando de voz de un niño, convertido a texto, para controlar
un robot (manito). Asegúrate de responder con 'accion', 'objeto', 'color' y 'tamano'. Si el color
o el tamaño no están especificados, responde con 'no especificado'. Si no entiendes la frase,
responde con 'resultado: error'. En caso contrario, responde con 'resultado: ok'. Las acciones
posibles son 'bajar', 'subir', 'soltar', 'tomar', 'ir', 'reiniciar'. Los colores posibles son 'rojo',
'blanco','azul' y 'amarillo'. Los tamaños posibles son 'grande', 'pequeno'. Los posible objetos son estrella,
cilindro, cubo y caja; cualquier otro objeto es error.
Comando: "{transcript}"
Nota: Los comandos pueden incluir variaciones en la expresión y errores comunes en el lenguaje de
los niños. Normaliza la respuesta a las categorías establecidas. La salida es un json con los campos
'resultado', 'accion', 'objeto', 'color' y 'tamano'. Adicionalmente los ninos pueden decir tomar,chupar, succionar o similar para tomar un objeto.
"""
def parse_command(transcript: str, llm_provider: str = "rules") -> Dict[str, str]:
"""Parse voice command using specified provider."""
if llm_provider == "gemini":
try:
from google import genai
from google.genai import types
except Exception:
return rule_parse(transcript)
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
return rule_parse(transcript)
try:
client = genai.Client(api_key=api_key)
prompt = build_gemini_prompt(transcript)
reply = client.models.generate_content(
model=os.getenv("GEMINI_MODEL", "gemini-2.0-flash"),
contents=prompt,
config=types.GenerateContentConfig(temperature=0.5),
)
raw = str(reply.text).replace("```json", "").replace("```", "")
return json.loads(raw)
except json.JSONDecodeError:
return {"resultado": "error"}
except Exception:
return rule_parse(transcript)
else:
return rule_parse(transcript)

View File

@@ -0,0 +1,50 @@
"""Robot registry and factory helpers."""
from __future__ import annotations
from typing import Optional
from ..core.behavior import RobotBehavior
from ..core.config import LLMConfig, VoiceConfig
from ..core.node import VoiceControlLogger
from ..core.robot import RobotAdapter
from .littlehand.adapter import (
ALIASES as _littlehand_aliases,
NAME as _littlehand_name,
VacuumGripperAdapter,
)
from .littlehand.behavior import LittlehandBehavior
def _resolve_robot_type(robot_type: str) -> tuple[str, bool]:
"""Resolve a robot type or alias to a registered robot name."""
token = robot_type.strip().lower()
if token in _littlehand_aliases or token == _littlehand_name:
return _littlehand_name, True
return _littlehand_name, False
def get_robot_adapter(robot_type: str, logger: Optional[VoiceControlLogger] = None) -> RobotAdapter:
"""Factory to get robot adapter by type."""
resolved, known = _resolve_robot_type(robot_type)
if not known and logger:
logger.warn(f"Unknown robot type '{robot_type}', defaulting to {resolved}")
return VacuumGripperAdapter()
def get_behavior(
robot_type: str,
config: VoiceConfig,
llm_config: Optional[LLMConfig] = None,
) -> RobotBehavior:
"""Factory to get robot behavior by type."""
resolved, _known = _resolve_robot_type(robot_type)
if resolved == _littlehand_name:
return LittlehandBehavior(config, VacuumGripperAdapter(), llm_config)
return LittlehandBehavior(config, VacuumGripperAdapter(), llm_config)
__all__ = [
"get_robot_adapter",
"get_behavior",
]

View File

@@ -0,0 +1 @@
"""Littlehand robot package."""

View File

@@ -0,0 +1,48 @@
"""Action definitions for Littlehand."""
from __future__ import annotations
from ...core.behavior import ActionInfo
# Littlehand starts from the default pick-and-place actions.
LITTLEHAND_ACTIONS: dict[str, ActionInfo] = {
"subir": ActionInfo(
name="subir",
aliases=["sube", "arriba"],
requires_pose=True,
description="Subir el robot",
),
"bajar": ActionInfo(
name="bajar",
aliases=["baja", "abajo"],
requires_pose=True,
description="Bajar el robot",
),
"ir": ActionInfo(
name="ir",
aliases=["ve", "mover", "muevete", "acercar"],
requires_object=True,
description="Ir hacia un objeto",
),
"tomar": ActionInfo(
name="tomar",
aliases=["toma", "agarra", "agarrar", "coger", "chupar", "succionar"],
requires_pose=False,
requires_object=False,
description="Tomar un objeto",
),
"soltar": ActionInfo(
name="soltar",
aliases=["deja", "dejar"],
requires_pose=False,
requires_object=False,
description="Soltar el objeto",
),
"reiniciar": ActionInfo(
name="reiniciar",
aliases=["reinicia", "reset"],
requires_pose=False,
requires_object=False,
description="Reiniciar a posicion inicial",
),
}

View File

@@ -0,0 +1,28 @@
"""Adapter for the Littlehand robot (vacuum gripper)."""
from __future__ import annotations
from typing import Any, Dict, List
from ...core.robot import RobotAdapter
from ...core.state import RobotStep
class VacuumGripperAdapter(RobotAdapter):
"""Adapter for robots with vacuum gripper (suction)."""
def grab(self) -> List[RobotStep]:
return [RobotStep(action="vacuum_on", payload={})]
def release(self) -> List[RobotStep]:
return [RobotStep(action="vacuum_off", payload={})]
def move(self, payload: Dict[str, Any]) -> List[RobotStep]:
return [RobotStep(action="move_to_pose", payload=payload)]
def reset_tool(self) -> List[RobotStep]:
return [RobotStep(action="vacuum_off", payload={})]
NAME = "littlehand"
ALIASES = {"vacuum", "littlehand"}

View File

@@ -0,0 +1,57 @@
"""Behavior for the Littlehand robot."""
from __future__ import annotations
from typing import Callable
from ...core.behavior import ActionContext, RobotBehavior
from .actions import LITTLEHAND_ACTIONS
class LittlehandBehavior(RobotBehavior):
"""Littlehand behavior using the default pick-and-place actions."""
ACTIONS = LITTLEHAND_ACTIONS
def action_handlers(self) -> dict[str, Callable[[ActionContext], bool]]:
return {
"subir": self.action_subir,
"bajar": self.action_bajar,
"ir": self.action_ir,
"tomar": self.action_tomar,
"soltar": self.action_soltar,
"reiniciar": self.action_reiniciar,
}
def action_subir(self, ctx: ActionContext) -> bool:
"""Move up by step_mm."""
target_z = ctx.pose[2] + self.config.step_mm
return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z)
def action_bajar(self, ctx: ActionContext) -> bool:
"""Move down by step_mm."""
target_z = ctx.pose[2] - self.config.step_mm
return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z)
def action_ir(self, ctx: ActionContext) -> bool:
"""Move to object position (approach + target)."""
pos = ctx.target.position_mm
self._queue_approach_sequence(ctx, pos)
return True
def action_tomar(self, ctx: ActionContext) -> bool:
"""Activate tool (low-level grab)."""
self._queue_steps(ctx, self.robot_adapter.grab())
return True
def action_soltar(self, ctx: ActionContext) -> bool:
"""Deactivate tool (low-level release)."""
self._queue_steps(ctx, self.robot_adapter.release())
return True
def action_reiniciar(self, ctx: ActionContext) -> bool:
"""Reset: release tool, move home, clear objects."""
self._queue_steps(ctx, self.robot_adapter.reset_tool())
self._queue_steps(ctx, self.robot_adapter.move(ctx.home_pose))
ctx.scene.clear_detected()
return True

View File

@@ -0,0 +1,8 @@
"""Web interface module."""
from .api import create_api, start_api_server
__all__ = [
"create_api",
"start_api_server",
]

View File

@@ -2,31 +2,29 @@
from __future__ import annotations from __future__ import annotations
import os
import sys
import threading import threading
from typing import Any from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
import uvicorn import uvicorn
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse, Response from fastapi.responses import HTMLResponse, Response
# Handle both package and direct script execution from .models import CommandRequest, CommandResponse
# __package__ is None when run as script, '' when imported from a script from .templates import HTML_TEMPLATE
if not __package__: from ..core.state import SharedState
_pkg_dir = os.path.dirname(os.path.abspath(__file__))
if _pkg_dir not in sys.path: if TYPE_CHECKING:
sys.path.insert(0, _pkg_dir) from ..core.scene import SceneState
from models import CommandRequest, CommandResponse
from state import SharedState # Type alias for vocabulary getter
from templates import HTML_TEMPLATE VocabularyGetter = Callable[[], Dict[str, List[str]]]
else:
from .models import CommandRequest, CommandResponse
from .state import SharedState
from .templates import HTML_TEMPLATE
def create_api(state: SharedState) -> FastAPI: def create_api(
state: SharedState,
scene: "SceneState",
vocabulary_getter: Optional[VocabularyGetter] = None,
) -> FastAPI:
"""Create FastAPI application with voice control endpoints.""" """Create FastAPI application with voice control endpoints."""
app = FastAPI( app = FastAPI(
title="Voice Control Debug API", title="Voice Control Debug API",
@@ -49,9 +47,14 @@ def create_api(state: SharedState) -> FastAPI:
@app.get("/api/objects") @app.get("/api/objects")
def get_objects() -> dict: def get_objects() -> dict:
"""Get detected and static objects.""" """Get all scene objects."""
try: try:
return state.get_objects() detected = [obj.to_dict() for obj in scene.query(source="detected")]
config_objs = [obj.to_dict() for obj in scene.query(source="config")]
return {
"detected": detected,
"static": config_objs,
}
except Exception as e: except Exception as e:
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@@ -67,8 +70,7 @@ def create_api(state: SharedState) -> FastAPI:
def clear_queue() -> dict: def clear_queue() -> dict:
"""Clear the command queue.""" """Clear the command queue."""
try: try:
with state._lock: state.clear_queue()
state.voice_state.queue.clear()
return {"ok": True} return {"ok": True}
except Exception as e: except Exception as e:
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@@ -137,6 +139,35 @@ def create_api(state: SharedState) -> FastAPI:
except Exception as e: except Exception as e:
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/debug/state")
def get_debug_state() -> dict:
"""Get full system state for debugging."""
try:
detected = [obj.to_dict() for obj in scene.query(source="detected")]
config_objs = [obj.to_dict() for obj in scene.query(source="config")]
return {
"pose": state.get_pose(),
"queue_size": state.queue_size(),
"pending_command": state.get_pending_command(),
"detected_objects": detected,
"static_objects": config_objs,
"scene_object_count": scene.count(),
"last_voice_input": state.get_last_voice_input(),
"last_parse_result": state.get_last_parse_result(),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/debug/vocabulary")
def get_vocabulary() -> dict:
"""Get all recognized keywords."""
try:
if vocabulary_getter is None:
return {"error": "Vocabulary getter not configured"}
return vocabulary_getter()
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return app return app
@@ -147,10 +178,23 @@ def run_uvicorn(app: FastAPI, host: str, port: int) -> None:
server.run() server.run()
def start_api_server(state: SharedState, config: Any) -> threading.Thread: def start_api_server(
"""Start the API server in a background thread.""" state: SharedState,
scene: "SceneState",
config: Any,
vocabulary_getter: Optional[VocabularyGetter] = None,
) -> threading.Thread:
"""Start the API server in a background thread.
Args:
state: Shared state container
scene: Scene state with object tracking
config: API configuration with host and port
vocabulary_getter: Optional function to get action keywords for debug endpoint
"""
import time as _time import time as _time
app = create_api(state)
app = create_api(state, scene, vocabulary_getter)
api_thread = threading.Thread( api_thread = threading.Thread(
target=run_uvicorn, target=run_uvicorn,
args=(app, config.host, config.port), args=(app, config.host, config.port),

View File

@@ -19,7 +19,7 @@ dependencies = [
] ]
[project.optional-dependencies] [project.optional-dependencies]
llm = ["google-genai"] llm = ["dspy >= 2.5", "google-genai", "litellm"]
[project.scripts] [project.scripts]
dora-voice-control = "dora_voice_control.main:main" dora-voice-control = "dora_voice_control.main:main"

View File

@@ -0,0 +1 @@
"""Tests for dora_voice_control."""

View File

@@ -0,0 +1,624 @@
"""Tests for scene state with spatial relationships."""
import sys
from pathlib import Path
import pytest
# Add parent directory to path for direct import from core.scene
sys.path.insert(0, str(Path(__file__).parent.parent / "dora_voice_control" / "core"))
sys.path.insert(0, str(Path(__file__).parent.parent))
from scene import SceneState, SceneObject
class TestSceneObjectBasics:
"""Test SceneObject dataclass."""
def test_create_scene_object(self):
"""Test basic SceneObject creation."""
obj = SceneObject(
id="cube_red_1",
object_type="cube",
color="red",
position_mm=[100.0, 200.0, 50.0],
height_mm=30.0,
)
assert obj.id == "cube_red_1"
assert obj.object_type == "cube"
assert obj.color == "red"
assert obj.position_mm == [100.0, 200.0, 50.0]
assert obj.height_mm == 30.0
assert obj.source == "detected" # default
def test_center_property(self):
"""Test center property is alias for position_mm."""
obj = SceneObject(
id="obj1",
object_type="cube",
color="blue",
position_mm=[10.0, 20.0, 30.0],
)
assert obj.center == obj.position_mm
def test_to_dict(self):
"""Test to_dict serialization."""
obj = SceneObject(
id="test_1",
object_type="cylinder",
color="yellow",
size="big",
position_mm=[1.0, 2.0, 3.0],
height_mm=40.0,
source="config",
)
d = obj.to_dict()
assert d["id"] == "test_1"
assert d["object_type"] == "cylinder"
assert d["color"] == "yellow"
assert d["size"] == "big"
assert d["position_mm"] == [1.0, 2.0, 3.0]
assert d["height_mm"] == 40.0
assert d["source"] == "config"
class TestSceneStateBasics:
"""Test basic SceneState operations."""
def test_add_and_get(self):
"""Test adding and retrieving objects."""
scene = SceneState()
obj = SceneObject(
id="cube_red_1",
object_type="cube",
color="red",
position_mm=[100.0, 200.0, 50.0],
)
assert scene.add(obj) is True
retrieved = scene.get("cube_red_1")
assert retrieved is not None
assert retrieved.id == "cube_red_1"
def test_add_duplicate_fails(self):
"""Test adding duplicate ID fails."""
scene = SceneState()
obj1 = SceneObject(id="obj1", object_type="cube", color="red")
obj2 = SceneObject(id="obj1", object_type="cylinder", color="blue")
assert scene.add(obj1) is True
assert scene.add(obj2) is False
def test_delete(self):
"""Test deleting objects."""
scene = SceneState()
obj = SceneObject(id="to_delete", object_type="cube", color="red")
scene.add(obj)
assert scene.get("to_delete") is not None
assert scene.delete("to_delete") is True
assert scene.get("to_delete") is None
def test_delete_nonexistent(self):
"""Test deleting non-existent object returns False."""
scene = SceneState()
assert scene.delete("nonexistent") is False
def test_update(self):
"""Test updating object properties."""
scene = SceneState()
obj = SceneObject(
id="to_update",
object_type="cube",
color="red",
position_mm=[0.0, 0.0, 0.0],
)
scene.add(obj)
assert scene.update("to_update", color="blue", position_mm=[1.0, 2.0, 3.0])
updated = scene.get("to_update")
assert updated.color == "blue"
assert updated.position_mm == [1.0, 2.0, 3.0]
def test_get_all(self):
"""Test getting all objects."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
scene.add(SceneObject(id="b", object_type="cube", color="blue"))
all_objs = scene.get_all()
assert len(all_objs) == 2
ids = {o.id for o in all_objs}
assert ids == {"a", "b"}
def test_count(self):
"""Test object count."""
scene = SceneState()
assert scene.count() == 0
scene.add(SceneObject(id="a", object_type="cube", color="red"))
assert scene.count() == 1
scene.add(SceneObject(id="b", object_type="cube", color="blue"))
assert scene.count() == 2
def test_clear(self):
"""Test clearing all objects."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
scene.add(SceneObject(id="b", object_type="cube", color="blue"))
scene.clear()
assert scene.count() == 0
class TestSceneStateQuery:
"""Test SceneState query operations."""
def test_query_by_type(self):
"""Test querying by object type."""
scene = SceneState()
scene.add(SceneObject(id="cube1", object_type="cube", color="red"))
scene.add(SceneObject(id="cyl1", object_type="cylinder", color="red"))
scene.add(SceneObject(id="cube2", object_type="cube", color="blue"))
cubes = scene.query(object_type="cube")
assert len(cubes) == 2
assert all(o.object_type == "cube" for o in cubes)
def test_query_by_color(self):
"""Test querying by color."""
scene = SceneState()
scene.add(SceneObject(id="r1", object_type="cube", color="red"))
scene.add(SceneObject(id="r2", object_type="cylinder", color="red"))
scene.add(SceneObject(id="b1", object_type="cube", color="blue"))
reds = scene.query(color="red")
assert len(reds) == 2
assert all(o.color == "red" for o in reds)
def test_query_by_size(self):
"""Test querying by size."""
scene = SceneState()
scene.add(SceneObject(id="big1", object_type="cube", color="red", size="big"))
scene.add(SceneObject(id="small1", object_type="cube", color="blue", size="small"))
big = scene.query(size="big")
assert len(big) == 1
assert big[0].id == "big1"
def test_query_by_source(self):
"""Test querying by source."""
scene = SceneState()
scene.add(SceneObject(id="d1", object_type="cube", color="red", source="detected"))
scene.add(SceneObject(id="c1", object_type="box", color="blue", source="config"))
detected = scene.query(source="detected")
assert len(detected) == 1
assert detected[0].id == "d1"
def test_query_multiple_criteria(self):
"""Test querying with multiple criteria."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red", size="big"))
scene.add(SceneObject(id="b", object_type="cube", color="red", size="small"))
scene.add(SceneObject(id="c", object_type="cube", color="blue", size="big"))
results = scene.query(object_type="cube", color="red", size="big")
assert len(results) == 1
assert results[0].id == "a"
def test_query_no_matches(self):
"""Test query with no matches returns empty list."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
results = scene.query(color="purple")
assert results == []
class TestSceneStateRelationships:
"""Test spatial relationship methods."""
def test_set_on_top_of(self):
"""Test manually setting stacking relationship."""
scene = SceneState()
scene.add(SceneObject(id="bottom", object_type="cube", color="red"))
scene.add(SceneObject(id="top", object_type="cube", color="blue"))
assert scene.set_on_top_of("top", "bottom") is True
top = scene.get("top")
assert top.on_top_of == "bottom"
def test_set_on_top_of_invalid(self):
"""Test setting relationship with invalid IDs."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
# Non-existent top object
assert scene.set_on_top_of("nonexistent", "a") is False
# Non-existent bottom object
assert scene.set_on_top_of("a", "nonexistent") is False
def test_get_object_below(self):
"""Test getting object below."""
scene = SceneState()
scene.add(SceneObject(id="bottom", object_type="cube", color="red"))
scene.add(SceneObject(id="top", object_type="cube", color="blue", on_top_of="bottom"))
below = scene.get_object_below("top")
assert below is not None
assert below.id == "bottom"
def test_get_objects_above(self):
"""Test getting objects stacked on top."""
scene = SceneState()
scene.add(SceneObject(id="base", object_type="cube", color="red"))
scene.add(SceneObject(id="middle", object_type="cube", color="blue", on_top_of="base"))
scene.add(SceneObject(id="side", object_type="cube", color="yellow", on_top_of="base"))
above = scene.get_objects_above("base")
assert len(above) == 2
ids = {o.id for o in above}
assert ids == {"middle", "side"}
def test_effective_height_no_stack(self):
"""Test effective height for single object."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red", height_mm=30.0))
assert scene.effective_height("a") == 30.0
def test_effective_height_stacked(self):
"""Test effective height for stacked objects."""
scene = SceneState()
scene.add(SceneObject(id="bottom", object_type="cube", color="red", height_mm=30.0))
scene.add(SceneObject(id="top", object_type="cube", color="blue", height_mm=25.0, on_top_of="bottom"))
assert scene.effective_height("bottom") == 30.0
assert scene.effective_height("top") == 55.0 # 30 + 25
def test_get_stack(self):
"""Test getting full stack."""
scene = SceneState()
scene.add(SceneObject(id="bottom", object_type="cube", color="red"))
scene.add(SceneObject(id="middle", object_type="cube", color="blue", on_top_of="bottom"))
scene.add(SceneObject(id="top", object_type="cube", color="yellow", on_top_of="middle"))
stack = scene.get_stack("middle")
assert len(stack) == 3
assert stack[0].id == "bottom"
assert stack[1].id == "middle"
assert stack[2].id == "top"
def test_delete_clears_relationships(self):
"""Test that deleting object clears relationships pointing to it."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
scene.add(SceneObject(id="b", object_type="cube", color="blue", on_top_of="a"))
assert scene.get("b").on_top_of == "a"
scene.delete("a")
obj_b = scene.get("b")
assert obj_b.on_top_of is None # Relationship cleared
class TestSceneStateStackingInference:
"""Test automatic stacking inference from positions."""
def test_infer_stacking_simple(self):
"""Test basic stacking inference."""
scene = SceneState()
# Bottom cube at z=30
scene.add(SceneObject(
id="bottom",
object_type="cube",
color="red",
position_mm=[0.0, 0.0, 30.0],
height_mm=30.0,
bbox_mm=[-15.0, -15.0, 15.0, 15.0],
))
# Top cube at z=60 (30 + 30 = 60, sitting on bottom)
scene.add(SceneObject(
id="top",
object_type="cube",
color="blue",
position_mm=[0.0, 0.0, 60.0],
height_mm=30.0,
bbox_mm=[-15.0, -15.0, 15.0, 15.0],
))
scene.infer_relationships()
top = scene.get("top")
assert top.on_top_of == "bottom"
def test_infer_stacking_no_overlap(self):
"""Test that objects without horizontal overlap are not stacked."""
scene = SceneState()
# Object at x=0
scene.add(SceneObject(
id="a",
object_type="cube",
color="red",
position_mm=[0.0, 0.0, 30.0],
height_mm=30.0,
bbox_mm=[-15.0, -15.0, 15.0, 15.0],
))
# Object at x=100 (no overlap), same z would indicate stack
scene.add(SceneObject(
id="b",
object_type="cube",
color="blue",
position_mm=[100.0, 0.0, 60.0],
height_mm=30.0,
bbox_mm=[85.0, -15.0, 115.0, 15.0],
))
scene.infer_relationships()
b = scene.get("b")
assert b.on_top_of is None # No stacking - no overlap
def test_infer_stacking_via_replace_detected(self):
"""Test stacking is inferred after replace_detected."""
scene = SceneState()
scene.replace_detected([
{
"object_type": "cube",
"color": "red",
"position_mm": [0.0, 0.0, 30.0],
"height_mm": 30.0,
"bbox_mm": [-15.0, -15.0, 15.0, 15.0],
},
{
"object_type": "cube",
"color": "blue",
"position_mm": [0.0, 0.0, 60.0], # z = 30 + 30
"height_mm": 30.0,
"bbox_mm": [-15.0, -15.0, 15.0, 15.0],
},
])
# Should have auto-inferred the stacking
blues = scene.query(color="blue")
assert len(blues) == 1
assert blues[0].on_top_of is not None
class TestSceneStateDetectorIntegration:
"""Test detector integration methods."""
def test_replace_detected_basic(self):
"""Test replacing detected objects."""
scene = SceneState()
scene.replace_detected([
{"object_type": "cube", "color": "red", "position_mm": [1.0, 2.0, 3.0]},
{"object_type": "cylinder", "color": "blue"},
])
assert scene.count() == 2
all_objs = scene.get_all()
assert all(o.source == "detected" for o in all_objs)
def test_replace_detected_keeps_config(self):
"""Test that replace_detected keeps config objects."""
scene = SceneState()
# Add a config object
scene.add(SceneObject(
id="config_box_1",
object_type="box",
color="yellow",
source="config",
))
# Replace detected
scene.replace_detected([
{"object_type": "cube", "color": "red"},
])
# Config object should still be there
assert scene.get("config_box_1") is not None
config_objs = scene.query(source="config")
assert len(config_objs) == 1
# Detected object should be added
detected = scene.query(source="detected")
assert len(detected) == 1
def test_replace_detected_replaces_old_detected(self):
"""Test that replace_detected replaces previous detected objects."""
scene = SceneState()
# First detection
scene.replace_detected([
{"object_type": "cube", "color": "red"},
{"object_type": "cube", "color": "blue"},
])
assert len(scene.query(source="detected")) == 2
# Second detection
scene.replace_detected([
{"object_type": "cylinder", "color": "green"},
])
detected = scene.query(source="detected")
assert len(detected) == 1
assert detected[0].object_type == "cylinder"
def test_replace_detected_generates_ids(self):
"""Test that replace_detected generates unique IDs."""
scene = SceneState()
scene.replace_detected([
{"object_type": "cube", "color": "red"},
{"object_type": "cube", "color": "red"},
])
all_objs = scene.get_all()
ids = [o.id for o in all_objs]
assert len(ids) == len(set(ids)) # All IDs should be unique
def test_clear_detected(self):
"""Test clearing only detected objects."""
scene = SceneState()
scene.add(SceneObject(id="config1", object_type="box", color="green", source="config"))
scene.replace_detected([
{"object_type": "cube", "color": "red"},
])
assert scene.count() == 2
scene.clear_detected()
assert scene.count() == 1
assert scene.get("config1") is not None
def test_to_list(self):
"""Test serializing all objects to list."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
scene.add(SceneObject(id="b", object_type="cube", color="blue"))
lst = scene.to_list()
assert len(lst) == 2
assert all(isinstance(d, dict) for d in lst)
ids = {d["id"] for d in lst}
assert ids == {"a", "b"}
class TestSceneStateThreadSafety:
"""Test thread safety of SceneState."""
def test_concurrent_add_and_query(self):
"""Test concurrent add and query operations."""
import threading
scene = SceneState()
errors = []
def adder():
try:
for i in range(100):
scene.add(SceneObject(
id=f"obj_{threading.current_thread().name}_{i}",
object_type="cube",
color="red",
))
except Exception as e:
errors.append(e)
def querier():
try:
for _ in range(100):
scene.query(object_type="cube")
except Exception as e:
errors.append(e)
threads = [
threading.Thread(target=adder, name="adder1"),
threading.Thread(target=adder, name="adder2"),
threading.Thread(target=querier, name="querier"),
]
for t in threads:
t.start()
for t in threads:
t.join()
assert not errors, f"Thread errors: {errors}"
class TestFindByCriteria:
"""Test find_by_criteria method (voice command matching)."""
def test_find_by_type_and_color(self):
"""Test finding object by type and color."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
scene.add(SceneObject(id="b", object_type="cube", color="blue"))
result = scene.find_by_criteria("cube", "red", "no especificado")
assert result is not None
assert result.id == "a"
def test_find_with_class_map(self):
"""Test finding with class_map translation."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
# Spanish to English mapping
class_map = {"cubo": "cube", "rojo": "red"}
result = scene.find_by_criteria("cubo", "rojo", "no especificado", class_map)
assert result is not None
assert result.id == "a"
def test_find_unspecified_returns_none(self):
"""Test that 'no especificado' object type returns None."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
result = scene.find_by_criteria("no especificado", "red", "no especificado")
assert result is None
def test_find_no_match(self):
"""Test finding with no matching object."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red"))
result = scene.find_by_criteria("cylinder", "blue", "no especificado")
assert result is None
def test_find_with_size(self):
"""Test finding with size criteria."""
scene = SceneState()
scene.add(SceneObject(id="a", object_type="cube", color="red", size="big"))
scene.add(SceneObject(id="b", object_type="cube", color="red", size="small"))
result = scene.find_by_criteria("cube", "red", "small")
assert result is not None
assert result.id == "b"
class TestLoadFromConfig:
"""Test load_from_config method."""
def test_load_nonexistent_file(self):
"""Test loading from nonexistent file returns 0."""
scene = SceneState()
count = scene.load_from_config("/nonexistent/path.toml")
assert count == 0
assert scene.count() == 0
def test_load_from_config_with_objects(self, tmp_path):
"""Test loading objects from config file."""
config_file = tmp_path / "config.toml"
config_file.write_text("""
[object_parameters]
normal_height = 100.0
[[static_objects]]
type = "box"
color = "blue"
position = [150.0, -200.0]
size = "big"
[[static_objects]]
type = "box"
color = "red"
position = [250.0, -200.0]
""")
scene = SceneState()
count = scene.load_from_config(str(config_file))
assert count == 2
assert scene.count() == 2
blues = scene.query(color="blue")
assert len(blues) == 1
assert blues[0].object_type == "box"
assert blues[0].position_mm[2] == 100.0 # normal_height
assert blues[0].source == "config"
def test_load_empty_config(self, tmp_path):
"""Test loading from empty config file."""
config_file = tmp_path / "empty.toml"
config_file.write_text("")
scene = SceneState()
count = scene.load_from_config(str(config_file))
assert count == 0

View File

@@ -22,6 +22,17 @@ except ModuleNotFoundError: # pragma: no cover
DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt") DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt")
# Detection visualization constants
POINT_SEARCH_RADIUS = 3
DEFAULT_FONT_SCALE = 0.5
DETECTION_BOX_THICKNESS = 2
def _log(msg: str) -> None:
"""Print a timestamped log message."""
timestamp = time.strftime("%H:%M:%S")
print(f"[detector {timestamp}] {msg}", flush=True)
@dataclass @dataclass
class DetectionConfig: class DetectionConfig:
@@ -46,8 +57,8 @@ def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]:
parts = [p.strip() for p in raw.split(",")] parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 2: if len(parts) >= 2:
return int(parts[0]), int(parts[1]) return int(parts[0]), int(parts[1])
except Exception: except Exception as e:
pass _log(f"Warning: failed to parse int pair '{raw}': {e}")
return default return default
@@ -56,8 +67,8 @@ def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, fl
parts = [p.strip() for p in raw.split(",")] parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 2: if len(parts) >= 2:
return float(parts[0]), float(parts[1]) return float(parts[0]), float(parts[1])
except Exception: except Exception as e:
pass _log(f"Warning: failed to parse float pair '{raw}': {e}")
return default return default
@@ -66,8 +77,8 @@ def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int
parts = [p.strip() for p in raw.split(",")] parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 3: if len(parts) >= 3:
return int(parts[0]), int(parts[1]), int(parts[2]) return int(parts[0]), int(parts[1]), int(parts[2])
except Exception: except Exception as e:
pass _log(f"Warning: failed to parse color '{raw}': {e}")
return default return default
@@ -159,10 +170,9 @@ def _sample_point(
point_xyz = point_cloud[y, x, :3].astype(np.float64) point_xyz = point_cloud[y, x, :3].astype(np.float64)
if _valid_point(point_xyz, cfg): if _valid_point(point_xyz, cfg):
return point_xyz return point_xyz
radius = 3
samples = [] samples = []
for dy in range(-radius, radius + 1): for dy in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1):
for dx in range(-radius, radius + 1): for dx in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1):
xx = x + dx xx = x + dx
yy = y + dy yy = y + dy
if xx < 0 or yy < 0 or xx >= w or yy >= h: if xx < 0 or yy < 0 or xx >= w or yy >= h:
@@ -218,7 +228,8 @@ def _load_config_file(path: str) -> Dict[str, Any]:
try: try:
with open(path, "rb") as handle: with open(path, "rb") as handle:
return tomllib.load(handle) return tomllib.load(handle)
except Exception: except Exception as e:
_log(f"Warning: failed to load config file '{path}': {e}")
return {} return {}
@@ -351,11 +362,11 @@ def _draw_detections(
color = color_map.get(color_name, (128, 128, 128)) color = color_map.get(color_name, (128, 128, 128))
# Draw bounding box # Draw bounding box
cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2) cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, DETECTION_BOX_THICKNESS)
# Draw label background # Draw label background
label = f"{obj_type} {color_name} {size}" label = f"{obj_type} {color_name} {size}"
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, DEFAULT_FONT_SCALE, 1)
cv2.rectangle( cv2.rectangle(
annotated, annotated,
(bbox[0], bbox[1] - th - 8), (bbox[0], bbox[1] - th - 8),
@@ -368,7 +379,7 @@ def _draw_detections(
label, label,
(bbox[0] + 2, bbox[1] - 4), (bbox[0] + 2, bbox[1] - 4),
cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_SIMPLEX,
0.5, DEFAULT_FONT_SCALE,
(0, 0, 0), (0, 0, 0),
1, 1,
) )

View File

@@ -1,5 +1,5 @@
[project] [project]
name = "dora-detector" name = "dora-yolo-object-detector"
version = "0.1.0" version = "0.1.0"
license = { file = "MIT" } license = { file = "MIT" }
authors = [{ name = "Dora" }] authors = [{ name = "Dora" }]
@@ -16,4 +16,4 @@ dependencies = [
] ]
[project.scripts] [project.scripts]
dora-detector = "dora_detector.main:main" dora-yolo-object-detector = "dora_yolo_object_detector.main:main"