Add voice control, working but need more work

This commit is contained in:
cristhian aguilera
2026-01-31 11:41:50 -03:00
parent 380c466170
commit b9798a2f46
21 changed files with 3101 additions and 0 deletions

22
config.toml Normal file
View File

@@ -0,0 +1,22 @@
[roi]
roi_top_left = [500, 230]
roi_bottom_right = [775, 510]
[bucket_positions]
blue_bucket_pos = [400, 90]
red_bucket_pos = [550, 90]
yellow_bucket_pos = [550, 630]
white_bucket_pos = [400, 630]
[dominant_colors]
blue = [255, 0, 0]
red = [0, 0, 255]
yellow = [0, 255, 255]
white = [255, 255, 255]
[object_parameters]
size_threshold = 4200
big_height = 125.9
small_height = 106.0
bottom_height = 68.0
normal_height = 220.0

View File

@@ -0,0 +1,133 @@
nodes:
- id: zed_camera_cpp
build: bash -lc "cmake -S dora_zed_cpp -B dora_zed_cpp/build && cmake --build dora_zed_cpp/build"
path: dora_zed_cpp/build/dora_zed_cpp
env:
ZED_RESOLUTION: "720"
ZED_FPS: "15"
ZED_DEPTH_MODE: "NEURAL"
ZED_DEPTH_MIN_MM: "10"
ZED_DEPTH_MAX_MM: "600"
ZED_DEPTH_FILL: "false"
ZED_FLIP: "ON"
ZED_WARMUP_FRAMES: "30"
inputs:
tick: dora/timer/millis/100
outputs:
- image_bgr
- camera_info
- point_cloud
- id: ulite6
build: uv pip install -e dora_ulite6
path: dora_ulite6/dora_ulite6/main.py
inputs:
tick: dora/timer/millis/10
command: voice/robot_cmd
outputs:
- tcp_pose
- status
env:
ROBOT_IP: "192.168.1.192"
DEFAULT_SPEED: "30"
DEFAULT_UNITS: "mm"
API_HOST: "0.0.0.0"
API_PORT: "9000"
VACUUM_ENABLED: "true"
# Initial position on startup: "home", "pose", or "none"
# Set to "none" - voice control handles initial positioning
INIT_MODE: "none"
- id: iobridge
build: |
uv venv -p 3.12 --seed --allow-existing
uv pip install -e dora_iobridge
path: dora_iobridge/dora_iobridge/main.py
env:
VIRTUAL_ENV: ./.venv
VOICE_HOST: "0.0.0.0"
VOICE_PORT: "8765"
VOICE_IN_OUTPUT: "voice_in"
VOICE_OUT_INPUT: "voice_out"
SCENE_INPUT: "scene_update"
inputs:
voice_out: voice/voice_out
scene_update: voice/scene_update
tick: dora/timer/millis/100
outputs:
- voice_in
- id: detector
build: |
uv venv -p 3.12 --seed --allow-existing
uv pip install -e dora_detector
path: dora_detector/dora_detector/main.py
env:
VIRTUAL_ENV: ./.venv
IMAGE_INPUT: "image_bgr"
POINT_CLOUD_INPUT: "point_cloud"
POSE_INPUT: "tcp_pose"
OBJECTS_OUTPUT: "objects"
IMAGE_OUTPUT: "image_annotated"
CALIBRATION_FILE: "calibration_ulite6_zed.npz"
DETECTOR_WEIGHTS: "trained_models/yolo8n.pt"
CONFIG_FILE: "config.toml"
ROI_TOP_LEFT: "500,230"
ROI_BOTTOM_RIGHT: "775,510"
SIZE_THRESHOLD: "4200"
DETECT_EVERY_N: "3"
MIN_DEPTH_MM: "10"
MAX_DEPTH_MM: "600"
inputs:
image_bgr: zed_camera_cpp/image_bgr
point_cloud: zed_camera_cpp/point_cloud
tcp_pose: ulite6/tcp_pose
tick: dora/timer/millis/100
outputs:
- objects
- image_annotated
- id: voice
build: |
uv venv -p 3.12 --seed --allow-existing
uv pip install -e dora_voice_control
path: dora_voice_control/dora_voice_control/main.py
env:
VIRTUAL_ENV: ./.venv
OBJECTS_INPUT: "objects"
POSE_INPUT: "tcp_pose"
STATUS_INPUT: "status"
COMMAND_OUTPUT: "robot_cmd"
CONFIG_FILE: "config.toml"
# Map Spanish command names to detector class names
CLASS_MAP: '{"cilindro": "cylinder", "cubo": "cube", "estrella": "star", "caja": "box", "amarillo": "yellow", "rojo": "red", "azul": "blue", "blanco": "white", "grande": "big", "pequeno": "small"}'
VOICE_IN_INPUT: "voice_in"
VOICE_OUT_OUTPUT: "voice_out"
SCENE_OUTPUT: "scene_update"
TCP_OFFSET_MM: "63.0"
APPROACH_OFFSET_MM: "50.0"
STEP_MM: "20.0"
DEFAULT_ROLL: "180.0"
DEFAULT_PITCH: "0.0"
DEFAULT_YAW: "0.0"
DRY_RUN: "false"
# Initial position (used on startup and reset command)
INIT_ON_START: "true"
INIT_X: "300.0"
INIT_Y: "0.0"
INIT_Z: "350.0"
INIT_ROLL: "180.0"
INIT_PITCH: "0.0"
INIT_YAW: "0.0"
IMAGE_INPUT: "image_annotated"
IMAGE_WIDTH: "1280"
IMAGE_HEIGHT: "720"
API_ENABLED: "true"
API_PORT: "8080"
inputs:
objects: detector/objects
tcp_pose: ulite6/tcp_pose
status: ulite6/status
voice_in: iobridge/voice_in
image_annotated: detector/image_annotated
tick: dora/timer/millis/100
outputs:
- robot_cmd
- voice_out
- scene_update

View File

@@ -0,0 +1 @@
"""Dora detector node package."""

View File

@@ -0,0 +1,513 @@
"""Dora node for YOLO detection and base-frame object localization."""
from __future__ import annotations
import json
import os
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np
import pyarrow as pa
from dora import Node
from ultralytics import YOLO
try:
import tomllib
except ModuleNotFoundError: # pragma: no cover
import tomli as tomllib
DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt")
@dataclass
class DetectionConfig:
imgsz: int
conf: float
iou: float
size_threshold: int
roi_top_left: Tuple[int, int]
roi_bottom_right: Tuple[int, int]
use_roi: bool
detect_every_n: int
min_depth_mm: float
max_depth_mm: float
color_blue: Tuple[int, int, int]
color_red: Tuple[int, int, int]
color_yellow: Tuple[int, int, int]
color_white: Tuple[int, int, int]
def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]:
try:
parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 2:
return int(parts[0]), int(parts[1])
except Exception:
pass
return default
def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, float]:
try:
parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 2:
return float(parts[0]), float(parts[1])
except Exception:
pass
return default
def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int]:
try:
parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 3:
return int(parts[0]), int(parts[1]), int(parts[2])
except Exception:
pass
return default
def _rotation_matrix_xyz(
roll_deg: float, pitch_deg: float, yaw_deg: float
) -> np.ndarray:
roll = np.deg2rad(roll_deg)
pitch = np.deg2rad(pitch_deg)
yaw = np.deg2rad(yaw_deg)
cx, sx = np.cos(roll), np.sin(roll)
cy, sy = np.cos(pitch), np.sin(pitch)
cz, sz = np.cos(yaw), np.sin(yaw)
rot_x = np.array([[1.0, 0.0, 0.0], [0.0, cx, -sx], [0.0, sx, cx]])
rot_y = np.array([[cy, 0.0, sy], [0.0, 1.0, 0.0], [-sy, 0.0, cy]])
rot_z = np.array([[cz, -sz, 0.0], [sz, cz, 0.0], [0.0, 0.0, 1.0]])
return rot_z @ rot_y @ rot_x
def _pose_to_matrix(tcp_pose_mm_deg: List[float]) -> np.ndarray:
x, y, z, roll, pitch, yaw = tcp_pose_mm_deg
rot = _rotation_matrix_xyz(roll, pitch, yaw)
mat = np.eye(4)
mat[:3, :3] = rot
mat[:3, 3] = np.array([x, y, z], dtype=np.float64) / 1000.0
return mat
def _decode_image(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
encoding = str(metadata.get("encoding", "bgr8")).lower()
width = metadata.get("width")
height = metadata.get("height")
if (width is None or height is None) and "shape" in metadata:
shape = metadata.get("shape")
if isinstance(shape, (list, tuple)) and len(shape) >= 2:
height = height if height is not None else int(shape[0])
width = width if width is not None else int(shape[1])
if width is None or height is None:
raise KeyError("width/height (or shape) missing from metadata")
if encoding == "bgr8":
frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
return frame.copy()
if encoding == "rgb8":
frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
if encoding in {"jpeg", "jpg", "jpe", "bmp", "webp", "png"}:
frame = storage.to_numpy().astype(np.uint8)
return cv2.imdecode(frame, cv2.IMREAD_COLOR)
if encoding == "yuv420":
yuv = storage.to_numpy().astype(np.uint8)
yuv = yuv[: width * height * 3 // 2].reshape((height * 3 // 2, width))
return cv2.cvtColor(yuv, cv2.COLOR_YUV420p2BGR)
raise RuntimeError(f"Unsupported image encoding: {encoding}")
def _decode_point_cloud(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
dtype_str = str(metadata.get("dtype", "float32"))
if dtype_str != "float32":
raise RuntimeError(f"Unsupported point cloud dtype: {dtype_str}")
shape = metadata.get("shape")
if not isinstance(shape, (list, tuple)) or len(shape) < 3:
raise KeyError("point cloud shape missing from metadata")
height, width, channels = [int(v) for v in shape[:3]]
if channels < 3:
raise ValueError("point cloud requires at least 3 channels")
raw = storage.to_numpy().astype(np.uint8).tobytes()
values = np.frombuffer(raw, dtype=np.float32)
return values.reshape((height, width, channels))
def _valid_point(point_xyz: np.ndarray, cfg: DetectionConfig) -> bool:
if not np.all(np.isfinite(point_xyz)):
return False
z = float(point_xyz[2])
if z <= 0:
return False
if z < cfg.min_depth_mm or z > cfg.max_depth_mm:
return False
return True
def _sample_point(
point_cloud: np.ndarray, x: int, y: int, cfg: DetectionConfig
) -> Optional[np.ndarray]:
h, w, _ = point_cloud.shape
if x < 0 or y < 0 or x >= w or y >= h:
return None
point_xyz = point_cloud[y, x, :3].astype(np.float64)
if _valid_point(point_xyz, cfg):
return point_xyz
radius = 3
samples = []
for dy in range(-radius, radius + 1):
for dx in range(-radius, radius + 1):
xx = x + dx
yy = y + dy
if xx < 0 or yy < 0 or xx >= w or yy >= h:
continue
p = point_cloud[yy, xx, :3].astype(np.float64)
if _valid_point(p, cfg):
samples.append(p)
if not samples:
return None
return np.median(np.stack(samples, axis=0), axis=0)
def _dominant_color(image: np.ndarray, bbox: List[int]) -> Tuple[int, int, int]:
x1, y1, x2, y2 = bbox
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(image.shape[1], x2)
y2 = min(image.shape[0], y2)
roi = image[y1:y2, x1:x2]
if roi.size == 0:
return (0, 0, 0)
color = np.median(roi, axis=(0, 1)).astype(int)
return int(color[0]), int(color[1]), int(color[2])
def _closest_color(color: Tuple[int, int, int], cfg: DetectionConfig) -> str:
colors = {
"blue": np.array(cfg.color_blue, dtype=np.float64),
"red": np.array(cfg.color_red, dtype=np.float64),
"yellow": np.array(cfg.color_yellow, dtype=np.float64),
"white": np.array(cfg.color_white, dtype=np.float64),
}
color_vec = np.array(color, dtype=np.float64)
best_name = "unknown"
best_dist = float("inf")
for name, value in colors.items():
dist = np.linalg.norm(color_vec - value)
if dist < best_dist:
best_name = name
best_dist = dist
return best_name
def _load_calibration(calibration_file: str) -> np.ndarray:
calib = np.load(calibration_file, allow_pickle=True)
t_cam2gripper = calib["T_cam2gripper"]
return t_cam2gripper
def _load_config_file(path: str) -> Dict[str, Any]:
if not path or not os.path.exists(path):
return {}
try:
with open(path, "rb") as handle:
return tomllib.load(handle)
except Exception:
return {}
def _build_config(config_path: str) -> DetectionConfig:
cfg_data = _load_config_file(config_path)
roi_cfg = cfg_data.get("roi", {})
colors_cfg = cfg_data.get("dominant_colors", {})
obj_cfg = cfg_data.get("object_parameters", {})
imgsz = int(os.getenv("YOLO_IMGSZ", "640"))
conf = float(os.getenv("YOLO_CONF", "0.25"))
iou = float(os.getenv("YOLO_IOU", "0.45"))
size_threshold = int(
os.getenv("SIZE_THRESHOLD", str(obj_cfg.get("size_threshold", 4200)))
)
roi_top_left = _parse_int_pair(
os.getenv(
"ROI_TOP_LEFT",
",".join([str(v) for v in roi_cfg.get("roi_top_left", [500, 230])]),
),
(500, 230),
)
roi_bottom_right = _parse_int_pair(
os.getenv(
"ROI_BOTTOM_RIGHT",
",".join([str(v) for v in roi_cfg.get("roi_bottom_right", [775, 510])]),
),
(775, 510),
)
use_roi = os.getenv("USE_ROI", "true").lower() in ("true", "1", "yes")
detect_every_n = int(os.getenv("DETECT_EVERY_N", "3"))
min_depth_mm = float(os.getenv("MIN_DEPTH_MM", "10"))
max_depth_mm = float(os.getenv("MAX_DEPTH_MM", "600"))
color_blue = _parse_color(
os.getenv(
"COLOR_BLUE",
",".join([str(v) for v in colors_cfg.get("blue", [255, 0, 0])]),
),
(255, 0, 0),
)
color_red = _parse_color(
os.getenv(
"COLOR_RED",
",".join([str(v) for v in colors_cfg.get("red", [0, 0, 255])]),
),
(0, 0, 255),
)
color_yellow = _parse_color(
os.getenv(
"COLOR_YELLOW",
",".join([str(v) for v in colors_cfg.get("yellow", [0, 255, 255])]),
),
(0, 255, 255),
)
color_white = _parse_color(
os.getenv(
"COLOR_WHITE",
",".join([str(v) for v in colors_cfg.get("white", [255, 255, 255])]),
),
(255, 255, 255),
)
return DetectionConfig(
imgsz=imgsz,
conf=conf,
iou=iou,
size_threshold=size_threshold,
roi_top_left=roi_top_left,
roi_bottom_right=roi_bottom_right,
use_roi=use_roi,
detect_every_n=detect_every_n,
min_depth_mm=min_depth_mm,
max_depth_mm=max_depth_mm,
color_blue=color_blue,
color_red=color_red,
color_yellow=color_yellow,
color_white=color_white,
)
def _within_roi(bbox: List[int], cfg: DetectionConfig) -> bool:
if not cfg.use_roi:
return True
x1, y1, x2, y2 = bbox
rx1, ry1 = cfg.roi_top_left
rx2, ry2 = cfg.roi_bottom_right
return x1 >= rx1 and y1 >= ry1 and x2 <= rx2 and y2 <= ry2
def _draw_detections(
frame: np.ndarray, objects: List[Dict[str, Any]], cfg: DetectionConfig
) -> np.ndarray:
"""Draw bounding boxes and labels on frame."""
annotated = frame.copy()
# Draw ROI rectangle (always visible)
cv2.rectangle(
annotated,
cfg.roi_top_left,
cfg.roi_bottom_right,
(0, 255, 0) if cfg.use_roi else (128, 128, 128),
2,
)
# Label the ROI
cv2.putText(
annotated,
"ROI",
(cfg.roi_top_left[0] + 5, cfg.roi_top_left[1] + 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 255, 0) if cfg.use_roi else (128, 128, 128),
2,
)
# Color mapping for visualization
color_map = {
"blue": (255, 100, 0),
"red": (0, 0, 255),
"yellow": (0, 255, 255),
"white": (200, 200, 200),
"unknown": (128, 128, 128),
}
for obj in objects:
bbox = obj.get("bbox", [0, 0, 0, 0])
color_name = obj.get("color", "unknown")
obj_type = obj.get("object_type", "?")
size = obj.get("size", "?")
pos = obj.get("position_mm", [0, 0, 0])
color = color_map.get(color_name, (128, 128, 128))
# Draw bounding box
cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
# Draw label background
label = f"{obj_type} {color_name} {size}"
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
cv2.rectangle(
annotated,
(bbox[0], bbox[1] - th - 8),
(bbox[0] + tw + 4, bbox[1]),
color,
-1,
)
cv2.putText(
annotated,
label,
(bbox[0] + 2, bbox[1] - 4),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 0, 0),
1,
)
# Draw position
pos_label = f"[{pos[0]:.0f}, {pos[1]:.0f}, {pos[2]:.0f}]"
cv2.putText(
annotated,
pos_label,
(bbox[0], bbox[3] + 15),
cv2.FONT_HERSHEY_SIMPLEX,
0.4,
color,
1,
)
return annotated
def main() -> None:
image_input = os.getenv("IMAGE_INPUT", "image_bgr")
point_cloud_input = os.getenv("POINT_CLOUD_INPUT", "point_cloud")
pose_input = os.getenv("POSE_INPUT", "tcp_pose")
objects_output = os.getenv("OBJECTS_OUTPUT", "objects")
image_output = os.getenv("IMAGE_OUTPUT", "image_annotated")
calibration_file = os.getenv("CALIBRATION_FILE", "calibration.npz")
weights = os.getenv("DETECTOR_WEIGHTS", DEFAULT_WEIGHTS)
config_file = os.getenv("CONFIG_FILE", "config.toml")
cfg = _build_config(config_file)
model = YOLO(weights)
if calibration_file and not os.path.isabs(calibration_file):
config_path = os.path.join("config", calibration_file)
calibration_file = config_path if os.path.exists(config_path) else calibration_file
t_cam2gripper = _load_calibration(calibration_file)
node = Node()
latest_pose: Optional[List[float]] = None
latest_pose_at: Optional[float] = None
latest_point_cloud: Optional[np.ndarray] = None
latest_pc_at: Optional[float] = None
frame_count = 0
for event in node:
if event["type"] != "INPUT":
continue
now = time.monotonic()
if event["id"] == pose_input:
tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
if tcp_pose.size >= 6:
latest_pose = tcp_pose[:6].tolist()
latest_pose_at = now
continue
if event["id"] == point_cloud_input:
latest_point_cloud = _decode_point_cloud(event["value"], event.get("metadata", {}))
latest_pc_at = now
continue
if event["id"] != image_input:
continue
frame_count += 1
if frame_count % max(1, cfg.detect_every_n) != 0:
continue
if latest_pose is None or latest_point_cloud is None:
continue
frame = _decode_image(event["value"], event.get("metadata", {}))
results = model.predict(
frame, imgsz=cfg.imgsz, conf=cfg.conf, iou=cfg.iou, verbose=False
)[0]
base_T_flange = _pose_to_matrix(latest_pose)
objects: List[Dict[str, Any]] = []
for r in results.boxes:
bbox = [int(x) for x in r.xyxy[0]]
if not _within_roi(bbox, cfg):
continue
cx = int((bbox[0] + bbox[2]) / 2)
cy = int((bbox[1] + bbox[3]) / 2)
point_cam_mm = _sample_point(latest_point_cloud, cx, cy, cfg)
if point_cam_mm is None:
continue
point_cam_m = np.array(
[point_cam_mm[0], point_cam_mm[1], point_cam_mm[2], 1.0],
dtype=np.float64,
)
point_cam_m[:3] /= 1000.0
point_base = base_T_flange @ t_cam2gripper @ point_cam_m
point_base_mm = point_base[:3] * 1000.0
dominant = _dominant_color(frame, bbox)
color_name = _closest_color(dominant, cfg)
area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
size_label = "big" if area >= cfg.size_threshold else "small"
objects.append(
{
"object_type": results.names[int(r.cls.item())],
"confidence": float(r.conf.item()),
"color": color_name,
"size": size_label,
"bbox": bbox,
"center_px": [cx, cy],
"position_mm": [
float(point_base_mm[0]),
float(point_base_mm[1]),
float(point_base_mm[2]),
],
"timestamp_ns": time.time_ns(),
}
)
payload = json.dumps({"objects": objects, "timestamp_ns": time.time_ns()})
node.send_output(
objects_output,
pa.array([payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
# Send annotated image
annotated = _draw_detections(frame, objects, cfg)
h, w = annotated.shape[:2]
node.send_output(
image_output,
pa.array(annotated.ravel().tolist()),
metadata={
"encoding": "bgr8",
"width": w,
"height": h,
"timestamp_ns": time.time_ns(),
},
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,19 @@
[project]
name = "dora-detector"
version = "0.1.0"
license = { file = "MIT" }
authors = [{ name = "Dora" }]
description = "Dora node for YOLO-based object detection with ZED point cloud"
requires-python = ">=3.8"
dependencies = [
"dora-rs >= 0.3.9",
"numpy < 2.0.0",
"opencv-python >= 4.1.1",
"pyarrow >= 12.0.0",
"ultralytics >= 8.0.0",
]
[project.scripts]
dora-detector = "dora_detector.main:main"

178
dora_iobridge/README.md Normal file
View File

@@ -0,0 +1,178 @@
# Dora IOBridge Node
A WebSocket server that bridges web clients with the Dora dataflow for real-time voice commands and scene updates.
## Inputs/Outputs
| Input | Type | Description |
|----------------|--------|---------------------------------------|
| `voice_out` | JSON | Response from voice control node |
| `scene_update` | JSON | Scene objects from voice control |
| Output | Type | Description |
|----------------|--------|---------------------------------------|
| `voice_in` | string | Voice commands forwarded to Dora |
## Environment Variables
```bash
VOICE_HOST=0.0.0.0 # Bind address
VOICE_PORT=8765 # Listen port
```
## Installation
```bash
cd dora_iobridge
pip install -e .
```
## Testing
### Test with WebSocket (wscat)
```bash
# Install wscat
npm install -g wscat
# Connect to the server
wscat -c ws://localhost:8765
```
### Test with curl (websocat)
```bash
# Install websocat
# Ubuntu: sudo apt install websocat
# macOS: brew install websocat
# Send a ping
echo '{"type": "ping"}' | websocat ws://localhost:8765
# Response: {"type": "pong"}
# Send a voice command
echo '{"type": "command", "text": "sube"}' | websocat ws://localhost:8765
# Request scene refresh
echo '{"type": "scene_refresh"}' | websocat ws://localhost:8765
```
### Test with Python
```python
import asyncio
import websockets
import json
async def test_iobridge():
uri = "ws://localhost:8765"
async with websockets.connect(uri) as ws:
# Test ping
await ws.send(json.dumps({"type": "ping"}))
response = await ws.recv()
print(f"Ping response: {response}")
# Send command
await ws.send(json.dumps({
"type": "command",
"text": "agarra el cubo rojo"
}))
# Listen for responses
async for message in ws:
data = json.loads(message)
print(f"Received: {data}")
asyncio.run(test_iobridge())
```
### Test with curl (HTTP upgrade not supported directly)
Since WebSocket requires an upgrade handshake, use this shell script:
```bash
#!/bin/bash
# test_iobridge.sh
# Using websocat for interactive testing
websocat ws://localhost:8765 <<EOF
{"type": "ping"}
{"type": "command", "text": "sube"}
{"type": "scene_refresh"}
EOF
```
## WebSocket Message Types
### Client -> Server
**Command (voice input)**
```json
{"type": "command", "text": "agarra el cubo rojo"}
```
**Ping (health check)**
```json
{"type": "ping"}
```
Response: `{"type": "pong"}`
**Scene Refresh**
```json
{"type": "scene_refresh"}
```
### Server -> Client (Broadcasts)
**Command Response**
```json
{
"type": "response",
"text": "Ok, voy a tomar",
"status": "ok"
}
```
**Scene Update**
```json
{
"type": "scene_updated",
"objects": [
{
"object_type": "cubo",
"color": "rojo",
"size": "big",
"position_mm": [150.0, 200.0, 280.0],
"source": "detection"
}
]
}
```
## Dora Dataflow Configuration
```yaml
nodes:
- id: iobridge
build: pip install -e ./dora_iobridge
path: dora_iobridge
inputs:
voice_out: voice_control/voice_out
scene_update: voice_control/scene_update
outputs:
- voice_in
env:
VOICE_HOST: "0.0.0.0"
VOICE_PORT: "8765"
```
```bash
dora up
dora start dataflow.yml
```
## Dependencies
- dora-rs >= 0.3.9
- pyarrow >= 12.0.0
- websockets >= 12.0

View File

@@ -0,0 +1 @@
"""Dora IO bridge node package."""

View File

@@ -0,0 +1,145 @@
"""Dora node bridging WebSocket IO to Dora topics."""
from __future__ import annotations
import asyncio
import json
import os
import threading
import time
from typing import Any, Dict, Optional, Set
import pyarrow as pa
from dora import Node
from websockets.server import serve, WebSocketServerProtocol
class IoBridgeServer:
def __init__(self, host: str, port: int):
self.host = host
self.port = port
self.clients: Set[WebSocketServerProtocol] = set()
self.command_handler = None
self.scene_refresh_handler = None
async def handler(self, websocket: WebSocketServerProtocol):
self.clients.add(websocket)
try:
async for message in websocket:
try:
data = json.loads(message)
except json.JSONDecodeError:
await websocket.send(
json.dumps({"type": "error", "text": "Invalid JSON message"})
)
continue
response = await self._route_message(data, websocket)
if response:
await websocket.send(json.dumps(response))
finally:
self.clients.discard(websocket)
async def _route_message(
self, data: Dict[str, Any], websocket: WebSocketServerProtocol
) -> Optional[Dict[str, Any]]:
msg_type = data.get("type")
if msg_type == "command":
text = data.get("text", "")
if self.command_handler:
await self.command_handler(text)
return None
return {"type": "error", "text": "No command handler registered"}
if msg_type == "ping":
return {"type": "pong"}
if msg_type == "scene_refresh":
if self.scene_refresh_handler:
objects = await self.scene_refresh_handler()
return {"type": "scene_updated", "objects": objects}
return {"type": "error", "text": "No scene handler registered"}
return {"type": "error", "text": f"Unknown message type: {msg_type}"}
async def broadcast(self, message: Dict[str, Any]):
if not self.clients:
return
payload = json.dumps(message)
await asyncio.gather(
*[client.send(payload) for client in self.clients], return_exceptions=True
)
async def send(self, message: Dict[str, Any], websocket: WebSocketServerProtocol):
await websocket.send(json.dumps(message))
async def start(self):
async with serve(self.handler, self.host, self.port):
await asyncio.Future()
def main() -> None:
host = os.getenv("VOICE_HOST", "0.0.0.0")
port = int(os.getenv("VOICE_PORT", "8765"))
input_topic = os.getenv("VOICE_IN_OUTPUT", "voice_in")
response_input = os.getenv("VOICE_OUT_INPUT", "voice_out")
scene_input = os.getenv("SCENE_INPUT", "scene_update")
node = Node()
server = IoBridgeServer(host, port)
loop = asyncio.new_event_loop()
def push_command(text: str) -> None:
node.send_output(
input_topic,
pa.array([text]),
metadata={"encoding": "utf-8", "timestamp_ns": time.time_ns()},
)
async def handle_scene_refresh():
return []
def command_handler(text: str):
push_command(text)
return None
server.command_handler = command_handler
server.scene_refresh_handler = handle_scene_refresh
def run_server():
asyncio.set_event_loop(loop)
loop.run_until_complete(server.start())
threading.Thread(target=run_server, daemon=True).start()
for event in node:
if event["type"] != "INPUT":
continue
if event["id"] == response_input:
raw = event["value"][0].as_py() if len(event["value"]) else ""
if not raw:
continue
try:
payload = json.loads(raw)
message = {
"type": "response",
"text": payload.get("text", ""),
"status": payload.get("status", "ok"),
}
except Exception:
message = {"type": "response", "text": raw, "status": "ok"}
asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
continue
if event["id"] == scene_input:
raw = event["value"][0].as_py() if len(event["value"]) else ""
if not raw:
continue
try:
payload = json.loads(raw)
objects = payload.get("objects", [])
message = {"type": "scene_updated", "objects": objects}
except Exception:
message = {"type": "scene_updated", "objects": []}
asyncio.run_coroutine_threadsafe(server.broadcast(message), loop)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,17 @@
[project]
name = "dora-iobridge"
version = "0.1.0"
license = { file = "MIT" }
authors = [{ name = "Dora" }]
description = "Dora node bridging WebSocket IO to Dora topics"
requires-python = ">=3.8"
dependencies = [
"dora-rs >= 0.3.9",
"pyarrow >= 12.0.0",
"websockets >= 12.0",
]
[project.scripts]
dora-iobridge = "dora_iobridge.main:main"

View File

@@ -781,6 +781,12 @@ def _status_snapshot(helper: ULite6Helper) -> Dict[str, Any]:
}
def _log(msg: str) -> None:
"""Print a timestamped log message."""
timestamp = time.strftime("%H:%M:%S")
print(f"[ulite6 {timestamp}] {msg}", flush=True)
def main() -> None:
node = Node()
@@ -791,7 +797,42 @@ def main() -> None:
api_port = int(os.getenv("API_PORT", "8080"))
vacuum_enabled = os.getenv("VACUUM_ENABLED", "false").lower() in ("true", "1", "yes")
# Initial position settings
init_mode = os.getenv("INIT_MODE", "none").lower() # "home", "pose", or "none"
init_x = float(os.getenv("INIT_X", "300.0"))
init_y = float(os.getenv("INIT_Y", "0.0"))
init_z = float(os.getenv("INIT_Z", "250.0"))
init_roll = float(os.getenv("INIT_ROLL", "180.0"))
init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
init_yaw = float(os.getenv("INIT_YAW", "0.0"))
init_speed = float(os.getenv("INIT_SPEED", "50.0"))
_log(f"Connecting to robot at {robot_ip}...")
helper = ULite6Helper(robot_ip)
_log("Robot connected")
# Move to initial position on startup
if init_mode == "home":
_log("Moving to home position...")
code = helper.go_home()
if code == 0:
_log("Home position reached")
else:
_log(f"Home failed with code {code}")
elif init_mode == "pose":
_log(f"Moving to initial pose: [{init_x}, {init_y}, {init_z}] roll={init_roll} pitch={init_pitch} yaw={init_yaw}")
code = helper.move_to_pose(
init_x, init_y, init_z,
init_roll, init_pitch, init_yaw,
speed=init_speed,
units="mm",
)
if code == 0:
_log("Initial pose reached")
else:
_log(f"Move to initial pose failed with code {code}")
else:
_log("Skipping initial position (INIT_MODE=none)")
# Create and start FastAPI server in background thread
app = create_api(helper, default_speed, default_units, vacuum_enabled)
@@ -895,6 +936,28 @@ def main() -> None:
code=code,
status=_status_snapshot(helper),
)
elif action in ("vacuum_on", "vacuum_off"):
if not vacuum_enabled:
_send_command_status(
node,
command_id=command_id,
action=action,
ok=False,
message="Vacuum gripper not enabled",
status=_status_snapshot(helper),
)
continue
vacuum_on = action == "vacuum_on"
code = helper.set_vacuum_gripper(vacuum_on)
_send_command_status(
node,
command_id=command_id,
action=action,
ok=code == 0,
message="Vacuum command executed",
code=code,
status=_status_snapshot(helper),
)
else:
_send_command_status(
node,

View File

@@ -0,0 +1,211 @@
# Dora Voice Control Node
A Dora node that processes Spanish voice commands from children and translates them into robot actions (movement, grasping, releasing objects). Includes a web debug interface.
## Features
- Spanish voice command parsing (rule-based or Gemini LLM)
- Real-time web debug interface
- Command queue management
- Workspace bounds validation
- Object detection integration
## File Structure
```
dora_voice_control/
├── __init__.py
├── main.py # Main Dora node entry point
├── api.py # FastAPI web server
├── config.py # Configuration management
├── models.py # Pydantic request/response models
├── parser.py # Voice command parsing logic
├── state.py # Shared state management
└── templates.py # HTML template for web interface
```
## Web Debug Interface
Access the debug interface at `http://localhost:8080` (default).
Features:
- Real-time status monitoring (pose, objects, queue)
- Send manual voice commands
- Quick command buttons
- View parse results
- Command history
- Clear queue
## Inputs/Outputs
| Input | Type | Description |
|---------------|--------|------------------------------------------|
| `voice_in` | string | Text transcription of voice command |
| `tcp_pose` | array | Current robot pose [x, y, z, roll, pitch, yaw] |
| `objects` | JSON | Detected objects from vision system |
| `status` | JSON | Command execution status from robot |
| Output | Type | Description |
|---------------|--------|------------------------------------------|
| `robot_cmd` | JSON | Robot command with action and payload |
| `voice_out` | JSON | Response confirmation to user |
| `scene_update`| JSON | Updated scene with all visible objects |
## Supported Commands (Spanish)
| Command | Action | Example |
|---------------|----------------|--------------------------------|
| `subir` | Move up | "sube" |
| `bajar` | Move down | "baja" |
| `tomar` | Grab object | "agarra el cubo rojo" |
| `soltar` | Release object | "suelta en la caja azul" |
| `ir` | Go to object | "ve al cilindro" |
| `reiniciar` | Reset | "reinicia" |
## Environment Variables
```bash
# Web API Server
API_ENABLED=true # Enable/disable web interface
API_HOST=0.0.0.0 # Bind address
API_PORT=8080 # Listen port
# TCP Parameters
TCP_OFFSET_MM=63.0 # Z-offset to object surface
APPROACH_OFFSET_MM=50.0 # Safe approach distance above object
STEP_MM=20.0 # Distance for up/down increments
# LLM Configuration (optional)
LLM_PROVIDER=rules # "rules" or "gemini"
GOOGLE_API_KEY=your_key # Required if using gemini
GEMINI_MODEL=gemini-2.0-flash
# Workspace Safety (optional)
WORKSPACE_MIN_X=-300
WORKSPACE_MAX_X=300
WORKSPACE_MIN_Y=-300
WORKSPACE_MAX_Y=300
WORKSPACE_MIN_Z=0
WORKSPACE_MAX_Z=500
# Misc
DRY_RUN=false # Skip sending robot commands
```
## Installation
```bash
cd dora_voice_control
pip install -e .
# With LLM support
pip install -e ".[llm]"
```
## Testing
### Web Interface
```bash
# Start the node (standalone for testing)
python -m dora_voice_control.main
# Open in browser
open http://localhost:8080
```
### API Endpoints
```bash
# Get status
curl http://localhost:8080/api/status
# Get objects
curl http://localhost:8080/api/objects
# Get queue
curl http://localhost:8080/api/queue
# Send command
curl -X POST http://localhost:8080/api/command \
-H "Content-Type: application/json" \
-d '{"text": "sube"}'
# Clear queue
curl -X POST http://localhost:8080/api/queue/clear
```
### Python Test
```python
from dora_voice_control.parser import rule_parse, normalize
# Test command parsing
text = "agarra el cubo rojo grande"
result = rule_parse(normalize(text))
print(result)
# {'resultado': 'ok', 'accion': 'tomar', 'objeto': 'cubo', 'color': 'rojo', 'tamano': 'grande'}
```
## Dora Dataflow Configuration
```yaml
nodes:
- id: voice_control
build: pip install -e ./dora_voice_control
path: dora_voice_control
inputs:
voice_in: iobridge/voice_in
tcp_pose: robot/tcp_pose
objects: detector/objects
status: robot/status
outputs:
- robot_cmd
- voice_out
- scene_update
env:
API_ENABLED: "true"
API_PORT: "8080"
DRY_RUN: "false"
```
## Message Examples
### Input: voice_in
```
"sube"
"agarra el cubo rojo"
"suelta en la caja azul"
```
### Output: robot_cmd
```json
{
"id": "550e8400-e29b-41d4-a716-446655440000",
"action": "move_to_pose",
"payload": {
"x": 150.0,
"y": 200.0,
"z": 280.0,
"roll": 180.0,
"pitch": 0.0,
"yaw": 0.0
}
}
```
### Output: voice_out
```json
{"text": "Ok, voy a subir", "status": "ok"}
{"text": "No entendi el comando", "status": "error"}
```
## Dependencies
- dora-rs >= 0.3.9
- numpy < 2.0.0
- pyarrow >= 12.0.0
- fastapi >= 0.109.0
- uvicorn >= 0.27.0
- pydantic >= 2.0.0
- google-genai (optional, for Gemini mode)

View File

@@ -0,0 +1 @@
"""Dora voice control node package."""

View File

@@ -0,0 +1,162 @@
"""FastAPI application for the voice control web interface."""
from __future__ import annotations
import os
import sys
import threading
from typing import Any
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse, Response
# Handle both package and direct script execution
# __package__ is None when run as script, '' when imported from a script
if not __package__:
_pkg_dir = os.path.dirname(os.path.abspath(__file__))
if _pkg_dir not in sys.path:
sys.path.insert(0, _pkg_dir)
from models import CommandRequest, CommandResponse
from state import SharedState
from templates import HTML_TEMPLATE
else:
from .models import CommandRequest, CommandResponse
from .state import SharedState
from .templates import HTML_TEMPLATE
def create_api(state: SharedState) -> FastAPI:
"""Create FastAPI application with voice control endpoints."""
app = FastAPI(
title="Voice Control Debug API",
description="Debug interface for the voice control node",
version="0.1.0",
)
@app.get("/", response_class=HTMLResponse)
def index() -> str:
"""Serve the web interface."""
return HTML_TEMPLATE
@app.get("/api/status")
def get_status() -> dict:
"""Get current status."""
try:
return state.get_status()
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/objects")
def get_objects() -> dict:
"""Get detected and static objects."""
try:
return state.get_objects()
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/queue")
def get_queue() -> list:
"""Get the command queue."""
try:
return state.get_queue()
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/queue/clear")
def clear_queue() -> dict:
"""Clear the command queue."""
try:
with state._lock:
state.voice_state.queue.clear()
return {"ok": True}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/history")
def get_history() -> list:
"""Get command history."""
try:
return state.get_history()
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/errors")
def get_errors() -> list:
"""Get error log."""
try:
return state.get_errors()
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/command", response_model=CommandResponse)
def send_command(request: CommandRequest) -> CommandResponse:
"""Send a voice command."""
try:
callback = state.get_command_callback()
if callback is None:
return CommandResponse(ok=False, text="No command handler available", status="error")
result = callback(request.text)
return CommandResponse(
ok=result.get("status") == "ok",
text=result.get("text", ""),
status=result.get("status", "error"),
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/image")
def get_image() -> Response:
"""Get the latest camera image as JPEG."""
try:
image_data = state.get_image()
if image_data is None:
# Return a 1x1 transparent pixel if no image
return Response(
content=b"",
media_type="image/jpeg",
status_code=204,
)
return Response(
content=image_data,
media_type="image/jpeg",
headers={"Cache-Control": "no-cache, no-store, must-revalidate"},
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/image/info")
def get_image_info() -> dict:
"""Get image metadata."""
try:
return {
"has_image": state.get_image() is not None,
"age_ms": state.get_image_age_ms(),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return app
def run_uvicorn(app: FastAPI, host: str, port: int) -> None:
"""Run uvicorn server (for use in background thread)."""
config = uvicorn.Config(app, host=host, port=port, log_level="warning")
server = uvicorn.Server(config)
server.run()
def start_api_server(state: SharedState, config: Any) -> threading.Thread:
"""Start the API server in a background thread."""
import time as _time
app = create_api(state)
api_thread = threading.Thread(
target=run_uvicorn,
args=(app, config.host, config.port),
daemon=True,
)
api_thread.start()
timestamp = _time.strftime("%H:%M:%S")
print(f"[voice_control {timestamp}] Web interface at http://{config.host}:{config.port}", flush=True)
return api_thread

View File

@@ -0,0 +1,95 @@
"""Configuration for the voice control node."""
from __future__ import annotations
import os
from dataclasses import dataclass
from typing import Dict, Optional, Tuple
@dataclass
class VoiceConfig:
"""Configuration for voice control."""
host: str
port: int
tcp_offset_mm: float
approach_offset_mm: float
step_mm: float
default_roll: float
default_pitch: float
default_yaw: float
dry_run: bool
workspace_min: Tuple[Optional[float], Optional[float], Optional[float]]
workspace_max: Tuple[Optional[float], Optional[float], Optional[float]]
class_map: Dict[str, str]
@dataclass
class ApiConfig:
"""Configuration for the web API server."""
host: str
port: int
enabled: bool
def _parse_float_env(name: str) -> Optional[float]:
"""Parse an optional float from environment variable."""
raw = os.getenv(name)
if raw is None or raw == "":
return None
try:
return float(raw)
except ValueError:
return None
def _parse_class_map(raw: str) -> Dict[str, str]:
"""Parse JSON class mapping from string."""
import json
if not raw:
return {}
try:
data = json.loads(raw)
if isinstance(data, dict):
return {str(k): str(v) for k, v in data.items()}
except Exception:
pass
return {}
def load_voice_config() -> VoiceConfig:
"""Load voice configuration from environment variables."""
return VoiceConfig(
host="",
port=0,
tcp_offset_mm=float(os.getenv("TCP_OFFSET_MM", "63.0")),
approach_offset_mm=float(os.getenv("APPROACH_OFFSET_MM", "50.0")),
step_mm=float(os.getenv("STEP_MM", "20.0")),
default_roll=float(os.getenv("DEFAULT_ROLL", "180.0")),
default_pitch=float(os.getenv("DEFAULT_PITCH", "0.0")),
default_yaw=float(os.getenv("DEFAULT_YAW", "0.0")),
dry_run=os.getenv("DRY_RUN", "false").lower() in ("true", "1", "yes"),
workspace_min=(
_parse_float_env("WORKSPACE_MIN_X"),
_parse_float_env("WORKSPACE_MIN_Y"),
_parse_float_env("WORKSPACE_MIN_Z"),
),
workspace_max=(
_parse_float_env("WORKSPACE_MAX_X"),
_parse_float_env("WORKSPACE_MAX_Y"),
_parse_float_env("WORKSPACE_MAX_Z"),
),
class_map=_parse_class_map(os.getenv("CLASS_MAP", "")),
)
def load_api_config() -> ApiConfig:
"""Load API server configuration from environment variables."""
return ApiConfig(
host=os.getenv("API_HOST", "0.0.0.0"),
port=int(os.getenv("API_PORT", "8080")),
enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"),
)

View File

@@ -0,0 +1,501 @@
"""Dora node for voice control with safe robot commands."""
from __future__ import annotations
import json
import os
import sys
import time
import uuid
from collections import deque
from typing import Any, Deque, Dict, List, Optional, Tuple
import cv2
import numpy as np
import pyarrow as pa
from dora import Node
try:
import tomllib
except ModuleNotFoundError:
import tomli as tomllib
# Handle both package and direct script execution
# __package__ is None when run as script, '' when imported from a script
_RUNNING_AS_SCRIPT = not __package__
if _RUNNING_AS_SCRIPT:
# Running as script - use absolute imports
_pkg_dir = os.path.dirname(os.path.abspath(__file__))
if _pkg_dir not in sys.path:
sys.path.insert(0, _pkg_dir)
from config import VoiceConfig, load_api_config, load_voice_config
from parser import normalize, parse_command
from state import RobotStep, SharedState
from api import start_api_server
else:
# Running as package - use relative imports
from .config import VoiceConfig, load_api_config, load_voice_config
from .parser import normalize, parse_command
from .state import RobotStep, SharedState
from .api import start_api_server
def _within_bounds(
point_mm: np.ndarray,
min_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
max_xyz: Tuple[Optional[float], Optional[float], Optional[float]],
) -> bool:
"""Check if point is within workspace bounds."""
x, y, z = point_mm.tolist()
min_x, min_y, min_z = min_xyz
max_x, max_y, max_z = max_xyz
if min_x is not None and x < min_x:
return False
if max_x is not None and x > max_x:
return False
if min_y is not None and y < min_y:
return False
if max_y is not None and y > max_y:
return False
if min_z is not None and z < min_z:
return False
if max_z is not None and z > max_z:
return False
return True
def _translate_target(token: str, mapping: Dict[str, str]) -> str:
"""Translate object name using class map."""
if token in mapping:
return mapping[token]
return token
def _load_config_file(path: str) -> Dict[str, Any]:
"""Load TOML configuration file."""
if not path or not os.path.exists(path):
return {}
try:
with open(path, "rb") as handle:
return tomllib.load(handle)
except Exception:
return {}
def _load_bucket_objects(config_path: str) -> List[Dict[str, Any]]:
"""Load bucket positions from config file."""
cfg = _load_config_file(config_path)
buckets = cfg.get("bucket_positions", {})
obj_cfg = cfg.get("object_parameters", {})
base_z = float(obj_cfg.get("normal_height", 220.0))
out = []
for key, color in [
("blue_bucket_pos", "blue"),
("red_bucket_pos", "red"),
("yellow_bucket_pos", "yellow"),
("white_bucket_pos", "white"),
]:
pos = buckets.get(key)
if not isinstance(pos, list) or len(pos) < 2:
continue
out.append(
{
"object_type": "box",
"color": color,
"size": "big",
"position_mm": [float(pos[0]), float(pos[1]), base_z],
"source": "config",
}
)
return out
def _send_dora_command(
node: Node, output_name: str, action: str, payload: Dict[str, Any]
) -> str:
"""Send a robot command via Dora."""
command_id = str(uuid.uuid4())
message = {"id": command_id, "action": action, "payload": payload}
node.send_output(
output_name,
pa.array([json.dumps(message)]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
return command_id
def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]:
"""Parse status payload from robot."""
if len(value) == 0:
return None
raw = value[0].as_py()
if not raw:
return None
try:
return json.loads(raw)
except Exception:
return None
def _log(msg: str) -> None:
"""Print a timestamped log message."""
timestamp = time.strftime("%H:%M:%S")
print(f"[voice_control {timestamp}] {msg}", flush=True)
def main() -> None:
"""Main entry point for the voice control node."""
_log("Starting voice control node...")
# Load configuration
cfg = load_voice_config()
api_cfg = load_api_config()
# Environment variables for I/O topics
objects_input = os.getenv("OBJECTS_INPUT", "objects")
voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in")
voice_out_output = os.getenv("VOICE_OUT_OUTPUT", "voice_out")
scene_output = os.getenv("SCENE_OUTPUT", "scene_update")
pose_input = os.getenv("POSE_INPUT", "tcp_pose")
status_input = os.getenv("STATUS_INPUT", "status")
command_output = os.getenv("COMMAND_OUTPUT", "robot_cmd")
image_input = os.getenv("IMAGE_INPUT", "image_annotated")
llm_provider = os.getenv("LLM_PROVIDER", "rules").lower()
config_file = os.getenv("CONFIG_FILE", "config.toml")
# Image dimensions (will be detected from first frame)
image_width = int(os.getenv("IMAGE_WIDTH", "1280"))
image_height = int(os.getenv("IMAGE_HEIGHT", "720"))
# Initial/home position for reset command
init_x = float(os.getenv("INIT_X", "300.0"))
init_y = float(os.getenv("INIT_Y", "0.0"))
init_z = float(os.getenv("INIT_Z", "250.0"))
init_roll = float(os.getenv("INIT_ROLL", "180.0"))
init_pitch = float(os.getenv("INIT_PITCH", "0.0"))
init_yaw = float(os.getenv("INIT_YAW", "0.0"))
_log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm")
_log(f"Initial position: [{init_x}, {init_y}, {init_z}]")
_log(f"LLM provider: {llm_provider}")
_log(f"Dry run: {cfg.dry_run}")
# Initialize shared state
shared_state = SharedState()
state = shared_state.voice_state
state.static_objects = _load_bucket_objects(config_file)
pending_intents: Deque[Dict[str, Any]] = deque()
_log(f"Loaded {len(state.static_objects)} static objects from config")
# Queue initial position movement on startup (same as reiniciar)
init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes")
send_init_scene_reset = init_on_start # Flag to send scene reset after node starts
if init_on_start:
_log(f"Startup: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
# Clear detected objects
state.latest_objects = []
state.latest_objects_at = None
# Queue vacuum off and move to home
state.queue.append(RobotStep(action="vacuum_off", payload={}))
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": init_x,
"y": init_y,
"z": init_z,
"roll": init_roll,
"pitch": init_pitch,
"yaw": init_yaw,
},
)
)
def command_handler(transcript: str) -> Dict[str, str]:
"""Handle voice command and return response."""
_log(f"Voice input received: \"{transcript}\"")
llm_result = parse_command(transcript, llm_provider)
_log(f"Parse result: {llm_result}")
# Update debug state
shared_state.update_voice_input(transcript, llm_result, time.monotonic())
if llm_result.get("resultado") != "ok":
_log("Command not understood")
return {"text": "No entendi el comando", "status": "error"}
action = llm_result.get("accion", "error")
obj = llm_result.get("objeto", "no especificado")
color = llm_result.get("color", "no especificado")
size = llm_result.get("tamano", "no especificado")
_log(f"Intent: action={action}, object={obj}, color={color}, size={size}")
pending_intents.append(
{"action": action, "obj": obj, "color": color, "size": size}
)
# Add to history
shared_state.add_to_history({
"timestamp": time.time(),
"input": transcript,
"action": action,
"object": obj,
"color": color,
"size": size,
})
return {"text": f"Ok, voy a {action}", "status": "ok"}
# Set command callback for web interface
shared_state.set_command_callback(command_handler)
# Start web API server if enabled
if api_cfg.enabled:
start_api_server(shared_state, api_cfg)
# Create Dora node
node = Node()
_log("Dora node created, waiting for events...")
first_event = True
for event in node:
# Send scene reset on first event (startup)
if first_event and send_init_scene_reset:
first_event = False
scene_payload = json.dumps(
{"objects": list(state.static_objects), "reset": True}
)
node.send_output(
scene_output,
pa.array([scene_payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
_log("Sent initial scene reset notification")
if event["type"] != "INPUT":
continue
# Handle voice input
if event["id"] == voice_in_input:
raw = event["value"][0].as_py() if len(event["value"]) else ""
if not raw:
continue
response = command_handler(raw)
node.send_output(
voice_out_output,
pa.array([json.dumps(response)]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
continue
# Handle pose updates
if event["id"] == pose_input:
tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
if tcp_pose.size >= 6:
state.latest_pose = tcp_pose[:6].tolist()
state.latest_pose_at = time.monotonic()
continue
# Handle object detection updates
if event["id"] == objects_input:
raw = event["value"][0].as_py() if len(event["value"]) else ""
if raw:
try:
payload = json.loads(raw)
objects = payload.get("objects", [])
except Exception:
objects = []
state.latest_objects = objects
state.latest_objects_at = time.monotonic()
continue
# Handle camera image
if event["id"] == image_input:
try:
# Get raw image data
img_data = event["value"].to_numpy()
# Reshape to image (assuming BGR format)
img = img_data.reshape((image_height, image_width, 3)).astype(np.uint8)
# Encode to JPEG
_, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 80])
shared_state.update_image(jpeg_data.tobytes(), time.monotonic())
except Exception as e:
# Log error but don't crash
pass
continue
# Handle robot status updates
if event["id"] == status_input:
payload = _parse_status_payload(event["value"])
if payload and state.pending_command:
if payload.get("command_id") == state.pending_command.get("id"):
_log(f"Command completed: {state.pending_command.get('action')} (status={payload.get('status', 'ok')})")
state.pending_command = None
continue
# Process pending intents
if pending_intents:
intent = pending_intents.popleft()
action = intent["action"]
obj = intent["obj"]
color = intent["color"]
size = intent["size"]
_log(f"Processing intent: {action} {obj} {color} {size}")
latest_pose = state.latest_pose
objects = list(state.latest_objects) + list(state.static_objects)
_log(f"Available objects: {len(state.latest_objects)} detected + {len(state.static_objects)} static")
if action in ("subir", "bajar") and latest_pose:
delta = cfg.step_mm if action == "subir" else -cfg.step_mm
target = np.array(latest_pose[:3], dtype=np.float64)
target[2] += delta
if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
step = RobotStep(
action="move_to_pose",
payload={
"x": float(target[0]),
"y": float(target[1]),
"z": float(target[2]),
"roll": cfg.default_roll,
"pitch": cfg.default_pitch,
"yaw": cfg.default_yaw,
},
)
state.queue.append(step)
_log(f"Queued: move Z to {target[2]:.1f}mm (delta={delta:+.1f})")
else:
_log(f"Target {target.tolist()} out of bounds, skipping")
elif action in ("ir", "tomar", "soltar"):
target_obj = None
if obj != "no especificado":
target_name = _translate_target(obj, cfg.class_map)
target_color = _translate_target(color, cfg.class_map)
_log(f"Looking for: type={target_name}, color={target_color}")
# Log available objects for debugging
for o in objects:
_log(f" -> Available: {o.get('object_type')} {o.get('color')} {o.get('size')} at {o.get('position_mm')}")
for o in objects:
if o.get("object_type") == target_name:
if color == "no especificado" or o.get("color") == target_color:
if size == "no especificado" or o.get("size") == _translate_target(size, cfg.class_map):
target_obj = o
break
if target_obj:
_log(f"Found target: {target_obj.get('object_type')} {target_obj.get('color')} at {target_obj.get('position_mm')}")
pos = np.array(target_obj["position_mm"], dtype=np.float64)
approach = pos.copy()
approach[2] += cfg.tcp_offset_mm + cfg.approach_offset_mm
target = pos.copy()
target[2] += cfg.tcp_offset_mm
if _within_bounds(approach, cfg.workspace_min, cfg.workspace_max):
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": float(approach[0]),
"y": float(approach[1]),
"z": float(approach[2]),
"roll": cfg.default_roll,
"pitch": cfg.default_pitch,
"yaw": cfg.default_yaw,
},
)
)
_log(f"Queued: approach pose at Z={approach[2]:.1f}mm")
if _within_bounds(target, cfg.workspace_min, cfg.workspace_max):
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": float(target[0]),
"y": float(target[1]),
"z": float(target[2]),
"roll": cfg.default_roll,
"pitch": cfg.default_pitch,
"yaw": cfg.default_yaw,
},
)
)
_log(f"Queued: target pose at Z={target[2]:.1f}mm")
if action == "tomar":
state.queue.append(RobotStep(action="vacuum_on", payload={}))
_log("Queued: vacuum_on")
elif action == "soltar":
state.queue.append(RobotStep(action="vacuum_off", payload={}))
_log("Queued: vacuum_off")
else:
_log(f"Target object not found: {obj} {color}")
continue
elif action == "reiniciar":
_log(f"Reiniciar: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]")
# Turn off vacuum first
state.queue.append(RobotStep(action="vacuum_off", payload={}))
# Clear current detected objects (will be refreshed by detector)
state.latest_objects = []
state.latest_objects_at = None
_log("Cleared detected objects - waiting for fresh detection")
# Move to initial position
state.queue.append(
RobotStep(
action="move_to_pose",
payload={
"x": init_x,
"y": init_y,
"z": init_z,
"roll": init_roll,
"pitch": init_pitch,
"yaw": init_yaw,
},
)
)
_log(f"Queued: vacuum_off + move to home")
# Send scene update to notify clients that scene was reset
scene_payload = json.dumps(
{"objects": list(state.static_objects), "reset": True}
)
node.send_output(
scene_output,
pa.array([scene_payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
_log("Sent scene reset notification")
_log(f"Queue size: {len(state.queue)}")
# Emit scene updates when objects change
if event["id"] == objects_input:
scene_payload = json.dumps(
{"objects": list(state.latest_objects) + list(state.static_objects)}
)
node.send_output(
scene_output,
pa.array([scene_payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
# Send queued robot steps one at a time
if state.pending_command is None and state.queue:
step = state.queue.popleft()
if cfg.dry_run:
_log(f"[DRY RUN] Would send: {step.action} {step.payload}")
state.pending_command = None
continue
cmd_id = _send_dora_command(node, command_output, step.action, step.payload)
state.pending_command = {"id": cmd_id, "action": step.action}
_log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={len(state.queue)}")
# Update debug state
shared_state.update_robot_command(
{"id": cmd_id, "action": step.action, "payload": step.payload},
time.monotonic(),
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,38 @@
"""Pydantic models for the voice control API."""
from __future__ import annotations
from typing import Optional
from pydantic import BaseModel
class CommandRequest(BaseModel):
"""Request to send a voice command."""
text: str
class CommandResponse(BaseModel):
"""Response from a voice command."""
ok: bool
text: str
status: str
class MoveRequest(BaseModel):
"""Request to move to a position."""
x: float
y: float
z: float
roll: Optional[float] = 180.0
pitch: Optional[float] = 0.0
yaw: Optional[float] = 0.0
class VacuumRequest(BaseModel):
"""Request to control the vacuum."""
on: bool

View File

@@ -0,0 +1,118 @@
"""Voice command parsing logic."""
from __future__ import annotations
import json
import os
import unicodedata
from typing import Dict
def normalize(text: str) -> str:
"""Normalize text: lowercase, remove accents."""
text = text.lower().strip()
text = unicodedata.normalize("NFKD", text)
text = "".join([c for c in text if not unicodedata.combining(c)])
return text
def rule_parse(transcript: str) -> Dict[str, str]:
"""Parse voice command using rule-based approach."""
text = normalize(transcript)
action = "error"
if any(w in text for w in ["reiniciar", "reinicia", "reset"]):
action = "reiniciar"
elif any(w in text for w in ["sube", "subir", "arriba"]):
action = "subir"
elif any(w in text for w in ["baja", "bajar", "abajo"]):
action = "bajar"
elif any(w in text for w in ["soltar", "deja", "dejar"]):
action = "soltar"
elif any(w in text for w in ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"]):
action = "tomar"
elif any(w in text for w in ["ir", "ve", "mover", "muevete", "acercar"]):
action = "ir"
color = "no especificado"
if "rojo" in text:
color = "rojo"
elif "azul" in text:
color = "azul"
elif "amarillo" in text:
color = "amarillo"
elif "blanco" in text:
color = "blanco"
obj = "no especificado"
if "estrella" in text:
obj = "estrella"
elif "cilindro" in text:
obj = "cilindro"
elif "cubo" in text:
obj = "cubo"
elif "caja" in text:
obj = "caja"
size = "no especificado"
if "grande" in text:
size = "grande"
elif "pequeno" in text or "pequeño" in text or "chico" in text:
size = "pequeno"
if action == "error":
return {"resultado": "error"}
return {
"resultado": "ok",
"accion": action,
"objeto": obj,
"color": color,
"tamano": size,
}
def build_gemini_prompt(transcript: str) -> str:
"""Build prompt for Gemini LLM parsing."""
return f"""Interpreta el siguiente comando de voz de un niño, convertido a texto, para controlar
un robot (manito). Asegúrate de responder con 'accion', 'objeto', 'color' y 'tamano'. Si el color
o el tamaño no están especificados, responde con 'no especificado'. Si no entiendes la frase,
responde con 'resultado: error'. En caso contrario, responde con 'resultado: ok'. Las acciones
posibles son 'bajar', 'subir', 'soltar', 'tomar', 'ir', 'reiniciar'. Los colores posibles son 'rojo',
'blanco','azul' y 'amarillo'. Los tamaños posibles son 'grande', 'pequeno'. Los posible objetos son estrella,
cilindro, cubo y caja; cualquier otro objeto es error.
Comando: "{transcript}"
Nota: Los comandos pueden incluir variaciones en la expresión y errores comunes en el lenguaje de
los niños. Normaliza la respuesta a las categorías establecidas. La salida es un json con los campos
'resultado', 'accion', 'objeto', 'color' y 'tamano'. Adicionalmente los ninos pueden decir tomar,chupar, succionar o similar para tomar un objeto.
"""
def parse_command(transcript: str, llm_provider: str = "rules") -> Dict[str, str]:
"""Parse voice command using specified provider."""
if llm_provider == "gemini":
try:
from google import genai
from google.genai import types
except Exception:
return rule_parse(transcript)
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
return rule_parse(transcript)
try:
client = genai.Client(api_key=api_key)
prompt = build_gemini_prompt(transcript)
reply = client.models.generate_content(
model=os.getenv("GEMINI_MODEL", "gemini-2.0-flash"),
contents=prompt,
config=types.GenerateContentConfig(temperature=0.5),
)
raw = str(reply.text).replace("```json", "").replace("```", "")
return json.loads(raw)
except json.JSONDecodeError:
return {"resultado": "error"}
except Exception:
return rule_parse(transcript)
else:
return rule_parse(transcript)

View File

@@ -0,0 +1,158 @@
"""Shared state management for voice control node."""
from __future__ import annotations
import threading
from collections import deque
from dataclasses import dataclass, field
from typing import Any, Deque, Dict, List, Optional
@dataclass
class RobotStep:
"""A single step in the robot command queue."""
action: str
payload: Dict[str, Any]
@dataclass
class VoiceState:
"""Runtime state for voice control."""
latest_pose: Optional[List[float]] = None
latest_pose_at: Optional[float] = None
latest_objects: List[Dict[str, Any]] = field(default_factory=list)
latest_objects_at: Optional[float] = None
static_objects: List[Dict[str, Any]] = field(default_factory=list)
pending_command: Optional[Dict[str, Any]] = None
queue: Deque[RobotStep] = field(default_factory=deque)
@dataclass
class DebugState:
"""Debug information for the web interface."""
last_voice_input: Optional[str] = None
last_voice_input_at: Optional[float] = None
last_parse_result: Optional[Dict[str, Any]] = None
last_robot_command: Optional[Dict[str, Any]] = None
last_robot_command_at: Optional[float] = None
command_history: List[Dict[str, Any]] = field(default_factory=list)
error_log: List[Dict[str, Any]] = field(default_factory=list)
latest_image: Optional[bytes] = None
latest_image_at: Optional[float] = None
class SharedState:
"""Thread-safe shared state container."""
def __init__(self) -> None:
self._lock = threading.Lock()
self.voice_state = VoiceState()
self.debug_state = DebugState()
self._command_callback: Optional[Any] = None
def set_command_callback(self, callback: Any) -> None:
"""Set callback for sending commands from web interface."""
with self._lock:
self._command_callback = callback
def get_command_callback(self) -> Optional[Any]:
"""Get the command callback."""
with self._lock:
return self._command_callback
def get_status(self) -> Dict[str, Any]:
"""Get current status for web interface."""
with self._lock:
vs = self.voice_state
ds = self.debug_state
return {
"has_pose": vs.latest_pose is not None,
"pose": vs.latest_pose,
"pose_age_ms": _age_ms(vs.latest_pose_at),
"object_count": len(vs.latest_objects),
"static_object_count": len(vs.static_objects),
"queue_size": len(vs.queue),
"has_pending_command": vs.pending_command is not None,
"pending_command": vs.pending_command,
"last_voice_input": ds.last_voice_input,
"last_voice_input_age_ms": _age_ms(ds.last_voice_input_at),
"last_parse_result": ds.last_parse_result,
}
def get_objects(self) -> Dict[str, Any]:
"""Get detected and static objects."""
with self._lock:
return {
"detected": list(self.voice_state.latest_objects),
"static": list(self.voice_state.static_objects),
}
def get_queue(self) -> List[Dict[str, Any]]:
"""Get the command queue."""
with self._lock:
return [{"action": s.action, "payload": s.payload} for s in self.voice_state.queue]
def get_history(self) -> List[Dict[str, Any]]:
"""Get command history."""
with self._lock:
return list(self.debug_state.command_history[-50:])
def get_errors(self) -> List[Dict[str, Any]]:
"""Get error log."""
with self._lock:
return list(self.debug_state.error_log[-50:])
def add_to_history(self, entry: Dict[str, Any]) -> None:
"""Add entry to command history."""
with self._lock:
self.debug_state.command_history.append(entry)
if len(self.debug_state.command_history) > 100:
self.debug_state.command_history = self.debug_state.command_history[-100:]
def add_error(self, error: Dict[str, Any]) -> None:
"""Add entry to error log."""
with self._lock:
self.debug_state.error_log.append(error)
if len(self.debug_state.error_log) > 100:
self.debug_state.error_log = self.debug_state.error_log[-100:]
def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None:
"""Update last voice input info."""
with self._lock:
self.debug_state.last_voice_input = text
self.debug_state.last_voice_input_at = timestamp
self.debug_state.last_parse_result = parse_result
def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None:
"""Update last robot command info."""
with self._lock:
self.debug_state.last_robot_command = command
self.debug_state.last_robot_command_at = timestamp
def update_image(self, image_bytes: bytes, timestamp: float) -> None:
"""Update latest camera image."""
with self._lock:
self.debug_state.latest_image = image_bytes
self.debug_state.latest_image_at = timestamp
def get_image(self) -> Optional[bytes]:
"""Get latest camera image."""
with self._lock:
return self.debug_state.latest_image
def get_image_age_ms(self) -> Optional[int]:
"""Get age of latest image in milliseconds."""
with self._lock:
return _age_ms(self.debug_state.latest_image_at)
def _age_ms(timestamp: Optional[float]) -> Optional[int]:
"""Calculate age in milliseconds from monotonic timestamp."""
import time
if timestamp is None:
return None
return int((time.monotonic() - timestamp) * 1000)

View File

@@ -0,0 +1,700 @@
"""HTML templates for the voice control web interface."""
HTML_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Voice Control Debug</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: 'Segoe UI', system-ui, sans-serif;
background: #1a1a2e;
color: #eee;
min-height: 100vh;
padding: 20px;
}
.header {
text-align: center;
margin-bottom: 20px;
padding-bottom: 15px;
border-bottom: 1px solid #333;
}
.header h1 { color: #00d4ff; font-size: 1.5em; }
.header .status {
margin-top: 8px;
font-size: 0.9em;
}
.status-dot {
display: inline-block;
width: 10px;
height: 10px;
border-radius: 50%;
margin-right: 6px;
}
.status-dot.ok { background: #00ff88; }
.status-dot.warn { background: #ffaa00; }
.status-dot.error { background: #ff4444; }
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
gap: 15px;
max-width: 1400px;
margin: 0 auto;
}
.card {
background: #16213e;
border-radius: 8px;
padding: 15px;
border: 1px solid #0f3460;
}
.card h2 {
color: #00d4ff;
font-size: 1em;
margin-bottom: 12px;
padding-bottom: 8px;
border-bottom: 1px solid #0f3460;
}
/* Command Input */
.command-form {
display: flex;
gap: 10px;
margin-bottom: 15px;
}
.command-form input {
flex: 1;
padding: 10px 12px;
border: 1px solid #0f3460;
border-radius: 6px;
background: #1a1a2e;
color: #fff;
font-size: 14px;
}
.command-form input:focus {
outline: none;
border-color: #00d4ff;
}
.btn {
padding: 10px 20px;
border: none;
border-radius: 6px;
cursor: pointer;
font-weight: 500;
transition: all 0.2s;
}
.btn-primary {
background: #00d4ff;
color: #000;
}
.btn-primary:hover { background: #00b8e0; }
.btn-primary:disabled { background: #555; color: #888; cursor: not-allowed; }
.btn-secondary {
background: #333;
color: #fff;
}
.btn-secondary:hover { background: #444; }
.btn-danger {
background: #ff4444;
color: #fff;
}
.btn-danger:hover { background: #cc3333; }
.btn-success {
background: #00ff88;
color: #000;
}
.btn-success:hover { background: #00cc6a; }
/* Quick Commands */
.quick-commands {
display: flex;
flex-wrap: wrap;
gap: 8px;
}
.quick-btn {
padding: 8px 14px;
background: #0f3460;
border: 1px solid #1a4a7a;
border-radius: 20px;
color: #00d4ff;
cursor: pointer;
font-size: 13px;
transition: all 0.2s;
}
.quick-btn:hover {
background: #1a4a7a;
border-color: #00d4ff;
}
/* Status Grid */
.status-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 10px;
}
.status-item {
background: #1a1a2e;
padding: 10px;
border-radius: 6px;
}
.status-item .label {
font-size: 11px;
color: #888;
text-transform: uppercase;
margin-bottom: 4px;
}
.status-item .value {
font-size: 14px;
font-weight: 500;
}
.status-item .value.ok { color: #00ff88; }
.status-item .value.warn { color: #ffaa00; }
.status-item .value.error { color: #ff4444; }
/* Pose Display */
.pose-grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 8px;
}
.pose-item {
background: #1a1a2e;
padding: 10px;
border-radius: 6px;
text-align: center;
}
.pose-item .label {
font-size: 11px;
color: #888;
margin-bottom: 4px;
}
.pose-item .value {
font-size: 16px;
font-weight: 600;
color: #00d4ff;
font-family: monospace;
}
/* Objects List */
.objects-list {
max-height: 300px;
overflow-y: auto;
}
.object-item {
display: flex;
justify-content: space-between;
align-items: center;
padding: 8px 10px;
background: #1a1a2e;
border-radius: 6px;
margin-bottom: 6px;
font-size: 13px;
}
.object-item .type { color: #00d4ff; font-weight: 500; }
.object-item .color-badge {
padding: 2px 8px;
border-radius: 10px;
font-size: 11px;
}
.color-badge.red { background: #ff4444; color: #fff; }
.color-badge.blue { background: #4488ff; color: #fff; }
.color-badge.yellow { background: #ffcc00; color: #000; }
.color-badge.white { background: #fff; color: #000; }
.object-item .pos {
font-family: monospace;
font-size: 11px;
color: #888;
}
/* Queue Display */
.queue-list {
max-height: 150px;
overflow-y: auto;
}
.queue-item {
display: flex;
align-items: center;
gap: 10px;
padding: 8px 10px;
background: #1a1a2e;
border-radius: 6px;
margin-bottom: 6px;
font-size: 13px;
}
.queue-item .index {
background: #0f3460;
color: #00d4ff;
padding: 2px 8px;
border-radius: 4px;
font-size: 11px;
}
.queue-item .action { color: #00ff88; font-weight: 500; }
.queue-item.pending { border-left: 3px solid #ffaa00; }
/* Log Display */
.log {
max-height: 250px;
overflow-y: auto;
font-family: monospace;
font-size: 12px;
}
.log-entry {
padding: 6px 10px;
border-bottom: 1px solid #0f3460;
}
.log-entry:last-child { border-bottom: none; }
.log-entry .time {
color: #666;
margin-right: 10px;
}
.log-entry.error { color: #ff4444; }
.log-entry.success { color: #00ff88; }
.log-entry.info { color: #00d4ff; }
/* Parse Result */
.parse-result {
background: #1a1a2e;
padding: 12px;
border-radius: 6px;
font-family: monospace;
font-size: 13px;
}
.parse-result .field {
display: flex;
justify-content: space-between;
padding: 4px 0;
border-bottom: 1px solid #0f3460;
}
.parse-result .field:last-child { border-bottom: none; }
.parse-result .key { color: #888; }
.parse-result .val { color: #00d4ff; }
/* Empty State */
.empty {
text-align: center;
color: #666;
padding: 20px;
font-style: italic;
}
/* Camera View */
.camera-container {
position: relative;
background: #000;
border-radius: 6px;
overflow: hidden;
min-height: 240px;
}
.camera-container img {
width: 100%;
height: auto;
display: block;
}
.camera-overlay {
position: absolute;
top: 10px;
right: 10px;
background: rgba(0,0,0,0.6);
padding: 4px 8px;
border-radius: 4px;
font-size: 11px;
}
.camera-overlay.ok { color: #00ff88; }
.camera-overlay.stale { color: #ffaa00; }
.camera-overlay.error { color: #ff4444; }
.no-image {
display: flex;
align-items: center;
justify-content: center;
height: 240px;
color: #666;
font-style: italic;
}
</style>
</head>
<body>
<div class="header">
<h1>Voice Control Debug Interface</h1>
<div class="status">
<span class="status-dot" id="status-dot"></span>
<span id="status-text">Connecting...</span>
</div>
</div>
<div class="grid">
<!-- Camera View -->
<div class="card" style="grid-column: span 2;">
<h2>Camera View <span id="camera-status" style="font-weight:normal;font-size:11px;color:#888;"></span></h2>
<div class="camera-container" id="camera-container">
<div class="no-image" id="no-image">No camera image available</div>
<img id="camera-img" style="display:none;" alt="Camera feed">
<div class="camera-overlay" id="camera-overlay"></div>
</div>
</div>
<!-- Command Input -->
<div class="card">
<h2>Send Command</h2>
<form class="command-form" id="command-form">
<input type="text" id="command-input" placeholder="Enter voice command (e.g., 'sube', 'agarra el cubo rojo')" autocomplete="off">
<button type="submit" class="btn btn-primary" id="btn-send">Send</button>
</form>
<div class="quick-commands">
<button class="quick-btn" onclick="sendQuick('sube')">Sube</button>
<button class="quick-btn" onclick="sendQuick('baja')">Baja</button>
<button class="quick-btn" onclick="sendQuick('agarra el cubo rojo')">Cubo Rojo</button>
<button class="quick-btn" onclick="sendQuick('agarra el cubo azul')">Cubo Azul</button>
<button class="quick-btn" onclick="sendQuick('suelta')">Soltar</button>
<button class="quick-btn" onclick="sendQuick('reinicia')">Reiniciar</button>
</div>
</div>
<!-- Status -->
<div class="card">
<h2>System Status</h2>
<div class="status-grid">
<div class="status-item">
<div class="label">Pose Available</div>
<div class="value" id="st-pose">--</div>
</div>
<div class="status-item">
<div class="label">Pose Age</div>
<div class="value" id="st-pose-age">--</div>
</div>
<div class="status-item">
<div class="label">Objects Detected</div>
<div class="value" id="st-objects">--</div>
</div>
<div class="status-item">
<div class="label">Static Objects</div>
<div class="value" id="st-static">--</div>
</div>
<div class="status-item">
<div class="label">Queue Size</div>
<div class="value" id="st-queue">--</div>
</div>
<div class="status-item">
<div class="label">Pending Command</div>
<div class="value" id="st-pending">--</div>
</div>
</div>
</div>
<!-- TCP Pose -->
<div class="card">
<h2>TCP Pose</h2>
<div class="pose-grid">
<div class="pose-item">
<div class="label">X (mm)</div>
<div class="value" id="pose-x">--</div>
</div>
<div class="pose-item">
<div class="label">Y (mm)</div>
<div class="value" id="pose-y">--</div>
</div>
<div class="pose-item">
<div class="label">Z (mm)</div>
<div class="value" id="pose-z">--</div>
</div>
<div class="pose-item">
<div class="label">Roll</div>
<div class="value" id="pose-roll">--</div>
</div>
<div class="pose-item">
<div class="label">Pitch</div>
<div class="value" id="pose-pitch">--</div>
</div>
<div class="pose-item">
<div class="label">Yaw</div>
<div class="value" id="pose-yaw">--</div>
</div>
</div>
</div>
<!-- Last Parse Result -->
<div class="card">
<h2>Last Parse Result</h2>
<div class="parse-result" id="parse-result">
<div class="empty">No command parsed yet</div>
</div>
<div style="margin-top: 10px; font-size: 12px; color: #888;">
<span>Last input: </span><span id="last-input">--</span>
</div>
</div>
<!-- Detected Objects -->
<div class="card">
<h2>Detected Objects</h2>
<div class="objects-list" id="objects-list">
<div class="empty">No objects detected</div>
</div>
</div>
<!-- Command Queue -->
<div class="card">
<h2>Command Queue</h2>
<div class="queue-list" id="queue-list">
<div class="empty">Queue is empty</div>
</div>
<div style="margin-top: 10px;">
<button class="btn btn-danger btn-sm" onclick="clearQueue()">Clear Queue</button>
</div>
</div>
<!-- Log -->
<div class="card" style="grid-column: span 2;">
<h2>Activity Log</h2>
<div class="log" id="log"></div>
</div>
</div>
<script>
const $ = id => document.getElementById(id);
async function fetchJson(url, opts = {}) {
try {
const res = await fetch(url, opts);
return await res.json();
} catch (e) {
return { error: e.message };
}
}
function log(msg, type = 'info') {
const logEl = $('log');
const time = new Date().toLocaleTimeString();
const entry = document.createElement('div');
entry.className = 'log-entry ' + type;
entry.innerHTML = '<span class="time">' + time + '</span>' + msg;
logEl.insertBefore(entry, logEl.firstChild);
if (logEl.children.length > 100) logEl.removeChild(logEl.lastChild);
}
async function updateStatus() {
const data = await fetchJson('/api/status');
if (data.error) {
$('status-dot').className = 'status-dot error';
$('status-text').textContent = 'Error: ' + data.error;
return;
}
$('status-dot').className = 'status-dot ok';
$('status-text').textContent = 'Connected';
$('st-pose').textContent = data.has_pose ? 'Yes' : 'No';
$('st-pose').className = 'value ' + (data.has_pose ? 'ok' : 'warn');
$('st-pose-age').textContent = data.pose_age_ms !== null ? data.pose_age_ms + 'ms' : '--';
$('st-pose-age').className = 'value ' + (data.pose_age_ms < 1000 ? 'ok' : 'warn');
$('st-objects').textContent = data.object_count;
$('st-static').textContent = data.static_object_count;
$('st-queue').textContent = data.queue_size;
$('st-queue').className = 'value ' + (data.queue_size > 0 ? 'warn' : 'ok');
$('st-pending').textContent = data.has_pending_command ? 'Yes' : 'No';
$('st-pending').className = 'value ' + (data.has_pending_command ? 'warn' : 'ok');
// Update pose
if (data.pose) {
$('pose-x').textContent = data.pose[0].toFixed(1);
$('pose-y').textContent = data.pose[1].toFixed(1);
$('pose-z').textContent = data.pose[2].toFixed(1);
$('pose-roll').textContent = data.pose[3].toFixed(1);
$('pose-pitch').textContent = data.pose[4].toFixed(1);
$('pose-yaw').textContent = data.pose[5].toFixed(1);
}
// Update last input
$('last-input').textContent = data.last_voice_input || '--';
// Update parse result
if (data.last_parse_result) {
let html = '';
for (const [k, v] of Object.entries(data.last_parse_result)) {
html += '<div class="field"><span class="key">' + k + '</span><span class="val">' + v + '</span></div>';
}
$('parse-result').innerHTML = html;
}
}
async function updateObjects() {
const data = await fetchJson('/api/objects');
if (data.error) return;
const list = $('objects-list');
const detected = data.detected || [];
const staticObjs = data.static || [];
if (detected.length === 0 && staticObjs.length === 0) {
list.innerHTML = '<div class="empty">No objects detected</div>';
return;
}
let html = '';
// Detected objects
if (detected.length > 0) {
html += '<div style="font-size:11px;color:#00d4ff;margin-bottom:6px;">Detected (' + detected.length + ')</div>';
html += detected.map(obj => {
const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
const colorClass = obj.color || 'white';
const conf = obj.confidence ? (obj.confidence * 100).toFixed(0) + '%' : '';
const size = obj.size || '';
return '<div class="object-item">' +
'<span class="type">' + (obj.object_type || '?') + '</span>' +
'<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
'<span style="color:#888;font-size:10px;">' + size + '</span>' +
'<span style="color:#00ff88;font-size:10px;">' + conf + '</span>' +
'<span class="pos">[' + pos + ']</span>' +
'</div>';
}).join('');
}
// Static objects
if (staticObjs.length > 0) {
html += '<div style="font-size:11px;color:#888;margin:8px 0 6px 0;">Static (' + staticObjs.length + ')</div>';
html += staticObjs.map(obj => {
const pos = obj.position_mm ? obj.position_mm.map(v => v.toFixed(0)).join(', ') : '--';
const colorClass = obj.color || 'white';
return '<div class="object-item" style="opacity:0.7;">' +
'<span class="type">' + (obj.object_type || '?') + '</span>' +
'<span class="color-badge ' + colorClass + '">' + (obj.color || '?') + '</span>' +
'<span class="pos">[' + pos + ']</span>' +
'</div>';
}).join('');
}
list.innerHTML = html;
}
async function updateQueue() {
const data = await fetchJson('/api/queue');
if (data.error) return;
const list = $('queue-list');
if (!data.length) {
list.innerHTML = '<div class="empty">Queue is empty</div>';
return;
}
list.innerHTML = data.map((item, i) => {
const payload = JSON.stringify(item.payload || {});
return '<div class="queue-item">' +
'<span class="index">' + (i + 1) + '</span>' +
'<span class="action">' + item.action + '</span>' +
'<span style="color:#888;font-size:11px">' + payload + '</span>' +
'</div>';
}).join('');
}
async function sendCommand(text) {
if (!text.trim()) return;
$('btn-send').disabled = true;
log('Sending: "' + text + '"', 'info');
const res = await fetchJson('/api/command', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text: text })
});
$('btn-send').disabled = false;
if (res.ok) {
log('Response: ' + res.text, 'success');
} else {
log('Error: ' + (res.text || res.detail || 'Unknown error'), 'error');
}
$('command-input').value = '';
updateStatus();
updateQueue();
}
function sendQuick(text) {
$('command-input').value = text;
sendCommand(text);
}
async function clearQueue() {
log('Clearing queue...', 'info');
const res = await fetchJson('/api/queue/clear', { method: 'POST' });
if (res.ok) {
log('Queue cleared', 'success');
} else {
log('Failed to clear queue', 'error');
}
updateQueue();
}
// Form submit
$('command-form').addEventListener('submit', e => {
e.preventDefault();
sendCommand($('command-input').value);
});
// Camera update
let cameraErrorCount = 0;
async function updateCamera() {
const info = await fetchJson('/api/image/info');
const overlay = $('camera-overlay');
const img = $('camera-img');
const noImage = $('no-image');
const status = $('camera-status');
if (info.error || !info.has_image) {
cameraErrorCount++;
if (cameraErrorCount > 3) {
img.style.display = 'none';
noImage.style.display = 'flex';
overlay.textContent = '';
status.textContent = '(no feed)';
}
return;
}
cameraErrorCount = 0;
noImage.style.display = 'none';
img.style.display = 'block';
// Update image with cache-busting
const newSrc = '/api/image?t=' + Date.now();
if (img.src !== newSrc) {
img.src = newSrc;
}
// Update overlay
const ageMs = info.age_ms || 0;
if (ageMs < 500) {
overlay.textContent = 'LIVE';
overlay.className = 'camera-overlay ok';
} else if (ageMs < 2000) {
overlay.textContent = ageMs + 'ms';
overlay.className = 'camera-overlay stale';
} else {
overlay.textContent = 'STALE ' + (ageMs/1000).toFixed(1) + 's';
overlay.className = 'camera-overlay error';
}
status.textContent = '';
}
// Auto-refresh
setInterval(updateStatus, 500);
setInterval(updateObjects, 1000);
setInterval(updateQueue, 500);
setInterval(updateCamera, 100);
// Initial load
updateStatus();
updateObjects();
updateQueue();
updateCamera();
log('Interface loaded', 'info');
</script>
</body>
</html>
"""

View File

@@ -0,0 +1,25 @@
[project]
name = "dora-voice-control"
version = "0.1.0"
license = { file = "MIT" }
authors = [{ name = "Dora" }]
description = "Dora node for voice command control via WebSocket"
requires-python = ">=3.8"
dependencies = [
"dora-rs >= 0.3.9",
"numpy < 2.0.0",
"pyarrow >= 12.0.0",
"websockets >= 12.0",
"fastapi >= 0.109.0",
"uvicorn >= 0.27.0",
"pydantic >= 2.0.0",
"opencv-python >= 4.8.0",
]
[project.optional-dependencies]
llm = ["google-genai"]
[project.scripts]
dora-voice-control = "dora_voice_control.main:main"

BIN
trained_models/yolo8n.pt Normal file

Binary file not shown.