diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..56de483 --- /dev/null +++ b/config.toml @@ -0,0 +1,22 @@ +[roi] +roi_top_left = [500, 230] +roi_bottom_right = [775, 510] + +[bucket_positions] +blue_bucket_pos = [400, 90] +red_bucket_pos = [550, 90] +yellow_bucket_pos = [550, 630] +white_bucket_pos = [400, 630] + +[dominant_colors] +blue = [255, 0, 0] +red = [0, 0, 255] +yellow = [0, 255, 255] +white = [255, 255, 255] + +[object_parameters] +size_threshold = 4200 +big_height = 125.9 +small_height = 106.0 +bottom_height = 68.0 +normal_height = 220.0 diff --git a/dataflow_voice_control_ulite6_zed.yml b/dataflow_voice_control_ulite6_zed.yml new file mode 100644 index 0000000..a5e89f6 --- /dev/null +++ b/dataflow_voice_control_ulite6_zed.yml @@ -0,0 +1,133 @@ +nodes: + - id: zed_camera_cpp + build: bash -lc "cmake -S dora_zed_cpp -B dora_zed_cpp/build && cmake --build dora_zed_cpp/build" + path: dora_zed_cpp/build/dora_zed_cpp + env: + ZED_RESOLUTION: "720" + ZED_FPS: "15" + ZED_DEPTH_MODE: "NEURAL" + ZED_DEPTH_MIN_MM: "10" + ZED_DEPTH_MAX_MM: "600" + ZED_DEPTH_FILL: "false" + ZED_FLIP: "ON" + ZED_WARMUP_FRAMES: "30" + inputs: + tick: dora/timer/millis/100 + outputs: + - image_bgr + - camera_info + - point_cloud + - id: ulite6 + build: uv pip install -e dora_ulite6 + path: dora_ulite6/dora_ulite6/main.py + inputs: + tick: dora/timer/millis/10 + command: voice/robot_cmd + outputs: + - tcp_pose + - status + env: + ROBOT_IP: "192.168.1.192" + DEFAULT_SPEED: "30" + DEFAULT_UNITS: "mm" + API_HOST: "0.0.0.0" + API_PORT: "9000" + VACUUM_ENABLED: "true" + # Initial position on startup: "home", "pose", or "none" + # Set to "none" - voice control handles initial positioning + INIT_MODE: "none" + - id: iobridge + build: | + uv venv -p 3.12 --seed --allow-existing + uv pip install -e dora_iobridge + path: dora_iobridge/dora_iobridge/main.py + env: + VIRTUAL_ENV: ./.venv + VOICE_HOST: "0.0.0.0" + VOICE_PORT: "8765" + VOICE_IN_OUTPUT: "voice_in" + VOICE_OUT_INPUT: "voice_out" + SCENE_INPUT: "scene_update" + inputs: + voice_out: voice/voice_out + scene_update: voice/scene_update + tick: dora/timer/millis/100 + outputs: + - voice_in + - id: detector + build: | + uv venv -p 3.12 --seed --allow-existing + uv pip install -e dora_detector + path: dora_detector/dora_detector/main.py + env: + VIRTUAL_ENV: ./.venv + IMAGE_INPUT: "image_bgr" + POINT_CLOUD_INPUT: "point_cloud" + POSE_INPUT: "tcp_pose" + OBJECTS_OUTPUT: "objects" + IMAGE_OUTPUT: "image_annotated" + CALIBRATION_FILE: "calibration_ulite6_zed.npz" + DETECTOR_WEIGHTS: "trained_models/yolo8n.pt" + CONFIG_FILE: "config.toml" + ROI_TOP_LEFT: "500,230" + ROI_BOTTOM_RIGHT: "775,510" + SIZE_THRESHOLD: "4200" + DETECT_EVERY_N: "3" + MIN_DEPTH_MM: "10" + MAX_DEPTH_MM: "600" + inputs: + image_bgr: zed_camera_cpp/image_bgr + point_cloud: zed_camera_cpp/point_cloud + tcp_pose: ulite6/tcp_pose + tick: dora/timer/millis/100 + outputs: + - objects + - image_annotated + - id: voice + build: | + uv venv -p 3.12 --seed --allow-existing + uv pip install -e dora_voice_control + path: dora_voice_control/dora_voice_control/main.py + env: + VIRTUAL_ENV: ./.venv + OBJECTS_INPUT: "objects" + POSE_INPUT: "tcp_pose" + STATUS_INPUT: "status" + COMMAND_OUTPUT: "robot_cmd" + CONFIG_FILE: "config.toml" + # Map Spanish command names to detector class names + CLASS_MAP: '{"cilindro": "cylinder", "cubo": "cube", "estrella": "star", "caja": "box", "amarillo": "yellow", "rojo": "red", "azul": "blue", "blanco": "white", "grande": "big", "pequeno": "small"}' + VOICE_IN_INPUT: "voice_in" + VOICE_OUT_OUTPUT: "voice_out" + SCENE_OUTPUT: "scene_update" + TCP_OFFSET_MM: "63.0" + APPROACH_OFFSET_MM: "50.0" + STEP_MM: "20.0" + DEFAULT_ROLL: "180.0" + DEFAULT_PITCH: "0.0" + DEFAULT_YAW: "0.0" + DRY_RUN: "false" + # Initial position (used on startup and reset command) + INIT_ON_START: "true" + INIT_X: "300.0" + INIT_Y: "0.0" + INIT_Z: "350.0" + INIT_ROLL: "180.0" + INIT_PITCH: "0.0" + INIT_YAW: "0.0" + IMAGE_INPUT: "image_annotated" + IMAGE_WIDTH: "1280" + IMAGE_HEIGHT: "720" + API_ENABLED: "true" + API_PORT: "8080" + inputs: + objects: detector/objects + tcp_pose: ulite6/tcp_pose + status: ulite6/status + voice_in: iobridge/voice_in + image_annotated: detector/image_annotated + tick: dora/timer/millis/100 + outputs: + - robot_cmd + - voice_out + - scene_update diff --git a/dora_detector/dora_detector/__init__.py b/dora_detector/dora_detector/__init__.py new file mode 100644 index 0000000..c107eaf --- /dev/null +++ b/dora_detector/dora_detector/__init__.py @@ -0,0 +1 @@ +"""Dora detector node package.""" diff --git a/dora_detector/dora_detector/main.py b/dora_detector/dora_detector/main.py new file mode 100644 index 0000000..717fb8d --- /dev/null +++ b/dora_detector/dora_detector/main.py @@ -0,0 +1,513 @@ +"""Dora node for YOLO detection and base-frame object localization.""" + +from __future__ import annotations + +import json +import os +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +import cv2 +import numpy as np +import pyarrow as pa +from dora import Node +from ultralytics import YOLO + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover + import tomli as tomllib + + +DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt") + + +@dataclass +class DetectionConfig: + imgsz: int + conf: float + iou: float + size_threshold: int + roi_top_left: Tuple[int, int] + roi_bottom_right: Tuple[int, int] + use_roi: bool + detect_every_n: int + min_depth_mm: float + max_depth_mm: float + color_blue: Tuple[int, int, int] + color_red: Tuple[int, int, int] + color_yellow: Tuple[int, int, int] + color_white: Tuple[int, int, int] + + +def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]: + try: + parts = [p.strip() for p in raw.split(",")] + if len(parts) >= 2: + return int(parts[0]), int(parts[1]) + except Exception: + pass + return default + + +def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, float]: + try: + parts = [p.strip() for p in raw.split(",")] + if len(parts) >= 2: + return float(parts[0]), float(parts[1]) + except Exception: + pass + return default + + +def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int]: + try: + parts = [p.strip() for p in raw.split(",")] + if len(parts) >= 3: + return int(parts[0]), int(parts[1]), int(parts[2]) + except Exception: + pass + return default + + +def _rotation_matrix_xyz( + roll_deg: float, pitch_deg: float, yaw_deg: float +) -> np.ndarray: + roll = np.deg2rad(roll_deg) + pitch = np.deg2rad(pitch_deg) + yaw = np.deg2rad(yaw_deg) + cx, sx = np.cos(roll), np.sin(roll) + cy, sy = np.cos(pitch), np.sin(pitch) + cz, sz = np.cos(yaw), np.sin(yaw) + rot_x = np.array([[1.0, 0.0, 0.0], [0.0, cx, -sx], [0.0, sx, cx]]) + rot_y = np.array([[cy, 0.0, sy], [0.0, 1.0, 0.0], [-sy, 0.0, cy]]) + rot_z = np.array([[cz, -sz, 0.0], [sz, cz, 0.0], [0.0, 0.0, 1.0]]) + return rot_z @ rot_y @ rot_x + + +def _pose_to_matrix(tcp_pose_mm_deg: List[float]) -> np.ndarray: + x, y, z, roll, pitch, yaw = tcp_pose_mm_deg + rot = _rotation_matrix_xyz(roll, pitch, yaw) + mat = np.eye(4) + mat[:3, :3] = rot + mat[:3, 3] = np.array([x, y, z], dtype=np.float64) / 1000.0 + return mat + + +def _decode_image(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray: + encoding = str(metadata.get("encoding", "bgr8")).lower() + width = metadata.get("width") + height = metadata.get("height") + if (width is None or height is None) and "shape" in metadata: + shape = metadata.get("shape") + if isinstance(shape, (list, tuple)) and len(shape) >= 2: + height = height if height is not None else int(shape[0]) + width = width if width is not None else int(shape[1]) + if width is None or height is None: + raise KeyError("width/height (or shape) missing from metadata") + + if encoding == "bgr8": + frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3)) + return frame.copy() + if encoding == "rgb8": + frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3)) + return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + if encoding in {"jpeg", "jpg", "jpe", "bmp", "webp", "png"}: + frame = storage.to_numpy().astype(np.uint8) + return cv2.imdecode(frame, cv2.IMREAD_COLOR) + if encoding == "yuv420": + yuv = storage.to_numpy().astype(np.uint8) + yuv = yuv[: width * height * 3 // 2].reshape((height * 3 // 2, width)) + return cv2.cvtColor(yuv, cv2.COLOR_YUV420p2BGR) + + raise RuntimeError(f"Unsupported image encoding: {encoding}") + + +def _decode_point_cloud(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray: + dtype_str = str(metadata.get("dtype", "float32")) + if dtype_str != "float32": + raise RuntimeError(f"Unsupported point cloud dtype: {dtype_str}") + shape = metadata.get("shape") + if not isinstance(shape, (list, tuple)) or len(shape) < 3: + raise KeyError("point cloud shape missing from metadata") + height, width, channels = [int(v) for v in shape[:3]] + if channels < 3: + raise ValueError("point cloud requires at least 3 channels") + raw = storage.to_numpy().astype(np.uint8).tobytes() + values = np.frombuffer(raw, dtype=np.float32) + return values.reshape((height, width, channels)) + + +def _valid_point(point_xyz: np.ndarray, cfg: DetectionConfig) -> bool: + if not np.all(np.isfinite(point_xyz)): + return False + z = float(point_xyz[2]) + if z <= 0: + return False + if z < cfg.min_depth_mm or z > cfg.max_depth_mm: + return False + return True + + +def _sample_point( + point_cloud: np.ndarray, x: int, y: int, cfg: DetectionConfig +) -> Optional[np.ndarray]: + h, w, _ = point_cloud.shape + if x < 0 or y < 0 or x >= w or y >= h: + return None + point_xyz = point_cloud[y, x, :3].astype(np.float64) + if _valid_point(point_xyz, cfg): + return point_xyz + radius = 3 + samples = [] + for dy in range(-radius, radius + 1): + for dx in range(-radius, radius + 1): + xx = x + dx + yy = y + dy + if xx < 0 or yy < 0 or xx >= w or yy >= h: + continue + p = point_cloud[yy, xx, :3].astype(np.float64) + if _valid_point(p, cfg): + samples.append(p) + if not samples: + return None + return np.median(np.stack(samples, axis=0), axis=0) + + +def _dominant_color(image: np.ndarray, bbox: List[int]) -> Tuple[int, int, int]: + x1, y1, x2, y2 = bbox + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(image.shape[1], x2) + y2 = min(image.shape[0], y2) + roi = image[y1:y2, x1:x2] + if roi.size == 0: + return (0, 0, 0) + color = np.median(roi, axis=(0, 1)).astype(int) + return int(color[0]), int(color[1]), int(color[2]) + + +def _closest_color(color: Tuple[int, int, int], cfg: DetectionConfig) -> str: + colors = { + "blue": np.array(cfg.color_blue, dtype=np.float64), + "red": np.array(cfg.color_red, dtype=np.float64), + "yellow": np.array(cfg.color_yellow, dtype=np.float64), + "white": np.array(cfg.color_white, dtype=np.float64), + } + color_vec = np.array(color, dtype=np.float64) + best_name = "unknown" + best_dist = float("inf") + for name, value in colors.items(): + dist = np.linalg.norm(color_vec - value) + if dist < best_dist: + best_name = name + best_dist = dist + return best_name + + +def _load_calibration(calibration_file: str) -> np.ndarray: + calib = np.load(calibration_file, allow_pickle=True) + t_cam2gripper = calib["T_cam2gripper"] + return t_cam2gripper + + +def _load_config_file(path: str) -> Dict[str, Any]: + if not path or not os.path.exists(path): + return {} + try: + with open(path, "rb") as handle: + return tomllib.load(handle) + except Exception: + return {} + + +def _build_config(config_path: str) -> DetectionConfig: + cfg_data = _load_config_file(config_path) + roi_cfg = cfg_data.get("roi", {}) + colors_cfg = cfg_data.get("dominant_colors", {}) + obj_cfg = cfg_data.get("object_parameters", {}) + + imgsz = int(os.getenv("YOLO_IMGSZ", "640")) + conf = float(os.getenv("YOLO_CONF", "0.25")) + iou = float(os.getenv("YOLO_IOU", "0.45")) + size_threshold = int( + os.getenv("SIZE_THRESHOLD", str(obj_cfg.get("size_threshold", 4200))) + ) + roi_top_left = _parse_int_pair( + os.getenv( + "ROI_TOP_LEFT", + ",".join([str(v) for v in roi_cfg.get("roi_top_left", [500, 230])]), + ), + (500, 230), + ) + roi_bottom_right = _parse_int_pair( + os.getenv( + "ROI_BOTTOM_RIGHT", + ",".join([str(v) for v in roi_cfg.get("roi_bottom_right", [775, 510])]), + ), + (775, 510), + ) + use_roi = os.getenv("USE_ROI", "true").lower() in ("true", "1", "yes") + detect_every_n = int(os.getenv("DETECT_EVERY_N", "3")) + min_depth_mm = float(os.getenv("MIN_DEPTH_MM", "10")) + max_depth_mm = float(os.getenv("MAX_DEPTH_MM", "600")) + color_blue = _parse_color( + os.getenv( + "COLOR_BLUE", + ",".join([str(v) for v in colors_cfg.get("blue", [255, 0, 0])]), + ), + (255, 0, 0), + ) + color_red = _parse_color( + os.getenv( + "COLOR_RED", + ",".join([str(v) for v in colors_cfg.get("red", [0, 0, 255])]), + ), + (0, 0, 255), + ) + color_yellow = _parse_color( + os.getenv( + "COLOR_YELLOW", + ",".join([str(v) for v in colors_cfg.get("yellow", [0, 255, 255])]), + ), + (0, 255, 255), + ) + color_white = _parse_color( + os.getenv( + "COLOR_WHITE", + ",".join([str(v) for v in colors_cfg.get("white", [255, 255, 255])]), + ), + (255, 255, 255), + ) + return DetectionConfig( + imgsz=imgsz, + conf=conf, + iou=iou, + size_threshold=size_threshold, + roi_top_left=roi_top_left, + roi_bottom_right=roi_bottom_right, + use_roi=use_roi, + detect_every_n=detect_every_n, + min_depth_mm=min_depth_mm, + max_depth_mm=max_depth_mm, + color_blue=color_blue, + color_red=color_red, + color_yellow=color_yellow, + color_white=color_white, + ) + + +def _within_roi(bbox: List[int], cfg: DetectionConfig) -> bool: + if not cfg.use_roi: + return True + x1, y1, x2, y2 = bbox + rx1, ry1 = cfg.roi_top_left + rx2, ry2 = cfg.roi_bottom_right + return x1 >= rx1 and y1 >= ry1 and x2 <= rx2 and y2 <= ry2 + + +def _draw_detections( + frame: np.ndarray, objects: List[Dict[str, Any]], cfg: DetectionConfig +) -> np.ndarray: + """Draw bounding boxes and labels on frame.""" + annotated = frame.copy() + + # Draw ROI rectangle (always visible) + cv2.rectangle( + annotated, + cfg.roi_top_left, + cfg.roi_bottom_right, + (0, 255, 0) if cfg.use_roi else (128, 128, 128), + 2, + ) + # Label the ROI + cv2.putText( + annotated, + "ROI", + (cfg.roi_top_left[0] + 5, cfg.roi_top_left[1] + 20), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 0) if cfg.use_roi else (128, 128, 128), + 2, + ) + + # Color mapping for visualization + color_map = { + "blue": (255, 100, 0), + "red": (0, 0, 255), + "yellow": (0, 255, 255), + "white": (200, 200, 200), + "unknown": (128, 128, 128), + } + + for obj in objects: + bbox = obj.get("bbox", [0, 0, 0, 0]) + color_name = obj.get("color", "unknown") + obj_type = obj.get("object_type", "?") + size = obj.get("size", "?") + pos = obj.get("position_mm", [0, 0, 0]) + + color = color_map.get(color_name, (128, 128, 128)) + + # Draw bounding box + cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2) + + # Draw label background + label = f"{obj_type} {color_name} {size}" + (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + cv2.rectangle( + annotated, + (bbox[0], bbox[1] - th - 8), + (bbox[0] + tw + 4, bbox[1]), + color, + -1, + ) + cv2.putText( + annotated, + label, + (bbox[0] + 2, bbox[1] - 4), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 0, 0), + 1, + ) + + # Draw position + pos_label = f"[{pos[0]:.0f}, {pos[1]:.0f}, {pos[2]:.0f}]" + cv2.putText( + annotated, + pos_label, + (bbox[0], bbox[3] + 15), + cv2.FONT_HERSHEY_SIMPLEX, + 0.4, + color, + 1, + ) + + return annotated + + +def main() -> None: + image_input = os.getenv("IMAGE_INPUT", "image_bgr") + point_cloud_input = os.getenv("POINT_CLOUD_INPUT", "point_cloud") + pose_input = os.getenv("POSE_INPUT", "tcp_pose") + objects_output = os.getenv("OBJECTS_OUTPUT", "objects") + image_output = os.getenv("IMAGE_OUTPUT", "image_annotated") + calibration_file = os.getenv("CALIBRATION_FILE", "calibration.npz") + weights = os.getenv("DETECTOR_WEIGHTS", DEFAULT_WEIGHTS) + config_file = os.getenv("CONFIG_FILE", "config.toml") + + cfg = _build_config(config_file) + model = YOLO(weights) + + if calibration_file and not os.path.isabs(calibration_file): + config_path = os.path.join("config", calibration_file) + calibration_file = config_path if os.path.exists(config_path) else calibration_file + t_cam2gripper = _load_calibration(calibration_file) + + node = Node() + latest_pose: Optional[List[float]] = None + latest_pose_at: Optional[float] = None + latest_point_cloud: Optional[np.ndarray] = None + latest_pc_at: Optional[float] = None + frame_count = 0 + + for event in node: + if event["type"] != "INPUT": + continue + + now = time.monotonic() + if event["id"] == pose_input: + tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1) + if tcp_pose.size >= 6: + latest_pose = tcp_pose[:6].tolist() + latest_pose_at = now + continue + + if event["id"] == point_cloud_input: + latest_point_cloud = _decode_point_cloud(event["value"], event.get("metadata", {})) + latest_pc_at = now + continue + + if event["id"] != image_input: + continue + + frame_count += 1 + if frame_count % max(1, cfg.detect_every_n) != 0: + continue + + if latest_pose is None or latest_point_cloud is None: + continue + + frame = _decode_image(event["value"], event.get("metadata", {})) + + results = model.predict( + frame, imgsz=cfg.imgsz, conf=cfg.conf, iou=cfg.iou, verbose=False + )[0] + + base_T_flange = _pose_to_matrix(latest_pose) + objects: List[Dict[str, Any]] = [] + for r in results.boxes: + bbox = [int(x) for x in r.xyxy[0]] + if not _within_roi(bbox, cfg): + continue + cx = int((bbox[0] + bbox[2]) / 2) + cy = int((bbox[1] + bbox[3]) / 2) + point_cam_mm = _sample_point(latest_point_cloud, cx, cy, cfg) + if point_cam_mm is None: + continue + point_cam_m = np.array( + [point_cam_mm[0], point_cam_mm[1], point_cam_mm[2], 1.0], + dtype=np.float64, + ) + point_cam_m[:3] /= 1000.0 + point_base = base_T_flange @ t_cam2gripper @ point_cam_m + point_base_mm = point_base[:3] * 1000.0 + + dominant = _dominant_color(frame, bbox) + color_name = _closest_color(dominant, cfg) + area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])) + size_label = "big" if area >= cfg.size_threshold else "small" + + objects.append( + { + "object_type": results.names[int(r.cls.item())], + "confidence": float(r.conf.item()), + "color": color_name, + "size": size_label, + "bbox": bbox, + "center_px": [cx, cy], + "position_mm": [ + float(point_base_mm[0]), + float(point_base_mm[1]), + float(point_base_mm[2]), + ], + "timestamp_ns": time.time_ns(), + } + ) + + payload = json.dumps({"objects": objects, "timestamp_ns": time.time_ns()}) + node.send_output( + objects_output, + pa.array([payload]), + metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + + # Send annotated image + annotated = _draw_detections(frame, objects, cfg) + h, w = annotated.shape[:2] + node.send_output( + image_output, + pa.array(annotated.ravel().tolist()), + metadata={ + "encoding": "bgr8", + "width": w, + "height": h, + "timestamp_ns": time.time_ns(), + }, + ) + + +if __name__ == "__main__": + main() diff --git a/dora_detector/pyproject.toml b/dora_detector/pyproject.toml new file mode 100644 index 0000000..15c1ce3 --- /dev/null +++ b/dora_detector/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "dora-detector" +version = "0.1.0" +license = { file = "MIT" } +authors = [{ name = "Dora" }] +description = "Dora node for YOLO-based object detection with ZED point cloud" + +requires-python = ">=3.8" + +dependencies = [ + "dora-rs >= 0.3.9", + "numpy < 2.0.0", + "opencv-python >= 4.1.1", + "pyarrow >= 12.0.0", + "ultralytics >= 8.0.0", +] + +[project.scripts] +dora-detector = "dora_detector.main:main" diff --git a/dora_iobridge/README.md b/dora_iobridge/README.md new file mode 100644 index 0000000..54d2c0e --- /dev/null +++ b/dora_iobridge/README.md @@ -0,0 +1,178 @@ +# Dora IOBridge Node + +A WebSocket server that bridges web clients with the Dora dataflow for real-time voice commands and scene updates. + +## Inputs/Outputs + +| Input | Type | Description | +|----------------|--------|---------------------------------------| +| `voice_out` | JSON | Response from voice control node | +| `scene_update` | JSON | Scene objects from voice control | + +| Output | Type | Description | +|----------------|--------|---------------------------------------| +| `voice_in` | string | Voice commands forwarded to Dora | + +## Environment Variables + +```bash +VOICE_HOST=0.0.0.0 # Bind address +VOICE_PORT=8765 # Listen port +``` + +## Installation + +```bash +cd dora_iobridge +pip install -e . +``` + +## Testing + +### Test with WebSocket (wscat) + +```bash +# Install wscat +npm install -g wscat + +# Connect to the server +wscat -c ws://localhost:8765 +``` + +### Test with curl (websocat) + +```bash +# Install websocat +# Ubuntu: sudo apt install websocat +# macOS: brew install websocat + +# Send a ping +echo '{"type": "ping"}' | websocat ws://localhost:8765 +# Response: {"type": "pong"} + +# Send a voice command +echo '{"type": "command", "text": "sube"}' | websocat ws://localhost:8765 + +# Request scene refresh +echo '{"type": "scene_refresh"}' | websocat ws://localhost:8765 +``` + +### Test with Python + +```python +import asyncio +import websockets +import json + +async def test_iobridge(): + uri = "ws://localhost:8765" + async with websockets.connect(uri) as ws: + # Test ping + await ws.send(json.dumps({"type": "ping"})) + response = await ws.recv() + print(f"Ping response: {response}") + + # Send command + await ws.send(json.dumps({ + "type": "command", + "text": "agarra el cubo rojo" + })) + + # Listen for responses + async for message in ws: + data = json.loads(message) + print(f"Received: {data}") + +asyncio.run(test_iobridge()) +``` + +### Test with curl (HTTP upgrade not supported directly) + +Since WebSocket requires an upgrade handshake, use this shell script: + +```bash +#!/bin/bash +# test_iobridge.sh + +# Using websocat for interactive testing +websocat ws://localhost:8765 < Server + +**Command (voice input)** +```json +{"type": "command", "text": "agarra el cubo rojo"} +``` + +**Ping (health check)** +```json +{"type": "ping"} +``` +Response: `{"type": "pong"}` + +**Scene Refresh** +```json +{"type": "scene_refresh"} +``` + +### Server -> Client (Broadcasts) + +**Command Response** +```json +{ + "type": "response", + "text": "Ok, voy a tomar", + "status": "ok" +} +``` + +**Scene Update** +```json +{ + "type": "scene_updated", + "objects": [ + { + "object_type": "cubo", + "color": "rojo", + "size": "big", + "position_mm": [150.0, 200.0, 280.0], + "source": "detection" + } + ] +} +``` + +## Dora Dataflow Configuration + +```yaml +nodes: + - id: iobridge + build: pip install -e ./dora_iobridge + path: dora_iobridge + inputs: + voice_out: voice_control/voice_out + scene_update: voice_control/scene_update + outputs: + - voice_in + env: + VOICE_HOST: "0.0.0.0" + VOICE_PORT: "8765" +``` + +```bash +dora up +dora start dataflow.yml +``` + +## Dependencies + +- dora-rs >= 0.3.9 +- pyarrow >= 12.0.0 +- websockets >= 12.0 diff --git a/dora_iobridge/dora_iobridge/__init__.py b/dora_iobridge/dora_iobridge/__init__.py new file mode 100644 index 0000000..af84e3e --- /dev/null +++ b/dora_iobridge/dora_iobridge/__init__.py @@ -0,0 +1 @@ +"""Dora IO bridge node package.""" diff --git a/dora_iobridge/dora_iobridge/main.py b/dora_iobridge/dora_iobridge/main.py new file mode 100644 index 0000000..a074e0c --- /dev/null +++ b/dora_iobridge/dora_iobridge/main.py @@ -0,0 +1,145 @@ +"""Dora node bridging WebSocket IO to Dora topics.""" + +from __future__ import annotations + +import asyncio +import json +import os +import threading +import time +from typing import Any, Dict, Optional, Set + +import pyarrow as pa +from dora import Node +from websockets.server import serve, WebSocketServerProtocol + + +class IoBridgeServer: + def __init__(self, host: str, port: int): + self.host = host + self.port = port + self.clients: Set[WebSocketServerProtocol] = set() + self.command_handler = None + self.scene_refresh_handler = None + + async def handler(self, websocket: WebSocketServerProtocol): + self.clients.add(websocket) + try: + async for message in websocket: + try: + data = json.loads(message) + except json.JSONDecodeError: + await websocket.send( + json.dumps({"type": "error", "text": "Invalid JSON message"}) + ) + continue + response = await self._route_message(data, websocket) + if response: + await websocket.send(json.dumps(response)) + finally: + self.clients.discard(websocket) + + async def _route_message( + self, data: Dict[str, Any], websocket: WebSocketServerProtocol + ) -> Optional[Dict[str, Any]]: + msg_type = data.get("type") + if msg_type == "command": + text = data.get("text", "") + if self.command_handler: + await self.command_handler(text) + return None + return {"type": "error", "text": "No command handler registered"} + if msg_type == "ping": + return {"type": "pong"} + if msg_type == "scene_refresh": + if self.scene_refresh_handler: + objects = await self.scene_refresh_handler() + return {"type": "scene_updated", "objects": objects} + return {"type": "error", "text": "No scene handler registered"} + return {"type": "error", "text": f"Unknown message type: {msg_type}"} + + async def broadcast(self, message: Dict[str, Any]): + if not self.clients: + return + payload = json.dumps(message) + await asyncio.gather( + *[client.send(payload) for client in self.clients], return_exceptions=True + ) + + async def send(self, message: Dict[str, Any], websocket: WebSocketServerProtocol): + await websocket.send(json.dumps(message)) + + async def start(self): + async with serve(self.handler, self.host, self.port): + await asyncio.Future() + + +def main() -> None: + host = os.getenv("VOICE_HOST", "0.0.0.0") + port = int(os.getenv("VOICE_PORT", "8765")) + input_topic = os.getenv("VOICE_IN_OUTPUT", "voice_in") + response_input = os.getenv("VOICE_OUT_INPUT", "voice_out") + scene_input = os.getenv("SCENE_INPUT", "scene_update") + + node = Node() + server = IoBridgeServer(host, port) + loop = asyncio.new_event_loop() + + def push_command(text: str) -> None: + node.send_output( + input_topic, + pa.array([text]), + metadata={"encoding": "utf-8", "timestamp_ns": time.time_ns()}, + ) + + async def handle_scene_refresh(): + return [] + + def command_handler(text: str): + push_command(text) + return None + + server.command_handler = command_handler + server.scene_refresh_handler = handle_scene_refresh + + def run_server(): + asyncio.set_event_loop(loop) + loop.run_until_complete(server.start()) + + threading.Thread(target=run_server, daemon=True).start() + + for event in node: + if event["type"] != "INPUT": + continue + + if event["id"] == response_input: + raw = event["value"][0].as_py() if len(event["value"]) else "" + if not raw: + continue + try: + payload = json.loads(raw) + message = { + "type": "response", + "text": payload.get("text", ""), + "status": payload.get("status", "ok"), + } + except Exception: + message = {"type": "response", "text": raw, "status": "ok"} + asyncio.run_coroutine_threadsafe(server.broadcast(message), loop) + continue + + if event["id"] == scene_input: + raw = event["value"][0].as_py() if len(event["value"]) else "" + if not raw: + continue + try: + payload = json.loads(raw) + objects = payload.get("objects", []) + message = {"type": "scene_updated", "objects": objects} + except Exception: + message = {"type": "scene_updated", "objects": []} + asyncio.run_coroutine_threadsafe(server.broadcast(message), loop) + + +if __name__ == "__main__": + main() diff --git a/dora_iobridge/pyproject.toml b/dora_iobridge/pyproject.toml new file mode 100644 index 0000000..d528a66 --- /dev/null +++ b/dora_iobridge/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "dora-iobridge" +version = "0.1.0" +license = { file = "MIT" } +authors = [{ name = "Dora" }] +description = "Dora node bridging WebSocket IO to Dora topics" + +requires-python = ">=3.8" + +dependencies = [ + "dora-rs >= 0.3.9", + "pyarrow >= 12.0.0", + "websockets >= 12.0", +] + +[project.scripts] +dora-iobridge = "dora_iobridge.main:main" diff --git a/dora_ulite6/dora_ulite6/main.py b/dora_ulite6/dora_ulite6/main.py index 6615165..8c8c99a 100644 --- a/dora_ulite6/dora_ulite6/main.py +++ b/dora_ulite6/dora_ulite6/main.py @@ -781,6 +781,12 @@ def _status_snapshot(helper: ULite6Helper) -> Dict[str, Any]: } +def _log(msg: str) -> None: + """Print a timestamped log message.""" + timestamp = time.strftime("%H:%M:%S") + print(f"[ulite6 {timestamp}] {msg}", flush=True) + + def main() -> None: node = Node() @@ -791,7 +797,42 @@ def main() -> None: api_port = int(os.getenv("API_PORT", "8080")) vacuum_enabled = os.getenv("VACUUM_ENABLED", "false").lower() in ("true", "1", "yes") + # Initial position settings + init_mode = os.getenv("INIT_MODE", "none").lower() # "home", "pose", or "none" + init_x = float(os.getenv("INIT_X", "300.0")) + init_y = float(os.getenv("INIT_Y", "0.0")) + init_z = float(os.getenv("INIT_Z", "250.0")) + init_roll = float(os.getenv("INIT_ROLL", "180.0")) + init_pitch = float(os.getenv("INIT_PITCH", "0.0")) + init_yaw = float(os.getenv("INIT_YAW", "0.0")) + init_speed = float(os.getenv("INIT_SPEED", "50.0")) + + _log(f"Connecting to robot at {robot_ip}...") helper = ULite6Helper(robot_ip) + _log("Robot connected") + + # Move to initial position on startup + if init_mode == "home": + _log("Moving to home position...") + code = helper.go_home() + if code == 0: + _log("Home position reached") + else: + _log(f"Home failed with code {code}") + elif init_mode == "pose": + _log(f"Moving to initial pose: [{init_x}, {init_y}, {init_z}] roll={init_roll} pitch={init_pitch} yaw={init_yaw}") + code = helper.move_to_pose( + init_x, init_y, init_z, + init_roll, init_pitch, init_yaw, + speed=init_speed, + units="mm", + ) + if code == 0: + _log("Initial pose reached") + else: + _log(f"Move to initial pose failed with code {code}") + else: + _log("Skipping initial position (INIT_MODE=none)") # Create and start FastAPI server in background thread app = create_api(helper, default_speed, default_units, vacuum_enabled) @@ -895,6 +936,28 @@ def main() -> None: code=code, status=_status_snapshot(helper), ) + elif action in ("vacuum_on", "vacuum_off"): + if not vacuum_enabled: + _send_command_status( + node, + command_id=command_id, + action=action, + ok=False, + message="Vacuum gripper not enabled", + status=_status_snapshot(helper), + ) + continue + vacuum_on = action == "vacuum_on" + code = helper.set_vacuum_gripper(vacuum_on) + _send_command_status( + node, + command_id=command_id, + action=action, + ok=code == 0, + message="Vacuum command executed", + code=code, + status=_status_snapshot(helper), + ) else: _send_command_status( node, diff --git a/dora_voice_control/README.md b/dora_voice_control/README.md new file mode 100644 index 0000000..15af54f --- /dev/null +++ b/dora_voice_control/README.md @@ -0,0 +1,211 @@ +# Dora Voice Control Node + +A Dora node that processes Spanish voice commands from children and translates them into robot actions (movement, grasping, releasing objects). Includes a web debug interface. + +## Features + +- Spanish voice command parsing (rule-based or Gemini LLM) +- Real-time web debug interface +- Command queue management +- Workspace bounds validation +- Object detection integration + +## File Structure + +``` +dora_voice_control/ +├── __init__.py +├── main.py # Main Dora node entry point +├── api.py # FastAPI web server +├── config.py # Configuration management +├── models.py # Pydantic request/response models +├── parser.py # Voice command parsing logic +├── state.py # Shared state management +└── templates.py # HTML template for web interface +``` + +## Web Debug Interface + +Access the debug interface at `http://localhost:8080` (default). + +Features: +- Real-time status monitoring (pose, objects, queue) +- Send manual voice commands +- Quick command buttons +- View parse results +- Command history +- Clear queue + +## Inputs/Outputs + +| Input | Type | Description | +|---------------|--------|------------------------------------------| +| `voice_in` | string | Text transcription of voice command | +| `tcp_pose` | array | Current robot pose [x, y, z, roll, pitch, yaw] | +| `objects` | JSON | Detected objects from vision system | +| `status` | JSON | Command execution status from robot | + +| Output | Type | Description | +|---------------|--------|------------------------------------------| +| `robot_cmd` | JSON | Robot command with action and payload | +| `voice_out` | JSON | Response confirmation to user | +| `scene_update`| JSON | Updated scene with all visible objects | + +## Supported Commands (Spanish) + +| Command | Action | Example | +|---------------|----------------|--------------------------------| +| `subir` | Move up | "sube" | +| `bajar` | Move down | "baja" | +| `tomar` | Grab object | "agarra el cubo rojo" | +| `soltar` | Release object | "suelta en la caja azul" | +| `ir` | Go to object | "ve al cilindro" | +| `reiniciar` | Reset | "reinicia" | + +## Environment Variables + +```bash +# Web API Server +API_ENABLED=true # Enable/disable web interface +API_HOST=0.0.0.0 # Bind address +API_PORT=8080 # Listen port + +# TCP Parameters +TCP_OFFSET_MM=63.0 # Z-offset to object surface +APPROACH_OFFSET_MM=50.0 # Safe approach distance above object +STEP_MM=20.0 # Distance for up/down increments + +# LLM Configuration (optional) +LLM_PROVIDER=rules # "rules" or "gemini" +GOOGLE_API_KEY=your_key # Required if using gemini +GEMINI_MODEL=gemini-2.0-flash + +# Workspace Safety (optional) +WORKSPACE_MIN_X=-300 +WORKSPACE_MAX_X=300 +WORKSPACE_MIN_Y=-300 +WORKSPACE_MAX_Y=300 +WORKSPACE_MIN_Z=0 +WORKSPACE_MAX_Z=500 + +# Misc +DRY_RUN=false # Skip sending robot commands +``` + +## Installation + +```bash +cd dora_voice_control +pip install -e . + +# With LLM support +pip install -e ".[llm]" +``` + +## Testing + +### Web Interface + +```bash +# Start the node (standalone for testing) +python -m dora_voice_control.main + +# Open in browser +open http://localhost:8080 +``` + +### API Endpoints + +```bash +# Get status +curl http://localhost:8080/api/status + +# Get objects +curl http://localhost:8080/api/objects + +# Get queue +curl http://localhost:8080/api/queue + +# Send command +curl -X POST http://localhost:8080/api/command \ + -H "Content-Type: application/json" \ + -d '{"text": "sube"}' + +# Clear queue +curl -X POST http://localhost:8080/api/queue/clear +``` + +### Python Test + +```python +from dora_voice_control.parser import rule_parse, normalize + +# Test command parsing +text = "agarra el cubo rojo grande" +result = rule_parse(normalize(text)) +print(result) +# {'resultado': 'ok', 'accion': 'tomar', 'objeto': 'cubo', 'color': 'rojo', 'tamano': 'grande'} +``` + +## Dora Dataflow Configuration + +```yaml +nodes: + - id: voice_control + build: pip install -e ./dora_voice_control + path: dora_voice_control + inputs: + voice_in: iobridge/voice_in + tcp_pose: robot/tcp_pose + objects: detector/objects + status: robot/status + outputs: + - robot_cmd + - voice_out + - scene_update + env: + API_ENABLED: "true" + API_PORT: "8080" + DRY_RUN: "false" +``` + +## Message Examples + +### Input: voice_in +``` +"sube" +"agarra el cubo rojo" +"suelta en la caja azul" +``` + +### Output: robot_cmd +```json +{ + "id": "550e8400-e29b-41d4-a716-446655440000", + "action": "move_to_pose", + "payload": { + "x": 150.0, + "y": 200.0, + "z": 280.0, + "roll": 180.0, + "pitch": 0.0, + "yaw": 0.0 + } +} +``` + +### Output: voice_out +```json +{"text": "Ok, voy a subir", "status": "ok"} +{"text": "No entendi el comando", "status": "error"} +``` + +## Dependencies + +- dora-rs >= 0.3.9 +- numpy < 2.0.0 +- pyarrow >= 12.0.0 +- fastapi >= 0.109.0 +- uvicorn >= 0.27.0 +- pydantic >= 2.0.0 +- google-genai (optional, for Gemini mode) diff --git a/dora_voice_control/dora_voice_control/__init__.py b/dora_voice_control/dora_voice_control/__init__.py new file mode 100644 index 0000000..fa251a9 --- /dev/null +++ b/dora_voice_control/dora_voice_control/__init__.py @@ -0,0 +1 @@ +"""Dora voice control node package.""" diff --git a/dora_voice_control/dora_voice_control/api.py b/dora_voice_control/dora_voice_control/api.py new file mode 100644 index 0000000..c2d1dc4 --- /dev/null +++ b/dora_voice_control/dora_voice_control/api.py @@ -0,0 +1,162 @@ +"""FastAPI application for the voice control web interface.""" + +from __future__ import annotations + +import os +import sys +import threading +from typing import Any + +import uvicorn +from fastapi import FastAPI, HTTPException +from fastapi.responses import HTMLResponse, Response + +# Handle both package and direct script execution +# __package__ is None when run as script, '' when imported from a script +if not __package__: + _pkg_dir = os.path.dirname(os.path.abspath(__file__)) + if _pkg_dir not in sys.path: + sys.path.insert(0, _pkg_dir) + from models import CommandRequest, CommandResponse + from state import SharedState + from templates import HTML_TEMPLATE +else: + from .models import CommandRequest, CommandResponse + from .state import SharedState + from .templates import HTML_TEMPLATE + + +def create_api(state: SharedState) -> FastAPI: + """Create FastAPI application with voice control endpoints.""" + app = FastAPI( + title="Voice Control Debug API", + description="Debug interface for the voice control node", + version="0.1.0", + ) + + @app.get("/", response_class=HTMLResponse) + def index() -> str: + """Serve the web interface.""" + return HTML_TEMPLATE + + @app.get("/api/status") + def get_status() -> dict: + """Get current status.""" + try: + return state.get_status() + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/objects") + def get_objects() -> dict: + """Get detected and static objects.""" + try: + return state.get_objects() + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/queue") + def get_queue() -> list: + """Get the command queue.""" + try: + return state.get_queue() + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/queue/clear") + def clear_queue() -> dict: + """Clear the command queue.""" + try: + with state._lock: + state.voice_state.queue.clear() + return {"ok": True} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/history") + def get_history() -> list: + """Get command history.""" + try: + return state.get_history() + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/errors") + def get_errors() -> list: + """Get error log.""" + try: + return state.get_errors() + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/command", response_model=CommandResponse) + def send_command(request: CommandRequest) -> CommandResponse: + """Send a voice command.""" + try: + callback = state.get_command_callback() + if callback is None: + return CommandResponse(ok=False, text="No command handler available", status="error") + + result = callback(request.text) + return CommandResponse( + ok=result.get("status") == "ok", + text=result.get("text", ""), + status=result.get("status", "error"), + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/image") + def get_image() -> Response: + """Get the latest camera image as JPEG.""" + try: + image_data = state.get_image() + if image_data is None: + # Return a 1x1 transparent pixel if no image + return Response( + content=b"", + media_type="image/jpeg", + status_code=204, + ) + return Response( + content=image_data, + media_type="image/jpeg", + headers={"Cache-Control": "no-cache, no-store, must-revalidate"}, + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/image/info") + def get_image_info() -> dict: + """Get image metadata.""" + try: + return { + "has_image": state.get_image() is not None, + "age_ms": state.get_image_age_ms(), + } + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + return app + + +def run_uvicorn(app: FastAPI, host: str, port: int) -> None: + """Run uvicorn server (for use in background thread).""" + config = uvicorn.Config(app, host=host, port=port, log_level="warning") + server = uvicorn.Server(config) + server.run() + + +def start_api_server(state: SharedState, config: Any) -> threading.Thread: + """Start the API server in a background thread.""" + import time as _time + app = create_api(state) + api_thread = threading.Thread( + target=run_uvicorn, + args=(app, config.host, config.port), + daemon=True, + ) + api_thread.start() + timestamp = _time.strftime("%H:%M:%S") + print(f"[voice_control {timestamp}] Web interface at http://{config.host}:{config.port}", flush=True) + return api_thread diff --git a/dora_voice_control/dora_voice_control/config.py b/dora_voice_control/dora_voice_control/config.py new file mode 100644 index 0000000..68101fb --- /dev/null +++ b/dora_voice_control/dora_voice_control/config.py @@ -0,0 +1,95 @@ +"""Configuration for the voice control node.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Dict, Optional, Tuple + + +@dataclass +class VoiceConfig: + """Configuration for voice control.""" + + host: str + port: int + tcp_offset_mm: float + approach_offset_mm: float + step_mm: float + default_roll: float + default_pitch: float + default_yaw: float + dry_run: bool + workspace_min: Tuple[Optional[float], Optional[float], Optional[float]] + workspace_max: Tuple[Optional[float], Optional[float], Optional[float]] + class_map: Dict[str, str] + + +@dataclass +class ApiConfig: + """Configuration for the web API server.""" + + host: str + port: int + enabled: bool + + +def _parse_float_env(name: str) -> Optional[float]: + """Parse an optional float from environment variable.""" + raw = os.getenv(name) + if raw is None or raw == "": + return None + try: + return float(raw) + except ValueError: + return None + + +def _parse_class_map(raw: str) -> Dict[str, str]: + """Parse JSON class mapping from string.""" + import json + + if not raw: + return {} + try: + data = json.loads(raw) + if isinstance(data, dict): + return {str(k): str(v) for k, v in data.items()} + except Exception: + pass + return {} + + +def load_voice_config() -> VoiceConfig: + """Load voice configuration from environment variables.""" + return VoiceConfig( + host="", + port=0, + tcp_offset_mm=float(os.getenv("TCP_OFFSET_MM", "63.0")), + approach_offset_mm=float(os.getenv("APPROACH_OFFSET_MM", "50.0")), + step_mm=float(os.getenv("STEP_MM", "20.0")), + default_roll=float(os.getenv("DEFAULT_ROLL", "180.0")), + default_pitch=float(os.getenv("DEFAULT_PITCH", "0.0")), + default_yaw=float(os.getenv("DEFAULT_YAW", "0.0")), + dry_run=os.getenv("DRY_RUN", "false").lower() in ("true", "1", "yes"), + workspace_min=( + _parse_float_env("WORKSPACE_MIN_X"), + _parse_float_env("WORKSPACE_MIN_Y"), + _parse_float_env("WORKSPACE_MIN_Z"), + ), + workspace_max=( + _parse_float_env("WORKSPACE_MAX_X"), + _parse_float_env("WORKSPACE_MAX_Y"), + _parse_float_env("WORKSPACE_MAX_Z"), + ), + class_map=_parse_class_map(os.getenv("CLASS_MAP", "")), + ) + + +def load_api_config() -> ApiConfig: + """Load API server configuration from environment variables.""" + return ApiConfig( + host=os.getenv("API_HOST", "0.0.0.0"), + port=int(os.getenv("API_PORT", "8080")), + enabled=os.getenv("API_ENABLED", "true").lower() in ("true", "1", "yes"), + ) diff --git a/dora_voice_control/dora_voice_control/main.py b/dora_voice_control/dora_voice_control/main.py new file mode 100644 index 0000000..277ba86 --- /dev/null +++ b/dora_voice_control/dora_voice_control/main.py @@ -0,0 +1,501 @@ +"""Dora node for voice control with safe robot commands.""" + +from __future__ import annotations + +import json +import os +import sys +import time +import uuid +from collections import deque +from typing import Any, Deque, Dict, List, Optional, Tuple + +import cv2 +import numpy as np +import pyarrow as pa +from dora import Node + +try: + import tomllib +except ModuleNotFoundError: + import tomli as tomllib + +# Handle both package and direct script execution +# __package__ is None when run as script, '' when imported from a script +_RUNNING_AS_SCRIPT = not __package__ + +if _RUNNING_AS_SCRIPT: + # Running as script - use absolute imports + _pkg_dir = os.path.dirname(os.path.abspath(__file__)) + if _pkg_dir not in sys.path: + sys.path.insert(0, _pkg_dir) + from config import VoiceConfig, load_api_config, load_voice_config + from parser import normalize, parse_command + from state import RobotStep, SharedState + from api import start_api_server +else: + # Running as package - use relative imports + from .config import VoiceConfig, load_api_config, load_voice_config + from .parser import normalize, parse_command + from .state import RobotStep, SharedState + from .api import start_api_server + + +def _within_bounds( + point_mm: np.ndarray, + min_xyz: Tuple[Optional[float], Optional[float], Optional[float]], + max_xyz: Tuple[Optional[float], Optional[float], Optional[float]], +) -> bool: + """Check if point is within workspace bounds.""" + x, y, z = point_mm.tolist() + min_x, min_y, min_z = min_xyz + max_x, max_y, max_z = max_xyz + if min_x is not None and x < min_x: + return False + if max_x is not None and x > max_x: + return False + if min_y is not None and y < min_y: + return False + if max_y is not None and y > max_y: + return False + if min_z is not None and z < min_z: + return False + if max_z is not None and z > max_z: + return False + return True + + +def _translate_target(token: str, mapping: Dict[str, str]) -> str: + """Translate object name using class map.""" + if token in mapping: + return mapping[token] + return token + + +def _load_config_file(path: str) -> Dict[str, Any]: + """Load TOML configuration file.""" + if not path or not os.path.exists(path): + return {} + try: + with open(path, "rb") as handle: + return tomllib.load(handle) + except Exception: + return {} + + +def _load_bucket_objects(config_path: str) -> List[Dict[str, Any]]: + """Load bucket positions from config file.""" + cfg = _load_config_file(config_path) + buckets = cfg.get("bucket_positions", {}) + obj_cfg = cfg.get("object_parameters", {}) + base_z = float(obj_cfg.get("normal_height", 220.0)) + out = [] + for key, color in [ + ("blue_bucket_pos", "blue"), + ("red_bucket_pos", "red"), + ("yellow_bucket_pos", "yellow"), + ("white_bucket_pos", "white"), + ]: + pos = buckets.get(key) + if not isinstance(pos, list) or len(pos) < 2: + continue + out.append( + { + "object_type": "box", + "color": color, + "size": "big", + "position_mm": [float(pos[0]), float(pos[1]), base_z], + "source": "config", + } + ) + return out + + +def _send_dora_command( + node: Node, output_name: str, action: str, payload: Dict[str, Any] +) -> str: + """Send a robot command via Dora.""" + command_id = str(uuid.uuid4()) + message = {"id": command_id, "action": action, "payload": payload} + node.send_output( + output_name, + pa.array([json.dumps(message)]), + metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + return command_id + + +def _parse_status_payload(value: pa.Array) -> Optional[Dict[str, Any]]: + """Parse status payload from robot.""" + if len(value) == 0: + return None + raw = value[0].as_py() + if not raw: + return None + try: + return json.loads(raw) + except Exception: + return None + + +def _log(msg: str) -> None: + """Print a timestamped log message.""" + timestamp = time.strftime("%H:%M:%S") + print(f"[voice_control {timestamp}] {msg}", flush=True) + + +def main() -> None: + """Main entry point for the voice control node.""" + _log("Starting voice control node...") + + # Load configuration + cfg = load_voice_config() + api_cfg = load_api_config() + + # Environment variables for I/O topics + objects_input = os.getenv("OBJECTS_INPUT", "objects") + voice_in_input = os.getenv("VOICE_IN_INPUT", "voice_in") + voice_out_output = os.getenv("VOICE_OUT_OUTPUT", "voice_out") + scene_output = os.getenv("SCENE_OUTPUT", "scene_update") + pose_input = os.getenv("POSE_INPUT", "tcp_pose") + status_input = os.getenv("STATUS_INPUT", "status") + command_output = os.getenv("COMMAND_OUTPUT", "robot_cmd") + image_input = os.getenv("IMAGE_INPUT", "image_annotated") + llm_provider = os.getenv("LLM_PROVIDER", "rules").lower() + config_file = os.getenv("CONFIG_FILE", "config.toml") + + # Image dimensions (will be detected from first frame) + image_width = int(os.getenv("IMAGE_WIDTH", "1280")) + image_height = int(os.getenv("IMAGE_HEIGHT", "720")) + + # Initial/home position for reset command + init_x = float(os.getenv("INIT_X", "300.0")) + init_y = float(os.getenv("INIT_Y", "0.0")) + init_z = float(os.getenv("INIT_Z", "250.0")) + init_roll = float(os.getenv("INIT_ROLL", "180.0")) + init_pitch = float(os.getenv("INIT_PITCH", "0.0")) + init_yaw = float(os.getenv("INIT_YAW", "0.0")) + + _log(f"Config: tcp_offset={cfg.tcp_offset_mm}mm, approach_offset={cfg.approach_offset_mm}mm, step={cfg.step_mm}mm") + _log(f"Initial position: [{init_x}, {init_y}, {init_z}]") + _log(f"LLM provider: {llm_provider}") + _log(f"Dry run: {cfg.dry_run}") + + # Initialize shared state + shared_state = SharedState() + state = shared_state.voice_state + state.static_objects = _load_bucket_objects(config_file) + pending_intents: Deque[Dict[str, Any]] = deque() + + _log(f"Loaded {len(state.static_objects)} static objects from config") + + # Queue initial position movement on startup (same as reiniciar) + init_on_start = os.getenv("INIT_ON_START", "true").lower() in ("true", "1", "yes") + send_init_scene_reset = init_on_start # Flag to send scene reset after node starts + if init_on_start: + _log(f"Startup: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]") + # Clear detected objects + state.latest_objects = [] + state.latest_objects_at = None + # Queue vacuum off and move to home + state.queue.append(RobotStep(action="vacuum_off", payload={})) + state.queue.append( + RobotStep( + action="move_to_pose", + payload={ + "x": init_x, + "y": init_y, + "z": init_z, + "roll": init_roll, + "pitch": init_pitch, + "yaw": init_yaw, + }, + ) + ) + + def command_handler(transcript: str) -> Dict[str, str]: + """Handle voice command and return response.""" + _log(f"Voice input received: \"{transcript}\"") + llm_result = parse_command(transcript, llm_provider) + _log(f"Parse result: {llm_result}") + + # Update debug state + shared_state.update_voice_input(transcript, llm_result, time.monotonic()) + + if llm_result.get("resultado") != "ok": + _log("Command not understood") + return {"text": "No entendi el comando", "status": "error"} + + action = llm_result.get("accion", "error") + obj = llm_result.get("objeto", "no especificado") + color = llm_result.get("color", "no especificado") + size = llm_result.get("tamano", "no especificado") + + _log(f"Intent: action={action}, object={obj}, color={color}, size={size}") + + pending_intents.append( + {"action": action, "obj": obj, "color": color, "size": size} + ) + + # Add to history + shared_state.add_to_history({ + "timestamp": time.time(), + "input": transcript, + "action": action, + "object": obj, + "color": color, + "size": size, + }) + + return {"text": f"Ok, voy a {action}", "status": "ok"} + + # Set command callback for web interface + shared_state.set_command_callback(command_handler) + + # Start web API server if enabled + if api_cfg.enabled: + start_api_server(shared_state, api_cfg) + + # Create Dora node + node = Node() + _log("Dora node created, waiting for events...") + + first_event = True + for event in node: + # Send scene reset on first event (startup) + if first_event and send_init_scene_reset: + first_event = False + scene_payload = json.dumps( + {"objects": list(state.static_objects), "reset": True} + ) + node.send_output( + scene_output, + pa.array([scene_payload]), + metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + _log("Sent initial scene reset notification") + if event["type"] != "INPUT": + continue + + # Handle voice input + if event["id"] == voice_in_input: + raw = event["value"][0].as_py() if len(event["value"]) else "" + if not raw: + continue + response = command_handler(raw) + node.send_output( + voice_out_output, + pa.array([json.dumps(response)]), + metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + continue + + # Handle pose updates + if event["id"] == pose_input: + tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1) + if tcp_pose.size >= 6: + state.latest_pose = tcp_pose[:6].tolist() + state.latest_pose_at = time.monotonic() + continue + + # Handle object detection updates + if event["id"] == objects_input: + raw = event["value"][0].as_py() if len(event["value"]) else "" + if raw: + try: + payload = json.loads(raw) + objects = payload.get("objects", []) + except Exception: + objects = [] + state.latest_objects = objects + state.latest_objects_at = time.monotonic() + continue + + # Handle camera image + if event["id"] == image_input: + try: + # Get raw image data + img_data = event["value"].to_numpy() + # Reshape to image (assuming BGR format) + img = img_data.reshape((image_height, image_width, 3)).astype(np.uint8) + # Encode to JPEG + _, jpeg_data = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 80]) + shared_state.update_image(jpeg_data.tobytes(), time.monotonic()) + except Exception as e: + # Log error but don't crash + pass + continue + + # Handle robot status updates + if event["id"] == status_input: + payload = _parse_status_payload(event["value"]) + if payload and state.pending_command: + if payload.get("command_id") == state.pending_command.get("id"): + _log(f"Command completed: {state.pending_command.get('action')} (status={payload.get('status', 'ok')})") + state.pending_command = None + continue + + # Process pending intents + if pending_intents: + intent = pending_intents.popleft() + action = intent["action"] + obj = intent["obj"] + color = intent["color"] + size = intent["size"] + + _log(f"Processing intent: {action} {obj} {color} {size}") + + latest_pose = state.latest_pose + objects = list(state.latest_objects) + list(state.static_objects) + _log(f"Available objects: {len(state.latest_objects)} detected + {len(state.static_objects)} static") + + if action in ("subir", "bajar") and latest_pose: + delta = cfg.step_mm if action == "subir" else -cfg.step_mm + target = np.array(latest_pose[:3], dtype=np.float64) + target[2] += delta + if _within_bounds(target, cfg.workspace_min, cfg.workspace_max): + step = RobotStep( + action="move_to_pose", + payload={ + "x": float(target[0]), + "y": float(target[1]), + "z": float(target[2]), + "roll": cfg.default_roll, + "pitch": cfg.default_pitch, + "yaw": cfg.default_yaw, + }, + ) + state.queue.append(step) + _log(f"Queued: move Z to {target[2]:.1f}mm (delta={delta:+.1f})") + else: + _log(f"Target {target.tolist()} out of bounds, skipping") + + elif action in ("ir", "tomar", "soltar"): + target_obj = None + if obj != "no especificado": + target_name = _translate_target(obj, cfg.class_map) + target_color = _translate_target(color, cfg.class_map) + _log(f"Looking for: type={target_name}, color={target_color}") + # Log available objects for debugging + for o in objects: + _log(f" -> Available: {o.get('object_type')} {o.get('color')} {o.get('size')} at {o.get('position_mm')}") + for o in objects: + if o.get("object_type") == target_name: + if color == "no especificado" or o.get("color") == target_color: + if size == "no especificado" or o.get("size") == _translate_target(size, cfg.class_map): + target_obj = o + break + if target_obj: + _log(f"Found target: {target_obj.get('object_type')} {target_obj.get('color')} at {target_obj.get('position_mm')}") + pos = np.array(target_obj["position_mm"], dtype=np.float64) + approach = pos.copy() + approach[2] += cfg.tcp_offset_mm + cfg.approach_offset_mm + target = pos.copy() + target[2] += cfg.tcp_offset_mm + if _within_bounds(approach, cfg.workspace_min, cfg.workspace_max): + state.queue.append( + RobotStep( + action="move_to_pose", + payload={ + "x": float(approach[0]), + "y": float(approach[1]), + "z": float(approach[2]), + "roll": cfg.default_roll, + "pitch": cfg.default_pitch, + "yaw": cfg.default_yaw, + }, + ) + ) + _log(f"Queued: approach pose at Z={approach[2]:.1f}mm") + if _within_bounds(target, cfg.workspace_min, cfg.workspace_max): + state.queue.append( + RobotStep( + action="move_to_pose", + payload={ + "x": float(target[0]), + "y": float(target[1]), + "z": float(target[2]), + "roll": cfg.default_roll, + "pitch": cfg.default_pitch, + "yaw": cfg.default_yaw, + }, + ) + ) + _log(f"Queued: target pose at Z={target[2]:.1f}mm") + if action == "tomar": + state.queue.append(RobotStep(action="vacuum_on", payload={})) + _log("Queued: vacuum_on") + elif action == "soltar": + state.queue.append(RobotStep(action="vacuum_off", payload={})) + _log("Queued: vacuum_off") + else: + _log(f"Target object not found: {obj} {color}") + continue + + elif action == "reiniciar": + _log(f"Reiniciar: resetting scene and moving to home [{init_x}, {init_y}, {init_z}]") + # Turn off vacuum first + state.queue.append(RobotStep(action="vacuum_off", payload={})) + # Clear current detected objects (will be refreshed by detector) + state.latest_objects = [] + state.latest_objects_at = None + _log("Cleared detected objects - waiting for fresh detection") + # Move to initial position + state.queue.append( + RobotStep( + action="move_to_pose", + payload={ + "x": init_x, + "y": init_y, + "z": init_z, + "roll": init_roll, + "pitch": init_pitch, + "yaw": init_yaw, + }, + ) + ) + _log(f"Queued: vacuum_off + move to home") + # Send scene update to notify clients that scene was reset + scene_payload = json.dumps( + {"objects": list(state.static_objects), "reset": True} + ) + node.send_output( + scene_output, + pa.array([scene_payload]), + metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + _log("Sent scene reset notification") + + _log(f"Queue size: {len(state.queue)}") + + # Emit scene updates when objects change + if event["id"] == objects_input: + scene_payload = json.dumps( + {"objects": list(state.latest_objects) + list(state.static_objects)} + ) + node.send_output( + scene_output, + pa.array([scene_payload]), + metadata={"encoding": "json", "timestamp_ns": time.time_ns()}, + ) + + # Send queued robot steps one at a time + if state.pending_command is None and state.queue: + step = state.queue.popleft() + if cfg.dry_run: + _log(f"[DRY RUN] Would send: {step.action} {step.payload}") + state.pending_command = None + continue + cmd_id = _send_dora_command(node, command_output, step.action, step.payload) + state.pending_command = {"id": cmd_id, "action": step.action} + _log(f"Sent command: {step.action} (id={cmd_id[:8]}...) remaining={len(state.queue)}") + + # Update debug state + shared_state.update_robot_command( + {"id": cmd_id, "action": step.action, "payload": step.payload}, + time.monotonic(), + ) + + +if __name__ == "__main__": + main() diff --git a/dora_voice_control/dora_voice_control/models.py b/dora_voice_control/dora_voice_control/models.py new file mode 100644 index 0000000..2a16bce --- /dev/null +++ b/dora_voice_control/dora_voice_control/models.py @@ -0,0 +1,38 @@ +"""Pydantic models for the voice control API.""" + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel + + +class CommandRequest(BaseModel): + """Request to send a voice command.""" + + text: str + + +class CommandResponse(BaseModel): + """Response from a voice command.""" + + ok: bool + text: str + status: str + + +class MoveRequest(BaseModel): + """Request to move to a position.""" + + x: float + y: float + z: float + roll: Optional[float] = 180.0 + pitch: Optional[float] = 0.0 + yaw: Optional[float] = 0.0 + + +class VacuumRequest(BaseModel): + """Request to control the vacuum.""" + + on: bool diff --git a/dora_voice_control/dora_voice_control/parser.py b/dora_voice_control/dora_voice_control/parser.py new file mode 100644 index 0000000..231a710 --- /dev/null +++ b/dora_voice_control/dora_voice_control/parser.py @@ -0,0 +1,118 @@ +"""Voice command parsing logic.""" + +from __future__ import annotations + +import json +import os +import unicodedata +from typing import Dict + + +def normalize(text: str) -> str: + """Normalize text: lowercase, remove accents.""" + text = text.lower().strip() + text = unicodedata.normalize("NFKD", text) + text = "".join([c for c in text if not unicodedata.combining(c)]) + return text + + +def rule_parse(transcript: str) -> Dict[str, str]: + """Parse voice command using rule-based approach.""" + text = normalize(transcript) + + action = "error" + if any(w in text for w in ["reiniciar", "reinicia", "reset"]): + action = "reiniciar" + elif any(w in text for w in ["sube", "subir", "arriba"]): + action = "subir" + elif any(w in text for w in ["baja", "bajar", "abajo"]): + action = "bajar" + elif any(w in text for w in ["soltar", "deja", "dejar"]): + action = "soltar" + elif any(w in text for w in ["tomar", "toma", "agarra", "agarrar", "coger", "chupar", "succionar"]): + action = "tomar" + elif any(w in text for w in ["ir", "ve", "mover", "muevete", "acercar"]): + action = "ir" + + color = "no especificado" + if "rojo" in text: + color = "rojo" + elif "azul" in text: + color = "azul" + elif "amarillo" in text: + color = "amarillo" + elif "blanco" in text: + color = "blanco" + + obj = "no especificado" + if "estrella" in text: + obj = "estrella" + elif "cilindro" in text: + obj = "cilindro" + elif "cubo" in text: + obj = "cubo" + elif "caja" in text: + obj = "caja" + + size = "no especificado" + if "grande" in text: + size = "grande" + elif "pequeno" in text or "pequeño" in text or "chico" in text: + size = "pequeno" + + if action == "error": + return {"resultado": "error"} + return { + "resultado": "ok", + "accion": action, + "objeto": obj, + "color": color, + "tamano": size, + } + + +def build_gemini_prompt(transcript: str) -> str: + """Build prompt for Gemini LLM parsing.""" + return f"""Interpreta el siguiente comando de voz de un niño, convertido a texto, para controlar + un robot (manito). Asegúrate de responder con 'accion', 'objeto', 'color' y 'tamano'. Si el color + o el tamaño no están especificados, responde con 'no especificado'. Si no entiendes la frase, + responde con 'resultado: error'. En caso contrario, responde con 'resultado: ok'. Las acciones + posibles son 'bajar', 'subir', 'soltar', 'tomar', 'ir', 'reiniciar'. Los colores posibles son 'rojo', + 'blanco','azul' y 'amarillo'. Los tamaños posibles son 'grande', 'pequeno'. Los posible objetos son estrella, + cilindro, cubo y caja; cualquier otro objeto es error. + Comando: "{transcript}" + Nota: Los comandos pueden incluir variaciones en la expresión y errores comunes en el lenguaje de + los niños. Normaliza la respuesta a las categorías establecidas. La salida es un json con los campos + 'resultado', 'accion', 'objeto', 'color' y 'tamano'. Adicionalmente los ninos pueden decir tomar,chupar, succionar o similar para tomar un objeto. + """ + + +def parse_command(transcript: str, llm_provider: str = "rules") -> Dict[str, str]: + """Parse voice command using specified provider.""" + if llm_provider == "gemini": + try: + from google import genai + from google.genai import types + except Exception: + return rule_parse(transcript) + + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + return rule_parse(transcript) + + try: + client = genai.Client(api_key=api_key) + prompt = build_gemini_prompt(transcript) + reply = client.models.generate_content( + model=os.getenv("GEMINI_MODEL", "gemini-2.0-flash"), + contents=prompt, + config=types.GenerateContentConfig(temperature=0.5), + ) + raw = str(reply.text).replace("```json", "").replace("```", "") + return json.loads(raw) + except json.JSONDecodeError: + return {"resultado": "error"} + except Exception: + return rule_parse(transcript) + else: + return rule_parse(transcript) diff --git a/dora_voice_control/dora_voice_control/state.py b/dora_voice_control/dora_voice_control/state.py new file mode 100644 index 0000000..a68d1d6 --- /dev/null +++ b/dora_voice_control/dora_voice_control/state.py @@ -0,0 +1,158 @@ +"""Shared state management for voice control node.""" + +from __future__ import annotations + +import threading +from collections import deque +from dataclasses import dataclass, field +from typing import Any, Deque, Dict, List, Optional + + +@dataclass +class RobotStep: + """A single step in the robot command queue.""" + + action: str + payload: Dict[str, Any] + + +@dataclass +class VoiceState: + """Runtime state for voice control.""" + + latest_pose: Optional[List[float]] = None + latest_pose_at: Optional[float] = None + latest_objects: List[Dict[str, Any]] = field(default_factory=list) + latest_objects_at: Optional[float] = None + static_objects: List[Dict[str, Any]] = field(default_factory=list) + pending_command: Optional[Dict[str, Any]] = None + queue: Deque[RobotStep] = field(default_factory=deque) + + +@dataclass +class DebugState: + """Debug information for the web interface.""" + + last_voice_input: Optional[str] = None + last_voice_input_at: Optional[float] = None + last_parse_result: Optional[Dict[str, Any]] = None + last_robot_command: Optional[Dict[str, Any]] = None + last_robot_command_at: Optional[float] = None + command_history: List[Dict[str, Any]] = field(default_factory=list) + error_log: List[Dict[str, Any]] = field(default_factory=list) + latest_image: Optional[bytes] = None + latest_image_at: Optional[float] = None + + +class SharedState: + """Thread-safe shared state container.""" + + def __init__(self) -> None: + self._lock = threading.Lock() + self.voice_state = VoiceState() + self.debug_state = DebugState() + self._command_callback: Optional[Any] = None + + def set_command_callback(self, callback: Any) -> None: + """Set callback for sending commands from web interface.""" + with self._lock: + self._command_callback = callback + + def get_command_callback(self) -> Optional[Any]: + """Get the command callback.""" + with self._lock: + return self._command_callback + + def get_status(self) -> Dict[str, Any]: + """Get current status for web interface.""" + with self._lock: + vs = self.voice_state + ds = self.debug_state + return { + "has_pose": vs.latest_pose is not None, + "pose": vs.latest_pose, + "pose_age_ms": _age_ms(vs.latest_pose_at), + "object_count": len(vs.latest_objects), + "static_object_count": len(vs.static_objects), + "queue_size": len(vs.queue), + "has_pending_command": vs.pending_command is not None, + "pending_command": vs.pending_command, + "last_voice_input": ds.last_voice_input, + "last_voice_input_age_ms": _age_ms(ds.last_voice_input_at), + "last_parse_result": ds.last_parse_result, + } + + def get_objects(self) -> Dict[str, Any]: + """Get detected and static objects.""" + with self._lock: + return { + "detected": list(self.voice_state.latest_objects), + "static": list(self.voice_state.static_objects), + } + + def get_queue(self) -> List[Dict[str, Any]]: + """Get the command queue.""" + with self._lock: + return [{"action": s.action, "payload": s.payload} for s in self.voice_state.queue] + + def get_history(self) -> List[Dict[str, Any]]: + """Get command history.""" + with self._lock: + return list(self.debug_state.command_history[-50:]) + + def get_errors(self) -> List[Dict[str, Any]]: + """Get error log.""" + with self._lock: + return list(self.debug_state.error_log[-50:]) + + def add_to_history(self, entry: Dict[str, Any]) -> None: + """Add entry to command history.""" + with self._lock: + self.debug_state.command_history.append(entry) + if len(self.debug_state.command_history) > 100: + self.debug_state.command_history = self.debug_state.command_history[-100:] + + def add_error(self, error: Dict[str, Any]) -> None: + """Add entry to error log.""" + with self._lock: + self.debug_state.error_log.append(error) + if len(self.debug_state.error_log) > 100: + self.debug_state.error_log = self.debug_state.error_log[-100:] + + def update_voice_input(self, text: str, parse_result: Dict[str, Any], timestamp: float) -> None: + """Update last voice input info.""" + with self._lock: + self.debug_state.last_voice_input = text + self.debug_state.last_voice_input_at = timestamp + self.debug_state.last_parse_result = parse_result + + def update_robot_command(self, command: Dict[str, Any], timestamp: float) -> None: + """Update last robot command info.""" + with self._lock: + self.debug_state.last_robot_command = command + self.debug_state.last_robot_command_at = timestamp + + def update_image(self, image_bytes: bytes, timestamp: float) -> None: + """Update latest camera image.""" + with self._lock: + self.debug_state.latest_image = image_bytes + self.debug_state.latest_image_at = timestamp + + def get_image(self) -> Optional[bytes]: + """Get latest camera image.""" + with self._lock: + return self.debug_state.latest_image + + def get_image_age_ms(self) -> Optional[int]: + """Get age of latest image in milliseconds.""" + with self._lock: + return _age_ms(self.debug_state.latest_image_at) + + +def _age_ms(timestamp: Optional[float]) -> Optional[int]: + """Calculate age in milliseconds from monotonic timestamp.""" + import time + + if timestamp is None: + return None + return int((time.monotonic() - timestamp) * 1000) diff --git a/dora_voice_control/dora_voice_control/templates.py b/dora_voice_control/dora_voice_control/templates.py new file mode 100644 index 0000000..b1076f2 --- /dev/null +++ b/dora_voice_control/dora_voice_control/templates.py @@ -0,0 +1,700 @@ +"""HTML templates for the voice control web interface.""" + +HTML_TEMPLATE = """ + + + + + Voice Control Debug + + + +
+

Voice Control Debug Interface

+
+ + Connecting... +
+
+ +
+ +
+

Camera View

+
+
No camera image available
+ +
+
+
+ + +
+

Send Command

+
+ + +
+
+ + + + + + +
+
+ + +
+

System Status

+
+
+
Pose Available
+
--
+
+
+
Pose Age
+
--
+
+
+
Objects Detected
+
--
+
+
+
Static Objects
+
--
+
+
+
Queue Size
+
--
+
+
+
Pending Command
+
--
+
+
+
+ + +
+

TCP Pose

+
+
+
X (mm)
+
--
+
+
+
Y (mm)
+
--
+
+
+
Z (mm)
+
--
+
+
+
Roll
+
--
+
+
+
Pitch
+
--
+
+
+
Yaw
+
--
+
+
+
+ + +
+

Last Parse Result

+
+
No command parsed yet
+
+
+ Last input: -- +
+
+ + +
+

Detected Objects

+
+
No objects detected
+
+
+ + +
+

Command Queue

+
+
Queue is empty
+
+
+ +
+
+ + +
+

Activity Log

+
+
+
+ + + + +""" diff --git a/dora_voice_control/pyproject.toml b/dora_voice_control/pyproject.toml new file mode 100644 index 0000000..92e6b1c --- /dev/null +++ b/dora_voice_control/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "dora-voice-control" +version = "0.1.0" +license = { file = "MIT" } +authors = [{ name = "Dora" }] +description = "Dora node for voice command control via WebSocket" + +requires-python = ">=3.8" + +dependencies = [ + "dora-rs >= 0.3.9", + "numpy < 2.0.0", + "pyarrow >= 12.0.0", + "websockets >= 12.0", + "fastapi >= 0.109.0", + "uvicorn >= 0.27.0", + "pydantic >= 2.0.0", + "opencv-python >= 4.8.0", +] + +[project.optional-dependencies] +llm = ["google-genai"] + +[project.scripts] +dora-voice-control = "dora_voice_control.main:main" diff --git a/trained_models/yolo8n.pt b/trained_models/yolo8n.pt new file mode 100644 index 0000000..1da2e5a Binary files /dev/null and b/trained_models/yolo8n.pt differ