Refactor voice control core and robot behavior

2026-02-02 12:29:59 -03:00
parent b9798a2f46
commit 695d309816
36 changed files with 3436 additions and 1065 deletions
--- a/dora_yolo_object_detector/dora_yolo_object_detector/init.py
+++ b/dora_yolo_object_detector/dora_yolo_object_detector/init.py
@@ -0,0 +1 @@
+"""Dora detector node package."""
--- a/dora_yolo_object_detector/dora_yolo_object_detector/main.py
+++ b/dora_yolo_object_detector/dora_yolo_object_detector/main.py
@@ -0,0 +1,524 @@
+"""Dora node for YOLO detection and base-frame object localization."""
+
+from __future__ import annotations
+
+import json
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+import pyarrow as pa
+from dora import Node
+from ultralytics import YOLO
+
+try:
+    import tomllib
+except ModuleNotFoundError:  # pragma: no cover
+    import tomli as tomllib
+
+
+DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt")
+
+# Detection visualization constants
+POINT_SEARCH_RADIUS = 3
+DEFAULT_FONT_SCALE = 0.5
+DETECTION_BOX_THICKNESS = 2
+
+
+def _log(msg: str) -> None:
+    """Print a timestamped log message."""
+    timestamp = time.strftime("%H:%M:%S")
+    print(f"[detector {timestamp}] {msg}", flush=True)
+
+
+@dataclass
+class DetectionConfig:
+    imgsz: int
+    conf: float
+    iou: float
+    size_threshold: int
+    roi_top_left: Tuple[int, int]
+    roi_bottom_right: Tuple[int, int]
+    use_roi: bool
+    detect_every_n: int
+    min_depth_mm: float
+    max_depth_mm: float
+    color_blue: Tuple[int, int, int]
+    color_red: Tuple[int, int, int]
+    color_yellow: Tuple[int, int, int]
+    color_white: Tuple[int, int, int]
+
+
+def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]:
+    try:
+        parts = [p.strip() for p in raw.split(",")]
+        if len(parts) >= 2:
+            return int(parts[0]), int(parts[1])
+    except Exception as e:
+        _log(f"Warning: failed to parse int pair '{raw}': {e}")
+    return default
+
+
+def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, float]:
+    try:
+        parts = [p.strip() for p in raw.split(",")]
+        if len(parts) >= 2:
+            return float(parts[0]), float(parts[1])
+    except Exception as e:
+        _log(f"Warning: failed to parse float pair '{raw}': {e}")
+    return default
+
+
+def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int]:
+    try:
+        parts = [p.strip() for p in raw.split(",")]
+        if len(parts) >= 3:
+            return int(parts[0]), int(parts[1]), int(parts[2])
+    except Exception as e:
+        _log(f"Warning: failed to parse color '{raw}': {e}")
+    return default
+
+
+def _rotation_matrix_xyz(
+    roll_deg: float, pitch_deg: float, yaw_deg: float
+) -> np.ndarray:
+    roll = np.deg2rad(roll_deg)
+    pitch = np.deg2rad(pitch_deg)
+    yaw = np.deg2rad(yaw_deg)
+    cx, sx = np.cos(roll), np.sin(roll)
+    cy, sy = np.cos(pitch), np.sin(pitch)
+    cz, sz = np.cos(yaw), np.sin(yaw)
+    rot_x = np.array([[1.0, 0.0, 0.0], [0.0, cx, -sx], [0.0, sx, cx]])
+    rot_y = np.array([[cy, 0.0, sy], [0.0, 1.0, 0.0], [-sy, 0.0, cy]])
+    rot_z = np.array([[cz, -sz, 0.0], [sz, cz, 0.0], [0.0, 0.0, 1.0]])
+    return rot_z @ rot_y @ rot_x
+
+
+def _pose_to_matrix(tcp_pose_mm_deg: List[float]) -> np.ndarray:
+    x, y, z, roll, pitch, yaw = tcp_pose_mm_deg
+    rot = _rotation_matrix_xyz(roll, pitch, yaw)
+    mat = np.eye(4)
+    mat[:3, :3] = rot
+    mat[:3, 3] = np.array([x, y, z], dtype=np.float64) / 1000.0
+    return mat
+
+
+def _decode_image(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
+    encoding = str(metadata.get("encoding", "bgr8")).lower()
+    width = metadata.get("width")
+    height = metadata.get("height")
+    if (width is None or height is None) and "shape" in metadata:
+        shape = metadata.get("shape")
+        if isinstance(shape, (list, tuple)) and len(shape) >= 2:
+            height = height if height is not None else int(shape[0])
+            width = width if width is not None else int(shape[1])
+    if width is None or height is None:
+        raise KeyError("width/height (or shape) missing from metadata")
+
+    if encoding == "bgr8":
+        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
+        return frame.copy()
+    if encoding == "rgb8":
+        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
+        return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+    if encoding in {"jpeg", "jpg", "jpe", "bmp", "webp", "png"}:
+        frame = storage.to_numpy().astype(np.uint8)
+        return cv2.imdecode(frame, cv2.IMREAD_COLOR)
+    if encoding == "yuv420":
+        yuv = storage.to_numpy().astype(np.uint8)
+        yuv = yuv[: width * height * 3 // 2].reshape((height * 3 // 2, width))
+        return cv2.cvtColor(yuv, cv2.COLOR_YUV420p2BGR)
+
+    raise RuntimeError(f"Unsupported image encoding: {encoding}")
+
+
+def _decode_point_cloud(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
+    dtype_str = str(metadata.get("dtype", "float32"))
+    if dtype_str != "float32":
+        raise RuntimeError(f"Unsupported point cloud dtype: {dtype_str}")
+    shape = metadata.get("shape")
+    if not isinstance(shape, (list, tuple)) or len(shape) < 3:
+        raise KeyError("point cloud shape missing from metadata")
+    height, width, channels = [int(v) for v in shape[:3]]
+    if channels < 3:
+        raise ValueError("point cloud requires at least 3 channels")
+    raw = storage.to_numpy().astype(np.uint8).tobytes()
+    values = np.frombuffer(raw, dtype=np.float32)
+    return values.reshape((height, width, channels))
+
+
+def _valid_point(point_xyz: np.ndarray, cfg: DetectionConfig) -> bool:
+    if not np.all(np.isfinite(point_xyz)):
+        return False
+    z = float(point_xyz[2])
+    if z <= 0:
+        return False
+    if z < cfg.min_depth_mm or z > cfg.max_depth_mm:
+        return False
+    return True
+
+
+def _sample_point(
+    point_cloud: np.ndarray, x: int, y: int, cfg: DetectionConfig
+) -> Optional[np.ndarray]:
+    h, w, _ = point_cloud.shape
+    if x < 0 or y < 0 or x >= w or y >= h:
+        return None
+    point_xyz = point_cloud[y, x, :3].astype(np.float64)
+    if _valid_point(point_xyz, cfg):
+        return point_xyz
+    samples = []
+    for dy in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1):
+        for dx in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1):
+            xx = x + dx
+            yy = y + dy
+            if xx < 0 or yy < 0 or xx >= w or yy >= h:
+                continue
+            p = point_cloud[yy, xx, :3].astype(np.float64)
+            if _valid_point(p, cfg):
+                samples.append(p)
+    if not samples:
+        return None
+    return np.median(np.stack(samples, axis=0), axis=0)
+
+
+def _dominant_color(image: np.ndarray, bbox: List[int]) -> Tuple[int, int, int]:
+    x1, y1, x2, y2 = bbox
+    x1 = max(0, x1)
+    y1 = max(0, y1)
+    x2 = min(image.shape[1], x2)
+    y2 = min(image.shape[0], y2)
+    roi = image[y1:y2, x1:x2]
+    if roi.size == 0:
+        return (0, 0, 0)
+    color = np.median(roi, axis=(0, 1)).astype(int)
+    return int(color[0]), int(color[1]), int(color[2])
+
+
+def _closest_color(color: Tuple[int, int, int], cfg: DetectionConfig) -> str:
+    colors = {
+        "blue": np.array(cfg.color_blue, dtype=np.float64),
+        "red": np.array(cfg.color_red, dtype=np.float64),
+        "yellow": np.array(cfg.color_yellow, dtype=np.float64),
+        "white": np.array(cfg.color_white, dtype=np.float64),
+    }
+    color_vec = np.array(color, dtype=np.float64)
+    best_name = "unknown"
+    best_dist = float("inf")
+    for name, value in colors.items():
+        dist = np.linalg.norm(color_vec - value)
+        if dist < best_dist:
+            best_name = name
+            best_dist = dist
+    return best_name
+
+
+def _load_calibration(calibration_file: str) -> np.ndarray:
+    calib = np.load(calibration_file, allow_pickle=True)
+    t_cam2gripper = calib["T_cam2gripper"]
+    return t_cam2gripper
+
+
+def _load_config_file(path: str) -> Dict[str, Any]:
+    if not path or not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "rb") as handle:
+            return tomllib.load(handle)
+    except Exception as e:
+        _log(f"Warning: failed to load config file '{path}': {e}")
+        return {}
+
+
+def _build_config(config_path: str) -> DetectionConfig:
+    cfg_data = _load_config_file(config_path)
+    roi_cfg = cfg_data.get("roi", {})
+    colors_cfg = cfg_data.get("dominant_colors", {})
+    obj_cfg = cfg_data.get("object_parameters", {})
+
+    imgsz = int(os.getenv("YOLO_IMGSZ", "640"))
+    conf = float(os.getenv("YOLO_CONF", "0.25"))
+    iou = float(os.getenv("YOLO_IOU", "0.45"))
+    size_threshold = int(
+        os.getenv("SIZE_THRESHOLD", str(obj_cfg.get("size_threshold", 4200)))
+    )
+    roi_top_left = _parse_int_pair(
+        os.getenv(
+            "ROI_TOP_LEFT",
+            ",".join([str(v) for v in roi_cfg.get("roi_top_left", [500, 230])]),
+        ),
+        (500, 230),
+    )
+    roi_bottom_right = _parse_int_pair(
+        os.getenv(
+            "ROI_BOTTOM_RIGHT",
+            ",".join([str(v) for v in roi_cfg.get("roi_bottom_right", [775, 510])]),
+        ),
+        (775, 510),
+    )
+    use_roi = os.getenv("USE_ROI", "true").lower() in ("true", "1", "yes")
+    detect_every_n = int(os.getenv("DETECT_EVERY_N", "3"))
+    min_depth_mm = float(os.getenv("MIN_DEPTH_MM", "10"))
+    max_depth_mm = float(os.getenv("MAX_DEPTH_MM", "600"))
+    color_blue = _parse_color(
+        os.getenv(
+            "COLOR_BLUE",
+            ",".join([str(v) for v in colors_cfg.get("blue", [255, 0, 0])]),
+        ),
+        (255, 0, 0),
+    )
+    color_red = _parse_color(
+        os.getenv(
+            "COLOR_RED",
+            ",".join([str(v) for v in colors_cfg.get("red", [0, 0, 255])]),
+        ),
+        (0, 0, 255),
+    )
+    color_yellow = _parse_color(
+        os.getenv(
+            "COLOR_YELLOW",
+            ",".join([str(v) for v in colors_cfg.get("yellow", [0, 255, 255])]),
+        ),
+        (0, 255, 255),
+    )
+    color_white = _parse_color(
+        os.getenv(
+            "COLOR_WHITE",
+            ",".join([str(v) for v in colors_cfg.get("white", [255, 255, 255])]),
+        ),
+        (255, 255, 255),
+    )
+    return DetectionConfig(
+        imgsz=imgsz,
+        conf=conf,
+        iou=iou,
+        size_threshold=size_threshold,
+        roi_top_left=roi_top_left,
+        roi_bottom_right=roi_bottom_right,
+        use_roi=use_roi,
+        detect_every_n=detect_every_n,
+        min_depth_mm=min_depth_mm,
+        max_depth_mm=max_depth_mm,
+        color_blue=color_blue,
+        color_red=color_red,
+        color_yellow=color_yellow,
+        color_white=color_white,
+    )
+
+
+def _within_roi(bbox: List[int], cfg: DetectionConfig) -> bool:
+    if not cfg.use_roi:
+        return True
+    x1, y1, x2, y2 = bbox
+    rx1, ry1 = cfg.roi_top_left
+    rx2, ry2 = cfg.roi_bottom_right
+    return x1 >= rx1 and y1 >= ry1 and x2 <= rx2 and y2 <= ry2
+
+
+def _draw_detections(
+    frame: np.ndarray, objects: List[Dict[str, Any]], cfg: DetectionConfig
+) -> np.ndarray:
+    """Draw bounding boxes and labels on frame."""
+    annotated = frame.copy()
+
+    # Draw ROI rectangle (always visible)
+    cv2.rectangle(
+        annotated,
+        cfg.roi_top_left,
+        cfg.roi_bottom_right,
+        (0, 255, 0) if cfg.use_roi else (128, 128, 128),
+        2,
+    )
+    # Label the ROI
+    cv2.putText(
+        annotated,
+        "ROI",
+        (cfg.roi_top_left[0] + 5, cfg.roi_top_left[1] + 20),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.6,
+        (0, 255, 0) if cfg.use_roi else (128, 128, 128),
+        2,
+    )
+
+    # Color mapping for visualization
+    color_map = {
+        "blue": (255, 100, 0),
+        "red": (0, 0, 255),
+        "yellow": (0, 255, 255),
+        "white": (200, 200, 200),
+        "unknown": (128, 128, 128),
+    }
+
+    for obj in objects:
+        bbox = obj.get("bbox", [0, 0, 0, 0])
+        color_name = obj.get("color", "unknown")
+        obj_type = obj.get("object_type", "?")
+        size = obj.get("size", "?")
+        pos = obj.get("position_mm", [0, 0, 0])
+
+        color = color_map.get(color_name, (128, 128, 128))
+
+        # Draw bounding box
+        cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, DETECTION_BOX_THICKNESS)
+
+        # Draw label background
+        label = f"{obj_type} {color_name} {size}"
+        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, DEFAULT_FONT_SCALE, 1)
+        cv2.rectangle(
+            annotated,
+            (bbox[0], bbox[1] - th - 8),
+            (bbox[0] + tw + 4, bbox[1]),
+            color,
+            -1,
+        )
+        cv2.putText(
+            annotated,
+            label,
+            (bbox[0] + 2, bbox[1] - 4),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            DEFAULT_FONT_SCALE,
+            (0, 0, 0),
+            1,
+        )
+
+        # Draw position
+        pos_label = f"[{pos[0]:.0f}, {pos[1]:.0f}, {pos[2]:.0f}]"
+        cv2.putText(
+            annotated,
+            pos_label,
+            (bbox[0], bbox[3] + 15),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.4,
+            color,
+            1,
+        )
+
+    return annotated
+
+
+def main() -> None:
+    image_input = os.getenv("IMAGE_INPUT", "image_bgr")
+    point_cloud_input = os.getenv("POINT_CLOUD_INPUT", "point_cloud")
+    pose_input = os.getenv("POSE_INPUT", "tcp_pose")
+    objects_output = os.getenv("OBJECTS_OUTPUT", "objects")
+    image_output = os.getenv("IMAGE_OUTPUT", "image_annotated")
+    calibration_file = os.getenv("CALIBRATION_FILE", "calibration.npz")
+    weights = os.getenv("DETECTOR_WEIGHTS", DEFAULT_WEIGHTS)
+    config_file = os.getenv("CONFIG_FILE", "config.toml")
+
+    cfg = _build_config(config_file)
+    model = YOLO(weights)
+
+    if calibration_file and not os.path.isabs(calibration_file):
+        config_path = os.path.join("config", calibration_file)
+        calibration_file = config_path if os.path.exists(config_path) else calibration_file
+    t_cam2gripper = _load_calibration(calibration_file)
+
+    node = Node()
+    latest_pose: Optional[List[float]] = None
+    latest_pose_at: Optional[float] = None
+    latest_point_cloud: Optional[np.ndarray] = None
+    latest_pc_at: Optional[float] = None
+    frame_count = 0
+
+    for event in node:
+        if event["type"] != "INPUT":
+            continue
+
+        now = time.monotonic()
+        if event["id"] == pose_input:
+            tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
+            if tcp_pose.size >= 6:
+                latest_pose = tcp_pose[:6].tolist()
+                latest_pose_at = now
+            continue
+
+        if event["id"] == point_cloud_input:
+            latest_point_cloud = _decode_point_cloud(event["value"], event.get("metadata", {}))
+            latest_pc_at = now
+            continue
+
+        if event["id"] != image_input:
+            continue
+
+        frame_count += 1
+        if frame_count % max(1, cfg.detect_every_n) != 0:
+            continue
+
+        if latest_pose is None or latest_point_cloud is None:
+            continue
+
+        frame = _decode_image(event["value"], event.get("metadata", {}))
+
+        results = model.predict(
+            frame, imgsz=cfg.imgsz, conf=cfg.conf, iou=cfg.iou, verbose=False
+        )[0]
+
+        base_T_flange = _pose_to_matrix(latest_pose)
+        objects: List[Dict[str, Any]] = []
+        for r in results.boxes:
+            bbox = [int(x) for x in r.xyxy[0]]
+            if not _within_roi(bbox, cfg):
+                continue
+            cx = int((bbox[0] + bbox[2]) / 2)
+            cy = int((bbox[1] + bbox[3]) / 2)
+            point_cam_mm = _sample_point(latest_point_cloud, cx, cy, cfg)
+            if point_cam_mm is None:
+                continue
+            point_cam_m = np.array(
+                [point_cam_mm[0], point_cam_mm[1], point_cam_mm[2], 1.0],
+                dtype=np.float64,
+            )
+            point_cam_m[:3] /= 1000.0
+            point_base = base_T_flange @ t_cam2gripper @ point_cam_m
+            point_base_mm = point_base[:3] * 1000.0
+
+            dominant = _dominant_color(frame, bbox)
+            color_name = _closest_color(dominant, cfg)
+            area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
+            size_label = "big" if area >= cfg.size_threshold else "small"
+
+            objects.append(
+                {
+                    "object_type": results.names[int(r.cls.item())],
+                    "confidence": float(r.conf.item()),
+                    "color": color_name,
+                    "size": size_label,
+                    "bbox": bbox,
+                    "center_px": [cx, cy],
+                    "position_mm": [
+                        float(point_base_mm[0]),
+                        float(point_base_mm[1]),
+                        float(point_base_mm[2]),
+                    ],
+                    "timestamp_ns": time.time_ns(),
+                }
+            )
+
+        payload = json.dumps({"objects": objects, "timestamp_ns": time.time_ns()})
+        node.send_output(
+            objects_output,
+            pa.array([payload]),
+            metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
+        )
+
+        # Send annotated image
+        annotated = _draw_detections(frame, objects, cfg)
+        h, w = annotated.shape[:2]
+        node.send_output(
+            image_output,
+            pa.array(annotated.ravel().tolist()),
+            metadata={
+                "encoding": "bgr8",
+                "width": w,
+                "height": h,
+                "timestamp_ns": time.time_ns(),
+            },
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/dora_yolo_object_detector/pyproject.toml
+++ b/dora_yolo_object_detector/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "dora-yolo-object-detector"
+version = "0.1.0"
+license = { file = "MIT" }
+authors = [{ name = "Dora" }]
+description = "Dora node for YOLO-based object detection with ZED point cloud"
+
+requires-python = ">=3.8"
+
+dependencies = [
+  "dora-rs >= 0.3.9",
+  "numpy < 2.0.0",
+  "opencv-python >= 4.1.1",
+  "pyarrow >= 12.0.0",
+  "ultralytics >= 8.0.0",
+]
+
+[project.scripts]
+dora-yolo-object-detector = "dora_yolo_object_detector.main:main"