Refactor voice control core and robot behavior

This commit is contained in:
cristhian aguilera
2026-02-02 12:29:59 -03:00
parent b9798a2f46
commit 695d309816
36 changed files with 3436 additions and 1065 deletions

View File

@@ -0,0 +1 @@
"""Dora detector node package."""

View File

@@ -0,0 +1,524 @@
"""Dora node for YOLO detection and base-frame object localization."""
from __future__ import annotations
import json
import os
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np
import pyarrow as pa
from dora import Node
from ultralytics import YOLO
try:
import tomllib
except ModuleNotFoundError: # pragma: no cover
import tomli as tomllib
DEFAULT_WEIGHTS = os.path.join(os.getcwd(), "trained_models", "yolo8n.pt")
# Detection visualization constants
POINT_SEARCH_RADIUS = 3
DEFAULT_FONT_SCALE = 0.5
DETECTION_BOX_THICKNESS = 2
def _log(msg: str) -> None:
"""Print a timestamped log message."""
timestamp = time.strftime("%H:%M:%S")
print(f"[detector {timestamp}] {msg}", flush=True)
@dataclass
class DetectionConfig:
imgsz: int
conf: float
iou: float
size_threshold: int
roi_top_left: Tuple[int, int]
roi_bottom_right: Tuple[int, int]
use_roi: bool
detect_every_n: int
min_depth_mm: float
max_depth_mm: float
color_blue: Tuple[int, int, int]
color_red: Tuple[int, int, int]
color_yellow: Tuple[int, int, int]
color_white: Tuple[int, int, int]
def _parse_int_pair(raw: str, default: Tuple[int, int]) -> Tuple[int, int]:
try:
parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 2:
return int(parts[0]), int(parts[1])
except Exception as e:
_log(f"Warning: failed to parse int pair '{raw}': {e}")
return default
def _parse_float_pair(raw: str, default: Tuple[float, float]) -> Tuple[float, float]:
try:
parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 2:
return float(parts[0]), float(parts[1])
except Exception as e:
_log(f"Warning: failed to parse float pair '{raw}': {e}")
return default
def _parse_color(raw: str, default: Tuple[int, int, int]) -> Tuple[int, int, int]:
try:
parts = [p.strip() for p in raw.split(",")]
if len(parts) >= 3:
return int(parts[0]), int(parts[1]), int(parts[2])
except Exception as e:
_log(f"Warning: failed to parse color '{raw}': {e}")
return default
def _rotation_matrix_xyz(
roll_deg: float, pitch_deg: float, yaw_deg: float
) -> np.ndarray:
roll = np.deg2rad(roll_deg)
pitch = np.deg2rad(pitch_deg)
yaw = np.deg2rad(yaw_deg)
cx, sx = np.cos(roll), np.sin(roll)
cy, sy = np.cos(pitch), np.sin(pitch)
cz, sz = np.cos(yaw), np.sin(yaw)
rot_x = np.array([[1.0, 0.0, 0.0], [0.0, cx, -sx], [0.0, sx, cx]])
rot_y = np.array([[cy, 0.0, sy], [0.0, 1.0, 0.0], [-sy, 0.0, cy]])
rot_z = np.array([[cz, -sz, 0.0], [sz, cz, 0.0], [0.0, 0.0, 1.0]])
return rot_z @ rot_y @ rot_x
def _pose_to_matrix(tcp_pose_mm_deg: List[float]) -> np.ndarray:
x, y, z, roll, pitch, yaw = tcp_pose_mm_deg
rot = _rotation_matrix_xyz(roll, pitch, yaw)
mat = np.eye(4)
mat[:3, :3] = rot
mat[:3, 3] = np.array([x, y, z], dtype=np.float64) / 1000.0
return mat
def _decode_image(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
encoding = str(metadata.get("encoding", "bgr8")).lower()
width = metadata.get("width")
height = metadata.get("height")
if (width is None or height is None) and "shape" in metadata:
shape = metadata.get("shape")
if isinstance(shape, (list, tuple)) and len(shape) >= 2:
height = height if height is not None else int(shape[0])
width = width if width is not None else int(shape[1])
if width is None or height is None:
raise KeyError("width/height (or shape) missing from metadata")
if encoding == "bgr8":
frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
return frame.copy()
if encoding == "rgb8":
frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
if encoding in {"jpeg", "jpg", "jpe", "bmp", "webp", "png"}:
frame = storage.to_numpy().astype(np.uint8)
return cv2.imdecode(frame, cv2.IMREAD_COLOR)
if encoding == "yuv420":
yuv = storage.to_numpy().astype(np.uint8)
yuv = yuv[: width * height * 3 // 2].reshape((height * 3 // 2, width))
return cv2.cvtColor(yuv, cv2.COLOR_YUV420p2BGR)
raise RuntimeError(f"Unsupported image encoding: {encoding}")
def _decode_point_cloud(storage: pa.Array, metadata: Dict[str, Any]) -> np.ndarray:
dtype_str = str(metadata.get("dtype", "float32"))
if dtype_str != "float32":
raise RuntimeError(f"Unsupported point cloud dtype: {dtype_str}")
shape = metadata.get("shape")
if not isinstance(shape, (list, tuple)) or len(shape) < 3:
raise KeyError("point cloud shape missing from metadata")
height, width, channels = [int(v) for v in shape[:3]]
if channels < 3:
raise ValueError("point cloud requires at least 3 channels")
raw = storage.to_numpy().astype(np.uint8).tobytes()
values = np.frombuffer(raw, dtype=np.float32)
return values.reshape((height, width, channels))
def _valid_point(point_xyz: np.ndarray, cfg: DetectionConfig) -> bool:
if not np.all(np.isfinite(point_xyz)):
return False
z = float(point_xyz[2])
if z <= 0:
return False
if z < cfg.min_depth_mm or z > cfg.max_depth_mm:
return False
return True
def _sample_point(
point_cloud: np.ndarray, x: int, y: int, cfg: DetectionConfig
) -> Optional[np.ndarray]:
h, w, _ = point_cloud.shape
if x < 0 or y < 0 or x >= w or y >= h:
return None
point_xyz = point_cloud[y, x, :3].astype(np.float64)
if _valid_point(point_xyz, cfg):
return point_xyz
samples = []
for dy in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1):
for dx in range(-POINT_SEARCH_RADIUS, POINT_SEARCH_RADIUS + 1):
xx = x + dx
yy = y + dy
if xx < 0 or yy < 0 or xx >= w or yy >= h:
continue
p = point_cloud[yy, xx, :3].astype(np.float64)
if _valid_point(p, cfg):
samples.append(p)
if not samples:
return None
return np.median(np.stack(samples, axis=0), axis=0)
def _dominant_color(image: np.ndarray, bbox: List[int]) -> Tuple[int, int, int]:
x1, y1, x2, y2 = bbox
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(image.shape[1], x2)
y2 = min(image.shape[0], y2)
roi = image[y1:y2, x1:x2]
if roi.size == 0:
return (0, 0, 0)
color = np.median(roi, axis=(0, 1)).astype(int)
return int(color[0]), int(color[1]), int(color[2])
def _closest_color(color: Tuple[int, int, int], cfg: DetectionConfig) -> str:
colors = {
"blue": np.array(cfg.color_blue, dtype=np.float64),
"red": np.array(cfg.color_red, dtype=np.float64),
"yellow": np.array(cfg.color_yellow, dtype=np.float64),
"white": np.array(cfg.color_white, dtype=np.float64),
}
color_vec = np.array(color, dtype=np.float64)
best_name = "unknown"
best_dist = float("inf")
for name, value in colors.items():
dist = np.linalg.norm(color_vec - value)
if dist < best_dist:
best_name = name
best_dist = dist
return best_name
def _load_calibration(calibration_file: str) -> np.ndarray:
calib = np.load(calibration_file, allow_pickle=True)
t_cam2gripper = calib["T_cam2gripper"]
return t_cam2gripper
def _load_config_file(path: str) -> Dict[str, Any]:
if not path or not os.path.exists(path):
return {}
try:
with open(path, "rb") as handle:
return tomllib.load(handle)
except Exception as e:
_log(f"Warning: failed to load config file '{path}': {e}")
return {}
def _build_config(config_path: str) -> DetectionConfig:
cfg_data = _load_config_file(config_path)
roi_cfg = cfg_data.get("roi", {})
colors_cfg = cfg_data.get("dominant_colors", {})
obj_cfg = cfg_data.get("object_parameters", {})
imgsz = int(os.getenv("YOLO_IMGSZ", "640"))
conf = float(os.getenv("YOLO_CONF", "0.25"))
iou = float(os.getenv("YOLO_IOU", "0.45"))
size_threshold = int(
os.getenv("SIZE_THRESHOLD", str(obj_cfg.get("size_threshold", 4200)))
)
roi_top_left = _parse_int_pair(
os.getenv(
"ROI_TOP_LEFT",
",".join([str(v) for v in roi_cfg.get("roi_top_left", [500, 230])]),
),
(500, 230),
)
roi_bottom_right = _parse_int_pair(
os.getenv(
"ROI_BOTTOM_RIGHT",
",".join([str(v) for v in roi_cfg.get("roi_bottom_right", [775, 510])]),
),
(775, 510),
)
use_roi = os.getenv("USE_ROI", "true").lower() in ("true", "1", "yes")
detect_every_n = int(os.getenv("DETECT_EVERY_N", "3"))
min_depth_mm = float(os.getenv("MIN_DEPTH_MM", "10"))
max_depth_mm = float(os.getenv("MAX_DEPTH_MM", "600"))
color_blue = _parse_color(
os.getenv(
"COLOR_BLUE",
",".join([str(v) for v in colors_cfg.get("blue", [255, 0, 0])]),
),
(255, 0, 0),
)
color_red = _parse_color(
os.getenv(
"COLOR_RED",
",".join([str(v) for v in colors_cfg.get("red", [0, 0, 255])]),
),
(0, 0, 255),
)
color_yellow = _parse_color(
os.getenv(
"COLOR_YELLOW",
",".join([str(v) for v in colors_cfg.get("yellow", [0, 255, 255])]),
),
(0, 255, 255),
)
color_white = _parse_color(
os.getenv(
"COLOR_WHITE",
",".join([str(v) for v in colors_cfg.get("white", [255, 255, 255])]),
),
(255, 255, 255),
)
return DetectionConfig(
imgsz=imgsz,
conf=conf,
iou=iou,
size_threshold=size_threshold,
roi_top_left=roi_top_left,
roi_bottom_right=roi_bottom_right,
use_roi=use_roi,
detect_every_n=detect_every_n,
min_depth_mm=min_depth_mm,
max_depth_mm=max_depth_mm,
color_blue=color_blue,
color_red=color_red,
color_yellow=color_yellow,
color_white=color_white,
)
def _within_roi(bbox: List[int], cfg: DetectionConfig) -> bool:
if not cfg.use_roi:
return True
x1, y1, x2, y2 = bbox
rx1, ry1 = cfg.roi_top_left
rx2, ry2 = cfg.roi_bottom_right
return x1 >= rx1 and y1 >= ry1 and x2 <= rx2 and y2 <= ry2
def _draw_detections(
frame: np.ndarray, objects: List[Dict[str, Any]], cfg: DetectionConfig
) -> np.ndarray:
"""Draw bounding boxes and labels on frame."""
annotated = frame.copy()
# Draw ROI rectangle (always visible)
cv2.rectangle(
annotated,
cfg.roi_top_left,
cfg.roi_bottom_right,
(0, 255, 0) if cfg.use_roi else (128, 128, 128),
2,
)
# Label the ROI
cv2.putText(
annotated,
"ROI",
(cfg.roi_top_left[0] + 5, cfg.roi_top_left[1] + 20),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 255, 0) if cfg.use_roi else (128, 128, 128),
2,
)
# Color mapping for visualization
color_map = {
"blue": (255, 100, 0),
"red": (0, 0, 255),
"yellow": (0, 255, 255),
"white": (200, 200, 200),
"unknown": (128, 128, 128),
}
for obj in objects:
bbox = obj.get("bbox", [0, 0, 0, 0])
color_name = obj.get("color", "unknown")
obj_type = obj.get("object_type", "?")
size = obj.get("size", "?")
pos = obj.get("position_mm", [0, 0, 0])
color = color_map.get(color_name, (128, 128, 128))
# Draw bounding box
cv2.rectangle(annotated, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, DETECTION_BOX_THICKNESS)
# Draw label background
label = f"{obj_type} {color_name} {size}"
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, DEFAULT_FONT_SCALE, 1)
cv2.rectangle(
annotated,
(bbox[0], bbox[1] - th - 8),
(bbox[0] + tw + 4, bbox[1]),
color,
-1,
)
cv2.putText(
annotated,
label,
(bbox[0] + 2, bbox[1] - 4),
cv2.FONT_HERSHEY_SIMPLEX,
DEFAULT_FONT_SCALE,
(0, 0, 0),
1,
)
# Draw position
pos_label = f"[{pos[0]:.0f}, {pos[1]:.0f}, {pos[2]:.0f}]"
cv2.putText(
annotated,
pos_label,
(bbox[0], bbox[3] + 15),
cv2.FONT_HERSHEY_SIMPLEX,
0.4,
color,
1,
)
return annotated
def main() -> None:
image_input = os.getenv("IMAGE_INPUT", "image_bgr")
point_cloud_input = os.getenv("POINT_CLOUD_INPUT", "point_cloud")
pose_input = os.getenv("POSE_INPUT", "tcp_pose")
objects_output = os.getenv("OBJECTS_OUTPUT", "objects")
image_output = os.getenv("IMAGE_OUTPUT", "image_annotated")
calibration_file = os.getenv("CALIBRATION_FILE", "calibration.npz")
weights = os.getenv("DETECTOR_WEIGHTS", DEFAULT_WEIGHTS)
config_file = os.getenv("CONFIG_FILE", "config.toml")
cfg = _build_config(config_file)
model = YOLO(weights)
if calibration_file and not os.path.isabs(calibration_file):
config_path = os.path.join("config", calibration_file)
calibration_file = config_path if os.path.exists(config_path) else calibration_file
t_cam2gripper = _load_calibration(calibration_file)
node = Node()
latest_pose: Optional[List[float]] = None
latest_pose_at: Optional[float] = None
latest_point_cloud: Optional[np.ndarray] = None
latest_pc_at: Optional[float] = None
frame_count = 0
for event in node:
if event["type"] != "INPUT":
continue
now = time.monotonic()
if event["id"] == pose_input:
tcp_pose = event["value"].to_numpy().astype(np.float64).reshape(-1)
if tcp_pose.size >= 6:
latest_pose = tcp_pose[:6].tolist()
latest_pose_at = now
continue
if event["id"] == point_cloud_input:
latest_point_cloud = _decode_point_cloud(event["value"], event.get("metadata", {}))
latest_pc_at = now
continue
if event["id"] != image_input:
continue
frame_count += 1
if frame_count % max(1, cfg.detect_every_n) != 0:
continue
if latest_pose is None or latest_point_cloud is None:
continue
frame = _decode_image(event["value"], event.get("metadata", {}))
results = model.predict(
frame, imgsz=cfg.imgsz, conf=cfg.conf, iou=cfg.iou, verbose=False
)[0]
base_T_flange = _pose_to_matrix(latest_pose)
objects: List[Dict[str, Any]] = []
for r in results.boxes:
bbox = [int(x) for x in r.xyxy[0]]
if not _within_roi(bbox, cfg):
continue
cx = int((bbox[0] + bbox[2]) / 2)
cy = int((bbox[1] + bbox[3]) / 2)
point_cam_mm = _sample_point(latest_point_cloud, cx, cy, cfg)
if point_cam_mm is None:
continue
point_cam_m = np.array(
[point_cam_mm[0], point_cam_mm[1], point_cam_mm[2], 1.0],
dtype=np.float64,
)
point_cam_m[:3] /= 1000.0
point_base = base_T_flange @ t_cam2gripper @ point_cam_m
point_base_mm = point_base[:3] * 1000.0
dominant = _dominant_color(frame, bbox)
color_name = _closest_color(dominant, cfg)
area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
size_label = "big" if area >= cfg.size_threshold else "small"
objects.append(
{
"object_type": results.names[int(r.cls.item())],
"confidence": float(r.conf.item()),
"color": color_name,
"size": size_label,
"bbox": bbox,
"center_px": [cx, cy],
"position_mm": [
float(point_base_mm[0]),
float(point_base_mm[1]),
float(point_base_mm[2]),
],
"timestamp_ns": time.time_ns(),
}
)
payload = json.dumps({"objects": objects, "timestamp_ns": time.time_ns()})
node.send_output(
objects_output,
pa.array([payload]),
metadata={"encoding": "json", "timestamp_ns": time.time_ns()},
)
# Send annotated image
annotated = _draw_detections(frame, objects, cfg)
h, w = annotated.shape[:2]
node.send_output(
image_output,
pa.array(annotated.ravel().tolist()),
metadata={
"encoding": "bgr8",
"width": w,
"height": h,
"timestamp_ns": time.time_ns(),
},
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,19 @@
[project]
name = "dora-yolo-object-detector"
version = "0.1.0"
license = { file = "MIT" }
authors = [{ name = "Dora" }]
description = "Dora node for YOLO-based object detection with ZED point cloud"
requires-python = ">=3.8"
dependencies = [
"dora-rs >= 0.3.9",
"numpy < 2.0.0",
"opencv-python >= 4.1.1",
"pyarrow >= 12.0.0",
"ultralytics >= 8.0.0",
]
[project.scripts]
dora-yolo-object-detector = "dora_yolo_object_detector.main:main"