diff --git a/config.toml b/config.toml index 56de483..1bb0165 100644 --- a/config.toml +++ b/config.toml @@ -20,3 +20,11 @@ big_height = 125.9 small_height = 106.0 bottom_height = 68.0 normal_height = 220.0 + +# Physical object heights in mm (used for stack-aware placement) +[object_heights] +cube_big = 40.0 +cube_small = 40.0 +cylinder_big = 40.0 +cylinder_small = 40.0 +default = 40.0 diff --git a/dataflow_voice_control_ulite6_zed.yml b/dataflow_voice_control_ulite6_zed.yml index 21386a7..bf95dd3 100644 --- a/dataflow_voice_control_ulite6_zed.yml +++ b/dataflow_voice_control_ulite6_zed.yml @@ -82,7 +82,7 @@ nodes: - id: voice build: | uv venv -p 3.12 --seed --allow-existing - uv pip install -e dora_voice_control + uv pip install -e "dora_voice_control[llm]" path: dora_voice_control/dora_voice_control/main.py env: VIRTUAL_ENV: ./.venv @@ -104,6 +104,14 @@ nodes: DEFAULT_PITCH: "0.0" DEFAULT_YAW: "0.0" DRY_RUN: "false" + # Object heights for stack-aware placement (in mm) + OBJECT_HEIGHT_DEFAULT: "40.0" + OBJECT_HEIGHT_CUBE: "40.0" + OBJECT_HEIGHT_CYLINDER: "40.0" + STACK_CLEARANCE_MM: "5.0" + # LLM provider for command parsing: "rules", "gemini", "ollama", "openai" + LLM_PROVIDER: "gemini" + LLM_MODEL: "gemini-2.0-flash" # Initial position (used on startup and reset command) INIT_ON_START: "true" INIT_X: "250.0" diff --git a/dora_voice_control/dora_voice_control/core/behavior.py b/dora_voice_control/dora_voice_control/core/behavior.py index f2cb24d..68c1cfe 100644 --- a/dora_voice_control/dora_voice_control/core/behavior.py +++ b/dora_voice_control/dora_voice_control/core/behavior.py @@ -129,8 +129,9 @@ class RobotBehavior(ABC): def _init_dspy(self, llm_config: LLMConfig) -> None: """Initialize DSPy predictor for this behavior.""" + _log(f"Initializing DSPy with provider={llm_config.provider}, model={llm_config.model}") if not DSPY_AVAILABLE: - _log("DSPy not available, falling back to rules") + _log("DSPy not available (import failed), falling back to rules") return if self.CommandSignature is None: _log("No CommandSignature defined, falling back to rules") @@ -141,8 +142,12 @@ class RobotBehavior(ABC): dspy.configure(lm=lm) self._predictor = dspy.Predict(self.CommandSignature) _log(f"DSPy initialized with {llm_config.provider}/{llm_config.model}") + else: + _log(f"Failed to create LM for provider={llm_config.provider}") except Exception as e: + import traceback _log(f"Failed to initialize DSPy: {e}") + _log(f"Traceback: {traceback.format_exc()}") def _create_lm(self, config: LLMConfig) -> Optional[Any]: """Create DSPy language model.""" @@ -171,8 +176,10 @@ class RobotBehavior(ABC): def parse_command(self, transcript: str) -> Dict[str, str]: """Parse voice command using DSPy or fallback to rules.""" if self._predictor: + _log(f"Using DSPy to parse: '{transcript}'") try: result = self._predictor(comando=transcript) + _log(f"DSPy result: accion={result.accion}, objeto={result.objeto}, color={result.color}, tamano={result.tamano}") return { "resultado": "ok" if result.accion != "error" else "error", "accion": result.accion, @@ -181,7 +188,11 @@ class RobotBehavior(ABC): "tamano": result.tamano, } except Exception as e: + import traceback _log(f"DSPy parsing failed: {e}, falling back to rules") + _log(f"Traceback: {traceback.format_exc()}") + else: + _log(f"No DSPy predictor, using rules to parse: '{transcript}'") return self.rule_parse(transcript) def rule_parse(self, transcript: str) -> Dict[str, str]: diff --git a/dora_voice_control/dora_voice_control/core/scene.py b/dora_voice_control/dora_voice_control/core/scene.py index 309e093..cd978f0 100644 --- a/dora_voice_control/dora_voice_control/core/scene.py +++ b/dora_voice_control/dora_voice_control/core/scene.py @@ -73,6 +73,10 @@ class SceneState: def __init__(self) -> None: self._lock = threading.Lock() self._objects: Dict[str, SceneObject] = {} + # Scene update mode: "static" (default) or "dynamic" + self._update_mode: str = os.getenv("SCENE_UPDATE_MODE", "static").lower() + # Whether scene has been captured (STATIC mode only) + self._scene_captured: bool = False # === Core Operations === @@ -146,6 +150,8 @@ class SceneState: for obj in self._objects.values(): if obj.on_top_of and obj.on_top_of not in self._objects: obj.on_top_of = None + # Reset capture flag to allow next detection + self._scene_captured = False # === Query === @@ -176,6 +182,22 @@ class SceneState: with self._lock: return len(self._objects) + # === Scene Update Mode === + + def is_static_mode(self) -> bool: + """Check if in STATIC update mode.""" + return self._update_mode == "static" + + def is_captured(self) -> bool: + """Check if scene has been captured (STATIC mode).""" + with self._lock: + return self._scene_captured + + def reset_capture(self) -> None: + """Reset capture flag to allow next detection to update scene.""" + with self._lock: + self._scene_captured = False + # === Spatial Relationships === def set_on_top_of(self, object_id: str, below_id: Optional[str]) -> bool: @@ -560,6 +582,10 @@ class ObjectsHandler: if not raw: return + # In STATIC mode, ignore updates after scene is captured + if self._scene.is_static_mode() and self._scene.is_captured(): + return + try: payload = json.loads(raw) objects = payload.get("objects", []) @@ -569,5 +595,11 @@ class ObjectsHandler: self._scene.replace_detected(objects) + # Mark scene as captured after first successful update (STATIC mode) + if self._scene.is_static_mode(): + with self._scene._lock: + self._scene._scene_captured = True + self._logger.log("Scene captured (STATIC mode)") + # Emit scene update self._notifier.send_scene_update() diff --git a/dora_voice_control/dora_voice_control/core/state.py b/dora_voice_control/dora_voice_control/core/state.py index 00ce664..10cc3e4 100644 --- a/dora_voice_control/dora_voice_control/core/state.py +++ b/dora_voice_control/dora_voice_control/core/state.py @@ -24,6 +24,7 @@ class VoiceState: latest_pose_at: Optional[float] = None pending_command: Optional[Dict[str, Any]] = None queue: Deque[RobotStep] = field(default_factory=deque) + held_object_id: Optional[str] = None # ID of currently held object @dataclass @@ -192,6 +193,21 @@ class SharedState: with self._lock: return self._debug_state.last_parse_result + def get_held_object_id(self) -> Optional[str]: + """Get the ID of the currently held object.""" + with self._lock: + return self._voice_state.held_object_id + + def set_held_object_id(self, object_id: Optional[str]) -> None: + """Set the ID of the currently held object.""" + with self._lock: + self._voice_state.held_object_id = object_id + + def clear_held_object(self) -> None: + """Clear the held object (after releasing).""" + with self._lock: + self._voice_state.held_object_id = None + def _age_ms(timestamp: Optional[float]) -> Optional[int]: """Calculate age in milliseconds from monotonic timestamp.""" diff --git a/dora_voice_control/dora_voice_control/robots/littlehand/actions.py b/dora_voice_control/dora_voice_control/robots/littlehand/actions.py index 6481e1b..154240b 100644 --- a/dora_voice_control/dora_voice_control/robots/littlehand/actions.py +++ b/dora_voice_control/dora_voice_control/robots/littlehand/actions.py @@ -8,39 +8,39 @@ from ...core.behavior import ActionInfo LITTLEHAND_ACTIONS: dict[str, ActionInfo] = { "subir": ActionInfo( name="subir", - aliases=["sube", "arriba"], + aliases=[], requires_pose=True, description="Subir el robot", ), "bajar": ActionInfo( name="bajar", - aliases=["baja", "abajo"], + aliases=[], requires_pose=True, description="Bajar el robot", ), "ir": ActionInfo( name="ir", - aliases=["ve", "mover", "muevete", "acercar"], + aliases=[], requires_object=True, description="Ir hacia un objeto", ), "tomar": ActionInfo( name="tomar", - aliases=["toma", "agarra", "agarrar", "coger", "chupar", "succionar"], + aliases=[], requires_pose=False, requires_object=False, description="Tomar un objeto", ), "soltar": ActionInfo( name="soltar", - aliases=["deja", "dejar"], + aliases=[], requires_pose=False, requires_object=False, description="Soltar el objeto", ), "reiniciar": ActionInfo( name="reiniciar", - aliases=["reinicia", "reset"], + aliases=[], requires_pose=False, requires_object=False, description="Reiniciar a posicion inicial", diff --git a/dora_voice_control/dora_voice_control/robots/littlehand/behavior.py b/dora_voice_control/dora_voice_control/robots/littlehand/behavior.py index 29179ad..21ce03d 100644 --- a/dora_voice_control/dora_voice_control/robots/littlehand/behavior.py +++ b/dora_voice_control/dora_voice_control/robots/littlehand/behavior.py @@ -10,6 +10,7 @@ from .actions import LITTLEHAND_ACTIONS from .signature import LittlehandSignature _XY_MATCH_RADIUS_MM = float(os.getenv("BAJAR_XY_RADIUS_MM", "40.0")) +_STACK_CLEARANCE_MM = float(os.getenv("STACK_CLEARANCE_MM", "5.0")) # Clearance when placing on top of objects class LittlehandBehavior(RobotBehavior): """Littlehand behavior using the default pick-and-place actions.""" @@ -35,20 +36,92 @@ class LittlehandBehavior(RobotBehavior): return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z) def action_bajar(self, ctx: ActionContext) -> bool: - """Move down by step_mm or to top of object under the tool.""" - target = self._find_object_under_pose(ctx) - if target is not None: - target_z = target.position_mm[2] + ctx.config.tcp_offset_mm - _log( - f"bajar: using object '{target.object_type}' color={target.color} " - f"obj_z={target.position_mm[2]:.1f} tcp_offset={ctx.config.tcp_offset_mm:.1f} " - f"target_z={target_z:.1f} at pose_z={ctx.pose[2]:.1f}" - ) + """Move down by step_mm or to top of object under the tool. + + If holding an object, accounts for its height when placing on obstacles. + + Note: position_mm[2] from the camera represents the TOP surface of the object + (camera looks down, so it sees the top). We use height_mm only for the HELD + object to calculate placement position. + """ + obstacle = self._find_object_under_pose(ctx) + + # Get held object height for stack-aware placement + held_height = self._get_held_object_height(ctx) + + if obstacle is not None: + # obstacle.position_mm[2] is the TOP surface of the obstacle + obstacle_top_z = obstacle.position_mm[2] + + if held_height > 0: + # Stack-aware: place held object on top of obstacle + # When vacuum releases, bottom of held object should be at obstacle_top + # TCP needs to be at: obstacle_top + held_height + tcp_offset + target_z = obstacle_top_z + held_height + _STACK_CLEARANCE_MM + ctx.config.tcp_offset_mm + _log( + f"bajar: STACK-AWARE placement on '{obstacle.object_type}' color={obstacle.color} " + f"obstacle_top_z={obstacle_top_z:.1f} held_height={held_height:.1f} " + f"clearance={_STACK_CLEARANCE_MM:.1f} tcp_offset={ctx.config.tcp_offset_mm:.1f} " + f"target_z={target_z:.1f}" + ) + else: + # Not holding anything: move TCP to object top (for grabbing) + target_z = obstacle_top_z + ctx.config.tcp_offset_mm + _log( + f"bajar: move to object '{obstacle.object_type}' color={obstacle.color} " + f"obstacle_top_z={obstacle_top_z:.1f} tcp_offset={ctx.config.tcp_offset_mm:.1f} " + f"target_z={target_z:.1f}" + ) return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z) + target_z = ctx.pose[2] - self.config.step_mm _log(f"bajar: no object under pose, step to z={target_z:.1f}") return self._queue_move(ctx, ctx.pose[0], ctx.pose[1], target_z) + def _get_held_object_height(self, ctx: ActionContext) -> float: + """Get the height of the currently held object. + + Uses configured height from config.toml [object_heights] section, + falling back to the detected height_mm. + """ + held_id = ctx.shared_state.get_held_object_id() + if not held_id: + return 0.0 + + held_obj = ctx.scene.get(held_id) + if not held_obj: + _log(f"bajar: held object id={held_id} not found in scene") + return 0.0 + + # Use configured height based on object type and size + height = self._get_configured_height(held_obj.object_type, held_obj.size) + _log(f"bajar: holding object id={held_id} type={held_obj.object_type} size={held_obj.size} height={height:.1f}mm") + return height + + def _get_configured_height(self, object_type: str, size: str) -> float: + """Get configured height for object type/size from environment or defaults.""" + # Try specific key like OBJECT_HEIGHT_CUBE_BIG + key = f"OBJECT_HEIGHT_{object_type.upper()}_{size.upper()}" + height_str = os.getenv(key) + if height_str: + try: + return float(height_str) + except ValueError: + pass + + # Try generic key like OBJECT_HEIGHT_CUBE + key = f"OBJECT_HEIGHT_{object_type.upper()}" + height_str = os.getenv(key) + if height_str: + try: + return float(height_str) + except ValueError: + pass + + # Default height (configurable via OBJECT_HEIGHT_DEFAULT) + default = float(os.getenv("OBJECT_HEIGHT_DEFAULT", "40.0")) + return default + def action_ir(self, ctx: ActionContext) -> bool: """Move to object X/Y while keeping current Z.""" if ctx.pose is None or ctx.target is None: @@ -57,24 +130,45 @@ class LittlehandBehavior(RobotBehavior): return self._queue_move(ctx, pos[0], pos[1], ctx.pose[2]) def action_tomar(self, ctx: ActionContext) -> bool: - """Activate tool (low-level grab).""" + """Activate tool (low-level grab) and track held object.""" self._queue_steps(ctx, self.robot_adapter.grab()) + + # Find the object under current pose to track for stack-aware placement + # (ctx.target may be None since requires_object=False in littlehand) + target_obj = ctx.target or self._find_object_under_pose(ctx) + if target_obj is not None: + ctx.shared_state.set_held_object_id(target_obj.id) + height = self._get_configured_height(target_obj.object_type, target_obj.size) + _log(f"tomar: now holding object id={target_obj.id} type={target_obj.object_type} height={height:.1f}mm") + else: + _log("tomar: no object found under pose, not tracking held object") return True def action_soltar(self, ctx: ActionContext) -> bool: - """Deactivate tool (low-level release).""" + """Deactivate tool (low-level release) and clear held object.""" self._queue_steps(ctx, self.robot_adapter.release()) + # Clear the held object tracking + held_id = ctx.shared_state.get_held_object_id() + if held_id: + _log(f"soltar: released object id={held_id}") + ctx.shared_state.clear_held_object() return True def action_reiniciar(self, ctx: ActionContext) -> bool: - """Reset: release tool, move home, clear objects.""" + """Reset: release tool, move home, clear objects and held state.""" self._queue_steps(ctx, self.robot_adapter.reset_tool()) self._queue_steps(ctx, self.robot_adapter.move(ctx.home_pose)) ctx.scene.clear_detected() + ctx.shared_state.clear_held_object() + _log("reiniciar: cleared held object state") return True def _find_object_under_pose(self, ctx: ActionContext) -> Optional["SceneObject"]: - """Find the topmost object near the current pose x,y (mm).""" + """Find the topmost object near the current pose x,y (mm). + + Note: position_mm[2] is treated as the TOP surface of the object + (camera looks down, sees the top surface). + """ if ctx.pose is None: _log("bajar: missing pose, cannot find object under tool") return None @@ -89,20 +183,19 @@ class LittlehandBehavior(RobotBehavior): dist2 = dx * dx + dy * dy if dist2 > _XY_MATCH_RADIUS_MM * _XY_MATCH_RADIUS_MM: continue - top_surface = obj.position_mm[2] + obj.height_mm - candidates.append((top_surface, obj)) + # position_mm[2] IS the top surface (camera sees top of object) + top_surface_z = obj.position_mm[2] + candidates.append((top_surface_z, obj)) _log( "bajar: near id={} type={} color={} center=({:.1f},{:.1f}) " - "dist_xy={:.1f} obj_z={:.1f} height={:.1f} top_z={:.1f}".format( + "dist_xy={:.1f} top_z={:.1f}".format( obj.id, obj.object_type, obj.color, obj.position_mm[0], obj.position_mm[1], (dist2 ** 0.5), - obj.position_mm[2], - obj.height_mm, - top_surface, + top_surface_z, ) ) if not candidates: diff --git a/dora_voice_control/dora_voice_control/robots/littlehand/signature.py b/dora_voice_control/dora_voice_control/robots/littlehand/signature.py index 83a073d..b7fcc3a 100644 --- a/dora_voice_control/dora_voice_control/robots/littlehand/signature.py +++ b/dora_voice_control/dora_voice_control/robots/littlehand/signature.py @@ -14,7 +14,11 @@ if dspy is not None: comando = dspy.InputField(desc="Voice command in Spanish") accion = dspy.OutputField( - desc="Action name: subir, bajar, ir, tomar, soltar, reiniciar or error" + desc=( + "Accion: subir, bajar, ir, tomar, soltar, reiniciar o error. " + "Mapea errores infantiles, parafrasis y sinonimos cercanos a la accion valida mas cercana. " + "Si la intencion es ambigua o no relacionada, devuelve error." + ) ) objeto = dspy.OutputField( desc="Object name (cubo, cilindro, estrella, caja) or 'no especificado'" diff --git a/dora_yolo_object_detector/dora_yolo_object_detector/main.py b/dora_yolo_object_detector/dora_yolo_object_detector/main.py index 1bfdb17..589c877 100644 --- a/dora_yolo_object_detector/dora_yolo_object_detector/main.py +++ b/dora_yolo_object_detector/dora_yolo_object_detector/main.py @@ -185,6 +185,60 @@ def _sample_point( return np.median(np.stack(samples, axis=0), axis=0) +def _estimate_object_height( + point_cloud: np.ndarray, bbox: List[int], cfg: DetectionConfig, sample_step: int = 4 +) -> Optional[float]: + """Estimate object height by sampling Z values within the bounding box. + + Samples points in a grid within the bbox, finds the Z range (max - min), + which corresponds to the object height. + + Args: + point_cloud: The point cloud array (H, W, channels) with XYZ in mm. + bbox: Bounding box [x1, y1, x2, y2] in pixels. + cfg: Detection config for depth validation. + sample_step: Step size for grid sampling (smaller = more samples). + + Returns: + Estimated height in mm, or None if not enough valid points. + """ + x1, y1, x2, y2 = bbox + h, w, _ = point_cloud.shape + + # Clamp bbox to image bounds + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(w, x2) + y2 = min(h, y2) + + if x2 <= x1 or y2 <= y1: + return None + + # Sample points in a grid within the bounding box + z_values = [] + for y in range(y1, y2, sample_step): + for x in range(x1, x2, sample_step): + point_xyz = point_cloud[y, x, :3].astype(np.float64) + if _valid_point(point_xyz, cfg): + z_values.append(point_xyz[2]) + + if len(z_values) < 5: + return None + + # Use percentiles to filter outliers (table surface, noise) + z_array = np.array(z_values) + z_min = np.percentile(z_array, 10) # Top of object (closer to camera = smaller Z) + z_max = np.percentile(z_array, 90) # Bottom/table level (farther = larger Z) + + height = z_max - z_min + + # Sanity check: height should be positive and reasonable (5mm to 200mm) + if height < 5.0 or height > 200.0: + return None + + return float(height) + + def _dominant_color(image: np.ndarray, bbox: List[int]) -> Tuple[int, int, int]: x1, y1, x2, y2 = bbox x1 = max(0, x1) @@ -482,22 +536,27 @@ def main() -> None: area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])) size_label = "big" if area >= cfg.size_threshold else "small" - objects.append( - { - "object_type": results.names[int(r.cls.item())], - "confidence": float(r.conf.item()), - "color": color_name, - "size": size_label, - "bbox": bbox, - "center_px": [cx, cy], - "position_mm": [ - float(point_base_mm[0]), - float(point_base_mm[1]), - float(point_base_mm[2]), - ], - "timestamp_ns": time.time_ns(), - } - ) + # Estimate object height from point cloud + height_mm = _estimate_object_height(latest_point_cloud, bbox, cfg) + + obj_data = { + "object_type": results.names[int(r.cls.item())], + "confidence": float(r.conf.item()), + "color": color_name, + "size": size_label, + "bbox": bbox, + "center_px": [cx, cy], + "position_mm": [ + float(point_base_mm[0]), + float(point_base_mm[1]), + float(point_base_mm[2]), + ], + "timestamp_ns": time.time_ns(), + } + if height_mm is not None: + obj_data["height_mm"] = height_mm + + objects.append(obj_data) payload = json.dumps({"objects": objects, "timestamp_ns": time.time_ns()}) node.send_output(