';
+ });
+ if (crossConns.length > 5) {
+ html += '
+' + (crossConns.length - 5) + ' more
';
+ }
+ html += '
';
+ }
+
+ return html;
+ }
+
+ function escapeHtml(str) {
+ var div = document.createElement("div");
+ div.textContent = str;
+ return div.innerHTML;
+ }
+
+ function handleMouseOver(event, d) {
+ if (!entryAnimationDone) return;
+ isHighlighting = true;
+ var connected = adjacency[d.id] || {};
+
+ nodeElements
+ .transition().duration(dur(200)).ease(d3.easeQuadOut)
+ .style("opacity", function (n) {
+ if (n.id === d.id) return 1;
+ if (connected[n.id]) return 1;
+ return 0.08;
+ })
+ .attr("r", function (n) {
+ if (n.id === d.id) {
+ return n.type === "category" ? n.radius * 1.15 : 8;
+ }
+ return n.type === "category" ? n.radius : 5;
+ });
+
+ linkElements
+ .transition().duration(dur(200)).ease(d3.easeQuadOut)
+ .style("opacity", function (l) {
+ var sid = l.source.id, tid = l.target.id;
+ if (sid === d.id || tid === d.id) return 1;
+ return 0.03;
+ })
+ .style("stroke-opacity", function (l) {
+ var sid = l.source.id, tid = l.target.id;
+ if (sid === d.id || tid === d.id) return 0.7;
+ return 0.05;
+ })
+ .style("stroke-width", function (l) {
+ var sid = l.source.id, tid = l.target.id;
+ if (sid === d.id || tid === d.id) return l.type === "cross" ? 2.5 : 1.2;
+ return l.type === "membership" ? 0.5 : 1;
+ });
+
+ labelElements
+ .transition().duration(dur(200)).ease(d3.easeQuadOut)
+ .style("opacity", function (n) {
+ if (n.id === d.id) return 1;
+ if (connected[n.id]) return 1;
+ return 0.05;
+ });
+
+ tooltip.innerHTML = buildTooltipContent(d);
+ tooltip.style.display = "block";
+ positionTooltip(event);
+ }
+
+ function handleMouseMove(event) {
+ if (tooltip.style.display === "block") positionTooltip(event);
+ }
+
+ function handleMouseOut() {
+ if (!entryAnimationDone) return;
+ isHighlighting = false;
+ nodeElements
+ .transition().duration(dur(250)).ease(d3.easeQuadOut)
+ .style("opacity", function (n) {
+ if (n.type === "category") return activeCategories[n.id] ? 1 : 0;
+ return activeCategories[n.category] ? 1 : 0;
+ })
+ .attr("r", function (d) { return d.type === "category" ? d.radius : 5; });
+ linkElements
+ .transition().duration(dur(250)).ease(d3.easeQuadOut)
+ .style("opacity", function (l) {
+ var s = l.source, t = l.target;
+ var sVis = s.type === "category" ? activeCategories[s.id] : activeCategories[s.category];
+ var tVis = t.type === "category" ? activeCategories[t.id] : activeCategories[t.category];
+ return (sVis && tVis) ? 1 : 0;
+ })
+ .style("stroke-opacity", function (d) { return d.type === "membership" ? 0.12 : 0.25; })
+ .style("stroke-width", function (d) { return d.type === "membership" ? 0.5 : 1; });
+ labelElements
+ .transition().duration(dur(250)).ease(d3.easeQuadOut)
+ .style("opacity", function (d) {
+ if (d.type === "category") return activeCategories[d.id] ? 1 : 0;
+ if (!activeCategories[d.category]) return 0;
+ return computeArticleLabelOpacity();
+ });
+ tooltip.style.display = "none";
+ }
+
+ // Gradual label fade instead of hard threshold
+ function computeArticleLabelOpacity() {
+ if (currentZoom < 1.3) return 0;
+ if (currentZoom > 2.2) return 0.9;
+ return (currentZoom - 1.3) / (2.2 - 1.3) * 0.9;
+ }
+
+ var isDragging = false;
+
+ function handleClick(event, d) {
+ if (isDragging) return;
+ if (d.url) window.open(d.url, "_self");
+ }
+
+ // Drag
+ function dragStart(event, d) {
+ isDragging = false;
+ if (!event.active) simulation.alphaTarget(0.15).restart();
+ d.fx = d.x;
+ d.fy = d.y;
+ d3.select(this).style("cursor", "grabbing");
+ }
+ function dragging(event, d) {
+ isDragging = true;
+ d.fx = event.x;
+ d.fy = event.y;
+ }
+ function dragEnd(event, d) {
+ if (!event.active) simulation.alphaTarget(0);
+ d.fx = null;
+ d.fy = null;
+ d3.select(this).style("cursor", "grab");
+ setTimeout(function () { isDragging = false; }, 50);
+ }
+
+ // Label visibility based on zoom
+ function updateLabelVisibility() {
+ if (isHighlighting) return;
+ var artOpacity = computeArticleLabelOpacity();
+ labelElements.style("opacity", function (d) {
+ if (d.type === "category") return 1;
+ return artOpacity;
+ });
+ }
+
+ // ── Search ──
+ var searchInput = document.getElementById("kg-search");
+ var searchClear = document.getElementById("kg-search-clear");
+ var searchNoResults = document.getElementById("kg-search-no-results");
+
+ searchInput.addEventListener("input", function () {
+ var q = this.value.toLowerCase().trim();
+ searchClear.style.display = q ? "block" : "none";
+ if (!q) {
+ handleMouseOut();
+ if (searchNoResults) searchNoResults.style.display = "none";
+ return;
+ }
+ var matches = {};
+ var matchCount = 0;
+ nodes.forEach(function (n) {
+ if (n.label.toLowerCase().indexOf(q) !== -1) {
+ matches[n.id] = true;
+ matchCount++;
+ }
+ });
+
+ if (searchNoResults) {
+ searchNoResults.style.display = matchCount === 0 ? "block" : "none";
+ }
+
+ isHighlighting = true;
+ nodeElements
+ .transition().duration(dur(150))
+ .style("opacity", function (n) { return matches[n.id] ? 1 : 0.06; });
+ linkElements
+ .transition().duration(dur(150))
+ .style("opacity", 0.03);
+ labelElements
+ .transition().duration(dur(150))
+ .style("opacity", function (n) { return matches[n.id] ? 1 : 0.03; });
+ });
+
+ searchInput.addEventListener("keydown", function (e) {
+ if (e.key === "Enter") {
+ var q = this.value.toLowerCase().trim();
+ if (!q) return;
+ var match = nodes.find(function (n) {
+ return n.label.toLowerCase().indexOf(q) !== -1;
+ });
+ if (match && match.x != null) {
+ svg.transition().duration(dur(750)).ease(d3.easeCubicOut).call(
+ zoom.transform,
+ d3.zoomIdentity.translate(width / 2, height / 2).scale(2.5).translate(-match.x, -match.y)
+ );
+ }
+ }
+ if (e.key === "Escape") {
+ this.value = "";
+ searchClear.style.display = "none";
+ if (searchNoResults) searchNoResults.style.display = "none";
+ handleMouseOut();
+ }
+ });
+
+ searchClear.addEventListener("click", function () {
+ searchInput.value = "";
+ searchClear.style.display = "none";
+ if (searchNoResults) searchNoResults.style.display = "none";
+ handleMouseOut();
+ searchInput.focus();
+ });
+
+ // ── Category Filters ──
+ var filterContainer = document.getElementById("kg-filters");
+ var activeCategories = {};
+ data.categories.forEach(function (c) { activeCategories[c.id] = true; });
+
+ data.categories.forEach(function (c) {
+ var pill = document.createElement("button");
+ pill.className = "kg-filter-pill active";
+ pill.style.setProperty("--pill-color", c.color);
+ pill.setAttribute("data-cat", c.id);
+ pill.setAttribute("aria-pressed", "true");
+ pill.textContent = c.label;
+ pill.addEventListener("click", function () {
+ activeCategories[c.id] = !activeCategories[c.id];
+ this.classList.toggle("active", activeCategories[c.id]);
+ this.setAttribute("aria-pressed", activeCategories[c.id] ? "true" : "false");
+ applyFilters();
+ });
+ filterContainer.appendChild(pill);
+ });
+
+ function applyFilters() {
+ nodeElements.style("display", function (n) {
+ if (n.type === "category") return activeCategories[n.id] ? null : "none";
+ return activeCategories[n.category] ? null : "none";
+ });
+ labelElements.style("display", function (n) {
+ if (n.type === "category") return activeCategories[n.id] ? null : "none";
+ return activeCategories[n.category] ? null : "none";
+ });
+ linkElements.style("display", function (l) {
+ var s = l.source, t = l.target;
+ var sVis = s.type === "category" ? activeCategories[s.id] : activeCategories[s.category];
+ var tVis = t.type === "category" ? activeCategories[t.id] : activeCategories[t.category];
+ return (sVis && tVis) ? null : "none";
+ });
+ simulation.alpha(0.3).restart();
+ }
+
+ // Reset filters
+ document.getElementById("kg-reset-filters").addEventListener("click", function () {
+ data.categories.forEach(function (c) { activeCategories[c.id] = true; });
+ var pills = filterContainer.querySelectorAll(".kg-filter-pill");
+ pills.forEach(function (p) {
+ p.classList.add("active");
+ p.setAttribute("aria-pressed", "true");
+ });
+ applyFilters();
+ svg.transition().duration(dur(750)).ease(d3.easeCubicOut).call(
+ zoom.transform,
+ d3.zoomIdentity.translate(0, 0).scale(1)
+ );
+ });
+
+ // ── Stats ──
+ var articleCount = data.articles.length;
+ var catCount = data.categories.length;
+ var crossEdges = data.edges.length;
+ var statsEl = document.getElementById("kg-stats");
+ statsEl.setAttribute("role", "status");
+ statsEl.innerHTML =
+ '' + catCount + ' categories•' +
+ '' + articleCount + ' articles•' +
+ '' + crossEdges + ' connections•' +
+ 'Click node to navigate • Scroll to zoom • Drag to pan';
+
+ // ── Resize (debounced) ──
+ var resizeTimer;
+ window.addEventListener("resize", function () {
+ clearTimeout(resizeTimer);
+ resizeTimer = setTimeout(function () {
+ width = container.clientWidth;
+ height = container.clientHeight;
+ svg.attr("width", width).attr("height", height);
+ simulation.force("center", d3.forceCenter(width / 2, height / 2));
+ simulation.force("x", d3.forceX(width / 2).strength(0.02));
+ simulation.force("y", d3.forceY(height / 2).strength(0.02));
+ simulation.alpha(0.3).restart();
+ }, 150);
+ });
+
+ // ── Legend ──
+ var legendContent = document.getElementById("kg-legend-content");
+ data.categories.forEach(function (c) {
+ var item = document.createElement("div");
+ item.className = "kg-legend-item";
+ item.innerHTML = '' +
+ '' + escapeHtml(c.label) + '';
+ legendContent.appendChild(item);
+ });
+
+ var legendToggle = document.getElementById("kg-legend-toggle");
+ legendToggle.setAttribute("aria-expanded", "true");
+ legendToggle.addEventListener("click", function () {
+ var panel = document.getElementById("kg-legend");
+ panel.classList.toggle("collapsed");
+ var isCollapsed = panel.classList.contains("collapsed");
+ this.textContent = isCollapsed ? "Legend" : "Hide";
+ this.setAttribute("aria-expanded", isCollapsed ? "false" : "true");
+ });
+
+ // Touch dismiss for tooltip
+ svg.on("touchstart", function (event) {
+ if (!event.target.classList || !event.target.classList.contains("kg-node")) {
+ handleMouseOut();
+ }
+ }, { passive: true });
+})();
diff --git a/wiki/computing/deploying-neural-networks-edge-devices.md b/wiki/computing/deploying-neural-networks-edge-devices.md
new file mode 100644
index 00000000..c56c4af0
--- /dev/null
+++ b/wiki/computing/deploying-neural-networks-edge-devices.md
@@ -0,0 +1,593 @@
+---
+date: 2026-04-30
+title: Deploying Neural Networks on Edge Devices for Robotics
+---
+Deploying neural networks directly on edge devices is essential for modern robotic systems that must perceive and act in real time. Cloud-based inference introduces network latency, bandwidth constraints, and single points of failure that are unacceptable for safety-critical applications such as autonomous navigation, manipulation, and human-robot interaction. This article provides a comprehensive, practical guide to the full edge deployment pipeline: exporting trained models to portable formats like ONNX, optimizing them with inference engines such as TensorRT and OpenVINO, applying quantization techniques for maximum throughput, and integrating optimized models into ROS 2 robotic software stacks on platforms like NVIDIA Jetson, Google Coral, and Raspberry Pi with AI accelerators.
+
+## Why Edge Inference Matters for Robotics
+
+Robotic perception and control loops impose strict requirements that make on-device inference the preferred deployment strategy in most real-world systems.
+
+### Latency
+
+A robot navigating at 1 m/s with a 100 ms perception latency travels 10 cm blind between frames. Reducing inference latency to 10 ms shrinks that gap to 1 cm. Edge deployment eliminates the round-trip network delay inherent in cloud inference, which typically adds 20–200 ms depending on connectivity.
+
+### Bandwidth
+
+A single 720p RGB camera at 30 FPS generates roughly 50 MB/s of raw data. Stereo cameras, LiDAR, and depth sensors multiply this several-fold. Streaming all sensor data to a remote server is impractical over typical wireless links, especially when multiple robots share the same network.
+
+### Reliability
+
+Wireless connectivity is unreliable in many robotics environments — warehouses with metal shelving, underground tunnels, outdoor fields. Edge inference guarantees that perception continues operating regardless of network conditions.
+
+### Power and Form Factor
+
+Mobile robots carry limited battery capacity. Modern edge accelerators like the NVIDIA Jetson Orin NX deliver up to 100 TOPS of INT8 performance at 15–25 W, enabling sophisticated deep learning models within tight power budgets.
+
+### Common Edge Platforms
+
+| Platform | Compute | Power | Typical Use Case |
+|---|---|---|---|
+| NVIDIA Jetson Orin AGX | 275 TOPS (INT8) | 15–60 W | High-performance mobile robots, AV |
+| NVIDIA Jetson Orin NX | 100 TOPS (INT8) | 10–25 W | Mid-range robots, drones |
+| NVIDIA Jetson Orin Nano | 40 TOPS (INT8) | 7–15 W | Cost-sensitive edge inference |
+| Google Coral Edge TPU | 4 TOPS (INT8) | 2 W | Low-power classification, detection |
+| Raspberry Pi 5 + Hailo-8L | 13 TOPS (INT8) | 5–10 W | Lightweight perception, education |
+| Intel NUC + Neural Compute Stick | ~4 TOPS (FP16) | 10–25 W | OpenVINO-optimized workloads |
+
+For platform-specific setup guidance on the Jetson Orin, see the [Jetson Orin AGX](/wiki/computing/jetson-orin-agx/) article.
+
+## The Edge Deployment Pipeline
+
+The deployment pipeline transforms a trained model into an optimized inference engine tailored to the target hardware. The general workflow follows four stages:
+
+```
+Train (GPU Workstation/Cloud)
+ → Export (PyTorch/TF → ONNX)
+ → Optimize (ONNX → TensorRT / OpenVINO / TFLite)
+ → Deploy (Edge Device + ROS 2)
+```
+
+### Model Interchange Formats
+
+**ONNX (Open Neural Network Exchange)** is the most widely supported intermediate representation. It defines a common set of operators and a standard file format that bridges training frameworks and inference runtimes.
+
+| Source Framework | Target Runtime | Path |
+|---|---|---|
+| PyTorch | TensorRT (Jetson) | PyTorch → ONNX → TensorRT |
+| PyTorch | OpenVINO (Intel) | PyTorch → ONNX → OpenVINO IR |
+| TensorFlow | TFLite (Coral/RPi) | TensorFlow → SavedModel → TFLite |
+| TensorFlow | TensorRT (Jetson) | TensorFlow → ONNX → TensorRT |
+
+### Why Not Deploy the Training Framework Directly?
+
+Training frameworks like PyTorch are designed for flexibility and automatic differentiation, not inference speed. They carry substantial overhead: dynamic graphs, gradient bookkeeping, and unoptimized operator scheduling. Dedicated inference engines strip this overhead and apply hardware-specific optimizations that can yield 2–10x speedups.
+
+## ONNX Export
+
+### Exporting a PyTorch Model
+
+PyTorch provides built-in ONNX export through `torch.onnx.export()`. The exporter traces the model with sample input and records the operations into an ONNX graph.
+
+```python
+import torch
+import torchvision
+
+# Load a pretrained detection model
+model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(
+ weights="DEFAULT"
+)
+model.eval()
+
+# Create a dummy input matching expected input dimensions
+# Batch size 1, 3 channels, 640x640 resolution
+dummy_input = torch.randn(1, 3, 640, 640)
+
+# Export to ONNX format
+torch.onnx.export(
+ model,
+ dummy_input,
+ "detector.onnx",
+ opset_version=17, # Use a recent opset for broad operator support
+ input_names=["images"],
+ output_names=["boxes", "labels", "scores"],
+ dynamic_axes={ # Allow variable batch size at inference time
+ "images": {0: "batch"},
+ "boxes": {0: "batch"},
+ "labels": {0: "batch"},
+ "scores": {0: "batch"},
+ },
+)
+print("ONNX export complete: detector.onnx")
+```
+
+> When using `dynamic_axes`, ensure the model architecture supports variable dimensions. Some operations (e.g., hard-coded reshapes) may break with dynamic shapes.
+
+### Exporting a TensorFlow/Keras Model
+
+For TensorFlow models, use the `tf2onnx` converter:
+
+```bash
+pip install tf2onnx
+python -m tf2onnx.convert \
+ --saved-model ./saved_model_dir \
+ --output model.onnx \
+ --opset 17
+```
+
+### Validating the ONNX Model
+
+Always validate the exported model before optimization to catch export errors early.
+
+```python
+import onnx
+import onnxruntime as ort
+import numpy as np
+
+# Structural validation: checks graph consistency and operator support
+model = onnx.load("detector.onnx")
+onnx.checker.check_model(model)
+print("ONNX model structure is valid.")
+
+# Runtime validation: run inference and verify output shapes
+session = ort.InferenceSession("detector.onnx")
+dummy = np.random.randn(1, 3, 640, 640).astype(np.float32)
+outputs = session.run(None, {"images": dummy})
+
+for i, out in enumerate(outputs):
+ print(f"Output {i}: shape={out.shape}, dtype={out.dtype}")
+```
+
+If the checker raises errors, common fixes include:
+- Updating the `opset_version` to support newer operators
+- Replacing unsupported custom operators with standard ONNX equivalents
+- Simplifying the model with `onnx-simplifier`: `python -m onnxsim model.onnx model_simplified.onnx`
+
+## TensorRT Optimization
+
+NVIDIA TensorRT is the highest-performance inference engine for NVIDIA GPUs, including all Jetson platforms. It applies a suite of optimizations that are impossible at the framework level.
+
+### What TensorRT Does
+
+1. **Layer Fusion**: Combines sequences of operations (e.g., Conv → BatchNorm → ReLU) into single GPU kernels, reducing memory traffic and kernel launch overhead.
+2. **Kernel Auto-Tuning**: Benchmarks multiple CUDA kernel implementations for each layer on the specific target GPU and selects the fastest.
+3. **Precision Calibration**: Converts FP32 weights to FP16 or INT8 with minimal accuracy loss, doubling or quadrupling throughput.
+4. **Memory Optimization**: Reuses memory buffers across layers with non-overlapping lifetimes, reducing peak memory consumption.
+5. **Dynamic Tensor Memory**: Allocates only the memory needed for the actual input dimensions when using dynamic shapes.
+
+### Building a TensorRT Engine from ONNX
+
+```python
+import tensorrt as trt
+
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+
+def build_engine(onnx_path, engine_path, fp16=True, int8=False, calibrator=None):
+ """Build and serialize a TensorRT engine from an ONNX model."""
+ builder = trt.Builder(TRT_LOGGER)
+ network = builder.create_network(
+ 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ )
+ parser = trt.OnnxParser(network, TRT_LOGGER)
+
+ # Parse the ONNX model
+ with open(onnx_path, "rb") as f:
+ if not parser.parse(f.read()):
+ for i in range(parser.num_errors):
+ print(f"ONNX parse error: {parser.get_error(i)}")
+ return None
+
+ # Configure builder settings
+ config = builder.create_builder_config()
+ config.set_memory_pool_limit(
+ trt.MemoryPoolType.WORKSPACE, 1 << 30 # 1 GB workspace
+ )
+
+ # Enable FP16 precision (2x speedup on most Jetson platforms)
+ if fp16:
+ config.set_flag(trt.BuilderFlag.FP16)
+
+ # Enable INT8 precision (requires calibration dataset)
+ if int8:
+ config.set_flag(trt.BuilderFlag.INT8)
+ config.int8_calibrator = calibrator
+
+ # Build the engine (this takes minutes — auto-tunes kernels)
+ serialized_engine = builder.build_serialized_network(network, config)
+
+ # Save to disk for later loading
+ with open(engine_path, "wb") as f:
+ f.write(serialized_engine)
+
+ print(f"TensorRT engine saved to {engine_path}")
+ return serialized_engine
+
+# Build an FP16 engine
+build_engine("detector.onnx", "detector_fp16.engine", fp16=True)
+```
+
+> TensorRT engines are **not portable** across GPU architectures. An engine built on a Jetson Orin will not run on a Jetson Xavier or a desktop GPU. Always build on the target device or a device with the same GPU architecture.
+
+### Running Inference with a TensorRT Engine
+
+```python
+import tensorrt as trt
+import pycuda.driver as cuda
+import pycuda.autoinit
+import numpy as np
+
+def load_engine(engine_path):
+ """Deserialize a TensorRT engine from disk."""
+ runtime = trt.Runtime(TRT_LOGGER)
+ with open(engine_path, "rb") as f:
+ return runtime.deserialize_cuda_engine(f.read())
+
+def infer(engine, input_data):
+ """Run inference on a single input tensor."""
+ context = engine.create_execution_context()
+
+ # Allocate device memory for input and output tensors
+ d_input = cuda.mem_alloc(input_data.nbytes)
+ cuda.memcpy_htod(d_input, input_data)
+
+ # Query output tensor shape and allocate memory
+ output_shape = engine.get_tensor_shape(engine.get_tensor_name(1))
+ output_size = int(np.prod(output_shape)) * np.dtype(np.float32).itemsize
+ d_output = cuda.mem_alloc(output_size)
+
+ # Set tensor addresses and execute
+ context.set_tensor_address(engine.get_tensor_name(0), int(d_input))
+ context.set_tensor_address(engine.get_tensor_name(1), int(d_output))
+ context.execute_async_v3(stream_handle=cuda.Stream().handle)
+
+ # Copy result back to host
+ output = np.empty(output_shape, dtype=np.float32)
+ cuda.memcpy_dtoh(output, d_output)
+ return output
+```
+
+For a complete YOLOv5 deployment example with TensorRT on Jetson, see the [YOLOv5 Training and Deployment on NVIDIA Jetson](/wiki/machine-learning/yolov5-tensorrt/) article.
+
+## Quantization Deep Dive
+
+Quantization reduces the numerical precision of model weights and activations from 32-bit floating point to lower-bitwidth representations, trading a small accuracy reduction for significant gains in speed, memory, and power efficiency.
+
+### The Quantization Formula
+
+Uniform affine quantization maps a floating-point value $x$ to an integer $q$ using a scale factor $s$ and zero-point $z$:
+
+$$q = \text{round}\left(\frac{x}{s}\right) + z$$
+
+The inverse (dequantization) recovers an approximation of the original value:
+
+$$\hat{x} = s \cdot (q - z)$$
+
+The scale and zero-point are computed from the observed range $[x_{\min}, x_{\max}]$ of the tensor:
+
+$$s = \frac{x_{\max} - x_{\min}}{q_{\max} - q_{\min}}$$
+
+$$z = q_{\min} - \text{round}\left(\frac{x_{\min}}{s}\right)$$
+
+For INT8 quantization, $q_{\min} = -128$ and $q_{\max} = 127$, giving 256 discrete levels. The quantization error is bounded by $\frac{s}{2}$.
+
+### FP16 vs INT8
+
+| Property | FP32 (Baseline) | FP16 | INT8 |
+|---|---|---|---|
+| Bits per value | 32 | 16 | 8 |
+| Typical speedup | 1x | 1.5–2x | 2–4x |
+| Memory reduction | 1x | 2x | 4x |
+| Accuracy impact | Baseline | Negligible | 0.5–2% drop typical |
+| Calibration needed | No | No | Yes |
+
+**Use FP16** as the default for edge deployment — it provides a significant speedup with virtually no accuracy loss on modern hardware. **Use INT8** when latency or throughput requirements cannot be met with FP16, and when you can afford the calibration and validation effort.
+
+### Post-Training Quantization (PTQ)
+
+PTQ quantizes a pre-trained FP32 model without retraining. It requires a small calibration dataset (typically 500–1000 representative samples) to determine optimal scale factors for each tensor.
+
+```python
+import tensorrt as trt
+import numpy as np
+import os
+
+class ImageCalibrator(trt.IInt8EntropyCalibrator2):
+ """Calibrator that feeds representative images for INT8 quantization."""
+
+ def __init__(self, calibration_dir, batch_size=8, input_shape=(3, 640, 640)):
+ super().__init__()
+ self.batch_size = batch_size
+ self.input_shape = input_shape
+
+ # Load calibration image paths
+ self.image_paths = [
+ os.path.join(calibration_dir, f)
+ for f in sorted(os.listdir(calibration_dir))
+ if f.endswith((".jpg", ".png"))
+ ]
+ self.current_index = 0
+ self.cache_file = "calibration.cache"
+
+ # Pre-allocate device buffer for one batch
+ self.device_input = cuda.mem_alloc(
+ batch_size * int(np.prod(input_shape)) * 4
+ )
+
+ def get_batch_size(self):
+ return self.batch_size
+
+ def get_batch(self, names):
+ """Load and preprocess the next batch of calibration images."""
+ if self.current_index >= len(self.image_paths):
+ return None
+
+ batch = []
+ for i in range(self.batch_size):
+ idx = self.current_index + i
+ if idx >= len(self.image_paths):
+ break
+ # Load and preprocess image (resize, normalize to [0, 1])
+ img = load_and_preprocess(self.image_paths[idx], self.input_shape)
+ batch.append(img)
+
+ self.current_index += self.batch_size
+ batch_array = np.array(batch, dtype=np.float32)
+ cuda.memcpy_htod(self.device_input, batch_array)
+ return [int(self.device_input)]
+
+ def read_calibration_cache(self):
+ """Read cached calibration data to avoid recalibrating."""
+ if os.path.exists(self.cache_file):
+ with open(self.cache_file, "rb") as f:
+ return f.read()
+ return None
+
+ def write_calibration_cache(self, cache):
+ """Write calibration data to disk for future reuse."""
+ with open(self.cache_file, "wb") as f:
+ f.write(cache)
+
+# Build INT8 engine with calibrator
+calibrator = ImageCalibrator("./calibration_images/", batch_size=8)
+build_engine("detector.onnx", "detector_int8.engine", fp16=True, int8=True, calibrator=calibrator)
+```
+
+> Select calibration images that represent the actual distribution your robot will encounter. For a warehouse robot, use images from the warehouse — not ImageNet validation images.
+
+### Quantization-Aware Training (QAT)
+
+QAT simulates quantization effects during training by inserting fake quantization nodes that round weights and activations to their quantized equivalents on the forward pass while maintaining full-precision gradients on the backward pass. This allows the model to learn to be robust to quantization noise.
+
+QAT typically recovers 0.5–1% accuracy compared to PTQ. Use it when:
+- PTQ accuracy is unacceptable for your application
+- The model is small and retraining is inexpensive
+- You need INT8 for latency-critical control loops
+
+PyTorch supports QAT through the `torch.ao.quantization` API:
+
+```python
+import torch
+from torch.ao.quantization import get_default_qat_qconfig, prepare_qat, convert
+
+model.train()
+
+# Attach quantization configuration to the model
+model.qconfig = get_default_qat_qconfig("x86") # or "qnnpack" for ARM
+
+# Insert fake quantization observers
+model_prepared = prepare_qat(model)
+
+# Fine-tune for a few epochs with quantization simulation
+for epoch in range(fine_tune_epochs):
+ train_one_epoch(model_prepared, train_loader)
+
+# Convert to fully quantized model
+model_quantized = convert(model_prepared)
+```
+
+## Deployment Patterns for Robotics
+
+### ROS 2 Inference Node
+
+Wrapping an optimized inference engine in a ROS 2 node allows seamless integration with the broader robotic perception and planning stack.
+
+```python
+import rclpy
+from rclpy.node import Node
+from sensor_msgs.msg import Image
+from vision_msgs.msg import Detection2DArray, Detection2D, ObjectHypothesisWithPose
+from cv_bridge import CvBridge
+import numpy as np
+
+class DetectorNode(Node):
+ """ROS 2 node that runs TensorRT object detection on camera images."""
+
+ def __init__(self):
+ super().__init__("detector_node")
+
+ # Load TensorRT engine at startup
+ self.engine = load_engine("detector_fp16.engine")
+ self.context = self.engine.create_execution_context()
+ self.bridge = CvBridge()
+
+ # Subscribe to camera images
+ self.sub = self.create_subscription(
+ Image, "/camera/image_raw", self.image_callback, 10
+ )
+
+ # Publish detection results
+ self.pub = self.create_publisher(
+ Detection2DArray, "/detections", 10
+ )
+ self.get_logger().info("Detector node initialized with TensorRT engine")
+
+ def image_callback(self, msg):
+ # Convert ROS Image to numpy array
+ cv_image = self.bridge.imgmsg_to_cv2(msg, "rgb8")
+
+ # Preprocess: resize, normalize, transpose to CHW, add batch dim
+ input_tensor = self.preprocess(cv_image)
+
+ # Run TensorRT inference
+ boxes, labels, scores = self.infer(input_tensor)
+
+ # Publish detections above confidence threshold
+ det_array = Detection2DArray()
+ det_array.header = msg.header
+ for box, label, score in zip(boxes, labels, scores):
+ if score > 0.5:
+ det = Detection2D()
+ hyp = ObjectHypothesisWithPose()
+ hyp.hypothesis.class_id = str(int(label))
+ hyp.hypothesis.score = float(score)
+ det.results.append(hyp)
+ det_array.detections.append(det)
+
+ self.pub.publish(det_array)
+
+def main():
+ rclpy.init()
+ node = DetectorNode()
+ rclpy.spin(node)
+ node.destroy_node()
+ rclpy.shutdown()
+```
+
+### Multi-Model Pipelines
+
+Many robotic perception systems chain multiple models. A common pattern is a fast detector followed by a more expensive classifier or pose estimator that runs only on detected regions of interest (ROIs):
+
+```
+Camera Frame → Detector (YOLOv8-nano, ~5ms)
+ → Crop ROIs
+ → Classifier (ResNet-18, ~2ms per ROI)
+ → Pose Estimator (only for target class, ~8ms)
+```
+
+Run the detector on every frame and the downstream models only on relevant ROIs. This keeps the total pipeline latency low while reserving computational headroom for the most informative processing.
+
+### Batching Strategies
+
+- **Latency-optimized (batch=1)**: Process each frame immediately as it arrives. Best for real-time control loops where freshness matters more than throughput.
+- **Throughput-optimized (batch=N)**: Accumulate N frames and process them together. TensorRT and GPUs achieve higher utilization with larger batches. Use this for offline processing, mapping, or when multiple cameras feed the same model.
+- **Adaptive batching**: Adjust batch size based on the current processing queue depth. Process immediately when idle; batch when backlogged.
+
+### Memory Management on Constrained Devices
+
+Jetson devices share memory between CPU and GPU. A few practices help avoid out-of-memory crashes:
+
+- **Pre-allocate all CUDA buffers at node startup**, not per-frame. Dynamic allocation on the GPU path causes fragmentation and latency spikes.
+- **Use CUDA unified memory** (`cudaMallocManaged`) on Jetson, which avoids explicit host-device copies since CPU and GPU share physical memory.
+- **Limit TensorRT workspace size** to leave headroom for other processes: `config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 512 << 20)` for 512 MB.
+- **Monitor memory** with `tegrastats` or `jetson_stats` to track actual usage under load.
+- **Containerize inference nodes** with [Docker for Robotics](/wiki/tools/docker-for-robotics/) to isolate memory usage and manage dependencies cleanly.
+
+## Benchmarking and Profiling
+
+Thorough benchmarking is essential to validate that the optimized model meets the robotic system's real-time requirements.
+
+### Key Metrics
+
+| Metric | What It Measures | Target (Typical) |
+|---|---|---|
+| Latency (p50) | Median inference time | < 30 ms for perception |
+| Latency (p99) | Worst-case inference time | < 50 ms for safety-critical |
+| Throughput (FPS) | Frames processed per second | ≥ 30 FPS for real-time |
+| GPU Memory | Peak GPU memory usage | < 50% of device total |
+| Power Draw | Average power consumption | Within platform TDP |
+
+### trtexec: TensorRT's Built-In Benchmarking Tool
+
+`trtexec` is the fastest way to benchmark an ONNX model or TensorRT engine without writing code:
+
+```bash
+# Benchmark an ONNX model with FP16 precision
+trtexec --onnx=detector.onnx --fp16 --iterations=1000 --warmUp=500
+
+# Benchmark an existing TensorRT engine
+trtexec --loadEngine=detector_fp16.engine --iterations=1000
+
+# Benchmark with specific input shape
+trtexec --onnx=detector.onnx --fp16 \
+ --shapes=images:1x3x640x640 \
+ --iterations=1000 --percentile=99
+```
+
+`trtexec` reports mean, median, and percentile latencies, along with throughput and GPU compute utilization.
+
+### jetson_stats (jtop)
+
+`jetson_stats` provides real-time monitoring of Jetson platform metrics:
+
+```bash
+# Install jetson_stats
+pip install jetson-stats
+
+# Launch the interactive dashboard
+jtop
+```
+
+`jtop` displays CPU/GPU utilization, memory usage, temperature, power draw, and clock frequencies — all critical for understanding whether the system is thermally throttling or memory-starved during inference.
+
+### NVIDIA Nsight Systems
+
+For deep profiling of the inference pipeline, Nsight Systems captures GPU kernel timelines, CUDA API calls, and CPU-GPU synchronization:
+
+```bash
+# Profile a Python inference script
+nsys profile --trace=cuda,osrt --output=profile_report python inference.py
+
+# Open the trace in the Nsight Systems GUI for visual analysis
+nsys-ui profile_report.nsys-rep
+```
+
+Look for:
+- **Gaps between kernels** indicating CPU bottlenecks or unnecessary synchronization
+- **Long memory copies** suggesting suboptimal host-device transfers
+- **Kernel occupancy** below 50% suggesting the model is too small to saturate the GPU
+
+## Best Practices and Common Pitfalls
+
+### Best Practices
+
+1. **Always benchmark on the target device.** Desktop GPU performance does not predict Jetson performance. Build and test on the actual hardware.
+2. **Start with FP16, try INT8 only if needed.** FP16 provides most of the speedup with no accuracy work. INT8 requires calibration and validation effort.
+3. **Use static input shapes when possible.** Dynamic shapes prevent some TensorRT optimizations. If your input resolution is fixed, hardcode it.
+4. **Pin your TensorRT version.** Engines are not compatible across TensorRT versions. Lock the version in your Docker container or JetPack release.
+5. **Profile the full pipeline, not just inference.** Pre-processing (resize, normalize) and post-processing (NMS, decoding) often dominate end-to-end latency, especially for small models. Use OpenCV's CUDA backend or GPU-accelerated pre-processing.
+6. **Set the Jetson power mode appropriately.** Use `sudo nvpmodel -m 0` for maximum performance or tune to balance power and speed. See the [GPU System Setup](/wiki/computing/setup-gpus-for-computer-vision/) guide.
+
+### Common Pitfalls
+
+1. **Building TensorRT engines on the wrong platform.** Engines are tied to the specific GPU architecture and TensorRT version. An engine built on an RTX 4090 will not run on a Jetson Orin.
+2. **Ignoring pre/post-processing overhead.** A 5 ms model inference is meaningless if preprocessing takes 20 ms on the CPU. Move preprocessing to the GPU.
+3. **Using dynamic shapes unnecessarily.** Dynamic shapes add runtime overhead. Only use them when truly needed (e.g., variable batch size).
+4. **Calibrating INT8 with unrepresentative data.** Calibration images must match deployment distribution. Using ImageNet images for a warehouse robot will produce poor quantization scales.
+5. **Not validating accuracy after quantization.** Always run the quantized model through your evaluation pipeline. A 2% drop in mAP on a benchmark may translate to critical missed detections in your specific scenario.
+6. **Forgetting to warm up the engine.** The first few inferences are slower due to CUDA context initialization and memory allocation. Run 10–50 warm-up inferences before measuring latency.
+7. **Blocking the ROS 2 callback thread.** If inference takes longer than the camera frame period, frames will queue up and latency will grow unboundedly. Use separate threads or process the latest frame only.
+
+## Summary
+
+Deploying neural networks on edge devices is a core competency for modern robotics engineers. The pipeline — export to ONNX, optimize with TensorRT or OpenVINO, quantize to FP16 or INT8, and deploy within a ROS 2 node — transforms research models into production-ready perception systems that meet real-time, power, and reliability constraints. Start with FP16 quantization for immediate gains, profile the full pipeline to identify bottlenecks beyond the model itself, and always validate accuracy on representative data from the target deployment environment. As edge hardware continues to advance, these techniques will enable increasingly sophisticated on-device AI for autonomous robotic systems.
+
+## See Also:
+- [Jetson Orin AGX](/wiki/computing/jetson-orin-agx/) - Platform setup and optimization for NVIDIA Jetson
+- [Setup Your GPU System for Computer Vision](/wiki/computing/setup-gpus-for-computer-vision/) - CUDA, cuDNN, and driver configuration
+- [YOLOv5 Training and Deployment on NVIDIA Jetson](/wiki/machine-learning/yolov5-tensorrt/) - End-to-end YOLO deployment with TensorRT
+- [Docker for Robotics](/wiki/tools/docker-for-robotics/) - Containerizing robotic applications for reproducible deployments
+- [Mediapipe: Live ML Anywhere](/wiki/machine-learning/mediapipe-live-ml-anywhere/) - Lightweight on-device ML pipelines
+
+## Further Reading
+- [NVIDIA TensorRT Developer Guide](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/) — Comprehensive documentation covering all TensorRT features, optimization strategies, and API references.
+- [ONNX Runtime Documentation](https://onnxruntime.ai/docs/) — Official documentation for the ONNX Runtime inference engine, including execution providers for different hardware.
+- [NVIDIA Jetson AI Lab](https://www.jetson-ai-lab.com/) — Tutorials and pre-built containers for deploying AI models on Jetson platforms.
+- [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers) — Guide for deploying models on extremely resource-constrained devices.
+- [OpenVINO Toolkit Documentation](https://docs.openvino.ai/) — Intel's toolkit for optimizing and deploying models on Intel hardware including CPUs, GPUs, and VPUs.
+
+## References
+- A. Jacob, B. Kligys, B. Chen et al., "Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference," in *Proc. IEEE/CVF Conf. Computer Vision and Pattern Recognition (CVPR)*, 2018, pp. 2704–2713.
+- NVIDIA Corporation, "TensorRT: Programmable Inference Accelerator," NVIDIA Developer Documentation, 2024. [Online]. Available: https://developer.nvidia.com/tensorrt
+- ONNX Project Contributors, "Open Neural Network Exchange (ONNX)," GitHub Repository, 2024. [Online]. Available: https://github.com/onnx/onnx
+- R. Krishnamoorthi, "Quantizing Deep Convolutional Networks for Efficient Inference: A Whitepaper," arXiv preprint arXiv:1806.08342, 2018.
+- S. Han, H. Mao, and W. J. Dally, "Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and Huffman Coding," in *Proc. Int. Conf. Learning Representations (ICLR)*, 2016.
diff --git a/wiki/computing/index.md b/wiki/computing/index.md
index a0374e29..952ff171 100644
--- a/wiki/computing/index.md
+++ b/wiki/computing/index.md
@@ -37,6 +37,9 @@ This section is curated for developers and researchers working on robotics appli
- **[Upgrading the Ubuntu Kernel](/wiki/computing/upgrading-ubuntu-kernel/)**
Instructions for upgrading the Ubuntu kernel to a specific version, ensuring compatibility and system stability.
+- **[Deploying Neural Networks on Edge Devices for Robotics](/wiki/computing/deploying-neural-networks-edge-devices/)**
+ A comprehensive guide to the edge deployment pipeline: exporting models to ONNX, optimizing with TensorRT, applying FP16/INT8 quantization, and integrating inference engines into ROS 2 nodes on platforms like NVIDIA Jetson and Google Coral.
+
## Resources
### Development and Setup
diff --git a/wiki/knowledge-graph/index.html b/wiki/knowledge-graph/index.html
new file mode 100644
index 00000000..5df2de6b
--- /dev/null
+++ b/wiki/knowledge-graph/index.html
@@ -0,0 +1,432 @@
+---
+layout: default
+title: Knowledge Graph
+---
+
+
+
+
+
+
+
+
Initializing Knowledge Graph...
+
+
+
+
+
Robotics Knowledge Graph
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
No matching articles found
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/wiki/machine-learning/foundation-models-robot-task-planning.md b/wiki/machine-learning/foundation-models-robot-task-planning.md
new file mode 100644
index 00000000..c431a923
--- /dev/null
+++ b/wiki/machine-learning/foundation-models-robot-task-planning.md
@@ -0,0 +1,471 @@
+---
+date: 2026-04-30
+title: Foundation Models (LLMs/VLMs) for Robot Task Planning
+---
+Foundation models, including large language models (LLMs) and vision-language models (VLMs), have rapidly become a transformative force in robotics. By leveraging the broad world knowledge, common-sense reasoning, and generalization capabilities acquired during internet-scale pre-training, these models enable robots to interpret open-ended natural language instructions, reason about scenes, and compose long-horizon task plans without task-specific reward engineering. This article surveys the key approaches for integrating foundation models into robotic systems, covering affordance-grounded planners like SayCan, code-generation frameworks such as Code as Policies, vision-language-action models like RT-2, and 3D value-map composition with VoxPoser. Practical implementation patterns, code examples with ROS 2, and best practices for safe deployment are also discussed.
+
+## Why Foundation Models Matter for Robotics
+
+Traditional robot task planning relies on manually specified symbolic planners (PDDL, behavior trees) or task-specific learned policies. These approaches struggle with open-vocabulary instructions, novel objects, and long-horizon reasoning. Foundation models address these limitations by providing:
+
+- **Broad world knowledge**: Understanding of object properties, spatial relationships, and common-sense physics acquired from large-scale text and image data.
+- **Language grounding**: The ability to interpret free-form natural language commands and map them to structured action sequences.
+- **Zero-shot generalization**: Handling novel objects and task descriptions without retraining.
+- **Compositional reasoning**: Breaking complex instructions into ordered subtask sequences through chain-of-thought prompting.
+
+The general paradigm is to use a foundation model as a high-level semantic planner that proposes candidate actions or code, while low-level controllers (motion planners, PID controllers, learned visuomotor policies) handle execution. This separation of concerns keeps the system modular and allows each layer to be improved independently.
+
+## Key Approaches
+
+### SayCan: Grounding Language in Robot Affordances
+
+SayCan, introduced by Ahn et al. (2022), addresses a fundamental problem: LLMs can suggest semantically reasonable actions, but they have no knowledge of what the robot can actually do in its current environment. SayCan combines the "say" (LLM knowledge of useful actions) with the "can" (learned affordance functions that score the feasibility of each action given the current state).
+
+The scoring mechanism works as follows. Given a natural language instruction $i$ and a set of candidate skills $\{a_1, a_2, \ldots, a_n\}$, SayCan selects the next action by:
+
+$$a^* = \arg\max_{a_k} \; p_{\text{LLM}}(a_k \mid i, h) \cdot p_{\text{afford}}(a_k \mid s)$$
+
+where:
+- $p_{\text{LLM}}(a_k \mid i, h)$ is the LLM's probability that skill $a_k$ is a useful next step given instruction $i$ and history $h$
+- $p_{\text{afford}}(a_k \mid s)$ is the affordance score from a learned value function indicating how likely $a_k$ is to succeed in the current state $s$
+
+The affordance functions are typically trained via reinforcement learning on real robot data. Each primitive skill (e.g., "pick up the sponge", "go to the counter") has an associated value function that provides a success probability. This product ensures that the selected action is both semantically relevant and physically executable.
+
+#### Strengths and Limitations
+
+SayCan works well for table-top manipulation and mobile manipulation with a fixed skill library. However, it requires pre-trained affordance functions for every skill, making it difficult to scale to new capabilities. The discrete skill set also limits the expressiveness of plans.
+
+### Code as Policies and ProgPrompt
+
+Rather than selecting from a fixed skill library, Code as Policies (CaP), proposed by Liang et al. (2023), prompts an LLM to directly generate executable Python code that calls robot perception and control APIs. The key insight is that code is a more expressive and composable representation for robot behavior than flat action sequences.
+
+A typical prompt provides the LLM with:
+1. Available API functions (e.g., `pick(obj)`, `place(obj, location)`, `get_obj_pos(name)`)
+2. Perception query functions (e.g., `detect_objects()`, `get_color(obj)`)
+3. A few in-context examples mapping instructions to code
+4. The current instruction
+
+```python
+# Example: Code as Policies prompt structure
+SYSTEM_PROMPT = """You are a robot policy generator. You have access to the
+following functions:
+
+# Perception
+detect_objects() -> list[str] # returns names of visible objects
+get_obj_pos(name: str) -> tuple # returns (x, y, z) position
+get_obj_color(name: str) -> str # returns color string
+
+# Actions
+pick(name: str) -> bool # grasp the named object
+place(name: str, pos: tuple) -> bool # place held object at position
+move_to(pos: tuple) -> bool # move end-effector to position
+say(text: str) -> None # speak text aloud
+
+Write Python code to accomplish the user's instruction.
+Use only the functions above. Add error handling."""
+
+# Example in-context demonstration
+EXAMPLE = """
+# Instruction: "Put the red block on top of the blue block"
+objects = detect_objects()
+red_block = [o for o in objects if 'red' in get_obj_color(o) and 'block' in o][0]
+blue_block = [o for o in objects if 'blue' in get_obj_color(o) and 'block' in o][0]
+blue_pos = get_obj_pos(blue_block)
+pick(red_block)
+# Stack on top: offset z by block height
+place(red_block, (blue_pos[0], blue_pos[1], blue_pos[2] + 0.05))
+"""
+```
+
+ProgPrompt (Singh et al., 2023) extends this idea by generating Pythonic programs with explicit assertions and precondition checks, making the generated plans more robust to execution failures.
+
+#### Strengths and Limitations
+
+Code generation is highly flexible and compositional — loops, conditionals, and arithmetic come for free. However, it introduces risks: LLM-generated code can contain bugs, call undefined functions, or produce unsafe motions. Sandboxing and validation are essential.
+
+### RT-2 and RT-X: Vision-Language-Action Models
+
+RT-2 (Brohan et al., 2023) takes a fundamentally different approach by fine-tuning a VLM end-to-end to directly output robot actions. Rather than using a foundation model as a planner that calls external controllers, RT-2 treats action generation as a sequence modeling problem. Robot actions are tokenized as text strings (e.g., discretized into bins), and the model is trained on both internet-scale vision-language data and robot demonstration data.
+
+The architecture builds on PaLI-X or PaLM-E as the backbone VLM. Given an image observation $o_t$ and a language instruction $l$, RT-2 outputs action tokens:
+
+$$a_t = \text{VLM}(o_t, l)$$
+
+where $a_t$ is a discretized action vector including end-effector displacement, rotation, and gripper state.
+
+RT-X extends this concept across multiple robot embodiments. The Open X-Embodiment dataset aggregates demonstration data from over 20 different robot platforms. Cross-embodiment training produces policies that transfer better to new robots and tasks than single-embodiment training.
+
+#### Strengths and Limitations
+
+VLA models offer the tightest integration between perception, language understanding, and action — everything is in one forward pass. However, they require large-scale robot demonstration data for fine-tuning, inference latency can be high (hundreds of milliseconds per action), and the monolithic architecture is harder to debug than modular systems.
+
+### VoxPoser: 3D Value Maps from VLMs
+
+VoxPoser (Huang et al., 2023) uses LLMs and VLMs to compose 3D affordance and constraint maps in the robot's workspace, which are then used by a motion planner. Given an instruction like "pour water into the cup without spilling", VoxPoser:
+
+1. Uses an LLM to decompose the instruction into spatial objectives and constraints
+2. Queries a VLM (or open-vocabulary detector) to localize relevant objects in 3D
+3. Composes a voxel value map where high values indicate goal regions and low values indicate obstacle or constraint regions
+4. Feeds the value map to a motion planner (e.g., MPC or trajectory optimization) as a cost function
+
+This approach is zero-shot — it does not require any robot demonstration data. The 3D value maps serve as an interface between the semantic understanding of foundation models and the geometric reasoning of classical planners.
+
+### CLIP and Open-Vocabulary Detection
+
+CLIP (Contrastive Language-Image Pretraining) and its derivatives (OWL-ViT, Grounding DINO, GLIP) enable robots to detect and localize objects described by arbitrary text queries, eliminating the need for fixed object categories.
+
+In a robotics pipeline, open-vocabulary detection typically serves as the perception backbone:
+
+```python
+# Using Grounding DINO for open-vocabulary object detection
+from groundingdino.util.inference import load_model, predict
+
+# Load the pre-trained Grounding DINO model
+model = load_model("groundingdino/config/GroundingDINO_SwinT.py",
+ "weights/groundingdino_swint_ogc.pth")
+
+# Detect objects matching a text query in the camera image
+def detect_objects(image, text_query, box_threshold=0.3, text_threshold=0.25):
+ """Detect objects in image matching the natural language query.
+
+ Args:
+ image: RGB image from robot camera
+ text_query: natural language description (e.g., "red mug. blue plate.")
+ box_threshold: confidence threshold for bounding boxes
+ text_threshold: confidence threshold for text matching
+
+ Returns:
+ boxes: detected bounding boxes in xyxy format
+ phrases: matched text phrases for each detection
+ """
+ boxes, logits, phrases = predict(
+ model=model,
+ image=image,
+ caption=text_query,
+ box_threshold=box_threshold,
+ text_threshold=text_threshold
+ )
+ return boxes, phrases
+```
+
+Open-vocabulary detectors are often combined with depth cameras to produce 3D object poses, which are then fed to grasp planners or the foundation model planner.
+
+## Architecture Patterns
+
+### Hierarchical Planning: Foundation Model + Low-Level Controller
+
+The most common architecture separates high-level semantic planning from low-level control:
+
+```
+┌─────────────────────────────────────────────┐
+│ Natural Language Instruction │
+└──────────────────┬──────────────────────────┘
+ │
+┌──────────────────▼──────────────────────────┐
+│ Foundation Model (LLM / VLM) │
+│ - Task decomposition │
+│ - Subtask sequencing │
+│ - Code or plan generation │
+└──────────────────┬──────────────────────────┘
+ │ Subtask / Code / Action
+┌──────────────────▼──────────────────────────┐
+│ Skill Library / Low-Level Controllers │
+│ - MoveIt for motion planning │
+│ - Diffusion policy for manipulation │
+│ - PID / MPC for locomotion │
+└──────────────────┬──────────────────────────┘
+ │ Joint commands
+┌──────────────────▼──────────────────────────┐
+│ Robot Hardware │
+└─────────────────────────────────────────────┘
+```
+
+This pattern keeps the foundation model in the loop for high-level decisions while relying on well-tested controllers for safe physical execution. It also allows swapping foundation models without changing the control stack.
+
+### Prompt Engineering for Robotics
+
+Effective prompting for robot task planning differs from general LLM prompting:
+
+1. **Define the action space explicitly**: List every available function with type signatures and brief descriptions. The LLM cannot infer capabilities it was not told about.
+2. **Provide physical constraints**: Include workspace bounds, payload limits, and collision constraints in the system prompt.
+3. **Use structured output formats**: Request JSON or Python code rather than free-form text to simplify parsing.
+4. **Include failure recovery**: Prompt the model to include try/except blocks or precondition checks.
+5. **Chain-of-thought for sequencing**: Ask the model to first list the steps in natural language, then generate code for each step.
+
+### Vision-Language Grounding Pipeline
+
+For tasks requiring visual understanding, a common pipeline is:
+
+1. Capture RGB-D image from robot camera
+2. Run open-vocabulary detector (Grounding DINO, OWL-ViT) to localize mentioned objects
+3. Project detections to 3D using depth data and camera intrinsics
+4. Pass object names, positions, and spatial relations to the LLM planner
+5. LLM generates an action plan referencing objects by name
+6. Execute plan using low-level controllers with 3D target positions
+
+## Practical Implementation: LLM Task Planner with ROS 2
+
+The following example demonstrates a minimal LLM-based task planner integrated with ROS 2. The node receives a natural language instruction, queries an LLM to generate a structured plan, and publishes action goals.
+
+```python
+#!/usr/bin/env python3
+"""ROS 2 node that uses an LLM to generate pick-and-place task plans."""
+
+import json
+import rclpy
+from rclpy.node import Node
+from std_msgs.msg import String
+from geometry_msgs.msg import PoseStamped
+from openai import OpenAI
+
+
+class LLMTaskPlanner(Node):
+ def __init__(self):
+ super().__init__('llm_task_planner')
+
+ # Subscribe to natural language commands
+ self.cmd_sub = self.create_subscription(
+ String, '/task_command', self.command_callback, 10)
+
+ # Publisher for action goals (consumed by a MoveIt action server)
+ self.goal_pub = self.create_publisher(
+ PoseStamped, '/move_goal', 10)
+
+ # Publisher for planner status feedback
+ self.status_pub = self.create_publisher(
+ String, '/planner_status', 10)
+
+ # Initialize the LLM client
+ self.llm_client = OpenAI()
+
+ # Known object positions from perception (updated by detector node)
+ self.object_positions = {}
+ self.obj_sub = self.create_subscription(
+ String, '/detected_objects', self.objects_callback, 10)
+
+ self.get_logger().info('LLM Task Planner ready.')
+
+ def objects_callback(self, msg):
+ """Update known object positions from the perception pipeline."""
+ self.object_positions = json.loads(msg.data)
+
+ def command_callback(self, msg):
+ """Handle incoming natural language task commands."""
+ instruction = msg.data
+ self.get_logger().info(f'Received instruction: {instruction}')
+
+ # Build the prompt with current scene context
+ plan = self.generate_plan(instruction)
+ if plan is None:
+ self.publish_status('Planning failed: LLM returned no valid plan.')
+ return
+
+ # Execute the generated plan step by step
+ self.execute_plan(plan)
+
+ def generate_plan(self, instruction: str) -> list:
+ """Query the LLM to produce a structured task plan.
+
+ Returns a list of action dicts, e.g.:
+ [{"action": "pick", "object": "red_cup"},
+ {"action": "place", "object": "red_cup", "target": [0.5, 0.2, 0.1]}]
+ """
+ # Format known objects into a scene description
+ scene = json.dumps(self.object_positions, indent=2)
+
+ system_prompt = f"""You are a robot task planner. The robot has a
+single arm with a parallel-jaw gripper. It can perform these actions:
+
+- pick(object_name): grasp the named object at its known position
+- place(object_name, [x, y, z]): place the held object at the target
+- move_to([x, y, z]): move gripper to a position without grasping
+
+Current scene (object positions in meters):
+{scene}
+
+Workspace bounds: x=[0.0, 0.8], y=[-0.4, 0.4], z=[0.0, 0.5]
+
+Output a JSON array of action steps. Each step has:
+ "action": one of "pick", "place", "move_to"
+ "object": object name (for pick/place)
+ "target": [x, y, z] (for place/move_to)
+
+Only use objects that appear in the current scene. Verify target
+positions are within workspace bounds. Output ONLY valid JSON."""
+
+ try:
+ response = self.llm_client.chat.completions.create(
+ model="gpt-4o",
+ messages=[
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": instruction}
+ ],
+ temperature=0.0,
+ max_tokens=1024
+ )
+ plan_text = response.choices[0].message.content.strip()
+
+ # Strip markdown fences if present
+ if plan_text.startswith("```"):
+ plan_text = plan_text.split("\n", 1)[1].rsplit("```", 1)[0]
+
+ plan = json.loads(plan_text)
+ self.get_logger().info(f'Generated plan with {len(plan)} steps.')
+ return plan
+
+ except Exception as e:
+ self.get_logger().error(f'LLM planning failed: {e}')
+ return None
+
+ def execute_plan(self, plan: list):
+ """Execute a structured plan by publishing goal poses."""
+ for i, step in enumerate(plan):
+ action = step.get("action")
+ self.publish_status(f'Step {i+1}/{len(plan)}: {action}')
+
+ if action == "pick":
+ obj_name = step["object"]
+ if obj_name not in self.object_positions:
+ self.publish_status(f'Object {obj_name} not found.')
+ return
+ pos = self.object_positions[obj_name]
+ self.publish_goal(pos)
+
+ elif action in ("place", "move_to"):
+ target = step.get("target")
+ if target is None:
+ self.publish_status(f'No target for {action}.')
+ return
+ self.publish_goal(target)
+
+ def publish_goal(self, position: list):
+ """Publish a PoseStamped goal for the motion planner."""
+ goal = PoseStamped()
+ goal.header.frame_id = "base_link"
+ goal.header.stamp = self.get_clock().now().to_msg()
+ goal.pose.position.x = float(position[0])
+ goal.pose.position.y = float(position[1])
+ goal.pose.position.z = float(position[2])
+ goal.pose.orientation.w = 1.0
+ self.goal_pub.publish(goal)
+
+ def publish_status(self, text: str):
+ """Publish a status message for monitoring."""
+ msg = String()
+ msg.data = text
+ self.status_pub.publish(msg)
+ self.get_logger().info(text)
+
+
+def main(args=None):
+ rclpy.init(args=args)
+ node = LLMTaskPlanner()
+ rclpy.spin(node)
+ node.destroy_node()
+ rclpy.shutdown()
+
+
+if __name__ == '__main__':
+ main()
+```
+
+To use this node:
+
+1. Launch your robot's perception pipeline to populate the `/detected_objects` topic with a JSON dictionary of object names and `[x, y, z]` positions.
+2. Start the planner: `ros2 run your_package llm_task_planner`
+3. Send a command: `ros2 topic pub /task_command std_msgs/String "data: 'pick up the red mug and place it on the shelf'"`
+
+The planner queries the LLM, generates a structured JSON plan, and publishes sequential goal poses for a downstream motion planner (e.g., MoveIt 2) to execute.
+
+## Challenges
+
+### Latency
+
+Foundation model inference introduces significant latency. A single GPT-4o API call takes 500ms–3s depending on prompt length and output size. For real-time control at 10–100 Hz, this is prohibitive. The standard mitigation is to use the foundation model only for high-level planning (called once per task or subtask) and rely on fast low-level controllers for real-time execution.
+
+On-device models (e.g., quantized LLaMA variants on NVIDIA Jetson) can reduce latency to 100–300ms per query but sacrifice capability. Recent work on speculative decoding and KV-cache optimization is closing this gap.
+
+### Hallucination and Correctness
+
+LLMs may generate plans that reference non-existent objects, call undefined APIs, violate physical constraints, or skip critical safety steps. Common mitigations include:
+
+- **Grounding with perception**: Verify all referenced objects exist in the current scene before execution.
+- **Affordance scoring**: Use learned value functions (as in SayCan) to filter infeasible actions.
+- **Code validation**: Parse and statically analyze generated code before execution. Check that all function calls are in the allowed API set.
+- **Plan verification**: Use a secondary model or rule-based checker to validate the plan against known constraints.
+
+### Safety
+
+Executing LLM-generated actions on physical hardware introduces safety risks. A malformed plan could cause collisions, drop objects, or damage the robot. Essential safeguards include:
+
+- **Workspace bounds checking**: Reject any target position outside the robot's safe workspace.
+- **Collision checking**: Run generated trajectories through a collision checker (e.g., MoveIt's planning scene) before execution.
+- **Human-in-the-loop confirmation**: For high-stakes actions, display the plan and require human approval before execution.
+- **Emergency stop integration**: Ensure the system respects hardware e-stop signals at all times, independent of the planner.
+
+### Compute Requirements
+
+VLA models like RT-2 require powerful GPUs (A100-class or higher) for real-time inference. Deploying these on mobile robots with limited compute budgets is challenging. Edge deployment strategies include model distillation, quantization (INT8/INT4), and offloading inference to a cloud server connected via low-latency networking.
+
+### Sim-to-Real Gap for Learned Representations
+
+Foundation models trained on internet data may have representation biases that do not align with the robot's sensory modality. For example, CLIP embeddings trained on web images may not generalize well to low-resolution depth images from a wrist-mounted camera. Domain adaptation, fine-tuning on robot-specific data, or using robot-specialized VLMs (e.g., SpatialVLM, RoboVLM) can mitigate this gap.
+
+## Best Practices
+
+### Prompt Design
+
+- **Be explicit about the action space**: List every available function with types and descriptions. Omit nothing the model needs.
+- **Include physical constraints**: Workspace bounds, payload limits, reachability constraints, and collision objects should be in the prompt.
+- **Provide diverse examples**: Include 3–5 in-context demonstrations covering different task types (sorting, stacking, tool use).
+- **Request structured output**: JSON or Python code is far easier to parse and validate than free-form text.
+- **Use chain-of-thought**: Prompt the model to reason step by step before generating the final plan. This improves accuracy on complex, multi-step tasks.
+
+### Fallback Strategies
+
+- **Retry with rephrasing**: If the LLM returns an invalid plan, retry with a more constrained prompt that narrows the output format.
+- **Graceful degradation**: If planning fails after retries, fall back to a pre-defined behavior tree or a safe home position. See the [Behavior Trees](/wiki/planning/behavior-tree/) article for implementing fallback behaviors.
+- **Incremental execution**: Execute one subtask at a time, re-querying the LLM after each step with updated scene state. This allows mid-plan correction.
+
+### Human-in-the-Loop Verification
+
+For deployment outside controlled lab settings, consider a confirmation loop:
+
+1. LLM generates a plan.
+2. The plan is displayed to a human operator (on a screen or via speech).
+3. The operator approves, modifies, or rejects the plan.
+4. Only approved plans are executed.
+
+This pattern is especially important during the early stages of deployment when trust in the system has not yet been established through extensive testing.
+
+### Evaluation and Testing
+
+- **Unit test the prompt**: Create a test suite of instruction-scene pairs with expected plan outputs. Run these against the LLM and check correctness.
+- **Simulation testing**: Execute generated plans in simulation (e.g., Isaac Sim, Gazebo) before deploying on hardware.
+- **Log everything**: Record all instructions, generated plans, execution traces, and outcomes for post-hoc analysis and prompt refinement.
+
+## Summary
+
+Foundation models have opened a new paradigm for robot task planning, enabling systems that understand open-ended natural language instructions and reason about novel objects and scenes. The key architectural insight is to use foundation models for high-level semantic reasoning while relying on established low-level controllers for safe physical execution. Approaches range from affordance-grounded planning (SayCan) to code generation (Code as Policies) to end-to-end vision-language-action models (RT-2). Each offers different tradeoffs between flexibility, data requirements, and integration complexity. Successful deployment requires careful attention to safety, latency, hallucination mitigation, and human oversight. As foundation models continue to improve in speed and reliability, their role in robotics will expand from research prototypes to production systems.
+
+## See Also:
+- [NLP for Robotics](/wiki/machine-learning/nlp-for-robotics/) - Background on transformer models and natural language processing for robotic systems
+- [Introduction to Diffusion Models and Diffusion Policy](/wiki/machine-learning/intro-to-diffusion/) - Diffusion-based visuomotor policies used as low-level controllers in foundation model architectures
+- [Imitation Learning With a Focus on Humanoids](/wiki/machine-learning/imitation-learning/) - Data collection and policy training for humanoid robots, including NVIDIA GR00T's System-1/System-2 architecture
+- [Behavior Trees](/wiki/planning/behavior-tree/) - Structured fallback behaviors for when foundation model planning fails
+
+## Further Reading
+- [Google DeepMind RT-2 Blog Post](https://deepmind.google/discover/blog/rt-2-new-model-translates-vision-and-language-into-action/) - Overview of the RT-2 vision-language-action model with demonstration videos
+- [Open X-Embodiment Project](https://robotics-transformer-x.github.io/) - Cross-embodiment robot learning dataset and models spanning 20+ robot platforms
+- [Code as Policies Project Page](https://code-as-policies.github.io/) - Interactive examples of LLM-generated robot policies with video demonstrations
+- [VoxPoser Project Page](https://voxposer.github.io/) - Zero-shot 3D value map composition from vision-language models for manipulation
+- [Grounding DINO GitHub Repository](https://github.com/IDEA-Research/GroundingDINO) - Open-vocabulary object detection model commonly used in robotic perception pipelines
+
+## References
+- M. Ahn et al., "Do As I Can, Not As I Say: Grounding Language in Robotic Affordances," in *Proc. Conference on Robot Learning (CoRL)*, 2022.
+- A. Brohan et al., "RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control," in *Proc. Conference on Robot Learning (CoRL)*, 2023.
+- S. Huang et al., "VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models," in *Proc. Conference on Robot Learning (CoRL)*, 2023.
+- J. Liang et al., "Code as Policies: Language Model Programs for Embodied Control," in *Proc. IEEE International Conference on Robotics and Automation (ICRA)*, 2023.
+- A. Radford et al., "Learning Transferable Visual Models From Natural Language Supervision," in *Proc. International Conference on Machine Learning (ICML)*, 2021.
+- I. Singh et al., "ProgPrompt: Generating Situated Robot Task Plans Using Large Language Models," in *Proc. IEEE International Conference on Robotics and Automation (ICRA)*, 2023.
+- Open X-Embodiment Collaboration, "Open X-Embodiment: Robotic Learning Datasets and RT-X Models," in *Proc. IEEE International Conference on Robotics and Automation (ICRA)*, 2024.
diff --git a/wiki/machine-learning/index.md b/wiki/machine-learning/index.md
index 15270315..0e3b8cd5 100644
--- a/wiki/machine-learning/index.md
+++ b/wiki/machine-learning/index.md
@@ -47,6 +47,9 @@ This section demonstrates how machine learning enhances robotic systems by enabl
- **[Imitation Learning With a Focus on Humanoids](/wiki/machine-learning/imitation-learning/)**
Covers the foundations of imitation learning and provides a practical guide for data collection and policy deployment on humanoid robots.
+- **[Foundation Models (LLMs/VLMs) for Robot Task Planning](/wiki/machine-learning/foundation-models-robot-task-planning/)**
+ Surveys key approaches for integrating large language models and vision-language models into robotic systems, including SayCan, Code as Policies, RT-2, and VoxPoser, with practical ROS 2 implementation examples.
+
## Resources
- [GIMP Installation Guide](https://www.gimp.org/)
diff --git a/wiki/simulation/index.md b/wiki/simulation/index.md
index 88c24058..7de3ec17 100644
--- a/wiki/simulation/index.md
+++ b/wiki/simulation/index.md
@@ -30,6 +30,9 @@ This section focuses on **simulation tools, techniques, and environments** for r
- **[Spawning and Controlling Vehicles in CARLA](/wiki/simulation/Spawning-and-Controlling-Vehicles-in-CARLA/)**
A hands-on tutorial for spawning and controlling vehicles in the CARLA simulator. Covers connecting to the CARLA server, visualizing waypoints, spawning vehicles, and using PID controllers for motion control. Demonstrates waypoint tracking with visual aids and includes example scripts for quick implementation.
+- **[Sim-to-Real Transfer: Domain Randomization and Adaptation](/wiki/simulation/sim-to-real-transfer/)**
+ Covers the reality gap between simulation and physical hardware, and the two dominant families of techniques for bridging it — domain randomization and domain adaptation. Includes practical code examples with NVIDIA Isaac Gym and case studies from OpenAI, ANYmal, and manipulation research.
+
## Resources
### General Simulation Tools
diff --git a/wiki/simulation/sim-to-real-transfer.md b/wiki/simulation/sim-to-real-transfer.md
new file mode 100644
index 00000000..809e79b5
--- /dev/null
+++ b/wiki/simulation/sim-to-real-transfer.md
@@ -0,0 +1,355 @@
+---
+date: 2026-04-30
+title: "Sim-to-Real Transfer: Domain Randomization and Adaptation Techniques"
+---
+Sim-to-real transfer is the process of training robotic policies in simulation and deploying them on physical hardware. Simulation offers unlimited data, safe exploration, and rapid iteration, but a persistent gap between simulated and real-world dynamics causes policies that perform well in simulation to fail on real robots. This article covers the primary sources of the reality gap, the two dominant families of techniques for bridging it — domain randomization and domain adaptation — and provides practical implementation guidance with code examples using NVIDIA Isaac Gym and Isaac Sim. Whether you are training a locomotion controller for a quadruped or a manipulation policy for a robotic arm, these techniques are essential for reliable real-world deployment.
+
+## The Reality Gap
+
+The reality gap refers to the systematic discrepancies between a simulated environment and the physical world. Understanding these discrepancies is the first step toward closing them. The gap arises from four primary sources:
+
+### Visual Discrepancies
+
+Rendered images in simulation differ from real camera feeds in texture detail, lighting conditions, reflections, shadows, lens distortion, motion blur, and color balance. A perception model trained exclusively on synthetic images will often fail when confronted with real visual input.
+
+### Dynamics Discrepancies
+
+Physics engines approximate the real world using simplified contact models, rigid-body assumptions, and discrete-time integration. Parameters such as mass, center of gravity, friction coefficients, joint damping, and restitution are never perfectly known. Small errors in these values compound over long rollouts, causing trajectory divergence.
+
+### Sensor Noise and Latency
+
+Real sensors exhibit noise distributions, biases, quantization artifacts, and latency that are difficult to replicate exactly. An IMU in simulation may report perfect angular velocities, whereas the physical IMU adds gyroscope bias drift and accelerometer noise. Similarly, depth cameras produce noisy point clouds with missing data near reflective or transparent surfaces.
+
+### Actuator Modeling
+
+Real actuators have nonlinear torque-speed curves, backlash, gear friction, thermal effects, and communication delays between the controller and the motor driver. Simulation often models actuators as ideal torque or velocity sources, ignoring these effects.
+
+> The reality gap is not a single problem but a collection of mismatches across perception, dynamics, sensing, and actuation. Effective sim-to-real transfer addresses multiple sources simultaneously.
+
+## Domain Randomization
+
+Domain randomization (DR) is the strategy of training a policy across a wide distribution of simulated environments so that the real world appears as just another sample from that distribution. Rather than building a single high-fidelity simulator, DR exposes the policy to enough variation that it learns to be robust to the specific conditions it will encounter on hardware.
+
+### Mathematical Formulation
+
+Let $\xi$ denote the vector of environment parameters (masses, frictions, visual properties, sensor noise levels, etc.). In standard reinforcement learning, a policy $\pi_\theta$ is trained in a single environment with fixed parameters $\xi_0$:
+
+$$\theta^* = \arg\max_\theta \; \mathbb{E}_{\tau \sim \pi_\theta, \xi_0} \left[ \sum_{t=0}^{T} \gamma^t r_t \right]$$
+
+With domain randomization, parameters are sampled from a distribution $P(\xi)$ at the start of each episode (or at each step), and the objective becomes:
+
+$$\theta^* = \arg\max_\theta \; \mathbb{E}_{\xi \sim P(\xi)} \; \mathbb{E}_{\tau \sim \pi_\theta, \xi} \left[ \sum_{t=0}^{T} \gamma^t r_t \right]$$
+
+The key insight is that if the real-world parameters $\xi_{\text{real}}$ fall within the support of $P(\xi)$, the trained policy is likely to generalize. The distribution $P(\xi)$ is typically uniform or Gaussian over manually specified ranges.
+
+### Visual Randomization
+
+Visual domain randomization modifies the appearance of the simulated scene to prevent the perception module from overfitting to synthetic rendering artifacts. Common randomizations include:
+
+- **Textures**: Random textures applied to objects, floors, walls, and the robot body.
+- **Lighting**: Random number, position, intensity, and color of light sources. Random ambient light levels.
+- **Camera intrinsics**: Small perturbations to focal length, principal point, and distortion coefficients.
+- **Camera extrinsics**: Jitter in camera position and orientation relative to the nominal mount.
+- **Distractors**: Random geometric shapes placed in the scene background.
+- **Post-processing**: Random noise, blur, color jitter, and contrast adjustments applied to rendered images.
+
+Visual randomization is particularly important for policies that consume raw image input. For policies that use only proprioceptive state (joint angles, velocities), visual randomization is unnecessary.
+
+### Dynamics Randomization
+
+Dynamics randomization perturbs the physical parameters of the simulation:
+
+| Parameter | Typical Range | Notes |
+|-----------|--------------|-------|
+| Link masses | ±15–30% | Randomize each link independently |
+| Center of mass offsets | ±1–3 cm | Per link, all three axes |
+| Joint friction | ±20–50% | Coulomb and viscous components |
+| Joint damping | ±20–50% | Often coupled with friction |
+| Ground friction | 0.3–1.2 | Coefficient of friction |
+| Restitution | 0.0–0.5 | Bounciness of contacts |
+| Actuator strength | ±10–20% | Scales applied torques |
+| Control latency | 0–20 ms | Simulates communication delay |
+| Observation noise | Sensor-dependent | Gaussian noise on joint encoders, IMU |
+| External disturbances | Task-dependent | Random forces/torques on the base |
+
+### Practical Example: Domain Randomization in Isaac Gym
+
+The following example demonstrates how to configure dynamics randomization for a quadruped locomotion task using NVIDIA Isaac Gym. The randomization parameters are specified in a configuration dictionary and applied at each environment reset.
+
+```python
+import isaacgym
+from isaacgym import gymapi, gymutil
+import numpy as np
+import torch
+
+class QuadrupedDREnv:
+ """Quadruped locomotion environment with domain randomization."""
+
+ def __init__(self, num_envs=4096, device="cuda:0"):
+ self.num_envs = num_envs
+ self.device = device
+
+ # Define randomization ranges as (min, max) tuples
+ self.randomization_params = {
+ "base_mass_offset": (-1.0, 1.0), # kg added to base link
+ "link_mass_scale": (0.8, 1.2), # multiplicative scale per link
+ "friction_range": (0.3, 1.25), # ground friction coefficient
+ "restitution_range": (0.0, 0.4), # ground restitution
+ "joint_damping_scale": (0.8, 1.3), # scale factor for joint damping
+ "joint_friction": (0.0, 0.05), # joint-level Coulomb friction (Nm)
+ "actuator_strength_scale": (0.85, 1.15), # scale factor on torque commands
+ "control_latency_steps": (0, 3), # discrete steps of added latency
+ "observation_noise": {
+ "joint_pos": 0.01, # rad standard deviation
+ "joint_vel": 0.15, # rad/s standard deviation
+ "imu_gyro": 0.02, # rad/s standard deviation
+ "imu_accel": 0.05, # m/s^2 standard deviation
+ },
+ "push_force_range": (0.0, 30.0), # random external push force (N)
+ }
+
+ def randomize_environment(self, env_ids):
+ """Apply domain randomization to specified environments on reset."""
+ n = len(env_ids)
+
+ # Randomize base mass by adding an offset
+ mass_offsets = torch.uniform(
+ self.randomization_params["base_mass_offset"][0],
+ self.randomization_params["base_mass_offset"][1],
+ size=(n,), device=self.device
+ )
+ self.apply_mass_offsets(env_ids, mass_offsets)
+
+ # Randomize ground friction per environment
+ frictions = torch.uniform(
+ self.randomization_params["friction_range"][0],
+ self.randomization_params["friction_range"][1],
+ size=(n,), device=self.device
+ )
+ self.apply_ground_friction(env_ids, frictions)
+
+ # Randomize actuator strength (scales the torques sent to joints)
+ self.actuator_scales[env_ids] = torch.uniform(
+ self.randomization_params["actuator_strength_scale"][0],
+ self.randomization_params["actuator_strength_scale"][1],
+ size=(n, self.num_joints), device=self.device
+ )
+
+ # Randomize control latency (integer number of steps)
+ low, high = self.randomization_params["control_latency_steps"]
+ self.latency_steps[env_ids] = torch.randint(
+ low, high + 1, size=(n,), device=self.device
+ )
+
+ def apply_observation_noise(self, obs):
+ """Add Gaussian noise to observations to simulate sensor imperfections."""
+ noise_cfg = self.randomization_params["observation_noise"]
+ obs[:, 0:12] += torch.randn_like(obs[:, 0:12]) * noise_cfg["joint_pos"]
+ obs[:, 12:24] += torch.randn_like(obs[:, 12:24]) * noise_cfg["joint_vel"]
+ obs[:, 24:27] += torch.randn_like(obs[:, 24:27]) * noise_cfg["imu_gyro"]
+ obs[:, 27:30] += torch.randn_like(obs[:, 27:30]) * noise_cfg["imu_accel"]
+ return obs
+```
+
+The corresponding YAML configuration file allows rapid iteration without modifying code:
+
+```yaml
+# domain_randomization.yaml
+domain_randomization:
+ # Dynamics randomization applied at each episode reset
+ dynamics:
+ base_mass_offset: [-1.0, 1.0] # kg, uniform
+ link_mass_scale: [0.8, 1.2] # multiplicative, uniform
+ friction_range: [0.3, 1.25] # ground coefficient, uniform
+ restitution_range: [0.0, 0.4] # ground restitution, uniform
+ joint_damping_scale: [0.8, 1.3] # multiplicative, uniform
+ joint_friction: [0.0, 0.05] # Nm, uniform
+ actuator_strength_scale: [0.85, 1.15]
+ control_latency_steps: [0, 3] # integer, uniform
+
+ # Observation noise applied every step
+ observation_noise:
+ joint_position_std: 0.01 # rad
+ joint_velocity_std: 0.15 # rad/s
+ imu_gyro_std: 0.02 # rad/s
+ imu_accel_std: 0.05 # m/s^2
+
+ # External disturbances applied periodically
+ disturbances:
+ push_force_range: [0.0, 30.0] # N, applied to base
+ push_interval_s: [5.0, 15.0] # seconds between pushes
+ push_duration_s: [0.05, 0.2] # duration of each push
+
+ # Visual randomization (for vision-based policies)
+ visual:
+ texture_randomize: true
+ light_intensity_range: [0.3, 1.5]
+ light_color_temperature: [3000, 7000] # Kelvin
+ camera_fov_jitter: 2.0 # degrees
+ camera_position_jitter: 0.005 # meters
+```
+
+## Domain Adaptation
+
+While domain randomization broadens the training distribution to encompass reality, domain adaptation takes the opposite approach: it explicitly aligns the simulated and real domains so that a policy trained in one transfers directly to the other. The two approaches are complementary and are often combined.
+
+### System Identification
+
+System identification (SysID) estimates the physical parameters of the real system and configures the simulator to match. This produces a high-fidelity simulation that minimizes the reality gap at its source.
+
+Common approaches include:
+
+1. **Manual measurement**: Weigh links on a scale, measure dimensions with calipers, estimate friction by dragging surfaces.
+2. **Trajectory matching**: Record real-world trajectories under known commands, then optimize simulation parameters to minimize the trajectory error:
+
+$$\xi^* = \arg\min_\xi \sum_{t=0}^{T} \| s_t^{\text{real}} - s_t^{\text{sim}}(\xi) \|^2$$
+
+3. **Bayesian estimation**: Use Bayesian optimization or Markov Chain Monte Carlo methods to estimate parameter distributions rather than point estimates, naturally integrating with domain randomization.
+
+System identification reduces the width of the randomization distribution needed, leading to better-performing policies. However, it requires real-world data collection and cannot capture all aspects of the reality gap.
+
+### Transfer Learning and Progressive Nets
+
+Transfer learning fine-tunes a policy pre-trained in simulation using a small amount of real-world data. Progressive neural networks extend this idea by freezing the simulation-trained network and adding lateral connections to a new network that adapts to real data, preventing catastrophic forgetting of simulation knowledge.
+
+The procedure is:
+
+1. Train a policy $\pi_{\text{sim}}$ to convergence in simulation.
+2. Freeze the weights of $\pi_{\text{sim}}$.
+3. Initialize a new network $\pi_{\text{real}}$ with lateral connections from $\pi_{\text{sim}}$.
+4. Fine-tune $\pi_{\text{real}}$ on real-world rollouts with a small learning rate.
+
+This approach is particularly useful when the reality gap is too large for zero-shot transfer but collecting large amounts of real data is impractical.
+
+### Adversarial Domain Adaptation
+
+Adversarial methods learn domain-invariant representations by training a feature extractor that a domain discriminator cannot distinguish between sim and real:
+
+1. **Domain-Adversarial Neural Networks (DANN)**: A gradient reversal layer forces the feature extractor to learn representations that are informative for the task but invariant to the domain (sim vs. real). The training objective combines task loss and domain confusion:
+
+$$\mathcal{L} = \mathcal{L}_{\text{task}}(\theta_f, \theta_y) - \lambda \, \mathcal{L}_{\text{domain}}(\theta_f, \theta_d)$$
+
+where $\theta_f$ are feature extractor parameters, $\theta_y$ are task predictor parameters, $\theta_d$ are domain classifier parameters, and $\lambda$ controls the trade-off.
+
+2. **Image-level adaptation with CycleGAN**: CycleGAN translates simulated images to appear realistic (and vice versa) without paired data. A policy can then be trained on "sim-to-real translated" images, which look like real camera feeds but are generated from simulation. This is effective for bridging the visual domain gap while retaining the ability to generate unlimited training data.
+
+```python
+# Pseudocode for CycleGAN-based visual domain adaptation
+# Step 1: Collect unpaired images from sim and real
+sim_images = collect_sim_images(num=10000)
+real_images = collect_real_images(num=500)
+
+# Step 2: Train CycleGAN to translate sim -> real appearance
+cyclegan = CycleGAN()
+cyclegan.train(sim_images, real_images, epochs=200)
+
+# Step 3: During policy training, translate sim renders before feeding to policy
+for episode in range(num_episodes):
+ sim_obs = env.render()
+ # Translate simulated image to real-looking image
+ adapted_obs = cyclegan.sim_to_real(sim_obs)
+ action = policy(adapted_obs)
+ env.step(action)
+```
+
+## Evaluation Methodology
+
+Evaluating sim-to-real transfer requires measuring performance in both domains and analyzing the transfer gap.
+
+### Metrics
+
+| Metric | Description |
+|--------|-------------|
+| **Sim success rate** | Task success rate in the training simulation |
+| **Real success rate** | Task success rate on physical hardware |
+| **Transfer ratio** | Real success rate / Sim success rate |
+| **Sim-to-real gap** | Sim success rate − Real success rate |
+| **Robustness score** | Success rate under perturbations (pushes, payload changes) |
+
+### Evaluation Protocol
+
+1. **Baseline**: Train and evaluate a policy without any randomization or adaptation to establish the naive transfer gap.
+2. **Ablation**: Enable randomization categories one at a time (dynamics only, visual only, both) to measure the contribution of each.
+3. **Range sweep**: Vary the width of randomization ranges. Too narrow fails to cover reality; too wide makes the task too hard to learn.
+4. **Real-world trials**: Run a statistically significant number of real-world trials (typically 20–100 per condition) and report mean success rate with confidence intervals.
+
+## Best Practices
+
+### Start Simple, Randomize Incrementally
+
+Begin by training in a non-randomized simulator with identified parameters. Once the policy works in that setting, add randomization categories one at a time, verifying that the policy still learns. This helps isolate which randomizations are necessary and which degrade performance.
+
+### Validate with Real-World Data
+
+Collect a small set of real-world trajectories early in the project. Use them to:
+- Calibrate simulator parameters via system identification.
+- Set informed randomization ranges rather than guessing.
+- Evaluate candidate policies before committing to extensive real-world testing.
+
+### Automated Domain Randomization (ADR)
+
+Manually specifying randomization ranges is tedious and suboptimal. Automated domain randomization, introduced by OpenAI for the Rubik's cube project, adapts ranges during training based on policy performance:
+
+1. Start with narrow randomization ranges centered on nominal values.
+2. If the policy achieves a target success rate, widen the ranges.
+3. If the policy performance drops below a threshold, narrow the ranges.
+
+This produces the widest randomization distribution that the policy can still handle, automatically trading off robustness and performance.
+
+$$\Delta \xi_i \leftarrow \begin{cases} \Delta \xi_i + \delta & \text{if success rate} > \tau_{\text{upper}} \\ \Delta \xi_i - \delta & \text{if success rate} < \tau_{\text{lower}} \end{cases}$$
+
+### When NOT to Use Sim-to-Real
+
+Sim-to-real transfer is not always the best approach:
+
+- **Sufficient real data is available**: If you can collect enough real-world demonstrations efficiently (e.g., via teleoperation), imitation learning on real data avoids the sim-to-real gap entirely.
+- **The task is contact-rich and hard to simulate**: Tasks like deformable object manipulation, fluid handling, or fine-grained tactile sensing may have reality gaps that are too large to bridge.
+- **The real environment is simple**: For well-structured environments (e.g., a fixed industrial cell), it may be faster to build a high-fidelity digital twin than to randomize broadly.
+
+## Case Studies
+
+### OpenAI Rubik's Cube (Dactyl)
+
+OpenAI trained a Shadow Dexterous Hand to solve a Rubik's cube entirely in simulation using massive domain randomization and ADR. The policy was trained across billions of episodes with randomized hand geometry, cube dimensions, friction coefficients, actuator gains, and visual properties. The ADR procedure automatically expanded 37 randomization parameters over the course of training. The resulting policy transferred zero-shot to the real robot, demonstrating that sufficient randomization can bridge even complex dexterous manipulation gaps.
+
+### ANYmal Quadruped Locomotion
+
+ETH Zurich and NVIDIA trained locomotion policies for the ANYmal quadruped robot in Isaac Gym with dynamics randomization including mass, friction, motor strength, and added latency. The policy was trained using PPO with 4096 parallel environments, achieving training times of under 20 minutes on a single GPU. The resulting controller transferred to the real ANYmal, enabling robust locomotion over rough terrain, stairs, and slippery surfaces without any real-world fine-tuning.
+
+### Manipulation Policies with Diffusion Models
+
+Recent work combines diffusion policy architectures with sim-to-real transfer for manipulation tasks. Policies are pre-trained in simulation with visual and dynamics randomization, then optionally fine-tuned with a small number of real-world demonstrations. The multi-modal nature of diffusion policies makes them naturally robust to distributional shift, complementing the robustness provided by domain randomization.
+
+## Summary
+
+Sim-to-real transfer enables training robotic policies in the safety and efficiency of simulation while deploying them on physical hardware. The reality gap — spanning visual appearance, dynamics, sensor noise, and actuator behavior — is the central challenge. Domain randomization addresses this by training across a distribution of simulated environments broad enough to include reality as a plausible sample. Domain adaptation takes the complementary approach of aligning simulated and real domains through system identification, transfer learning, or adversarial methods. In practice, the most successful deployments combine both: use system identification to build an accurate baseline simulator, apply domain randomization to handle residual uncertainty, and optionally fine-tune on real data. Start with narrow randomization ranges, validate against real-world data early, and consider automated domain randomization to systematically expand robustness.
+
+## See Also:
+- [NVIDIA Isaac Sim Setup and ROS2 Workflow](/wiki/simulation/simulation-isaacsim-setup/)
+- [Building a Light-Weight Custom Simulator](/wiki/simulation/Building-a-Light-Weight-Custom-Simulator/)
+- [Introduction to Reinforcement Learning](/wiki/machine-learning/intro-to-rl/)
+- [Introduction to Diffusion Models and Diffusion Policy](/wiki/machine-learning/intro-to-diffusion/)
+- [Gazebo Simulation](/wiki/tools/gazebo-simulation/)
+- [Imitation Learning With a Focus on Humanoids](/wiki/machine-learning/imitation-learning/)
+
+## Further Reading
+- [Isaac Gym: High Performance GPU-Based Physics Simulation for Robot Learning](https://developer.nvidia.com/isaac-gym) — NVIDIA's GPU-accelerated physics simulator purpose-built for RL with thousands of parallel environments.
+- [Sim-to-Real Robot Learning from Pixels with Progressive Nets (DeepMind Blog)](https://www.deepmind.com/blog/sim-to-real-robot-learning-from-pixels-with-progressive-nets) — Overview of progressive neural networks for sim-to-real visual policy transfer.
+- [OpenAI Solving Rubik's Cube with a Robot Hand](https://openai.com/index/solving-rubiks-cube/) — Detailed account of the Dactyl project and automated domain randomization.
+- [Domain Randomization for Sim2Real Transfer (Lilian Weng's Blog)](https://lilianweng.github.io/posts/2019-05-05-domain-randomization/) — Comprehensive survey of domain randomization techniques with references.
+
+## References
+
+Akkaya, I., Andrychowicz, M., Chociej, M., Litwin, M., McGrew, B., Petron, A., Paino, A., Plappert, M., Powell, G., Ribas, R., & Schneider, J. (2019). Solving Rubik's cube with a robot hand. *arXiv preprint arXiv:1910.07113*.
+
+Ganin, Y., & Lempitsky, V. (2015). Unsupervised domain adaptation by backpropagation. *Proceedings of the 32nd International Conference on Machine Learning (ICML)*, 1180–1189.
+
+Hwangbo, J., Lee, J., Dosovitskiy, A., Bellicoso, D., Tsounis, V., Koltun, V., & Hutter, M. (2019). Learning agile and dynamic motor skills for legged robots. *Science Robotics*, 4(26), eaau5872.
+
+Makoviychuk, V., Wawrzyniak, L., Guo, Y., Lu, M., Storey, K., Macklin, M., Hoeller, D., Ruber, N., Tremblay, J., Murrell, T., Petrenko, O., & State, G. (2021). Isaac Gym: High performance GPU-based physics simulation for robot learning. *arXiv preprint arXiv:2108.10470*.
+
+Rusu, A. A., Vecerik, M., Rothörl, T., Heess, N., Pascanu, R., & Hadsell, R. (2017). Sim-to-real robot learning from pixels with progressive nets. *Proceedings of the 1st Conference on Robot Learning (CoRL)*, 262–270.
+
+Tobin, J., Fong, R., Ray, A., Schneider, J., Zaremba, W., & Abbeel, P. (2017). Domain randomization for transferring deep neural networks from simulation to the real world. *Proceedings of the IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)*, 23–30.
+
+Zhu, J., Park, T., Isola, P., & Efros, A. A. (2017). Unpaired image-to-image translation using cycle-consistent adversarial networks. *Proceedings of the IEEE International Conference on Computer Vision (ICCV)*, 2223–2232.