luxonis · jkbmrz · Jun 24, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
@@ -1,16 +1,14 @@
-from .zero_dce import ZeroDCEParser
-from .dncnn3 import DnCNN3Parser
-from .depth_anything import DepthAnythingParser
+from .image_to_image import ImageOutputParser
+from .monocular_depth import MonocularDepthParser
 from .yunet import YuNetParser
 from .mediapipe_hand_detection import MPHandDetectionParser
 from .mediapipe_hand_landmarker import MPHandLandmarkParser
 from .scrfd import SCRFDParser
 from .segmentation import SegmentationParser
 
 __all__ = [
-    'ZeroDCEParser', 
-    'DnCNN3Parser',
-    'DepthAnythingParser',
+    'ImageOutputParser',
+    'MonocularDepthParser',
     'YuNetParser',
     'MPHandDetectionParser',
     'MPHandLandmarkParser',

diff --git a/ml/postprocessing/image_to_image.py b/ml/postprocessing/image_to_image.py
@@ -0,0 +1,52 @@
+import depthai as dai
+
+from .utils import unnormalize_image
+from .utils.message_creation import create_image_message
+
+
+class ImageOutputParser(dai.node.ThreadedHostNode):
+    def __init__(self, output_is_bgr=False):
+        dai.node.ThreadedHostNode.__init__(self)
+        self.input = dai.Node.Input(self)
+        self.out = dai.Node.Output(self)
+
+        self.output_is_bgr = output_is_bgr
+
+    def setBGROutput(self):
+        self.output_is_bgr = True
+
+    def run(self):
+        """
+        Postprocessing logic for image-to-image models (e.g. DnCNN3, zero-dce etc.).
+
+        Returns:
+            dai.ImgFrame: uint8, grayscale HW / colorscale HWC BGR image.
+        """
+
+        while self.isRunning():
+
+            try:
+                output: dai.NNData = self.input.get()
+            except dai.MessageQueue.QueueException as e:
+                break  # Pipeline was stopped
+
+            output_layer_names = output.getAllLayerNames()
+            if len(output_layer_names) != 1:
+                raise ValueError(
+                    f"Expected 1 output layer, got {len(output_layer_names)}."
+                )
+            output = output.getTensor(output_layer_names[0])
+            if len(output.shape) != 4:
+                raise ValueError(
+                    f"Unexpected 4-dimensional output, got {len(output.shape)}-dimensional",
+                )
+
+            image = output[0]
+            image = unnormalize_image(image)
+
+            image_message = create_image_message(
+                image=image,
+                is_bgr=self.output_is_bgr,
+            )
+
+            self.out.send(image_message)
@@ -0,0 +1,48 @@
+import depthai as dai
+
+from .utils.message_creation import create_monocular_depth_message
+
+
+class MonocularDepthParser(dai.node.ThreadedHostNode):
+    def __init__(self, depth_type="relative"):
+        dai.node.ThreadedHostNode.__init__(self)
+        self.input = dai.Node.Input(self)
+        self.out = dai.Node.Output(self)
+
+        self.depth_type = depth_type
+
+    def setRelativeDepthType(self):
+        self.depth_type = "relative"
+
+    def setMetricDepthType(self):
+        self.depth_type = "metric"
+
+    def run(self):
+        """
+        Postprocessing logic for a model with monocular depth output (e.g.Depth Anything model).
+
+        Returns:
+            dai.ImgFrame: uint16, HW depth map.
+        """
+
+        while self.isRunning():
+
+            try:
+                output: dai.NNData = self.input.get()
+            except dai.MessageQueue.QueueException as e:
+                break  # Pipeline was stopped
+
+            output_layer_names = output.getAllLayerNames()
+            if len(output_layer_names) != 1:
+                raise ValueError(
+                    f"Expected 1 output layer, got {len(output_layer_names)}."
+                )
+            output = output.getTensor(output_layer_names[0])
+
+            depth_map = output[0]
+
+            depth_message = create_monocular_depth_message(
+                depth_map=depth_map,
+                depth_type=self.depth_type,
+            )
+            self.out.send(depth_message)
@@ -0,0 +1,2 @@
+from .denormalize import unnormalize_image
+from .decode_detections import decode_detections
@@ -0,0 +1,89 @@
+import numpy as np
+from typing import List, Dict, Any
+import time
+
+
+def decode_detections(
+    input_size: float,
+    stride: int,
+    score_threshold: float,
+    cls: np.ndarray,
+    obj: np.ndarray,
+    bbox: np.ndarray,
+    kps: np.ndarray,
+) -> List[Dict[str, Any]]:
+    """
+    Decode the detections from neural network output tensors.
+
+    Args:
+        input_size (float): The input size of the model that produced the detections, (width, height).
+        stride (int): The stride used in the detection grid.
+        rows (int): Number of rows in the detection grid.
+        cols (int): Number of columns in the detection grid.
+        score_threshold (float): Minimum score threshold for a detection to be considered valid.
+        cls (np.ndarray): 2D array of class scores for each grid cell, shape (grid_size, num_classes).
+        obj (np.ndarray): 1D array of objectness scores for each grid cell, shape (grid_size,).
+        bbox (np.ndarray): 2D array of bounding box coordinates, shape (grid_size, 4).
+        kps (np.ndarray): 2D array of keypoint coordinates, shape (grid_size, num_keypoints * 2).
+
+    Returns:
+        List[Dict[str, Any]]: A list of detections, where each detection is a dictionary containing:
+            - "bbox": [x1, y1, width, height] (relative bounding box coordinates)
+            - "label": int (class label)
+            - "keypoints": List[float] (relative keypoint coordinates)
+            - "score": float (detection score)
+    """
+
+    input_width, input_height = input_size
+    cols = int(input_size[0] / stride)  # w/stride
+    rows = int(input_size[1] / stride)  # h/stride
+
+    # Compute the indices
+    r, c = np.meshgrid(np.arange(rows), np.arange(cols), indexing='ij')
+    idx = r * cols + c
+
+    # Decode scores
+    cls_scores = np.clip(cls[idx], 0, 1)
+    obj_scores = np.clip(obj[idx], 0, 1)
+    max_cls_scores = np.max(cls_scores, axis=-1)
+    scores = np.sqrt(max_cls_scores * obj_scores)
+
+    # Get the labels with the highest score
+    labels = np.argmax(cls_scores, axis=-1)
+
+    # Decode bounding boxes
+    cx = (c + bbox[idx, 0]) * stride
+    cy = (r + bbox[idx, 1]) * stride
+    w = np.exp(bbox[idx, 2]) * stride
+    h = np.exp(bbox[idx, 3]) * stride
+    x1 = cx - w / 2
+    y1 = cy - h / 2
+
+    # Decode keypoints
+    lx = (kps[idx, ::2] + c[:, :, None]) * stride
+    ly = (kps[idx, 1::2] + r[:, :, None]) * stride
+    keypoints = np.stack((lx / input_width, ly / input_height), axis=-1)
+
+    # Filter detections based on score_threshold
+    mask = scores > score_threshold
+
+    # Append detection results
+    detections = []
+    for i in range(rows):
+        for j in range(cols):
+            if mask[i, j]:
+                detection = {
+                    "bbox": [
+                        x1[i, j] / input_width,
+                        y1[i, j] / input_height,
+                        w[i, j] / input_width,
+                        h[i, j] / input_height,
+                    ],
+                    "label": int(labels[i, j]),
+                    "keypoints": [(x,y) for x,y in keypoints[i,j]], #keypoints[i, j].tolist(),
+                    "score": float(scores[i, j]),
+                }
+
+                detections.append(detection)
+
+    return detections
@@ -0,0 +1,26 @@
+import numpy as np
+
+
+def unnormalize_image(image, normalize=True):
+    """
+    Un-normalize an image tensor by scaling it to the [0, 255] range.
+
+    Args:
+        image (np.ndarray): The normalized image tensor of shape (H, W, C) or (C, H, W).
+        normalize (bool, optional): Whether to normalize the image tensor. Defaults to True.
+
+    Returns:
+        np.ndarray: The un-normalized image.
+    """
+    # Normalize the image tensor to the range [0, 1]
+    if normalize:
+        image = (image - image.min()) / (image.max() - image.min())
+
+    # Scale to [0, 255] and clip the values to be in the proper range
+    image = image * 255.0
+    image = np.clip(image, 0, 255)
+
+    # Convert to uint8
+    image = image.astype(np.uint8)
+
+    return image
@@ -1,11 +1,13 @@
-from .depth import create_depth_message
+from .image import create_image_message
 from .segmentation import create_segmentation_message
 from .keypoints import create_hand_keypoints_message
 from .detection import create_detection_message
+from .monocular_depth import create_monocular_depth_message
 
 __all__ = [
-    "create_depth_message",
+    "create_image_message",
     "create_segmentation_message",
     "create_hand_keypoints_message",
     "create_detection_message",
-]
+    "create_monocular_depth_message",
+]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .denormalize import unnormalize_image
		from .decode_detections import decode_detections