fix: input size setting (#126)

* feat: remove default input_shape for LaneDetectionParser * feat: add input size extraction to ParserGenerator * feat: add automatic input_size detection to LaneDetectionParser * feat: add automatic input_size detection to YuNetParser * fix: BaseParser build method docstring * fix: naming of input_shape to input_size * fix: pre-commit * fix: YuNet multiple inputs error description * fix: typos * feat: add input_shape and layout to head_config and remove inputs_size from parser arguments * fix: pre-commit * fix: head_config model inputs nesting
luxonis · Nov 13, 2024 · 869ed96 · 869ed96
1 parent 37a2d92
commit 869ed96
Show file tree

Hide file tree

Showing 5 changed files with 76 additions and 43 deletions.
diff --git a/depthai_nodes/ml/parsers/base_parser.py b/depthai_nodes/ml/parsers/base_parser.py
@@ -57,6 +57,8 @@ def build(self, head_config: Dict[str, Any]) -> "BaseParser":
         @param head_config: A dictionary containing configuration details relevant to
             the parser, including parameters and settings required for output parsing.
         @type head_config: Dict[str, Any]
+        @return: The parser object with the head configuration set.
+        @rtype: BaseParser
         """
         pass
 

diff --git a/depthai_nodes/ml/parsers/lane_detection.py b/depthai_nodes/ml/parsers/lane_detection.py
@@ -23,8 +23,8 @@ class LaneDetectionParser(BaseParser):
         Griding number.
     cls_num_per_lane : int
         Number of points per lane.
-    input_shape : Tuple[int, int]
-        Input shape.
+    input_size : Tuple[int, int]
+        Input size (width,height).
 
     Output Message/s
     ----------------
@@ -44,7 +44,7 @@ def __init__(
         row_anchors: List[int] = None,
         griding_num: int = None,
         cls_num_per_lane: int = None,
-        input_shape: Tuple[int, int] = (288, 800),
+        input_size: Tuple[int, int] = None,
     ) -> None:
         """Initializes the lane detection parser node.
 
@@ -56,16 +56,16 @@ def __init__(
         @type griding_num: int
         @param cls_num_per_lane: Number of points per lane.
         @type cls_num_per_lane: int
-        @param input_shape: Input shape.
-        @type input_shape: Tuple[int, int]
+        @param input_size: Input size (width,height).
+        @type input_size: Tuple[int, int]
         """
         super().__init__()
         self.output_layer_name = output_layer_name
 
         self.row_anchors = row_anchors
         self.griding_num = griding_num
         self.cls_num_per_lane = cls_num_per_lane
-        self.input_shape = input_shape
+        self.input_size = input_size
 
     def setOutputLayerName(self, output_layer_name: str) -> None:
         """Set the output layer name for the lane detection model.
@@ -109,19 +109,19 @@ def setClsNumPerLane(self, cls_num_per_lane: int) -> None:
             raise ValueError("Number of points per lane must be an integer.")
         self.cls_num_per_lane = cls_num_per_lane
 
-    def setInputShape(self, input_shape: Tuple[int, int]) -> None:
-        """Set the input shape for the lane detection model.
+    def setInputSize(self, input_size: Tuple[int, int]) -> None:
+        """Set the input size for the lane detection model.
 
-        @param input_shape: Input shape.
-        @type input_shape: Tuple[int, int]
+        @param input_size: Input size (width,height).
+        @type input_size: Tuple[int, int]
         """
-        if not isinstance(input_shape, tuple):
-            raise ValueError("Input shape must be a tuple.")
-        if len(input_shape) != 2:
-            raise ValueError("Input shape must be a tuple of two integers.")
-        if not all(isinstance(size, int) for size in input_shape):
-            raise ValueError("Input shape must be a tuple of integers.")
-        self.input_shape = input_shape
+        if not isinstance(input_size, tuple):
+            raise ValueError("Input size must be a tuple.")
+        if len(input_size) != 2:
+            raise ValueError("Input size must be a tuple of two integers.")
+        if not all(isinstance(size, int) for size in input_size):
+            raise ValueError("Input size must be a tuple of integers.")
+        self.input_size = input_size
 
     def build(
         self,
@@ -147,6 +147,22 @@ def build(
             "cls_num_per_lane", self.cls_num_per_lane
         )
 
+        inputs = head_config["model_inputs"]
+        if len(inputs) != 1:
+            raise ValueError(
+                f"Only one input supported for LaneDetectionParser, got {len(inputs)} inputs."
+            )
+        self.input_shape = inputs[0].get("shape")
+        self.layout = inputs[0].get("layout")
+        if self.layout == "NHWC":
+            self.input_size = (self.input_shape[2], self.input_shape[1])
+        elif self.layout == "NCHW":
+            self.input_size = (self.input_shape[3], self.input_shape[2])
+        else:
+            raise ValueError(
+                f"Input layout {self.layout} not supported for input_size extraction."
+            )
+
         return self
 
     def run(self):
@@ -180,8 +196,8 @@ def run(self):
                 anchors=self.row_anchors,
                 griding_num=self.griding_num,
                 cls_num_per_lane=self.cls_num_per_lane,
-                input_width=self.input_shape[1],
-                input_height=self.input_shape[0],
+                input_width=self.input_size[0],
+                input_height=self.input_size[1],
                 y=y,
             )
 

diff --git a/depthai_nodes/ml/parsers/utils/yunet.py b/depthai_nodes/ml/parsers/utils/yunet.py
@@ -20,23 +20,23 @@ def manual_product(*args):
 
 
 def generate_anchors(
-    input_shape: Tuple[int, int],
+    input_size: Tuple[int, int],
     min_sizes: List[List[int]] = None,
     strides: List[int] = None,
 ):
     """Generate a set of default bounding boxes, known as anchors.
     The code is taken from https://github.com/Kazuhito00/YuNet-ONNX-TFLite-Sample/tree/main
 
-    @param input_shape: A tuple representing the width and height of the input image.
-    @type input_shape: Tuple[int, int]
+    @param input_size: A tuple representing the width and height of the input image.
+    @type input_size: Tuple[int, int]
     @param min_sizes: A list of lists, where each inner list contains the minimum sizes of the anchors for different feature maps.
     @type min_sizes List[List[int]]
     @param strides: Strides for each feature map layer.
     @type strides: List[int]
     @return: Anchors.
     @rtype: np.ndarray
     """
-    w, h = input_shape
+    w, h = input_size
 
     if min_sizes is None:
         min_sizes = [[10, 16, 24], [32, 48], [64, 96], [128, 192, 256]]
@@ -70,7 +70,7 @@ def generate_anchors(
 
 
 def decode_detections(
-    input_shape: Tuple[int, int],
+    input_size: Tuple[int, int],
     loc: np.ndarray,
     conf: np.ndarray,
     iou: np.ndarray,
@@ -80,8 +80,8 @@ def decode_detections(
     Decodes the output of an object detection model by converting the model's predictions (localization, confidence, and IoU scores) into bounding boxes, keypoints, and scores.
     The code is taken from https://github.com/Kazuhito00/YuNet-ONNX-TFLite-Sample/tree/main
 
-    @param input_shape: The shape of the input image (height, width).
-    @type input_shape: tuple
+    @param input_size: The size of the input image (height, width).
+    @type input_size: tuple
     @param loc: The predicted locations (or offsets) of the bounding boxes.
     @type loc: np.ndarray
     @param conf: The predicted class confidence scores.
@@ -98,12 +98,12 @@ def decode_detections(
 
     """
 
-    w, h = input_shape
+    w, h = input_size
 
     if variance is None:
         variance = [0.1, 0.2]
 
-    anchors = generate_anchors(input_shape)
+    anchors = generate_anchors(input_size)
 
     # Get scores
     cls_scores = conf[:, 1]
@@ -168,7 +168,7 @@ def format_detections(
     bboxes: np.ndarray,
     keypoints: np.ndarray,
     scores: np.ndarray,
-    input_shape: Tuple[int, int],
+    input_size: Tuple[int, int],
 ):
     """Format detections into a list of dictionaries.
 
@@ -178,16 +178,16 @@ def format_detections(
     @type np.ndarray
     @param scores: A numpy array of shape (N,) containing the scores.
     @type np.ndarray
-    @param input_shape: A tuple representing the height and width of the input image.
-    @type input_shape: tuple
+    @param input_size: A tuple representing the width and height of the input image.
+    @type input_size: tuple
     @return: A tuple of bboxes, keypoints, and scores.
         - bboxes: NumPy array of shape (N, 4) containing the decoded bounding boxes in the format [x_min, y_min, width, height].
         - keypoints: A NumPy array of shape (N, 10) containing the decoded keypoint coordinates for each anchor.
         - scores: A NumPy array of shape (N, 1) containing the combined scores for each anchor.
     @rtype: Tuple[np.ndarray, np.ndarray, np.ndarray]
     """
 
-    w, h = input_shape
+    w, h = input_size
 
     bboxes = normalize_bboxes(bboxes, height=h, width=w)
 

diff --git a/depthai_nodes/ml/parsers/yunet.py b/depthai_nodes/ml/parsers/yunet.py
@@ -22,7 +22,7 @@ class YuNetParser(DetectionParser):
     max_det : int
         Maximum number of detections to keep.
     input_size : Tuple[int, int]
-        Input size.
+        Input size (width, height).
     loc_output_layer_name: str
         Name of the output layer containing the location predictions.
     conf_output_layer_name: str
@@ -55,7 +55,7 @@ def __init__(
         @type iou_threshold: float
         @param max_det: Maximum number of detections to keep.
         @type max_det: int
-        @param input_size: Input shape of the model (width, height).
+        @param input_size: Input size of the model (width, height).
         @type input_size: Tuple[int, int]
         @param loc_output_layer_name: Output layer name for the location predictions.
         @type loc_output_layer_name: str
@@ -131,11 +131,6 @@ def build(
 
         super().build(head_config)
         output_layers = head_config.get("outputs", [])
-        self.input_size = head_config.get("input_size", self.input_size)
-        if len(output_layers) != 3:
-            raise ValueError(
-                f"YuNetParser expects exactly 3 output layers, got {output_layers} layers."
-            )
         for output_layer in output_layers:
             if "loc" in output_layer:
                 self.loc_output_layer_name = output_layer
@@ -147,6 +142,21 @@ def build(
                 raise ValueError(
                     f"Unexpected output layer {output_layer}. Only loc, conf, and iou output layers are supported."
                 )
+        inputs = head_config["model_inputs"]
+        if len(inputs) != 1:
+            raise ValueError(
+                f"Only one input supported for YuNetParser, got {len(inputs)} inputs."
+            )
+        self.input_shape = inputs[0].get("shape")
+        self.layout = inputs[0].get("layout")
+        if self.layout == "NHWC":
+            self.input_size = (self.input_shape[2], self.input_shape[1])
+        elif self.layout == "NCHW":
+            self.input_size = (self.input_shape[3], self.input_shape[2])
+        else:
+            raise ValueError(
+                f"Input layout {self.layout} not supported for input_size extraction."
+            )
 
         return self
 
@@ -242,7 +252,7 @@ def run(self):
 
             # decode detections
             bboxes, keypoints, scores = decode_detections(
-                input_shape=self.input_size,
+                input_size=self.input_size,
                 loc=loc,
                 conf=conf,
                 iou=iou,
@@ -261,7 +271,7 @@ def run(self):
                 bboxes=bboxes,
                 keypoints=keypoints,
                 scores=scores,
-                input_shape=self.input_size,
+                input_size=self.input_size,
             )
 
             # run nms

diff --git a/depthai_nodes/parser_generator.py b/depthai_nodes/parser_generator.py
@@ -59,8 +59,13 @@ def build(self, nn_archive: dai.NNArchive, head_index: int = None) -> Dict:
                     f"Parser {parser_name} does not inherit from BaseParser class."
                 )
 
-            head = decode_head(head)
-            parsers[index] = pipeline.create(parser).build(head)
+            head_config = decode_head(head)
+            head_config["model_inputs"] = []
+            for input in nn_archive.getConfig().model.inputs:
+                head_config["model_inputs"].append(
+                    {"shape": input.shape, "layout": input.layout}
+                )
+            parsers[index] = pipeline.create(parser).build(head_config)
 
         return parsers