diff --git a/depthai_nodes/ml/parsers/__init__.py b/depthai_nodes/ml/parsers/__init__.py index 61982eca..7493e150 100644 --- a/depthai_nodes/ml/parsers/__init__.py +++ b/depthai_nodes/ml/parsers/__init__.py @@ -14,7 +14,8 @@ from .scrfd import SCRFDParser from .segmentation import SegmentationParser from .superanimal_landmarker import SuperAnimalParser -from .xfeat import XFeatParser +from .xfeat_mono import XFeatMonoParser +from .xfeat_stereo import XFeatStereoParser from .yolo import YOLOExtendedParser from .yunet import YuNetParser @@ -28,7 +29,8 @@ "SuperAnimalParser", "KeypointParser", "MLSDParser", - "XFeatParser", + "XFeatMonoParser", + "XFeatStereoParser", "ClassificationParser", "YOLOExtendedParser", "FastSAMParser", diff --git a/depthai_nodes/ml/parsers/utils/xfeat.py b/depthai_nodes/ml/parsers/utils/xfeat.py index 364d9b6f..0ca892df 100644 --- a/depthai_nodes/ml/parsers/utils/xfeat.py +++ b/depthai_nodes/ml/parsers/utils/xfeat.py @@ -1,6 +1,5 @@ from typing import Any, Dict, List, Tuple -import cv2 import numpy as np @@ -59,26 +58,44 @@ def local_maximum_filter(x: np.ndarray, kernel_size: int) -> np.ndarray: return local_max -def bilinear_grid_sample( - im: np.ndarray, grid: np.ndarray, align_corners: bool = False -) -> np.ndarray: - """Bilinear grid sample. +def normgrid(x, H, W): + """Normalize coords to [-1,1]. + + @param x: Input coordinates, shape (N, Hg, Wg, 2) + @type x: np.ndarray + @param H: Height of the output feature map + @type H: int + @param W: Width of the output feature map + @type W: int + @return: Normalized coordinates, shape (N, Hg, Wg, 2) + @rtype: np.ndarray + """ + return 2.0 * (x / np.array([W - 1, H - 1], dtype=x.dtype)) - 1.0 - @param im: Input image tensor. + +def bilinear(im, pos, H, W): + """Given an input and a flow-field grid, computes the output using input values and + pixel locations from grid. Supported only bilinear interpolation method to sample + the input pixels. + + @param im: Input feature map, shape (N, C, H, W) @type im: np.ndarray - @param grid: Grid tensor. - @type grid: np.ndarray - @param align_corners: Whether to align corners. - @type align_corners: bool - @return: Output image tensor after applying bilinear grid sample. + @param pos: Point coordinates, shape (N, Hg, Wg, 2) + @type pos: np.ndarray + @param H: Height of the output feature map + @type H: int + @param W: Width of the output feature map + @type W: int + @return: A tensor with sampled points, shape (N, C, Hg, Wg) @rtype: np.ndarray """ + align_corners = False n, c, h, w = im.shape - gn, gh, gw, _ = grid.shape - assert n == gn + grid = normgrid(pos, H, W)[..., np.newaxis] + grid = grid.transpose(0, 1, 3, 2) - x = grid[:, :, :, 0] - y = grid[:, :, :, 1] + x = grid[..., 0] + y = grid[..., 1] if align_corners: x = ((x + 1) / 2) * (w - 1) @@ -95,10 +112,10 @@ def bilinear_grid_sample( x1 = x0 + 1 y1 = y0 + 1 - wa = ((x1 - x) * (y1 - y)).reshape(n, 1, -1) - wb = ((x1 - x) * (y - y0)).reshape(n, 1, -1) - wc = ((x - x0) * (y1 - y)).reshape(n, 1, -1) - wd = ((x - x0) * (y - y0)).reshape(n, 1, -1) + wa = ((x1 - x) * (y1 - y))[:, np.newaxis] + wb = ((x1 - x) * (y - y0))[:, np.newaxis] + wc = ((x - x0) * (y1 - y))[:, np.newaxis] + wd = ((x - x0) * (y - y0))[:, np.newaxis] # Apply padding im_padded = np.pad( @@ -106,32 +123,33 @@ def bilinear_grid_sample( ) padded_h = h + 2 padded_w = w + 2 - x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1 - # Clip coordinates to padded image size + # Adjust points for padding + x0, x1 = x0 + 1, x1 + 1 + y0, y1 = y0 + 1, y1 + 1 + + # Clip coordinates to stay within bounds x0 = np.clip(x0, 0, padded_w - 1) x1 = np.clip(x1, 0, padded_w - 1) y0 = np.clip(y0, 0, padded_h - 1) y1 = np.clip(y1, 0, padded_h - 1) + # Flatten im_padded for indexing im_padded = im_padded.reshape(n, c, -1) - x0_y0 = (x0 + y0 * padded_w).reshape(n, 1, -1) - x0_y1 = (x0 + y1 * padded_w).reshape(n, 1, -1) - x1_y0 = (x1 + y0 * padded_w).reshape(n, 1, -1) - x1_y1 = (x1 + y1 * padded_w).reshape(n, 1, -1) + x0_y0 = (x0 + y0 * padded_w)[:, np.newaxis].repeat(c, axis=1) + x0_y1 = (x0 + y1 * padded_w)[:, np.newaxis].repeat(c, axis=1) + x1_y0 = (x1 + y0 * padded_w)[:, np.newaxis].repeat(c, axis=1) + x1_y1 = (x1 + y1 * padded_w)[:, np.newaxis].repeat(c, axis=1) - def gather(im_padded, idx): - idx = idx.astype(np.int32) - gathered = np.take_along_axis(im_padded, idx, axis=2) - return gathered + Ia = np.take_along_axis(im_padded, x0_y0, axis=2) + Ib = np.take_along_axis(im_padded, x0_y1, axis=2) + Ic = np.take_along_axis(im_padded, x1_y0, axis=2) + Id = np.take_along_axis(im_padded, x1_y1, axis=2) - Ia = gather(im_padded, x0_y0) - Ib = gather(im_padded, x0_y1) - Ic = gather(im_padded, x1_y0) - Id = gather(im_padded, x1_y1) - - result = (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw) + result = (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape( + n, c, grid.shape[1], grid.shape[2] + ) return result @@ -192,6 +210,7 @@ def _nms( def detect_and_compute( feats: np.ndarray, kpts: np.ndarray, + heatmaps: np.ndarray, resize_rate_w: float, resize_rate_h: float, input_size: Tuple[int, int], @@ -220,37 +239,12 @@ def detect_and_compute( kpts_heats = _get_kpts_heatmap(kpts) mkpts = _nms(kpts_heats, threshold=0.05, kernel_size=5) # int64 - # Numpy implementation of normgrid - div_array = np.array([input_size[0] - 1, input_size[1] - 1], dtype=mkpts.dtype) - grid = 2.0 * (mkpts / div_array) - 1.0 - grid = np.expand_dims(grid, axis=2) - - if grid.size == 0: + if mkpts.size == 0: return None - # Numpy implementation of F.grid_sample - map_x = grid[..., 0].reshape(-1).astype(np.float32) - map_y = grid[..., 1].reshape(-1).astype(np.float32) - remapped = cv2.remap( - kpts_heats[0, 0], - map_x, - map_y, - interpolation=cv2.INTER_NEAREST, - borderMode=cv2.BORDER_CONSTANT, - borderValue=0, - ) - nearest_result = np.expand_dims(remapped, axis=0) + nearest_result = bilinear(kpts_heats, mkpts, input_size[1], input_size[0]) + bilinear_result = bilinear(heatmaps, mkpts, input_size[1], input_size[0]) - # Numpy implementation of F.grid_sample - remapped = cv2.remap( - kpts_heats[0, 0], - map_x, - map_y, - interpolation=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_CONSTANT, - borderValue=0, - ) - bilinear_result = np.expand_dims(remapped, axis=0) scores = (nearest_result * bilinear_result).reshape(1, -1) scores = scores.astype(np.float32) @@ -262,17 +256,12 @@ def detect_and_compute( mkpts_x = np.take_along_axis(mkpts[..., 0], idxs, axis=-1)[:, :top_k] mkpts_y = np.take_along_axis(mkpts[..., 1], idxs, axis=-1)[:, :top_k] mkpts = np.stack([mkpts_x, mkpts_y], axis=-1) - scores = np.take_along_axis(scores, idxs, axis=-1)[:, :top_k] - - div_array = np.array([input_size[0] - 1, input_size[1] - 1], dtype=mkpts.dtype) - grid = 2.0 * (mkpts / div_array) - 1.0 - grid = np.expand_dims(grid, axis=2) - map_x = grid[..., 0].reshape(-1).astype(np.float32) - map_y = grid[..., 1].reshape(-1).astype(np.float32) mkpts = mkpts.astype(np.float32) - feats = bilinear_grid_sample(feats, grid, align_corners=False) - feats = feats.transpose(0, 2, 3, 1).squeeze(-2) + scores = np.take_along_axis(scores, idxs, axis=-1)[:, :top_k] + + feats = bilinear(feats, mkpts, input_size[1], input_size[0]) + feats = feats[0].transpose(2, 1, 0) norm = np.linalg.norm(feats, axis=-1, keepdims=True) feats = feats / norm diff --git a/depthai_nodes/ml/parsers/xfeat.py b/depthai_nodes/ml/parsers/xfeat_mono.py similarity index 80% rename from depthai_nodes/ml/parsers/xfeat.py rename to depthai_nodes/ml/parsers/xfeat_mono.py index 72db15e6..37e59c5a 100644 --- a/depthai_nodes/ml/parsers/xfeat.py +++ b/depthai_nodes/ml/parsers/xfeat_mono.py @@ -7,8 +7,10 @@ from .utils.xfeat import detect_and_compute, match -class XFeatParser(dai.node.ThreadedHostNode): - """Parser class for parsing the output of the XFeat model. +class XFeatMonoParser(dai.node.ThreadedHostNode): + """Parser class for parsing the output of the XFeat model. It can be used for + parsing the output from one source (e.g. one camera). The reference frame can be set + with trigger method. Attributes ---------- @@ -24,6 +26,8 @@ class XFeatParser(dai.node.ThreadedHostNode): Maximum number of keypoints to keep. previous_results : np.ndarray Previous results from the model. Previous results are used to match keypoints between two frames. + trigger : bool + Trigger to set the reference frame. Output Message/s ---------------- @@ -48,6 +52,8 @@ def __init__( @type original_size: Tuple[float, float] @param input_size: Input image size. @type input_size: Tuple[float, float] + @param max_keypoints: Maximum number of keypoints to keep. + @type max_keypoints: int """ dai.node.ThreadedHostNode.__init__(self) self.input = self.createInput() @@ -56,6 +62,7 @@ def __init__( self.input_size = input_size self.max_keypoints = max_keypoints self.previous_results = None + self.trigger = False def setOriginalSize(self, original_size): """Sets the original image size. @@ -81,6 +88,10 @@ def setMaxKeypoints(self, max_keypoints): """ self.max_keypoints = max_keypoints + def setTrigger(self): + """Sets the trigger to set the reference frame.""" + self.trigger = True + def run(self): if self.original_size is None: raise ValueError("Original image size must be specified!") @@ -98,6 +109,7 @@ def run(self): keypoints = output.getTensor("keypoints", dequantize=True).astype( np.float32 ) + heatmaps = output.getTensor("heatmaps", dequantize=True).astype(np.float32) if len(feats.shape) == 3: feats = feats.reshape((1,) + feats.shape).transpose(0, 3, 1, 2) @@ -105,10 +117,13 @@ def run(self): keypoints = keypoints.reshape((1,) + keypoints.shape).transpose( 0, 3, 1, 2 ) + if len(heatmaps.shape) == 3: + heatmaps = heatmaps.reshape((1,) + heatmaps.shape).transpose(0, 3, 1, 2) result = detect_and_compute( feats, keypoints, + heatmaps, resize_rate_w, resize_rate_h, self.input_size, @@ -128,6 +143,11 @@ def run(self): matched_points = create_tracked_features_message(mkpts0, mkpts1) matched_points.setTimestamp(output.getTimestamp()) self.out.send(matched_points) + else: + matched_points = dai.TrackedFeatures() + matched_points.setTimestamp(output.getTimestamp()) + self.out.send(matched_points) - # save the result from first frame - self.previous_results = result + if self.trigger: + self.previous_results = result + self.trigger = False diff --git a/depthai_nodes/ml/parsers/xfeat_stereo.py b/depthai_nodes/ml/parsers/xfeat_stereo.py new file mode 100644 index 00000000..db8c613b --- /dev/null +++ b/depthai_nodes/ml/parsers/xfeat_stereo.py @@ -0,0 +1,183 @@ +from typing import Tuple + +import depthai as dai +import numpy as np + +from ..messages.creators import create_tracked_features_message +from .utils.xfeat import detect_and_compute, match + + +class XFeatStereoParser(dai.node.ThreadedHostNode): + """Parser class for parsing the output of the XFeat model. It can be used for parsing the output from two sources (e.g. two cameras - left and right). + + Attributes + ---------- + input : Node.Input + Node's input. It is a linking point to which the Neural Network's output is linked. It accepts the output of the Neural Network node. + out : Node.Output + Parser sends the processed network results to this output in a form of DepthAI message. It is a linking point from which the processed network results are retrieved. + original_size : Tuple[float, float] + Original image size. + input_size : Tuple[float, float] + Input image size. + max_keypoints : int + Maximum number of keypoints to keep. + + Output Message/s + ---------------- + **Type**: dai.TrackedFeatures + + **Description**: TrackedFeatures message containing matched keypoints with the same ID. + + Error Handling + -------------- + **ValueError**: If the original image size is not specified. + """ + + def __init__( + self, + original_size: Tuple[float, float] = None, + input_size: Tuple[float, float] = (640, 352), + max_keypoints: int = 4096, + ): + """Initializes the XFeatParser node. + + @param original_size: Original image size. + @type original_size: Tuple[float, float] + @param input_size: Input image size. + @type input_size: Tuple[float, float] + @param max_keypoints: Maximum number of keypoints to keep. + @type max_keypoints: int + """ + dai.node.ThreadedHostNode.__init__(self) + self.reference_input = self.createInput() + self.target_input = self.createInput() + self.out = self.createOutput() + self.original_size = original_size + self.input_size = input_size + self.max_keypoints = max_keypoints + + def setOriginalSize(self, original_size): + """Sets the original image size. + + @param original_size: Original image size. + @type original_size: Tuple[float, float] + """ + self.original_size = original_size + + def setInputSize(self, input_size): + """Sets the input image size. + + @param input_size: Input image size. + @type input_size: Tuple[float, float] + """ + self.input_size = input_size + + def setMaxKeypoints(self, max_keypoints): + """Sets the maximum number of keypoints to keep. + + @param max_keypoints: Maximum number of keypoints. + @type max_keypoints: int + """ + self.max_keypoints = max_keypoints + + def run(self): + if self.original_size is None: + raise ValueError("Original image size must be specified!") + + resize_rate_w = self.original_size[0] / self.input_size[0] + resize_rate_h = self.original_size[1] / self.input_size[1] + + while self.isRunning(): + try: + reference_output: dai.NNData = self.reference_input.get() + target_output: dai.NNData = self.target_input.get() + except dai.MessageQueue.QueueException: + break # Pipeline was stopped + + reference_feats = reference_output.getTensor( + "feats", dequantize=True + ).astype(np.float32) + reference_keypoints = reference_output.getTensor( + "keypoints", dequantize=True + ).astype(np.float32) + reference_heatmaps = reference_output.getTensor( + "heatmaps", dequantize=True + ).astype(np.float32) + + target_feats = target_output.getTensor("feats", dequantize=True).astype( + np.float32 + ) + target_keypoints = target_output.getTensor( + "keypoints", dequantize=True + ).astype(np.float32) + target_heatmaps = target_output.getTensor( + "heatmaps", dequantize=True + ).astype(np.float32) + + if len(reference_feats.shape) == 3: + reference_feats = reference_feats.reshape( + (1,) + reference_feats.shape + ).transpose(0, 3, 1, 2) + if len(reference_keypoints.shape) == 3: + reference_keypoints = reference_keypoints.reshape( + (1,) + reference_keypoints.shape + ).transpose(0, 3, 1, 2) + if len(reference_heatmaps.shape) == 3: + reference_heatmaps = reference_heatmaps.reshape( + (1,) + reference_heatmaps.shape + ).transpose(0, 3, 1, 2) + + if len(target_feats.shape) == 3: + target_feats = target_feats.reshape( + (1,) + target_feats.shape + ).transpose(0, 3, 1, 2) + if len(target_keypoints.shape) == 3: + target_keypoints = target_keypoints.reshape( + (1,) + target_keypoints.shape + ).transpose(0, 3, 1, 2) + if len(target_heatmaps.shape) == 3: + target_heatmaps = target_heatmaps.reshape( + (1,) + target_heatmaps.shape + ).transpose(0, 3, 1, 2) + + reference_result = detect_and_compute( + reference_feats, + reference_keypoints, + reference_heatmaps, + resize_rate_w, + resize_rate_h, + self.input_size, + self.max_keypoints, + ) + + target_result = detect_and_compute( + target_feats, + target_keypoints, + target_heatmaps, + resize_rate_w, + resize_rate_h, + self.input_size, + self.max_keypoints, + ) + + if reference_result is not None: + reference_result = reference_result[0] + else: + matched_points = dai.TrackedFeatures() + matched_points.setTimestamp(reference_output.getTimestamp()) + self.out.send(matched_points) + continue + + if target_result is not None: + target_result = target_result[0] + else: + matched_points = dai.TrackedFeatures() + matched_points.setTimestamp(target_output.getTimestamp()) + self.out.send(matched_points) + continue + + mkpts0, mkpts1 = match(reference_result, target_result) + matched_points = create_tracked_features_message(mkpts0, mkpts1) + matched_points.setTimestamp(target_output.getTimestamp()) + self.out.send(matched_points)