first commit

2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions
--- a/rtdetrv2_pytorch/references/deploy/readme.md
+++ b/rtdetrv2_pytorch/references/deploy/readme.md
@@ -0,0 +1,2 @@
+# Deployment
+
--- a/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py
@@ -0,0 +1,61 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision.transforms as T
+
+import numpy as np 
+import onnxruntime as ort 
+from PIL import Image, ImageDraw
+
+
+def draw(images, labels, boxes, scores, thrh = 0.6):
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+
+        for b in box:
+            draw.rectangle(list(b), outline='red',)
+            draw.text((b[0], b[1]), text=str(lab[i].item()), fill='blue', )
+
+        im.save(f'results_{i}.jpg')
+
+
+def main(args, ):
+    """main
+    """
+    sess = ort.InferenceSession(args.onnx_file)
+    print(ort.get_device())
+
+    im_pil = Image.open(args.im_file).convert('RGB')
+    w, h = im_pil.size
+    orig_size = torch.tensor([w, h])[None]
+
+    transforms = T.Compose([
+        T.Resize((640, 640)),
+        T.ToTensor(),
+    ])
+    im_data = transforms(im_pil)[None]
+
+    output = sess.run(
+        # output_names=['labels', 'boxes', 'scores'],
+        output_names=None,
+        input_feed={'images': im_data.data.numpy(), "orig_target_sizes": orig_size.data.numpy()}
+    )
+
+    labels, boxes, scores = output
+
+    draw([im_pil], labels, boxes, scores)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--onnx-file', type=str, )
+    parser.add_argument('--im-file', type=str, )
+    # parser.add_argument('-d', '--device', type=str, default='cpu')
+    args = parser.parse_args()
+    main(args)
--- a/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py
@@ -0,0 +1,5 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+# please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO
--- a/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py
@@ -0,0 +1,258 @@
+# Copyright 2023 lyuwenyu. All Rights Reserved.
+# Copyright (c) 2025 Hitbee-dev. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+# NOTICE: This file has been heavily modified by [Hitbee-dev] from the original source.
+# Modifications include restructuring for broader GPU architecture compatibility
+# (including NVIDIA Blackwell), improved modularity, and enhanced testability.
+# ==============================================================================
+
+import time
+import numpy as np
+import torch
+import tensorrt as trt
+from collections import OrderedDict
+from PIL import Image, ImageDraw, ImageFont
+
+class TRTInference(object):
+    """
+    A high-level wrapper for TensorRT inference, designed for ease of use and flexibility.
+    This class handles engine loading, context creation, and dynamic buffer allocation.
+    """
+    def __init__(self, engine_path, device='cuda:0', verbose=False):
+        """
+        Initializes the TRTInference instance.
+
+        Args:
+            engine_path (str): Path to the serialized TensorRT engine file.
+            device (str): The device to run inference on (e.g., 'cuda:0').
+            verbose (bool): If True, enables verbose logging from the TensorRT logger.
+        """
+        self.engine_path = engine_path
+        self.device = torch.device(device)
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
+        
+        trt.init_libnvinfer_plugins(self.logger, '')
+        self.runtime = trt.Runtime(self.logger)
+        self.engine = self._load_engine(engine_path)
+        self.context = self.engine.create_execution_context()
+
+        self.input_names, self.output_names = self._get_io_names()
+
+        self.buffers_allocated = False
+        self.gpu_buffers = OrderedDict()
+        self.binding_addrs = OrderedDict()
+
+        print(f"[TRTInference] Initialized successfully. Engine: '{engine_path}'.")
+
+    def _load_engine(self, path):
+        """Loads a TensorRT engine from a file."""
+        with open(path, 'rb') as f:
+            engine = self.runtime.deserialize_cuda_engine(f.read())
+        if engine is None:
+            raise RuntimeError(f"Failed to load TensorRT engine from '{path}'.")
+        return engine
+
+    def _get_io_names(self):
+        """Parses input and output tensor names from the engine."""
+        input_names, output_names = [], []
+        for i in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(i)
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                input_names.append(name)
+            else:
+                output_names.append(name)
+        return input_names, output_names
+
+    def _allocate_buffers(self, blob: dict):
+        """
+        Allocates GPU buffers for inputs and outputs based on the first inference request.
+        This "lazy allocation" strategy handles dynamic input shapes gracefully.
+        """
+        print("[TRTInference] First inference call detected. Allocating GPU buffers...")
+        for name in self.input_names:
+            tensor = blob[name]
+            shape = tuple(tensor.shape)
+            dtype = tensor.dtype
+            self.context.set_input_shape(name, shape)
+            self.gpu_buffers[name] = torch.empty(shape, dtype=dtype, device=self.device)
+            self.binding_addrs[name] = self.gpu_buffers[name].data_ptr()
+            print(f"  - Input '{name}': allocated buffer with shape {shape}.")
+
+        for name in self.output_names:
+            shape = tuple(self.context.get_tensor_shape(name))
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            torch_dtype = torch.from_numpy(np.array(0, dtype=dtype)).dtype
+            self.gpu_buffers[name] = torch.empty(shape, dtype=torch_dtype, device=self.device)
+            self.binding_addrs[name] = self.gpu_buffers[name].data_ptr()
+            print(f"  - Output '{name}': allocated buffer with shape {shape}.")
+
+        self.buffers_allocated = True
+        print("[TRTInference] GPU buffers allocated successfully.")
+
+    def __call__(self, blob: dict):
+        """
+        Executes inference on the loaded TensorRT engine.
+
+        Args:
+            blob (dict): A dictionary mapping input tensor names to their corresponding
+                         torch.Tensor data on the GPU.
+
+        Returns:
+            dict: A dictionary mapping output tensor names to their corresponding
+                  torch.Tensor results on the GPU.
+        """
+        if not self.buffers_allocated:
+            self._allocate_buffers(blob)
+            
+        for name in self.input_names:
+            self.gpu_buffers[name].copy_(blob[name])
+
+        self.context.execute_v2(bindings=list(self.binding_addrs.values()))
+        
+        return {name: self.gpu_buffers[name] for name in self.output_names}
+
+# --- Visualization Utility Function ---
+COCO_CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
+    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+    'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+]
+
+def visualize_detections(image_pil, boxes, scores, labels, class_names=COCO_CLASSES, threshold=0.5):
+    """
+    Draws bounding boxes on a PIL image. This function is a general-purpose utility.
+
+    Args:
+        image_pil (PIL.Image.Image): The image to draw on.
+        boxes (torch.Tensor): A tensor of bounding boxes (shape: [N, 4]).
+        scores (torch.Tensor): A tensor of confidence scores (shape: [N]).
+        labels (torch.Tensor): A tensor of class labels (shape: [N]).
+        class_names (list): A list of strings corresponding to class labels.
+        threshold (float): The confidence threshold for displaying detections.
+
+    Returns:
+        PIL.Image.Image: The image with detections drawn on it.
+    """
+    img_draw = image_pil.copy()
+    draw = ImageDraw.Draw(img_draw)
+    
+    # Ensure tensors are on CPU and converted to NumPy for processing
+    boxes = boxes.cpu().numpy()
+    scores = scores.cpu().numpy()
+    labels = labels.cpu().numpy()
+    
+    count = 0
+    for i in range(len(scores)):
+        score = scores[i]
+        if score < threshold:
+            continue
+        
+        count += 1
+        box = boxes[i]
+        label_idx = int(labels[i])
+        
+        xmin, ymin, xmax, ymax = box
+        class_name = class_names[label_idx] if label_idx < len(class_names) else f'CLS-{label_idx}'
+        color = 'red' # Keep it simple or use a color map
+        
+        draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=color, width=3)
+        
+        text = f"{class_name}: {score:.2f}"
+        
+        try:
+            font = ImageFont.truetype("arial.ttf", 20)
+        except IOError:
+            font = ImageFont.load_default()
+
+        text_bbox = draw.textbbox((xmin, ymin), text, font=font)
+        draw.rectangle(text_bbox, fill=color)
+        draw.text((xmin, ymin), text, fill="white", font=font)
+        
+    print(f"   - Found {count} objects above threshold {threshold}.")
+    return img_draw
+
+if __name__ == '__main__':
+    import argparse
+    import torchvision.transforms as T
+    import os
+
+    parser = argparse.ArgumentParser(description="Test script for the TRTInference wrapper.")
+    parser.add_argument('--engine', type=str, required=True, help="Path to the TensorRT engine file.")
+    parser.add_argument('--image', type=str, required=True, help="Path to the input image file.")
+    parser.add_argument('--output', type=str, default='output.jpg', help="Path to save the output image with detections.")
+    parser.add_argument('--device', type=str, default='cuda:0', help="Device to run inference on.")
+    parser.add_argument('--threshold', type=float, default=0.5, help="Confidence threshold for displaying detections.")
+    args = parser.parse_args()
+    
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA is not available. This script requires a GPU.")
+    
+    print("--- TRTInference Wrapper Test ---")
+    
+    print("\n1. Initializing TRTInference...")
+    trt_model = TRTInference(args.engine, device=args.device)
+    
+    print("\n2. Preprocessing input image...")
+    image_pil = Image.open(args.image).convert('RGB')
+    w, h = image_pil.size
+    
+    transforms = T.Compose([
+        T.Resize((640, 640)),
+        T.ToTensor(),
+    ])
+    
+    image_tensor = transforms(image_pil).unsqueeze(0).to(args.device)
+    orig_size_tensor = torch.tensor([[w, h]], dtype=torch.int64, device=args.device)
+
+    blob = {
+        'images': image_tensor,
+        'orig_target_sizes': orig_size_tensor
+    }
+    print(f"   - Original image size: {w}x{h}")
+    print(f"   - Input tensor shape: {image_tensor.shape}")
+
+    print("\n3. Running inference...")
+    start_time = time.time()
+    output_gpu = trt_model(blob)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    
+    print(f"\n4. Inference complete in { (end_time - start_time) * 1000:.2f} ms.")
+    
+    print("\n5. Post-processing and saving output image...")
+    output_labels = output_gpu['labels'][0]
+    output_boxes = output_gpu['boxes'][0]
+    output_scores = output_gpu['scores'][0]
+    
+    # Use the new, separate visualization function
+    result_image = visualize_detections(
+        image_pil, 
+        output_boxes, 
+        output_scores, 
+        output_labels, 
+        threshold=args.threshold
+    )
+    
+    result_image.save(args.output)
+    print(f"   - Output image with detections saved to: {os.path.abspath(args.output)}")
+
+    print("\n--- Test finished successfully ---")
--- a/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py
@@ -0,0 +1,84 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn 
+import torchvision.transforms as T
+
+import numpy as np 
+from PIL import Image, ImageDraw
+
+from src.core import YAMLConfig
+
+
+def draw(images, labels, boxes, scores, thrh = 0.6):
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+        scrs = scores[i][scr > thrh]
+
+        for j,b in enumerate(box):
+            draw.rectangle(list(b), outline='red',)
+            draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(),2)}", fill='blue', )
+
+        im.save(f'results_{i}.jpg')
+
+
+def main(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config, resume=args.resume)
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu') 
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+    else:
+        raise AttributeError('Only support resume to load model.state_dict by now.')
+
+    # NOTE load train mode state -> convert to deploy mode
+    cfg.model.load_state_dict(state)
+
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+            
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    model = Model().to(args.device)
+
+    im_pil = Image.open(args.im_file).convert('RGB')
+    w, h = im_pil.size
+    orig_size = torch.tensor([w, h])[None].to(args.device)
+
+    transforms = T.Compose([
+        T.Resize((640, 640)),
+        T.ToTensor(),
+    ])
+    im_data = transforms(im_pil)[None].to(args.device)
+
+    output = model(im_data, orig_size)
+    labels, boxes, scores = output
+
+    draw([im_pil], labels, boxes, scores)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, )
+    parser.add_argument('-r', '--resume', type=str, )
+    parser.add_argument('-f', '--im-file', type=str, )
+    parser.add_argument('-d', '--device', type=str, default='cpu')
+    args = parser.parse_args()
+    main(args)