first commit

2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -0,0 +1,59 @@
+# 论文测速使用的部分代码和工具
+
+
+## 测试YOLO系列的速度 [in progress]
+以[yolov8](https://github.com/ultralytics/ultralytics)为例
+
+<details open>
+<summary>1. 转onnx </summary>  
+
+执行`yolov8_onnx.py`中的`export_onnx`函数，新增代码主要涉及输出格式的转换
+</details>
+
+
+<details>
+<summary>2. 插入nms </summary>
+
+使用`utils.py`中的`yolo_insert_nms`函数，导出onnx模型后使用[Netron](https://netron.app/)查看结构. <img width="924" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/cb466483-d3a3-4f23-a68d-7ab8825059c8">
+</details>
+
+
+<details>
+<summary>3. 转tensorrt </summary>
+
+可以使用`trtexec.md`中的的脚本转换，或者使用`utils.py`中的Python代码转换
+```bash
+# trtexec -h
+trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
+```
+</details>
+
+
+<details>
+<summary>4. trtexec测速 </summary>
+
+可以使用`trtexec.md`中的的脚本转换，去掉`--buildOnly`参数
+
+</details>
+
+
+
+<details>
+<summary>5. profile分析（可选） </summary>
+
+在4的基础之上加以下命令
+```bash
+nsys profile --force-overwrite=true  -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms 
+```
+可以使用nsys可视化分析
+<img width="1090" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/507d8bde-9e7c-4ae5-b571-976c540ef2c6">
+
+</details>
+
+
+<details>
+<summary>6. Python测速或者部署   </summary>
+
+在Coco val数据集上测模型的平均速度使用`trtinfer.py`中的代码推理
+
+</details>
--- a/benchmark/dataset.py
+++ b/benchmark/dataset.py
@@ -0,0 +1,102 @@
+'''by lyuwenyu
+'''
+
+import os
+import glob
+from PIL import Image
+
+import torch
+import torch.utils.data as data
+import torchvision
+import torchvision.transforms as T 
+import torchvision.transforms.functional as F 
+
+
+class ToTensor(T.ToTensor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def __call__(self, pic):
+        if isinstance(pic, torch.Tensor):
+            return pic 
+        return super().__call__(pic)
+
+class PadToSize(T.Pad):
+    def __init__(self, size, fill=0, padding_mode='constant'):
+        super().__init__(0, fill, padding_mode)
+        self.size = size
+        self.fill = fill
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be padded.
+
+        Returns:
+            PIL Image or Tensor: Padded image.
+        """
+        w, h = F.get_image_size(img)
+        padding = (0, 0, self.size[0] - w, self.size[1] - h)
+        return F.pad(img, padding, self.fill, self.padding_mode)
+
+
+class Dataset(data.Dataset):
+    def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
+        super().__init__()
+
+        self.device = device
+        self.size = 640
+
+        self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
+
+        if preprocess is None: 
+            self.preprocess = T.Compose([
+                    T.Resize(size=639, max_size=640),
+                    PadToSize(size=(640, 640), fill=114),
+                    ToTensor(),
+                    T.ConvertImageDtype(torch.float),
+            ])
+        else:
+            self.preprocess = preprocess
+
+    def __len__(self, ):
+        return len(self.im_path_list)
+
+    def __getitem__(self, index):
+        # im = Image.open(self.img_path_list[index]).convert('RGB')
+        im = torchvision.io.read_file(self.im_path_list[index])
+        im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
+        _, h, w = im.shape # c,h,w
+
+        im = self.preprocess(im)
+
+        blob = {
+            'image': im, 
+            'im_shape': torch.tensor([self.size, self.size]).to(im.device),
+            'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
+            'orig_size': torch.tensor([w, h]).to(im.device),
+        }
+
+        return blob
+
+    @staticmethod
+    def post_process():
+        pass
+
+    @staticmethod
+    def collate_fn():
+        pass
+
+
+def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
+    '''show result
+    Keys:
+        'num_dets', 'det_boxes', 'det_scores', 'det_classes'
+    '''    
+    for i in range(blob['image'].shape[0]):
+        det_scores = outputs['det_scores'][i]
+        det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
+        
+        im = (blob['image'][i] * 255).to(torch.uint8)
+        im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
+        Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
--- a/benchmark/trtexec.md
+++ b/benchmark/trtexec.md
@@ -0,0 +1,13 @@
+
+```bash
+# build tensorrt engine 
+trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
+
+# using dynamic shapes
+# --explicitBatch --minShapes=image:1x3x640x640 --optShapes=image:8x3x640x640  --maxShapes=image:16x3x640x640 --shapes=image:8x3x640x640
+
+# timeline 
+nsys profile --force-overwrite=true  -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms  trtexec --loadEngine=./yolov8l_w_nms.engine --fp16 --avgRuns=10 --loadInputs='image:input_tensor.bin'
+
+# https://forums.developer.nvidia.com/t/about-loadinputs-in-trtexec/218880
+```
--- a/benchmark/trtinfer.py
+++ b/benchmark/trtinfer.py
@@ -0,0 +1,153 @@
+'''by lyuwenyu
+'''
+
+import time 
+import contextlib
+from collections import namedtuple, OrderedDict
+
+import torch
+import numpy as np
+import tensorrt as trt
+
+from utils import TimeProfiler
+
+class TRTInference(object):
+    def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
+        self.engine_path = engine_path
+        self.device = device
+        self.backend = backend
+        self.max_batch_size = max_batch_size
+        
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)  
+
+        self.engine = self.load_engine(engine_path)
+
+        self.context = self.engine.create_execution_context()
+
+        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
+        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
+
+        self.input_names = self.get_input_names()
+        self.output_names = self.get_output_names()
+        
+        if self.backend == 'cuda':
+            self.stream = cuda.Stream()
+
+        self.time_profile = TimeProfiler()
+
+    def init(self, ):
+        self.dynamic = False 
+
+    def load_engine(self, path):
+        '''load engine
+        '''
+        trt.init_libnvinfer_plugins(self.logger, '')
+        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+    
+    def get_input_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                names.append(name)
+        return names
+    
+    def get_output_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                names.append(name)
+        return names
+
+    def get_bindings(self, engine, context, max_batch_size=32, device=None):
+        '''build binddings
+        '''
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+        bindings = OrderedDict()
+        # max_batch_size = 1
+
+        for i, name in enumerate(engine):
+            shape = engine.get_tensor_shape(name)
+            dtype = trt.nptype(engine.get_tensor_dtype(name))
+
+            if shape[0] == -1:
+                dynamic = True 
+                shape[0] = max_batch_size
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # dynamic
+                    context.set_input_shape(name, shape)
+
+            if self.backend == 'cuda':
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    data = np.random.randn(*shape).astype(dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
+                else:
+                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
+
+            else:
+                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
+
+        return bindings
+
+    def run_torch(self, blob):
+        '''torch input
+        '''
+        for n in self.input_names:
+            if self.bindings[n].shape != blob[n].shape:
+                self.context.set_input_shape(n, blob[n].shape) 
+                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
+
+        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
+        self.context.execute_v2(list(self.bindings_addr.values()))
+        outputs = {n: self.bindings[n].data for n in self.output_names}
+
+        return outputs
+
+
+    def async_run_cuda(self, blob):
+        '''numpy input
+        '''
+        for n in self.input_names:
+            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
+        
+        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
+        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
+        
+        outputs = {}
+        for n in self.output_names:
+            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
+            outputs[n] = self.bindings[n].data
+        
+        self.stream.synchronize()
+        
+        return outputs
+    
+    def __call__(self, blob):
+        if self.backend == 'torch':
+            return self.run_torch(blob)
+
+        elif self.backend == 'cuda':
+            return self.async_run_cuda(blob)
+
+    def synchronize(self, ):
+        if self.backend == 'torch' and torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        elif self.backend == 'cuda':
+            self.stream.synchronize()
+    
+    def warmup(self, blob, n):
+        for _ in range(n):
+            _ = self(blob)
+
+    def speed(self, blob, n):
+        self.time_profile.reset()
+        for _ in range(n):
+            with self.time_profile:
+                _ = self(blob)
+
+        return self.time_profile.total / n 
+
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -0,0 +1,83 @@
+'''by lyuwenyu
+'''
+
+import time 
+import contextlib
+import numpy as np
+from PIL import Image
+from collections import OrderedDict
+
+import onnx
+import torch 
+import onnx_graphsurgeon
+
+
+def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
+    '''--loadInputs='image:input_tensor.bin'
+    '''
+    im = Image.open(path).resize(size)
+    data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
+    data.tofile(output_name)
+
+
+def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
+    '''
+    http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
+    https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
+    '''
+    onnx_model = onnx.load(path)
+
+    if simplify:
+        from onnxsim import simplify
+        onnx_model, _ = simplify(onnx_model,  overwrite_input_shapes={'image': [1, 3, 640, 640]})
+
+    graph = onnx_graphsurgeon.import_onnx(onnx_model)
+    graph.toposort()
+    graph.fold_constants()
+    graph.cleanup()
+
+    topk = max_output_boxes
+    attrs = OrderedDict(plugin_version='1',
+                        background_class=-1,
+                        max_output_boxes=topk,
+                        score_threshold=score_threshold,
+                        iou_threshold=iou_threshold,
+                        score_activation=False,
+                        box_coding=0, )
+
+    outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
+               onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
+               onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
+               onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
+
+    graph.layer(op='EfficientNMS_TRT', 
+                name="batched_nms", 
+                inputs=[graph.outputs[0], 
+                        graph.outputs[1]], 
+                outputs=outputs, 
+                attrs=attrs, )
+
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+
+    onnx.save(onnx_graphsurgeon.export_onnx(graph), f'yolo_w_nms.onnx')
+
+
+class TimeProfiler(contextlib.ContextDecorator):
+    def __init__(self, ):
+        self.total = 0
+        
+    def __enter__(self, ):
+        self.start = self.time()
+        return self 
+    
+    def __exit__(self, type, value, traceback):
+        self.total += self.time() - self.start
+    
+    def reset(self, ):
+        self.total = 0
+    
+    def time(self, ):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.time()
--- a/benchmark/yolov8_onnx.py
+++ b/benchmark/yolov8_onnx.py
@@ -0,0 +1,73 @@
+'''by lyuwenyu
+'''
+
+import torch 
+import torchvision
+
+import numpy as np 
+import onnxruntime as ort 
+
+from utils import yolo_insert_nms
+
+class YOLOv8(torch.nn.Module):
+    def __init__(self, name) -> None:
+        super().__init__()
+        from ultralytics import YOLO
+        # Load a model
+        # build a new model from scratch
+        # model = YOLO(f'{name}.yaml')  
+
+        # load a pretrained model (recommended for training)
+        model = YOLO(f'{name}.pt')  
+        self.model = model.model
+
+    def forward(self, x):
+        '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
+        '''
+        pred: torch.Tensor = self.model(x)[0] # n 84 8400,
+        pred = pred.permute(0, 2, 1)
+        nc = pred.shape[-1] - 4
+        boxes, scores = pred.split([4, nc], dim=-1)
+        boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+
+        return boxes, scores
+
+
+
+def export_onnx(name='yolov8n'):
+    '''export onnx
+    '''
+    m = YOLOv8(name)
+
+    x = torch.rand(1, 3, 640, 640)
+    dynamic_axes = {
+        'image': {0: '-1'}
+    }
+    torch.onnx.export(m, x, f'{name}.onnx', 
+                      input_names=['image'], 
+                      output_names=['boxes', 'scores'], 
+                      opset_version=13, 
+                      dynamic_axes=dynamic_axes)
+
+    data = np.random.rand(1, 3, 640, 640).astype(np.float32)
+    sess = ort.InferenceSession(f'{name}.onnx')
+    _ = sess.run(output_names=None, input_feed={'image': data})
+
+
+if __name__ == '__main__':
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name', type=str, default='yolov8l')
+    parser.add_argument('--score_threshold', type=float, default=0.001)
+    parser.add_argument('--iou_threshold', type=float, default=0.7)
+    parser.add_argument('--max_output_boxes', type=int, default=300)
+    args = parser.parse_args()
+
+    export_onnx(name=args.name)
+    
+    yolo_insert_nms(path=f'{args.name}.onnx', 
+                    score_threshold=args.score_threshold, 
+                    iou_threshold=args.iou_threshold, 
+                    max_output_boxes=args.max_output_boxes, )
+