first commit

This commit is contained in:
陈赣
2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions

59
benchmark/README.md Normal file
View File

@@ -0,0 +1,59 @@
# 论文测速使用的部分代码和工具
## 测试YOLO系列的速度 [in progress]
以[yolov8](https://github.com/ultralytics/ultralytics)为例
<details open>
<summary>1. 转onnx </summary>
执行`yolov8_onnx.py`中的`export_onnx`函数,新增代码主要涉及输出格式的转换
</details>
<details>
<summary>2. 插入nms </summary>
使用`utils.py`中的`yolo_insert_nms`函数导出onnx模型后使用[Netron](https://netron.app/)查看结构. <img width="924" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/cb466483-d3a3-4f23-a68d-7ab8825059c8">
</details>
<details>
<summary>3. 转tensorrt </summary>
可以使用`trtexec.md`中的的脚本转换,或者使用`utils.py`中的Python代码转换
```bash
# trtexec -h
trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
```
</details>
<details>
<summary>4. trtexec测速 </summary>
可以使用`trtexec.md`中的的脚本转换,去掉`--buildOnly`参数
</details>
<details>
<summary>5. profile分析可选 </summary>
在4的基础之上加以下命令
```bash
nsys profile --force-overwrite=true -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms
```
可以使用nsys可视化分析
<img width="1090" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/507d8bde-9e7c-4ae5-b571-976c540ef2c6">
</details>
<details>
<summary>6. Python测速或者部署 </summary>
在Coco val数据集上测模型的平均速度使用`trtinfer.py`中的代码推理
</details>

102
benchmark/dataset.py Normal file
View File

@@ -0,0 +1,102 @@
'''by lyuwenyu
'''
import os
import glob
from PIL import Image
import torch
import torch.utils.data as data
import torchvision
import torchvision.transforms as T
import torchvision.transforms.functional as F
class ToTensor(T.ToTensor):
def __init__(self) -> None:
super().__init__()
def __call__(self, pic):
if isinstance(pic, torch.Tensor):
return pic
return super().__call__(pic)
class PadToSize(T.Pad):
def __init__(self, size, fill=0, padding_mode='constant'):
super().__init__(0, fill, padding_mode)
self.size = size
self.fill = fill
def __call__(self, img):
"""
Args:
img (PIL Image or Tensor): Image to be padded.
Returns:
PIL Image or Tensor: Padded image.
"""
w, h = F.get_image_size(img)
padding = (0, 0, self.size[0] - w, self.size[1] - h)
return F.pad(img, padding, self.fill, self.padding_mode)
class Dataset(data.Dataset):
def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
super().__init__()
self.device = device
self.size = 640
self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
if preprocess is None:
self.preprocess = T.Compose([
T.Resize(size=639, max_size=640),
PadToSize(size=(640, 640), fill=114),
ToTensor(),
T.ConvertImageDtype(torch.float),
])
else:
self.preprocess = preprocess
def __len__(self, ):
return len(self.im_path_list)
def __getitem__(self, index):
# im = Image.open(self.img_path_list[index]).convert('RGB')
im = torchvision.io.read_file(self.im_path_list[index])
im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
_, h, w = im.shape # c,h,w
im = self.preprocess(im)
blob = {
'image': im,
'im_shape': torch.tensor([self.size, self.size]).to(im.device),
'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
'orig_size': torch.tensor([w, h]).to(im.device),
}
return blob
@staticmethod
def post_process():
pass
@staticmethod
def collate_fn():
pass
def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
'''show result
Keys:
'num_dets', 'det_boxes', 'det_scores', 'det_classes'
'''
for i in range(blob['image'].shape[0]):
det_scores = outputs['det_scores'][i]
det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
im = (blob['image'][i] * 255).to(torch.uint8)
im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')

13
benchmark/trtexec.md Normal file
View File

@@ -0,0 +1,13 @@
```bash
# build tensorrt engine
trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
# using dynamic shapes
# --explicitBatch --minShapes=image:1x3x640x640 --optShapes=image:8x3x640x640 --maxShapes=image:16x3x640x640 --shapes=image:8x3x640x640
# timeline
nsys profile --force-overwrite=true -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms trtexec --loadEngine=./yolov8l_w_nms.engine --fp16 --avgRuns=10 --loadInputs='image:input_tensor.bin'
# https://forums.developer.nvidia.com/t/about-loadinputs-in-trtexec/218880
```

153
benchmark/trtinfer.py Normal file
View File

@@ -0,0 +1,153 @@
'''by lyuwenyu
'''
import time
import contextlib
from collections import namedtuple, OrderedDict
import torch
import numpy as np
import tensorrt as trt
from utils import TimeProfiler
class TRTInference(object):
def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
self.engine_path = engine_path
self.device = device
self.backend = backend
self.max_batch_size = max_batch_size
self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
self.engine = self.load_engine(engine_path)
self.context = self.engine.create_execution_context()
self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
self.input_names = self.get_input_names()
self.output_names = self.get_output_names()
if self.backend == 'cuda':
self.stream = cuda.Stream()
self.time_profile = TimeProfiler()
def init(self, ):
self.dynamic = False
def load_engine(self, path):
'''load engine
'''
trt.init_libnvinfer_plugins(self.logger, '')
with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def get_input_names(self, ):
names = []
for _, name in enumerate(self.engine):
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
names.append(name)
return names
def get_output_names(self, ):
names = []
for _, name in enumerate(self.engine):
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
names.append(name)
return names
def get_bindings(self, engine, context, max_batch_size=32, device=None):
'''build binddings
'''
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
bindings = OrderedDict()
# max_batch_size = 1
for i, name in enumerate(engine):
shape = engine.get_tensor_shape(name)
dtype = trt.nptype(engine.get_tensor_dtype(name))
if shape[0] == -1:
dynamic = True
shape[0] = max_batch_size
if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: # dynamic
context.set_input_shape(name, shape)
if self.backend == 'cuda':
if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
data = np.random.randn(*shape).astype(dtype)
ptr = cuda.mem_alloc(data.nbytes)
bindings[name] = Binding(name, dtype, shape, data, ptr)
else:
data = cuda.pagelocked_empty(trt.volume(shape), dtype)
ptr = cuda.mem_alloc(data.nbytes)
bindings[name] = Binding(name, dtype, shape, data, ptr)
else:
data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
return bindings
def run_torch(self, blob):
'''torch input
'''
for n in self.input_names:
if self.bindings[n].shape != blob[n].shape:
self.context.set_input_shape(n, blob[n].shape)
self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
self.context.execute_v2(list(self.bindings_addr.values()))
outputs = {n: self.bindings[n].data for n in self.output_names}
return outputs
def async_run_cuda(self, blob):
'''numpy input
'''
for n in self.input_names:
cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
outputs = {}
for n in self.output_names:
cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
outputs[n] = self.bindings[n].data
self.stream.synchronize()
return outputs
def __call__(self, blob):
if self.backend == 'torch':
return self.run_torch(blob)
elif self.backend == 'cuda':
return self.async_run_cuda(blob)
def synchronize(self, ):
if self.backend == 'torch' and torch.cuda.is_available():
torch.cuda.synchronize()
elif self.backend == 'cuda':
self.stream.synchronize()
def warmup(self, blob, n):
for _ in range(n):
_ = self(blob)
def speed(self, blob, n):
self.time_profile.reset()
for _ in range(n):
with self.time_profile:
_ = self(blob)
return self.time_profile.total / n

83
benchmark/utils.py Normal file
View File

@@ -0,0 +1,83 @@
'''by lyuwenyu
'''
import time
import contextlib
import numpy as np
from PIL import Image
from collections import OrderedDict
import onnx
import torch
import onnx_graphsurgeon
def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
'''--loadInputs='image:input_tensor.bin'
'''
im = Image.open(path).resize(size)
data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
data.tofile(output_name)
def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
'''
http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
'''
onnx_model = onnx.load(path)
if simplify:
from onnxsim import simplify
onnx_model, _ = simplify(onnx_model, overwrite_input_shapes={'image': [1, 3, 640, 640]})
graph = onnx_graphsurgeon.import_onnx(onnx_model)
graph.toposort()
graph.fold_constants()
graph.cleanup()
topk = max_output_boxes
attrs = OrderedDict(plugin_version='1',
background_class=-1,
max_output_boxes=topk,
score_threshold=score_threshold,
iou_threshold=iou_threshold,
score_activation=False,
box_coding=0, )
outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
graph.layer(op='EfficientNMS_TRT',
name="batched_nms",
inputs=[graph.outputs[0],
graph.outputs[1]],
outputs=outputs,
attrs=attrs, )
graph.outputs = outputs
graph.cleanup().toposort()
onnx.save(onnx_graphsurgeon.export_onnx(graph), f'yolo_w_nms.onnx')
class TimeProfiler(contextlib.ContextDecorator):
def __init__(self, ):
self.total = 0
def __enter__(self, ):
self.start = self.time()
return self
def __exit__(self, type, value, traceback):
self.total += self.time() - self.start
def reset(self, ):
self.total = 0
def time(self, ):
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.time()

73
benchmark/yolov8_onnx.py Normal file
View File

@@ -0,0 +1,73 @@
'''by lyuwenyu
'''
import torch
import torchvision
import numpy as np
import onnxruntime as ort
from utils import yolo_insert_nms
class YOLOv8(torch.nn.Module):
def __init__(self, name) -> None:
super().__init__()
from ultralytics import YOLO
# Load a model
# build a new model from scratch
# model = YOLO(f'{name}.yaml')
# load a pretrained model (recommended for training)
model = YOLO(f'{name}.pt')
self.model = model.model
def forward(self, x):
'''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
'''
pred: torch.Tensor = self.model(x)[0] # n 84 8400,
pred = pred.permute(0, 2, 1)
nc = pred.shape[-1] - 4
boxes, scores = pred.split([4, nc], dim=-1)
boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
return boxes, scores
def export_onnx(name='yolov8n'):
'''export onnx
'''
m = YOLOv8(name)
x = torch.rand(1, 3, 640, 640)
dynamic_axes = {
'image': {0: '-1'}
}
torch.onnx.export(m, x, f'{name}.onnx',
input_names=['image'],
output_names=['boxes', 'scores'],
opset_version=13,
dynamic_axes=dynamic_axes)
data = np.random.rand(1, 3, 640, 640).astype(np.float32)
sess = ort.InferenceSession(f'{name}.onnx')
_ = sess.run(output_names=None, input_feed={'image': data})
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--name', type=str, default='yolov8l')
parser.add_argument('--score_threshold', type=float, default=0.001)
parser.add_argument('--iou_threshold', type=float, default=0.7)
parser.add_argument('--max_output_boxes', type=int, default=300)
args = parser.parse_args()
export_onnx(name=args.name)
yolo_insert_nms(path=f'{args.name}.onnx',
score_threshold=args.score_threshold,
iou_threshold=args.iou_threshold,
max_output_boxes=args.max_output_boxes, )