first commit
This commit is contained in:
59
benchmark/README.md
Normal file
59
benchmark/README.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# 论文测速使用的部分代码和工具
|
||||
|
||||
|
||||
## 测试YOLO系列的速度 [in progress]
|
||||
以[yolov8](https://github.com/ultralytics/ultralytics)为例
|
||||
|
||||
<details open>
|
||||
<summary>1. 转onnx </summary>
|
||||
|
||||
执行`yolov8_onnx.py`中的`export_onnx`函数,新增代码主要涉及输出格式的转换
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
<summary>2. 插入nms </summary>
|
||||
|
||||
使用`utils.py`中的`yolo_insert_nms`函数,导出onnx模型后使用[Netron](https://netron.app/)查看结构. <img width="924" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/cb466483-d3a3-4f23-a68d-7ab8825059c8">
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
<summary>3. 转tensorrt </summary>
|
||||
|
||||
可以使用`trtexec.md`中的的脚本转换,或者使用`utils.py`中的Python代码转换
|
||||
```bash
|
||||
# trtexec -h
|
||||
trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
|
||||
```
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
<summary>4. trtexec测速 </summary>
|
||||
|
||||
可以使用`trtexec.md`中的的脚本转换,去掉`--buildOnly`参数
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
<details>
|
||||
<summary>5. profile分析(可选) </summary>
|
||||
|
||||
在4的基础之上加以下命令
|
||||
```bash
|
||||
nsys profile --force-overwrite=true -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms
|
||||
```
|
||||
可以使用nsys可视化分析
|
||||
<img width="1090" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/507d8bde-9e7c-4ae5-b571-976c540ef2c6">
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
<summary>6. Python测速或者部署 </summary>
|
||||
|
||||
在Coco val数据集上测模型的平均速度使用`trtinfer.py`中的代码推理
|
||||
|
||||
</details>
|
||||
102
benchmark/dataset.py
Normal file
102
benchmark/dataset.py
Normal file
@@ -0,0 +1,102 @@
|
||||
'''by lyuwenyu
|
||||
'''
|
||||
|
||||
import os
|
||||
import glob
|
||||
from PIL import Image
|
||||
|
||||
import torch
|
||||
import torch.utils.data as data
|
||||
import torchvision
|
||||
import torchvision.transforms as T
|
||||
import torchvision.transforms.functional as F
|
||||
|
||||
|
||||
class ToTensor(T.ToTensor):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, pic):
|
||||
if isinstance(pic, torch.Tensor):
|
||||
return pic
|
||||
return super().__call__(pic)
|
||||
|
||||
class PadToSize(T.Pad):
|
||||
def __init__(self, size, fill=0, padding_mode='constant'):
|
||||
super().__init__(0, fill, padding_mode)
|
||||
self.size = size
|
||||
self.fill = fill
|
||||
|
||||
def __call__(self, img):
|
||||
"""
|
||||
Args:
|
||||
img (PIL Image or Tensor): Image to be padded.
|
||||
|
||||
Returns:
|
||||
PIL Image or Tensor: Padded image.
|
||||
"""
|
||||
w, h = F.get_image_size(img)
|
||||
padding = (0, 0, self.size[0] - w, self.size[1] - h)
|
||||
return F.pad(img, padding, self.fill, self.padding_mode)
|
||||
|
||||
|
||||
class Dataset(data.Dataset):
|
||||
def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
|
||||
super().__init__()
|
||||
|
||||
self.device = device
|
||||
self.size = 640
|
||||
|
||||
self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
|
||||
|
||||
if preprocess is None:
|
||||
self.preprocess = T.Compose([
|
||||
T.Resize(size=639, max_size=640),
|
||||
PadToSize(size=(640, 640), fill=114),
|
||||
ToTensor(),
|
||||
T.ConvertImageDtype(torch.float),
|
||||
])
|
||||
else:
|
||||
self.preprocess = preprocess
|
||||
|
||||
def __len__(self, ):
|
||||
return len(self.im_path_list)
|
||||
|
||||
def __getitem__(self, index):
|
||||
# im = Image.open(self.img_path_list[index]).convert('RGB')
|
||||
im = torchvision.io.read_file(self.im_path_list[index])
|
||||
im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
|
||||
_, h, w = im.shape # c,h,w
|
||||
|
||||
im = self.preprocess(im)
|
||||
|
||||
blob = {
|
||||
'image': im,
|
||||
'im_shape': torch.tensor([self.size, self.size]).to(im.device),
|
||||
'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
|
||||
'orig_size': torch.tensor([w, h]).to(im.device),
|
||||
}
|
||||
|
||||
return blob
|
||||
|
||||
@staticmethod
|
||||
def post_process():
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def collate_fn():
|
||||
pass
|
||||
|
||||
|
||||
def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
|
||||
'''show result
|
||||
Keys:
|
||||
'num_dets', 'det_boxes', 'det_scores', 'det_classes'
|
||||
'''
|
||||
for i in range(blob['image'].shape[0]):
|
||||
det_scores = outputs['det_scores'][i]
|
||||
det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
|
||||
|
||||
im = (blob['image'][i] * 255).to(torch.uint8)
|
||||
im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
|
||||
Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
|
||||
13
benchmark/trtexec.md
Normal file
13
benchmark/trtexec.md
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
```bash
|
||||
# build tensorrt engine
|
||||
trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
|
||||
|
||||
# using dynamic shapes
|
||||
# --explicitBatch --minShapes=image:1x3x640x640 --optShapes=image:8x3x640x640 --maxShapes=image:16x3x640x640 --shapes=image:8x3x640x640
|
||||
|
||||
# timeline
|
||||
nsys profile --force-overwrite=true -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms trtexec --loadEngine=./yolov8l_w_nms.engine --fp16 --avgRuns=10 --loadInputs='image:input_tensor.bin'
|
||||
|
||||
# https://forums.developer.nvidia.com/t/about-loadinputs-in-trtexec/218880
|
||||
```
|
||||
153
benchmark/trtinfer.py
Normal file
153
benchmark/trtinfer.py
Normal file
@@ -0,0 +1,153 @@
|
||||
'''by lyuwenyu
|
||||
'''
|
||||
|
||||
import time
|
||||
import contextlib
|
||||
from collections import namedtuple, OrderedDict
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
import tensorrt as trt
|
||||
|
||||
from utils import TimeProfiler
|
||||
|
||||
class TRTInference(object):
|
||||
def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
|
||||
self.engine_path = engine_path
|
||||
self.device = device
|
||||
self.backend = backend
|
||||
self.max_batch_size = max_batch_size
|
||||
|
||||
self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
|
||||
|
||||
self.engine = self.load_engine(engine_path)
|
||||
|
||||
self.context = self.engine.create_execution_context()
|
||||
|
||||
self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
|
||||
self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
|
||||
|
||||
self.input_names = self.get_input_names()
|
||||
self.output_names = self.get_output_names()
|
||||
|
||||
if self.backend == 'cuda':
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
self.time_profile = TimeProfiler()
|
||||
|
||||
def init(self, ):
|
||||
self.dynamic = False
|
||||
|
||||
def load_engine(self, path):
|
||||
'''load engine
|
||||
'''
|
||||
trt.init_libnvinfer_plugins(self.logger, '')
|
||||
with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
|
||||
return runtime.deserialize_cuda_engine(f.read())
|
||||
|
||||
def get_input_names(self, ):
|
||||
names = []
|
||||
for _, name in enumerate(self.engine):
|
||||
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
def get_output_names(self, ):
|
||||
names = []
|
||||
for _, name in enumerate(self.engine):
|
||||
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
def get_bindings(self, engine, context, max_batch_size=32, device=None):
|
||||
'''build binddings
|
||||
'''
|
||||
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
|
||||
bindings = OrderedDict()
|
||||
# max_batch_size = 1
|
||||
|
||||
for i, name in enumerate(engine):
|
||||
shape = engine.get_tensor_shape(name)
|
||||
dtype = trt.nptype(engine.get_tensor_dtype(name))
|
||||
|
||||
if shape[0] == -1:
|
||||
dynamic = True
|
||||
shape[0] = max_batch_size
|
||||
if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: # dynamic
|
||||
context.set_input_shape(name, shape)
|
||||
|
||||
if self.backend == 'cuda':
|
||||
if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
|
||||
data = np.random.randn(*shape).astype(dtype)
|
||||
ptr = cuda.mem_alloc(data.nbytes)
|
||||
bindings[name] = Binding(name, dtype, shape, data, ptr)
|
||||
else:
|
||||
data = cuda.pagelocked_empty(trt.volume(shape), dtype)
|
||||
ptr = cuda.mem_alloc(data.nbytes)
|
||||
bindings[name] = Binding(name, dtype, shape, data, ptr)
|
||||
|
||||
else:
|
||||
data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
|
||||
bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
|
||||
|
||||
return bindings
|
||||
|
||||
def run_torch(self, blob):
|
||||
'''torch input
|
||||
'''
|
||||
for n in self.input_names:
|
||||
if self.bindings[n].shape != blob[n].shape:
|
||||
self.context.set_input_shape(n, blob[n].shape)
|
||||
self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
|
||||
|
||||
self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
|
||||
self.context.execute_v2(list(self.bindings_addr.values()))
|
||||
outputs = {n: self.bindings[n].data for n in self.output_names}
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def async_run_cuda(self, blob):
|
||||
'''numpy input
|
||||
'''
|
||||
for n in self.input_names:
|
||||
cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
|
||||
|
||||
bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
|
||||
self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
|
||||
|
||||
outputs = {}
|
||||
for n in self.output_names:
|
||||
cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
|
||||
outputs[n] = self.bindings[n].data
|
||||
|
||||
self.stream.synchronize()
|
||||
|
||||
return outputs
|
||||
|
||||
def __call__(self, blob):
|
||||
if self.backend == 'torch':
|
||||
return self.run_torch(blob)
|
||||
|
||||
elif self.backend == 'cuda':
|
||||
return self.async_run_cuda(blob)
|
||||
|
||||
def synchronize(self, ):
|
||||
if self.backend == 'torch' and torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
|
||||
elif self.backend == 'cuda':
|
||||
self.stream.synchronize()
|
||||
|
||||
def warmup(self, blob, n):
|
||||
for _ in range(n):
|
||||
_ = self(blob)
|
||||
|
||||
def speed(self, blob, n):
|
||||
self.time_profile.reset()
|
||||
for _ in range(n):
|
||||
with self.time_profile:
|
||||
_ = self(blob)
|
||||
|
||||
return self.time_profile.total / n
|
||||
|
||||
83
benchmark/utils.py
Normal file
83
benchmark/utils.py
Normal file
@@ -0,0 +1,83 @@
|
||||
'''by lyuwenyu
|
||||
'''
|
||||
|
||||
import time
|
||||
import contextlib
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from collections import OrderedDict
|
||||
|
||||
import onnx
|
||||
import torch
|
||||
import onnx_graphsurgeon
|
||||
|
||||
|
||||
def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
|
||||
'''--loadInputs='image:input_tensor.bin'
|
||||
'''
|
||||
im = Image.open(path).resize(size)
|
||||
data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
|
||||
data.tofile(output_name)
|
||||
|
||||
|
||||
def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
|
||||
'''
|
||||
http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
|
||||
https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
|
||||
'''
|
||||
onnx_model = onnx.load(path)
|
||||
|
||||
if simplify:
|
||||
from onnxsim import simplify
|
||||
onnx_model, _ = simplify(onnx_model, overwrite_input_shapes={'image': [1, 3, 640, 640]})
|
||||
|
||||
graph = onnx_graphsurgeon.import_onnx(onnx_model)
|
||||
graph.toposort()
|
||||
graph.fold_constants()
|
||||
graph.cleanup()
|
||||
|
||||
topk = max_output_boxes
|
||||
attrs = OrderedDict(plugin_version='1',
|
||||
background_class=-1,
|
||||
max_output_boxes=topk,
|
||||
score_threshold=score_threshold,
|
||||
iou_threshold=iou_threshold,
|
||||
score_activation=False,
|
||||
box_coding=0, )
|
||||
|
||||
outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
|
||||
onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
|
||||
onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
|
||||
onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
|
||||
|
||||
graph.layer(op='EfficientNMS_TRT',
|
||||
name="batched_nms",
|
||||
inputs=[graph.outputs[0],
|
||||
graph.outputs[1]],
|
||||
outputs=outputs,
|
||||
attrs=attrs, )
|
||||
|
||||
graph.outputs = outputs
|
||||
graph.cleanup().toposort()
|
||||
|
||||
onnx.save(onnx_graphsurgeon.export_onnx(graph), f'yolo_w_nms.onnx')
|
||||
|
||||
|
||||
class TimeProfiler(contextlib.ContextDecorator):
|
||||
def __init__(self, ):
|
||||
self.total = 0
|
||||
|
||||
def __enter__(self, ):
|
||||
self.start = self.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.total += self.time() - self.start
|
||||
|
||||
def reset(self, ):
|
||||
self.total = 0
|
||||
|
||||
def time(self, ):
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
return time.time()
|
||||
73
benchmark/yolov8_onnx.py
Normal file
73
benchmark/yolov8_onnx.py
Normal file
@@ -0,0 +1,73 @@
|
||||
'''by lyuwenyu
|
||||
'''
|
||||
|
||||
import torch
|
||||
import torchvision
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
|
||||
from utils import yolo_insert_nms
|
||||
|
||||
class YOLOv8(torch.nn.Module):
|
||||
def __init__(self, name) -> None:
|
||||
super().__init__()
|
||||
from ultralytics import YOLO
|
||||
# Load a model
|
||||
# build a new model from scratch
|
||||
# model = YOLO(f'{name}.yaml')
|
||||
|
||||
# load a pretrained model (recommended for training)
|
||||
model = YOLO(f'{name}.pt')
|
||||
self.model = model.model
|
||||
|
||||
def forward(self, x):
|
||||
'''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
|
||||
'''
|
||||
pred: torch.Tensor = self.model(x)[0] # n 84 8400,
|
||||
pred = pred.permute(0, 2, 1)
|
||||
nc = pred.shape[-1] - 4
|
||||
boxes, scores = pred.split([4, nc], dim=-1)
|
||||
boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
|
||||
|
||||
return boxes, scores
|
||||
|
||||
|
||||
|
||||
def export_onnx(name='yolov8n'):
|
||||
'''export onnx
|
||||
'''
|
||||
m = YOLOv8(name)
|
||||
|
||||
x = torch.rand(1, 3, 640, 640)
|
||||
dynamic_axes = {
|
||||
'image': {0: '-1'}
|
||||
}
|
||||
torch.onnx.export(m, x, f'{name}.onnx',
|
||||
input_names=['image'],
|
||||
output_names=['boxes', 'scores'],
|
||||
opset_version=13,
|
||||
dynamic_axes=dynamic_axes)
|
||||
|
||||
data = np.random.rand(1, 3, 640, 640).astype(np.float32)
|
||||
sess = ort.InferenceSession(f'{name}.onnx')
|
||||
_ = sess.run(output_names=None, input_feed={'image': data})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--name', type=str, default='yolov8l')
|
||||
parser.add_argument('--score_threshold', type=float, default=0.001)
|
||||
parser.add_argument('--iou_threshold', type=float, default=0.7)
|
||||
parser.add_argument('--max_output_boxes', type=int, default=300)
|
||||
args = parser.parse_args()
|
||||
|
||||
export_onnx(name=args.name)
|
||||
|
||||
yolo_insert_nms(path=f'{args.name}.onnx',
|
||||
score_threshold=args.score_threshold,
|
||||
iou_threshold=args.iou_threshold,
|
||||
max_output_boxes=args.max_output_boxes, )
|
||||
|
||||
Reference in New Issue
Block a user