first commit

This commit is contained in:
陈赣
2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions

View File

@@ -0,0 +1,124 @@
### Getting Started: A Complete Workflow
This guide provides a complete, step-by-step workflow from setting up the environment to training, exporting, and running inference with TensorRT.
#### **1. Environment Setup with Docker (Recommended)**
Using Docker is the recommended way to ensure all dependencies, drivers, and CUDA versions are perfectly aligned. This eliminates "it works on my machine" issues.
* **Step 1.1: Build and Run the Container**
From the project's root directory, run `docker compose`. This will build the image based on the `Dockerfile` and start the service in the background.
```bash
docker compose up --build -d
```
* **Step 1.2: Verify the Container is Running**
Check that the container is up and running. Note its name for the next step.
```bash
docker ps
```
---
#### **2. Training & Evaluation (Using `docker attach`)**
This method directly attaches your terminal to the container's main process. It's simple but requires careful handling to avoid terminating your session.
* **Step 2.1: Attach to the Container**
Attach your terminal to the running container. You will be dropped into a bash shell.
```bash
docker attach <your_container_name>
```
* **Step 2.2: Run the Training Command**
Now, *inside the attached shell*, run your training command. `torchrun` will automatically use the GPUs assigned to the container. **Do not run it in the background (`&`)**.
```bash
# Example for 4 GPUs assigned to the container
torchrun --nproc_per_node=4 --master-port=8989 \
tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
--amp
```
* **Step 2.3: Detach from the Session (IMPORTANT!)**
With your training running, you can safely detach and leave it running.
**WARNING:** **DO NOT PRESS `Ctrl+C`**. This will kill the training process and potentially the entire container.
To safely detach, press the sequence: **`Ctrl+P`**, followed immediately by **`Ctrl+Q`**.
You will return to your local terminal, and the container will continue running the training in the background.
* **Step 2.4: Re-attach to Your Session**
To check on your training progress, simply run the `docker attach` command again. You will see the live output from your training command.
```bash
docker attach <your_container_name>
```
(Remember to detach with `Ctrl+P`, `Ctrl+Q` when you're done.)
---
#### **3. Exporting & Inference**
For tasks like exporting or running inference, which don't need to run for days, it's safer to use `docker exec` to open a new, separate shell.
* **Step 3.1: Open a New Shell in the Container**
```bash
docker exec -it <your_container_name> bash
```
* **Step 3.2: Run Export or Inference Commands**
Now, inside this new shell, run your commands.
```bash
# Export to ONNX
python tools/export_onnx.py \
-c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
-r path/to/trained_checkpoint.pth \
--check
```
```
# Convert to TensorRT
bash tools/onnx2trt.sh /path/to/your/model.onnx
```
```
# RUN TRT Inference
python references/deploy/rtdetrv2_tensorrt.py \
--engine /path/to/your/model.trt \
--image /path/to/your/image.jpg \
--output /path/to/save/output.jpg \
--threshold 0.5
```
### Utilities & Tips
* **Visualize training with TensorBoard:**
* Use the standard port `6006` to avoid conflicts with training.
* Ensure the port `6006` is exposed in your `docker-compose.yml`.
```bash
# Inside the container
tensorboard --logdir=path/to/summary/ --host=0.0.0.0 --port=6006
```
* **Managing the Container Lifecycle:**
* **To temporarily stop** the container without deleting it (e.g., to pause training and resume later):
```bash
docker compose stop
```
You can restart it later with `docker compose start`.
* **To stop and completely remove** the container, network, and volumes:
```bash
docker compose down
```

View File

@@ -0,0 +1,100 @@
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
import torch
import torch.nn as nn
from src.core import YAMLConfig, yaml_utils
def main(args, ):
"""main
"""
update_dict = yaml_utils.parse_cli(args.update) if args.update else {}
update_dict.update({k: v for k, v in args.__dict__.items() \
if k not in ['update', ] and v is not None})
cfg = YAMLConfig(args.config, **update_dict)
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
if 'ema' in checkpoint:
state = checkpoint['ema']['module']
else:
state = checkpoint['model']
# NOTE load train mode state -> convert to deploy mode
cfg.model.load_state_dict(state)
else:
# raise AttributeError('Only support resume to load model.state_dict by now.')
print('not load model.state_dict, use default init state dict...')
class Model(nn.Module):
def __init__(self, ) -> None:
super().__init__()
self.model = cfg.model.deploy()
self.postprocessor = cfg.postprocessor.deploy()
def forward(self, images, orig_target_sizes):
outputs = self.model(images)
outputs = self.postprocessor(outputs, orig_target_sizes)
return outputs
model = Model()
data = torch.rand(1, 3, args.input_size, args.input_size)
size = torch.tensor([[args.input_size, args.input_size]])
_ = model(data, size)
dynamic_axes = {
'images': {0: 'N', },
'orig_target_sizes': {0: 'N'}
}
torch.onnx.export(
model,
(data, size),
args.output_file,
input_names=['images', 'orig_target_sizes'],
output_names=['labels', 'boxes', 'scores'],
dynamic_axes=dynamic_axes,
opset_version=16,
verbose=False,
do_constant_folding=True,
)
if args.check:
import onnx
onnx_model = onnx.load(args.output_file)
onnx.checker.check_model(onnx_model)
print('Check export onnx model done...')
if args.simplify:
import onnx
import onnxsim
dynamic = True
# input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None
input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
onnx_model_simplify, check = onnxsim.simplify(args.output_file, input_shapes=input_shapes, dynamic_input_shape=dynamic)
onnx.save(onnx_model_simplify, args.output_file)
print(f'Simplify onnx model {check}...')
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--config', '-c', type=str)
parser.add_argument('--resume', '-r', type=str)
parser.add_argument('--output_file', '-o', type=str, default='model.onnx')
parser.add_argument('--input_size', '-s', type=int, default=640)
parser.add_argument('--check', action='store_true', default=False)
parser.add_argument('--simplify', action='store_true', default=False)
parser.add_argument('--update', '-u', nargs='+', help='update yaml config')
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,81 @@
import os
import argparse
import tensorrt as trt
def main(onnx_path, engine_path, max_batchsize, opt_batchsize, min_batchsize, use_fp16=True, verbose=False)->None:
""" Convert ONNX model to TensorRT engine.
Args:
onnx_path (str): Path to the input ONNX model.
engine_path (str): Path to save the output TensorRT engine.
use_fp16 (bool): Whether to use FP16 precision.
verbose (bool): Whether to enable verbose logging.
"""
logger = trt.Logger(trt.Logger.VERBOSE if verbose else trt.Logger.INFO)
builder = trt.Builder(logger)
network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(network_flags)
parser = trt.OnnxParser(network, logger)
config = builder.create_builder_config()
config.set_preview_feature(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805, True)
if not os.path.isfile(onnx_path):
raise FileNotFoundError(f"ONNX file not found: {onnx_path}")
print(f"[INFO] Loading ONNX file from {onnx_path}")
with open(onnx_path, "rb") as f:
if not parser.parse(f.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
raise RuntimeError("Failed to parse ONNX file")
config = builder.create_builder_config()
config.set_preview_feature(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805, True)
config.max_workspace_size = 1 << 30 # 1GB
if use_fp16:
if builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
print("[INFO] FP16 optimization enabled.")
else:
print("[WARNING] FP16 not supported on this platform. Proceeding with FP32.")
profile = builder.create_optimization_profile()
profile.set_shape("images", min=(min_batchsize, 3, 640, 640), opt=(opt_batchsize, 3, 640, 640), max=(max_batchsize, 3, 640, 640))
profile.set_shape("orig_target_sizes", min=(1, 2), opt=(1, 2), max=(1, 2))
config.add_optimization_profile(profile)
print("[INFO] Building TensorRT engine...")
engine = builder.build_engine(network, config)
if engine is None:
raise RuntimeError("Failed to build the engine.")
print(f"[INFO] Saving engine to {engine_path}")
with open(engine_path, "wb") as f:
f.write(engine.serialize())
print("[INFO] Engine export complete.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert ONNX to TensorRT Engine")
parser.add_argument("--onnx", "-i", type=str, required=True, help="Path to input ONNX model file")
parser.add_argument("--saveEngine", "-o", type=str, default="model.engine", help="Path to output TensorRT engine file")
parser.add_argument("--maxBatchSize", "-Mb", type=int, default=32, help="Maximum batch size for inference")
parser.add_argument("--optBatchSize", "-ob", type=int, default=16, help="Optimal batch size for inference")
parser.add_argument("--minBatchSize", "-mb", type=int, default=1, help="Minimum batch size for inference")
parser.add_argument("--fp16", default=True, action="store_true", help="Enable FP16 precision mode")
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
args = parser.parse_args()
main(
onnx_path=args.onnx,
engine_path=args.saveEngine,
max_batchsize=args.maxBatchSize,
opt_batchsize=args.optBatchSize,
min_batchsize=args.minBatchSize,
use_fp16=args.fp16,
verbose=args.verbose
)

View File

@@ -0,0 +1,35 @@
#!/bin/bash
# A script to convert an ONNX model to a TensorRT engine using trtexec.
# This script automatically sets the output engine path based on the input ONNX file.
# Exit immediately if a command exits with a non-zero status.
set -e
# Check if an input file is provided.
if [ -z "$1" ]; then
echo "Error: No ONNX file provided."
echo "Usage: $0 /path/to/your/model.onnx"
exit 1
fi
ONNX_FILE=$1
# Replace the .onnx extension with .trt for the output file.
ENGINE_FILE="${ONNX_FILE%.onnx}.trt"
echo "==> Converting ONNX to TensorRT Engine <=="
echo " - Input ONNX: $ONNX_FILE"
echo " - Output TRT: $ENGINE_FILE"
echo " - Precision: FP16"
echo "=========================================="
# Run the trtexec command.
# --fp16 enables 16-bit floating-point precision for faster inference.
# --verbose provides detailed output during the conversion process.
trtexec --onnx="$ONNX_FILE" \
--saveEngine="$ENGINE_FILE" \
--fp16 \
--verbose
echo "=========================================="
echo "✅ Successfully created TensorRT engine: $ENGINE_FILE"

View File

@@ -0,0 +1,110 @@
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
"""
import math
import os
import sys
import torch
import torch.nn as nn
from torch import Tensor
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
from typing import Any, Dict, List, Optional
from src.core import YAMLConfig, yaml_utils
__all__ = ["profile_stats"]
def _auto_scale_flops(flops: float):
"""Copied from torch.profiler.profile"""
flop_headers = [
"",
"K",
"M",
"G",
"T",
"P",
]
assert flops > 0
log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
assert log_flops >= 0 and log_flops < len(flop_headers)
return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
def profile_stats(
model: nn.Module,
data: Optional[Tensor]=None,
shape: List[int]=[1, 3, 640, 640],
verbose: bool=False
) -> Dict[str, Any]:
is_training = model.training
model.train()
num_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
model.eval()
if data is None:
dtype = next(model.parameters()).dtype
device = next(model.parameters()).device
data = torch.rand(*shape, dtype=dtype, device=device)
print(device)
def trace_handler(prof):
print(prof.key_averages().table(sort_by='self_cuda_time_total', row_limit=-1))
wait = 0
warmup = 1
active = 1
repeat = 1
skip_first = 0
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(
wait=wait,
warmup=warmup,
active=active,
repeat=repeat,
skip_first=skip_first,
),
with_flops=True,
) as p:
n_step = skip_first + (wait + warmup + active) * repeat
for _ in range(n_step):
_ = model(data)
p.step()
if is_training:
model.train()
statistics = p.key_averages()
info = statistics.table(sort_by='self_cuda_time_total', row_limit=-1)
num_flops = sum(event.flops for event in statistics if event.flops > 0) / active
(flops_scale, flops_header) = _auto_scale_flops(num_flops)
if verbose:
print(info)
print(f'Total number of trainable parameters: {num_params}')
print(f'Total number of flops: {num_flops * flops_scale:.3f}{flops_header} with {shape}')
return {'n_parameters': num_params, 'n_flops': num_flops, 'info': info}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, required=True)
parser.add_argument('-d', '--device', type=str, default='cuda:0', help='device',)
parser.add_argument('-u', '--update', nargs='+', help='Update yaml config from command line.')
args = parser.parse_args()
update_dict = yaml_utils.parse_cli(args.update) if args.update else {}
update_dict.update({k: v for k, v in args.__dict__.items() \
if k not in ['update', ] and v is not None})
cfg = YAMLConfig(args.config, **update_dict)
model = cfg.model.to(args.device)
profile_stats(model, verbose=True)

View File

@@ -0,0 +1,65 @@
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
import argparse
from src.misc import dist_utils
from src.core import YAMLConfig, yaml_utils
from src.solver import TASKS
def main(args, ) -> None:
"""main
"""
dist_utils.setup_distributed(args.print_rank, args.print_method, seed=args.seed)
assert not all([args.tuning, args.resume]), \
'Only support from_scrach or resume or tuning at one time'
update_dict = yaml_utils.parse_cli(args.update)
update_dict.update({k: v for k, v in args.__dict__.items() \
if k not in ['update', ] and v is not None})
cfg = YAMLConfig(args.config, **update_dict)
print('cfg: ', cfg.__dict__)
solver = TASKS[cfg.yaml_cfg['task']](cfg)
if args.test_only:
solver.val()
else:
solver.fit()
dist_utils.cleanup()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# priority 0
parser.add_argument('-c', '--config', type=str, required=True)
parser.add_argument('-r', '--resume', type=str, help='resume from checkpoint')
parser.add_argument('-t', '--tuning', type=str, help='tuning from checkpoint')
parser.add_argument('-d', '--device', type=str, help='device',)
parser.add_argument('--seed', type=int, help='exp reproducibility')
parser.add_argument('--use-amp', action='store_true', help='auto mixed precision training')
parser.add_argument('--output-dir', type=str, help='output directoy')
parser.add_argument('--summary-dir', type=str, help='tensorboard summry')
parser.add_argument('--test-only', action='store_true', default=False,)
# priority 1
parser.add_argument('-u', '--update', nargs='+', help='update yaml config')
# env
parser.add_argument('--print-method', type=str, default='builtin', help='print method')
parser.add_argument('--print-rank', type=int, default=0, help='print rank id')
parser.add_argument('--local-rank', type=int, help='local rank id')
args = parser.parse_args()
main(args)