first commit

This commit is contained in:
陈赣
2026-06-03 12:43:14 +08:00
commit ba76cfae28
608 changed files with 120791 additions and 0 deletions

52
third_party/trt_yolov8/CMakeLists.txt vendored Executable file
View File

@@ -0,0 +1,52 @@
cmake_minimum_required(VERSION 3.10)
project(trt_yolov8 VERSION 1.0)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fPIC -w -fdiagnostics-color=always -pthread")
# save all libs to the same directory
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/libs)
# TensorRT required
include_directories("/usr/local/tensorRT/include") # change this line if possible
link_directories("/usr/local/tensorRT/lib") # change this line if possible
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") # change this line if possible
enable_language(CUDA)
# CUDA required
find_package(CUDA REQUIRED)
message(STATUS "CUDA library status:")
message(STATUS " version: ${CUDA_VERSION}")
message(STATUS " libraries: ${CUDA_LIBRARIES}")
message(STATUS " include path: ${CUDA_INCLUDE_DIRS}")
include_directories(${CUDA_INCLUDE_DIRS})
# OpenCV required
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})
# build for trt_yolov8
file(GLOB SRCS "*.cpp" "src/*.cpp" "src/*.cu" "plugin/*.cu")
add_library(${PROJECT_NAME} SHARED ${SRCS})
target_link_libraries(${PROJECT_NAME} nvinfer cudart ${OpenCV_LIBS})
# build samples for trt_yolov8
if(NOT DEFINED VP_BUILD_FROM)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/samples) # save all exe to 'samples'
add_executable(trt_yolov8_det_test "samples/trt_yolov8_det_test.cpp")
target_link_libraries(trt_yolov8_det_test ${PROJECT_NAME})
add_executable(trt_yolov8_pose_test "samples/trt_yolov8_pose_test.cpp")
target_link_libraries(trt_yolov8_pose_test ${PROJECT_NAME})
add_executable(trt_yolov8_seg_test "samples/trt_yolov8_seg_test.cpp")
target_link_libraries(trt_yolov8_seg_test ${PROJECT_NAME})
add_executable(trt_yolov8_cls_test "samples/trt_yolov8_cls_test.cpp")
target_link_libraries(trt_yolov8_cls_test ${PROJECT_NAME})
add_executable(trt_yolov8_wts_2_engine "samples/trt_yolov8_wts_2_engine.cpp")
target_link_libraries(trt_yolov8_wts_2_engine ${PROJECT_NAME})
endif()

68
third_party/trt_yolov8/README.md vendored Executable file
View File

@@ -0,0 +1,68 @@
# trt_yolov8 #
detection/classification/segmentation/pose_estimation based on `yolov8`, codes come from [tensorrtx](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov8), with some modifications.
## How to generate plan file(trt/engine) from pytorch models ?
```
.pt --> .wts --> .trt/.engine
```
1. download default `.pt` models of `yolov8/n/s/m/...(det/cls/seg/pose)` from [github](https://github.com/ultralytics/assets/releases) (or trained by yourself)
2. convert `.pt` to `.wts` using `samples/gen_wts.py`
3. convert `.wts` to `.engine` using `samples/trt_yolov8_wts_2_engine.cpp` (**change classes number before compiling**)
### .pt -> .wts ###
`python3 samples/gen_wts.py [-w] [.pt] [-o] [.wts] [-t] [detect/seg/cls]`
```
python3 samples/gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
python3 samples/gen_wts.py -w yolov8n-seg.pt -o yolov8n-seg.wts -t seg
python3 samples/gen_wts.py -w yolov8n-pose.pt -o yolov8n-pose.wts
python3 samples/gen_wts.py -w yolov8n-cls.pt -o yolov8n-cls.wts -t cls
python3 samples/gen_wts.py -w yolov8s.pt -o yolov8s.wts -t detect
python3 samples/gen_wts.py -w yolov8s-seg.pt -o yolov8s-seg.wts -t seg
python3 samples/gen_wts.py -w yolov8s-pose.pt -o yolov8s-pose.wts
python3 samples/gen_wts.py -w yolov8s-cls.pt -o yolov8s-cls.wts -t cls
...
```
### .wts -> .engine ###
`./trt_yolov8_wts_2_engine [-det/-seg/-pose/-cls] [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]`
```
./build/samples/trt_yolov8_wts_2_engine -det yolov8n.wts yolov8n.engine n
./build/samples/trt_yolov8_wts_2_engine -det yolov8s.wts yolov8s.engine s
./build/samples/trt_yolov8_wts_2_engine -seg yolov8n-seg.wts yolov8n-seg.engine n
./build/samples/trt_yolov8_wts_2_engine -seg yolov8s-seg.wts yolov8s-seg.engine s
./build/samples/trt_yolov8_wts_2_engine -pose yolov8n-pose.wts yolov8n-pose.engine n
./build/samples/trt_yolov8_wts_2_engine -pose yolov8s-pose.wts yolov8s-pose.engine s
./build/samples/trt_yolov8_wts_2_engine -cls yolov8n-cls.wts yolov8n-cls.engine n
./build/samples/trt_yolov8_wts_2_engine -cls yolov8s-cls.wts yolov8s-cls.engine s
...
```
> CUDA 11.1 + TensorRT 8.5 for this repository (tested)
## How to build trt_yolov8 ?
we can build trt_yolov8 separately.
> change the value of `kNumClass/kClsNumClass` in `include/config.h` to your classes number (80 by default for coco dataset and 1000 by default for imagenet dataset) before building.
0. set the right library path and include path for TensorRT in `CMakeLists.txt`
1. `mkdir build && cd build`
2. `cmake ..`
3. `make -j8`
all lib files saved to `build/libs`, all samples saved to `build/samples`. please refer to videopipe about how to run samples for trt_yolov8.
## Sample screenshot (yolov8s/det/seg/pose/cls trained by coco/imagenet dataset from github) ##
### detection ###
![](../../doc/3rdparty/9.png)
### pose_estimation ###
![](../../doc/3rdparty/10.png)
### segmentation ###
![](../../doc/3rdparty/11.png)
### classification ###
![](../../doc/3rdparty/12.png)

30
third_party/trt_yolov8/include/block.h vendored Executable file
View File

@@ -0,0 +1,30 @@
#pragma once
#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
int ch, int k, int s, int p, std::string lname);
nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname);
nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname);
nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int k, std::string lname);
nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);
nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
int px_arry_num, bool is_segmentation, bool is_pose);

39
third_party/trt_yolov8/include/calibrator.h vendored Executable file
View File

@@ -0,0 +1,39 @@
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H
#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"
//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//! CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
{
public:
Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
virtual ~Int8EntropyCalibrator2();
int getBatchSize() const TRT_NOEXCEPT override;
bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
private:
int batchsize_;
int input_w_;
int input_h_;
int img_idx_;
std::string img_dir_;
std::vector<std::string> img_files_;
size_t input_count_;
std::string calib_table_name_;
const char* input_blob_name_;
bool read_cache_;
void* device_input_;
std::vector<char> calib_cache_;
};
#endif // ENTROPY_CALIBRATOR_H

25
third_party/trt_yolov8/include/config.h vendored Executable file
View File

@@ -0,0 +1,25 @@
#define USE_FP16
//#define USE_FP32
//#define USE_INT8
const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static int kNumClass = 80;
const static int kNumberOfPoints = 17; // number of keypoints total
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f; // keypoints confidence
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";
// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;
// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;

18
third_party/trt_yolov8/include/cuda_utils.h vendored Executable file
View File

@@ -0,0 +1,18 @@
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_
#include <cuda_runtime_api.h>
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
{\
cudaError_t error_code = callstr;\
if (error_code != cudaSuccess) {\
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
assert(0);\
}\
}
#endif // CUDA_CHECK
#endif // TRTX_CUDA_UTILS_H_

504
third_party/trt_yolov8/include/logging.h vendored Executable file
View File

@@ -0,0 +1,504 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H
#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"
using Severity = nvinfer1::ILogger::Severity;
class LogStreamConsumerBuffer : public std::stringbuf
{
public:
LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
: mOutput(stream)
, mPrefix(prefix)
, mShouldLog(shouldLog)
{
}
LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
: mOutput(other.mOutput)
{
}
~LogStreamConsumerBuffer()
{
// std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
// std::streambuf::pptr() gives a pointer to the current position of the output sequence
// if the pointer to the beginning is not equal to the pointer to the current position,
// call putOutput() to log the output to the stream
if (pbase() != pptr())
{
putOutput();
}
}
// synchronizes the stream buffer and returns 0 on success
// synchronizing the stream buffer consists of inserting the buffer contents into the stream,
// resetting the buffer and flushing the stream
virtual int sync()
{
putOutput();
return 0;
}
void putOutput()
{
if (mShouldLog)
{
// prepend timestamp
std::time_t timestamp = std::time(nullptr);
tm* tm_local = std::localtime(&timestamp);
std::cout << "[";
std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
// std::stringbuf::str() gets the string contents of the buffer
// insert the buffer contents pre-appended by the appropriate prefix into the stream
mOutput << mPrefix << str();
// set the buffer to empty
str("");
// flush the stream
mOutput.flush();
}
}
void setShouldLog(bool shouldLog)
{
mShouldLog = shouldLog;
}
private:
std::ostream& mOutput;
std::string mPrefix;
bool mShouldLog;
};
//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
: mBuffer(stream, prefix, shouldLog)
{
}
protected:
LogStreamConsumerBuffer mBuffer;
};
//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//! Order of base classes is LogStreamConsumerBase and then std::ostream.
//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//! Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
//! \brief Creates a LogStreamConsumer which logs messages with level severity.
//! Reportable severity determines if the messages are severe enough to be logged.
LogStreamConsumer(Severity reportableSeverity, Severity severity)
: LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
, std::ostream(&mBuffer) // links the stream buffer with the stream
, mShouldLog(severity <= reportableSeverity)
, mSeverity(severity)
{
}
LogStreamConsumer(LogStreamConsumer&& other)
: LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
, std::ostream(&mBuffer) // links the stream buffer with the stream
, mShouldLog(other.mShouldLog)
, mSeverity(other.mSeverity)
{
}
void setReportableSeverity(Severity reportableSeverity)
{
mShouldLog = mSeverity <= reportableSeverity;
mBuffer.setShouldLog(mShouldLog);
}
private:
static std::ostream& severityOstream(Severity severity)
{
return severity >= Severity::kINFO ? std::cout : std::cerr;
}
static std::string severityPrefix(Severity severity)
{
switch (severity)
{
case Severity::kINTERNAL_ERROR: return "[F] ";
case Severity::kERROR: return "[E] ";
case Severity::kWARNING: return "[W] ";
case Severity::kINFO: return "[I] ";
case Severity::kVERBOSE: return "[V] ";
default: assert(0); return "";
}
}
bool mShouldLog;
Severity mSeverity;
};
//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.
class Logger : public nvinfer1::ILogger
{
public:
Logger(Severity severity = Severity::kWARNING)
: mReportableSeverity(severity)
{
}
//!
//! \enum TestResult
//! \brief Represents the state of a given test
//!
enum class TestResult
{
kRUNNING, //!< The test is running
kPASSED, //!< The test passed
kFAILED, //!< The test failed
kWAIVED //!< The test was waived
};
//!
//! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
//! \return The nvinfer1::ILogger associated with this Logger
//!
//! TODO Once all samples are updated to use this method to register the logger with TensorRT,
//! we can eliminate the inheritance of Logger from ILogger
//!
nvinfer1::ILogger& getTRTLogger()
{
return *this;
}
//!
//! \brief Implementation of the nvinfer1::ILogger::log() virtual method
//!
//! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
//! inheritance from nvinfer1::ILogger
//!
void log(Severity severity, const char* msg) TRT_NOEXCEPT override
{
LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
}
//!
//! \brief Method for controlling the verbosity of logging output
//!
//! \param severity The logger will only emit messages that have severity of this level or higher.
//!
void setReportableSeverity(Severity severity)
{
mReportableSeverity = severity;
}
//!
//! \brief Opaque handle that holds logging information for a particular test
//!
//! This object is an opaque handle to information used by the Logger to print test results.
//! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
//! with Logger::reportTest{Start,End}().
//!
class TestAtom
{
public:
TestAtom(TestAtom&&) = default;
private:
friend class Logger;
TestAtom(bool started, const std::string& name, const std::string& cmdline)
: mStarted(started)
, mName(name)
, mCmdline(cmdline)
{
}
bool mStarted;
std::string mName;
std::string mCmdline;
};
//!
//! \brief Define a test for logging
//!
//! \param[in] name The name of the test. This should be a string starting with
//! "TensorRT" and containing dot-separated strings containing
//! the characters [A-Za-z0-9_].
//! For example, "TensorRT.sample_googlenet"
//! \param[in] cmdline The command line used to reproduce the test
//
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
//!
static TestAtom defineTest(const std::string& name, const std::string& cmdline)
{
return TestAtom(false, name, cmdline);
}
//!
//! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
//! as input
//!
//! \param[in] name The name of the test
//! \param[in] argc The number of command-line arguments
//! \param[in] argv The array of command-line arguments (given as C strings)
//!
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
{
auto cmdline = genCmdlineString(argc, argv);
return defineTest(name, cmdline);
}
//!
//! \brief Report that a test has started.
//!
//! \pre reportTestStart() has not been called yet for the given testAtom
//!
//! \param[in] testAtom The handle to the test that has started
//!
static void reportTestStart(TestAtom& testAtom)
{
reportTestResult(testAtom, TestResult::kRUNNING);
assert(!testAtom.mStarted);
testAtom.mStarted = true;
}
//!
//! \brief Report that a test has ended.
//!
//! \pre reportTestStart() has been called for the given testAtom
//!
//! \param[in] testAtom The handle to the test that has ended
//! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
//! TestResult::kFAILED, TestResult::kWAIVED
//!
static void reportTestEnd(const TestAtom& testAtom, TestResult result)
{
assert(result != TestResult::kRUNNING);
assert(testAtom.mStarted);
reportTestResult(testAtom, result);
}
static int reportPass(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kPASSED);
return EXIT_SUCCESS;
}
static int reportFail(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kFAILED);
return EXIT_FAILURE;
}
static int reportWaive(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kWAIVED);
return EXIT_SUCCESS;
}
static int reportTest(const TestAtom& testAtom, bool pass)
{
return pass ? reportPass(testAtom) : reportFail(testAtom);
}
Severity getReportableSeverity() const
{
return mReportableSeverity;
}
private:
//!
//! \brief returns an appropriate string for prefixing a log message with the given severity
//!
static const char* severityPrefix(Severity severity)
{
switch (severity)
{
case Severity::kINTERNAL_ERROR: return "[F] ";
case Severity::kERROR: return "[E] ";
case Severity::kWARNING: return "[W] ";
case Severity::kINFO: return "[I] ";
case Severity::kVERBOSE: return "[V] ";
default: assert(0); return "";
}
}
//!
//! \brief returns an appropriate string for prefixing a test result message with the given result
//!
static const char* testResultString(TestResult result)
{
switch (result)
{
case TestResult::kRUNNING: return "RUNNING";
case TestResult::kPASSED: return "PASSED";
case TestResult::kFAILED: return "FAILED";
case TestResult::kWAIVED: return "WAIVED";
default: assert(0); return "";
}
}
//!
//! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
//!
static std::ostream& severityOstream(Severity severity)
{
return severity >= Severity::kINFO ? std::cout : std::cerr;
}
//!
//! \brief method that implements logging test results
//!
static void reportTestResult(const TestAtom& testAtom, TestResult result)
{
severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
<< testAtom.mCmdline << std::endl;
}
//!
//! \brief generate a command line string from the given (argc, argv) values
//!
static std::string genCmdlineString(int argc, char const* const* argv)
{
std::stringstream ss;
for (int i = 0; i < argc; i++)
{
if (i > 0)
ss << " ";
ss << argv[i];
}
return ss.str();
}
Severity mReportableSeverity;
};
namespace
{
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//! LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//! LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//! LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//! LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
// ("fatal" severity)
//!
//! Example usage:
//!
//! LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}
} // anonymous namespace
#endif // TENSORRT_LOGGING_H

29
third_party/trt_yolov8/include/macros.h vendored Executable file
View File

@@ -0,0 +1,29 @@
#ifndef __MACROS_H
#define __MACROS_H
#include "NvInfer.h"
#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else
#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif // API_EXPORTS
#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif
#endif // __MACROS_H

31
third_party/trt_yolov8/include/model.h vendored Executable file
View File

@@ -0,0 +1,31 @@
#pragma once
#include <assert.h>
#include <string>
#include "NvInfer.h"
nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw);
nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);

34
third_party/trt_yolov8/include/postprocess.h vendored Executable file
View File

@@ -0,0 +1,34 @@
#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"
cv::Rect get_rect(cv::Mat& img, float bbox[4]);
cv::Mat scale_mask(cv::Mat mask, cv::Mat img);
cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[51]);
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
float conf_thresh, float nms_thresh = 0.5);
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch);
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
int count);
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
std::unordered_map<int, std::string>& labels_map);

16
third_party/trt_yolov8/include/preprocess.h vendored Executable file
View File

@@ -0,0 +1,16 @@
#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"
#include <map>
void cuda_preprocess_init(int max_image_size);
void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);

24
third_party/trt_yolov8/include/types.h vendored Executable file
View File

@@ -0,0 +1,24 @@
#pragma once
#include <string>
#include "config.h"
struct alignas(float) Detection {
//center_x center_y w h
float bbox[4];
float conf; // bbox_conf * cls_conf
int class_id;
float mask[32];
float keypoints[51]; // 17*3 keypoints
};
struct Classification {
int class_id;
float conf;
};
struct AffineMatrix {
float value[6];
};
const int bbox_element =
sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag

86
third_party/trt_yolov8/include/utils.h vendored Executable file
View File

@@ -0,0 +1,86 @@
#pragma once
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include <fstream>
static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
int w, h, x, y;
float r_w = input_w / (img.cols*1.0);
float r_h = input_h / (img.rows*1.0);
if (r_h > r_w) {
w = input_w;
h = r_w * img.rows;
x = 0;
y = (input_h - h) / 2;
} else {
w = r_h * img.cols;
h = input_h;
x = (input_w - w) / 2;
y = 0;
}
cv::Mat re(h, w, CV_8UC3);
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
return out;
}
static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
DIR *p_dir = opendir(p_dir_name);
if (p_dir == nullptr) {
return -1;
}
struct dirent* p_file = nullptr;
while ((p_file = readdir(p_dir)) != nullptr) {
if (strcmp(p_file->d_name, ".") != 0 &&
strcmp(p_file->d_name, "..") != 0) {
//std::string cur_file_name(p_dir_name);
//cur_file_name += "/";
//cur_file_name += p_file->d_name;
std::string cur_file_name(p_file->d_name);
file_names.push_back(cur_file_name);
}
}
closedir(p_dir);
return 0;
}
// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
size_t first = str.find_first_not_of(' ');
if (std::string::npos == first) {
return str;
}
size_t last = str.find_last_not_of(' ');
return str.substr(first, (last - first + 1));
}
// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
std::ostringstream out;
out.precision(n);
out << std::fixed << a_value;
return out.str();
}
static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
std::ifstream file(labels_filename);
// Read each line of the file
std::string line;
int index = 0;
while (std::getline(file, line)) {
// Strip the line of any leading or trailing whitespace
line = trim_leading_whitespace(line);
// Add the stripped line to the labels_map, using the loop index as the key
labels_map[index] = line;
index++;
}
// Close the file
file.close();
return 0;
}

334
third_party/trt_yolov8/plugin/yololayer.cu vendored Executable file
View File

@@ -0,0 +1,334 @@
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "../include/cuda_utils.h"
#include "../include/types.h"
#include "yololayer.h"
namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
void read(const char*& buffer, T& val) {
val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
}
} // namespace Tn
__device__ float sigmoid(float x) {
return 1.0f / (1.0f + exp(-x));
}
namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
int netHeight, int maxOut, bool is_segmentation, bool is_pose, const int* strides,
int stridesLength) {
mClassCount = classCount;
mNumberofpoints = numberofpoints;
mConfthreshkeypoints = confthreshkeypoints;
mYoloV8NetWidth = netWidth;
mYoloV8netHeight = netHeight;
mMaxOutObject = maxOut;
mStridesLength = stridesLength;
mStrides = new int[stridesLength];
memcpy(mStrides, strides, stridesLength * sizeof(int));
is_segmentation_ = is_segmentation;
is_pose_ = is_pose;
}
YoloLayerPlugin::~YoloLayerPlugin() {
if (mStrides != nullptr) {
delete[] mStrides;
mStrides = nullptr;
}
}
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
using namespace Tn;
const char *d = reinterpret_cast<const char*>(data), *a = d;
read(d, mClassCount);
read(d, mNumberofpoints);
read(d, mConfthreshkeypoints);
read(d, mThreadCount);
read(d, mYoloV8NetWidth);
read(d, mYoloV8netHeight);
read(d, mMaxOutObject);
read(d, mStridesLength);
mStrides = new int[mStridesLength];
for (int i = 0; i < mStridesLength; ++i) {
read(d, mStrides[i]);
}
read(d, is_segmentation_);
read(d, is_pose_);
assert(d == a + length);
}
void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
using namespace Tn;
char *d = static_cast<char*>(buffer), *a = d;
write(d, mClassCount);
write(d, mNumberofpoints);
write(d, mConfthreshkeypoints);
write(d, mThreadCount);
write(d, mYoloV8NetWidth);
write(d, mYoloV8netHeight);
write(d, mMaxOutObject);
write(d, mStridesLength);
for (int i = 0; i < mStridesLength; ++i) {
write(d, mStrides[i]);
}
write(d, is_segmentation_);
write(d, is_pose_);
assert(d == a + getSerializationSize());
}
size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_);
}
int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
return 0;
}
nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) TRT_NOEXCEPT {
int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
return nvinfer1::Dims3(total_size + 1, 1, 1);
}
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
mPluginNamespace = pluginNamespace;
}
const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
return mPluginNamespace;
}
nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return nvinfer1::DataType::kFLOAT;
}
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
int nbInputs) const TRT_NOEXCEPT {
return false;
}
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
return false;
}
void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
return "YoloLayer_TRT";
}
const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
return "1";
}
void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
delete this;
}
nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
YoloLayerPlugin* p =
new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
mMaxOutObject, is_segmentation_, is_pose_, mStrides, mStridesLength);
p->setPluginNamespace(mPluginNamespace);
return p;
}
int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
return 0;
}
__device__ float Logist(float data) {
return 1.0f / (1.0f + expf(-data));
};
__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
bool is_segmentation, bool is_pose) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx >= numElements)
return;
const int N_kpts = nk;
int total_grid = grid_h * grid_w;
int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0);
int batchIdx = idx / total_grid;
int elemIdx = idx % total_grid;
const float* curInput = input + batchIdx * total_grid * info_len;
int outputIdx = batchIdx * outputElem;
int class_id = 0;
float max_cls_prob = 0.0;
for (int i = 4; i < 4 + classes; i++) {
float p = Logist(curInput[elemIdx + i * total_grid]);
if (p > max_cls_prob) {
max_cls_prob = p;
class_id = i - 4;
}
}
if (max_cls_prob < 0.1)
return;
int count = (int)atomicAdd(output + outputIdx, 1);
if (count >= maxoutobject)
return;
char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
Detection* det = (Detection*)(data);
int row = elemIdx / grid_w;
int col = elemIdx % grid_w;
det->conf = max_cls_prob;
det->class_id = class_id;
det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
if (is_segmentation) {
for (int k = 0; k < 32; ++k) {
det->mask[k] = curInput[elemIdx + (4 + classes + k) * total_grid];
}
}
if (is_pose) {
for (int kpt = 0; kpt < N_kpts; kpt++) {
int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3) * total_grid;
int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 1) * total_grid;
int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 2) * total_grid;
float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);
float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;
bool is_within_bbox =
kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];
if (kpt_confidence < confkeypoints || !is_within_bbox) {
det->keypoints[kpt * 3] = -1;
det->keypoints[kpt * 3 + 1] = -1;
det->keypoints[kpt * 3 + 2] = -1;
} else {
det->keypoints[kpt * 3] = kpt_x;
det->keypoints[kpt * 3 + 1] = kpt_y;
det->keypoints[kpt * 3 + 2] = kpt_confidence;
}
}
}
}
void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
int mYoloV8NetWidth, int batchSize) {
int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
cudaMemsetAsync(output, 0, sizeof(float), stream);
for (int idx = 0; idx < batchSize; ++idx) {
CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
}
int numElem = 0;
const int maxGrids = mStridesLength;
int grids[maxGrids][2];
for (int i = 0; i < maxGrids; ++i) {
grids[i][0] = mYoloV8netHeight / mStrides[i];
grids[i][1] = mYoloV8NetWidth / mStrides[i];
}
for (unsigned int i = 0; i < maxGrids; i++) {
int grid_h = grids[i][0];
int grid_w = grids[i][1];
int stride = mStrides[i];
numElem = grid_h * grid_w * batchSize;
if (numElem < mThreadCount)
mThreadCount = numElem;
CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_);
}
}
PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
YoloPluginCreator::YoloPluginCreator() {
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
return "YoloLayer_TRT";
}
const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
return "1";
}
const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
return &mFC;
}
IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
assert(fc->nbFields == 1);
assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
int netinfo_count = 8;
int class_count = combinedInfo[0];
int numberofpoints = combinedInfo[1];
float confthreshkeypoints = combinedInfo[2];
int input_w = combinedInfo[3];
int input_h = combinedInfo[4];
int max_output_object_count = combinedInfo[5];
bool is_segmentation = combinedInfo[6];
bool is_pose = combinedInfo[7];
const int* px_arry = combinedInfo + netinfo_count;
int px_arry_length = fc->fields[0].length - netinfo_count;
YoloLayerPlugin* obj =
new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
max_output_object_count, is_segmentation, is_pose, px_arry, px_arry_length);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT {
// This object will be deleted when the network is destroyed, which will
// call YoloLayerPlugin::destroy()
YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
} // namespace nvinfer1

109
third_party/trt_yolov8/plugin/yololayer.h vendored Executable file
View File

@@ -0,0 +1,109 @@
#pragma once
#include <string>
#include <vector>
#include "NvInfer.h"
#include "../include/macros.h"
namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
public:
YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
int maxOut, bool is_segmentation, bool is_pose, const int* strides, int stridesLength);
YoloLayerPlugin(const void* data, size_t length);
~YoloLayerPlugin();
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
int initialize() TRT_NOEXCEPT override;
virtual void terminate() TRT_NOEXCEPT override {}
virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
cudaStream_t stream) TRT_NOEXCEPT override;
virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
int nbOutputs) const TRT_NOEXCEPT override {
return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
}
const char* getPluginType() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
void destroy() TRT_NOEXCEPT override;
IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
const char* getPluginNamespace() const TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
int32_t nbInputs) const TRT_NOEXCEPT;
bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
int nbInputs) const TRT_NOEXCEPT override;
bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
int32_t nbOutput) TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
private:
void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
int mYoloV8NetWidth, int batchSize);
int mThreadCount = 256;
const char* mPluginNamespace;
int mClassCount;
int mNumberofpoints;
float mConfthreshkeypoints;
int mYoloV8NetWidth;
int mYoloV8netHeight;
int mMaxOutObject;
bool is_segmentation_;
bool is_pose_;
int* mStrides;
int mStridesLength;
};
class API YoloPluginCreator : public IPluginCreator {
public:
YoloPluginCreator();
~YoloPluginCreator() override = default;
const char* getPluginName() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT override;
void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }
const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
private:
std::string mNamespace;
static PluginFieldCollection mFC;
static std::vector<PluginField> mPluginAttributes;
};
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
} // namespace nvinfer1

View File

@@ -0,0 +1,11 @@
from ultralytics import YOLO
'''
NOTE: trt_yolov8 do not need ONNX format at all, this script just used for visualizing yolov8 networks on `https://netron.app`.
'''
# Load a model
model = YOLO("../../../../vp_data/models/trt/others/yolov8n-seg.pt") # load a pretrained model (recommended for training)
# Export the model
path = model.export(format="onnx") # export the model to ONNX format which could be visualized on netron.app

57
third_party/trt_yolov8/samples/gen_wts.py vendored Executable file
View File

@@ -0,0 +1,57 @@
import sys
import argparse
import os
import struct
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
parser.add_argument('-w', '--weights', required=True,
help='Input weights (.pt) file path (required)')
parser.add_argument(
'-o', '--output', help='Output (.wts) file path (optional)')
parser.add_argument(
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg'],
help='determines the model is detection/classification')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid input file')
if not args.output:
args.output = os.path.splitext(args.weights)[0] + '.wts'
elif os.path.isdir(args.output):
args.output = os.path.join(
args.output,
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
return args.weights, args.output, args.type
pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')
# Load model
print(f'Loading {pt_file}')
# Initialize
device = 'cpu'
# Load model
model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32
if m_type in ['detect', 'seg']:
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')
model.to(device).eval()
with open(wts_file, 'w') as f:
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
f.write(struct.pack('>f', float(vv)).hex())
f.write('\n')

View File

@@ -0,0 +1,36 @@
#include "../trt_yolov8_classifier.h"
int main() {
trt_yolov8::trt_yolov8_classifier detector("./vp_data/models/trt/others/yolov8s-cls_v8.5.engine");
auto image1 = cv::imread("./vp_data/test_images/vehicle_cls/1.jpg");
auto image2 = cv::imread("./vp_data/test_images/vehicle_cls/5.jpg");
std::unordered_map<int, std::string> labels_map;
read_labels("./vp_data/models/imagenet_1000labels1.txt", labels_map);
std::vector<std::vector<Classification>> classifications;
std::vector<cv::Mat> images = {image1, image2};
detector.classify(images, classifications, 5); // top3 by default
for (int i = 0; i < classifications.size(); ++i) {
auto& classification = classifications[i];
auto& image = images[i];
for (int j = 0; j < classification.size(); ++j) {
std::cout << "(top" << j + 1 << ") class_id:" << classification[j].class_id << " conf:" << classification[j].conf << std::endl;
}
std::cout << std::endl;
// draw top1's label on image
cv::putText(image, "top1: " + labels_map.at(classification[0].class_id), cv::Point(10, 10), 1.5, 1, cv::Scalar(0, 0, 255));
}
cv::imshow("cls1", image1);
cv::imshow("cls2", image2);
cv::waitKey(0);
return 0;
}

View File

@@ -0,0 +1,25 @@
#include "../trt_yolov8_detector.h"
int main() {
trt_yolov8::trt_yolov8_detector detector("./vp_data/models/trt/others/yolov8s_v8.5.engine");
cv::VideoCapture cap("./vp_data/test_video/face2.mp4");
cv::Mat frame;
while (true) {
if (!cap.read(frame)) {
cap.set(cv::CAP_PROP_POS_FRAMES, 0);
continue;
}
std::vector<std::vector<Detection>> detections;
std::vector<cv::Mat> frames = {frame};
detector.detect(frames, detections);
draw_bbox(frames, detections);
cv::imshow("detect", frame);
cv::waitKey(40);
}
return 0;
}

View File

@@ -0,0 +1,25 @@
#include "../trt_yolov8_pose_detector.h"
int main() {
trt_yolov8::trt_yolov8_pose_detector detector("./vp_data/models/trt/others/yolov8s-pose_v8.5.engine");
cv::VideoCapture cap("./vp_data/test_video/face2.mp4");
cv::Mat frame;
while (true) {
if (!cap.read(frame)) {
cap.set(cv::CAP_PROP_POS_FRAMES, 0);
continue;
}
std::vector<std::vector<Detection>> detections;
std::vector<cv::Mat> frames = {frame};
detector.detect(frames, detections);
draw_bbox_keypoints_line(frames, detections);
cv::imshow("pose", frame);
cv::waitKey(40);
}
return 0;
}

View File

@@ -0,0 +1,29 @@
#include "../trt_yolov8_seg_detector.h"
int main() {
trt_yolov8::trt_yolov8_seg_detector detector("./vp_data/models/trt/others/yolov8s-seg_v8.5.engine");
cv::VideoCapture cap("./vp_data/test_video/face2.mp4");
std::unordered_map<int, std::string> labels_map;
read_labels("./vp_data/models/coco_80classes.txt", labels_map);
cv::Mat frame;
while (true) {
if (!cap.read(frame)) {
cap.set(cv::CAP_PROP_POS_FRAMES, 0);
continue;
}
std::vector<std::vector<Detection>> detections;
std::vector<std::vector<cv::Mat>> masks;
std::vector<cv::Mat> frames = {frame};
detector.detect(frames, detections, masks);
draw_mask_bbox(frame, detections[0], masks[0], labels_map);
cv::imshow("seg", frame);
cv::waitKey(40);
}
return 0;
}

View File

@@ -0,0 +1,45 @@
#include "../trt_yolov8_detector.h"
#include "../trt_yolov8_pose_detector.h"
#include "../trt_yolov8_seg_detector.h"
#include "../trt_yolov8_classifier.h"
int main(int argc, char** argv) {
/* run command:
* ./trt_yolov8_wts_2_engine [-det/-seg/-pose/-cls] [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]
*/
if (argc != 5) {
std::cerr << "arguments not right!" << std::endl;
std::cerr << "./trt_yolov8_wts_2_engine [-det/-seg/-pose/-cls] [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]" << std::endl;
}
std::string task_type = std::string(argv[1]);
std::string wts_name = std::string(argv[2]);
std::string engine_name = std::string(argv[3]);
std::string sub_type = std::string(argv[4]);
if (task_type == "-det") {
trt_yolov8::trt_yolov8_detector detector;
detector.wts_2_engine(wts_name, engine_name, sub_type);
}
else if (task_type == "-seg") {
trt_yolov8::trt_yolov8_seg_detector detector;
detector.wts_2_engine(wts_name, engine_name, sub_type);
}
else if (task_type == "-pose") {
trt_yolov8::trt_yolov8_pose_detector detector;
detector.wts_2_engine(wts_name, engine_name, sub_type);
}
else if (task_type == "-cls") {
trt_yolov8::trt_yolov8_classifier classifier;
classifier.wts_2_engine(wts_name, engine_name, sub_type);
}
else {
std::cerr << "arguments not right!" << std::endl;
std::cerr << "./trt_yolov8_wts_2_engine [-det/-seg/-pose/-cls] [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]" << std::endl;
}
return 0;
}

268
third_party/trt_yolov8/src/block.cpp vendored Executable file
View File

@@ -0,0 +1,268 @@
#include "../include/block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "../include/config.h"
#include "../plugin/yololayer.h"
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
std::cout << "Loading weights: " << file << std::endl;
std::map<std::string, nvinfer1::Weights> WeightMap;
std::ifstream input(file);
assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
while (count--) {
nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
uint32_t size;
std::string name;
input >> name >> std::dec >> size;
wt.type = nvinfer1::DataType::kFLOAT;
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; x++) {
input >> std::hex >> val[x];
}
wt.values = val;
wt.count = size;
WeightMap[name] = wt;
}
return WeightMap;
}
static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, std::string lname, float eps) {
float* gamma = (float*)weightMap[lname + ".weight"].values;
float* beta = (float*)weightMap[lname + ".bias"].values;
float* mean = (float*)weightMap[lname + ".running_mean"].values;
float* var = (float*)weightMap[lname + ".running_var"].values;
int len = weightMap[lname + ".running_var"].count;
float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
scval[i] = gamma[i] / sqrt(var[i] + eps);
}
nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};
float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
}
nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};
float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
pval[i] = 1.0;
}
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
weightMap[lname + ".scale"] = scale;
weightMap[lname + ".shift"] = shift;
weightMap[lname + ".power"] = power;
nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
assert(output);
return output;
}
nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
int ch, int k, int s, int p, std::string lname) {
nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
nvinfer1::IConvolutionLayer* conv =
network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
assert(conv);
conv->setStrideNd(nvinfer1::DimsHW{s, s});
conv->setPaddingNd(nvinfer1::DimsHW{p, p});
nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
nvinfer1::IElementWiseLayer* ew =
network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
assert(ew);
return ew;
}
nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) {
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".cv1");
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname + ".cv2");
if (shortcut && c1 == c2) {
nvinfer1::IElementWiseLayer* ew =
network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
return ew;
}
return conv2;
}
nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname) {
int c_ = (float)c2 * e;
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, 0, lname + ".cv1");
nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();
nvinfer1::ISliceLayer* split1 =
network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
nvinfer1::ISliceLayer* split2 =
network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0},
nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
nvinfer1::ITensor* y1 = split2->getOutput(0);
for (int i = 0; i < n; i++) {
auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
y1 = b->getOutput(0);
nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
cat = network->addConcatenation(inputTensors, 2);
}
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
return conv2;
}
nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname) {
assert(network != nullptr);
int hidden_channels = static_cast<int>(c2 * e);
// cv1 branch
nvinfer1::IElementWiseLayer* conv1 =
convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, 0, lname + ".cv1");
nvinfer1::ITensor* cv1_out = conv1->getOutput(0);
// Split the output of cv1 into two tensors
nvinfer1::Dims dims = cv1_out->getDimensions();
nvinfer1::ISliceLayer* split1 =
network->addSlice(*cv1_out, nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]},
nvinfer1::Dims3{1, 1, 1});
nvinfer1::ISliceLayer* split2 =
network->addSlice(*cv1_out, nvinfer1::Dims3{dims.d[0] / 2, 0, 0},
nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]}, nvinfer1::Dims3{1, 1, 1});
// Create y1 bottleneck sequence
nvinfer1::ITensor* y1 = split1->getOutput(0);
for (int i = 0; i < n; ++i) {
auto* bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0,
lname + ".m." + std::to_string(i));
y1 = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck
}
// Concatenate y1 with the second split of cv1
nvinfer1::ITensor* concatInputs[2] = {y1, split2->getOutput(0)};
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);
// cv2 to produce the final output
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
return conv2;
}
nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int k, std::string lname) {
int c_ = c1 / 2;
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1");
nvinfer1::IPoolingLayer* pool1 =
network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
nvinfer1::IPoolingLayer* pool2 =
network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
nvinfer1::IPoolingLayer* pool3 =
network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
pool3->getOutput(0)};
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
return conv2;
}
nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {
nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
shuffle1->setReshapeDimensions(nvinfer1::Dims3{4, 16, grid});
shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2});
nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
nvinfer1::IConvolutionLayer* conv =
network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
conv->setStrideNd(nvinfer1::DimsHW{s, s});
conv->setPaddingNd(nvinfer1::DimsHW{p, p});
nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid});
return shuffle2;
}
nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
int px_arry_num, bool is_segmentation, bool is_pose) {
auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
const int netinfo_count = 8; // Assuming the first 5 elements are for netinfo as per existing code.
const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined.
std::vector<int> combinedInfo(total_count);
// Fill in the first 5 elements as per existing netinfo.
combinedInfo[0] = kNumClass;
combinedInfo[1] = kNumberOfPoints;
combinedInfo[2] = kConfThreshKeypoints;
combinedInfo[3] = kInputW;
combinedInfo[4] = kInputH;
combinedInfo[5] = kMaxNumOutputBbox;
combinedInfo[6] = is_segmentation;
combinedInfo[7] = is_pose;
// Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
// Now let's create the PluginField object to hold this combined information.
nvinfer1::PluginField pluginField;
pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize
pluginField.data = combinedInfo.data();
pluginField.type = nvinfer1::PluginFieldType::kINT32;
pluginField.length = combinedInfo.size();
// Create the PluginFieldCollection to hold the PluginField object.
nvinfer1::PluginFieldCollection pluginFieldCollection;
pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array
pluginFieldCollection.fields = &pluginField;
// Create the plugin object using the PluginFieldCollection.
nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);
// We assume that the plugin is to be added onto the network.
// Prepare input tensors for the YOLO Layer.
std::vector<nvinfer1::ITensor*> inputTensors;
for (auto det : dets) {
inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor.
}
// Add the plugin to the network using the prepared input tensors.
nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);
return yoloLayer; // Return the added YOLO layer.
}

80
third_party/trt_yolov8/src/calibrator.cpp vendored Executable file
View File

@@ -0,0 +1,80 @@
#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/dnn/dnn.hpp>
#include "../include/calibrator.h"
#include "../include/cuda_utils.h"
#include "../include/utils.h"
Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
const char* input_blob_name, bool read_cache)
: batchsize_(batchsize)
, input_w_(input_w)
, input_h_(input_h)
, img_idx_(0)
, img_dir_(img_dir)
, calib_table_name_(calib_table_name)
, input_blob_name_(input_blob_name)
, read_cache_(read_cache)
{
input_count_ = 3 * input_w * input_h * batchsize;
CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
read_files_in_dir(img_dir, img_files_);
}
Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
CUDA_CHECK(cudaFree(device_input_));
}
int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
{
return batchsize_;
}
bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
{
if (img_idx_ + batchsize_ > (int)img_files_.size()) {
return false;
}
std::vector<cv::Mat> input_imgs_;
for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
std::cout << img_files_[i] << " " << i << std::endl;
cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
if (temp.empty()){
std::cerr << "Fatal error: image cannot open!" << std::endl;
return false;
}
cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
input_imgs_.push_back(pr_img);
}
img_idx_ += batchsize_;
cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], input_blob_name_));
bindings[0] = device_input_;
return true;
}
const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
{
std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
calib_cache_.clear();
std::ifstream input(calib_table_name_, std::ios::binary);
input >> std::noskipws;
if (read_cache_ && input.good())
{
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
}
length = calib_cache_.size();
return length ? calib_cache_.data() : nullptr;
}
void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
{
std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
std::ofstream output(calib_table_name_, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}

1853
third_party/trt_yolov8/src/model.cpp vendored Executable file

File diff suppressed because it is too large Load Diff

259
third_party/trt_yolov8/src/postprocess.cpp vendored Executable file
View File

@@ -0,0 +1,259 @@
#include "../include/postprocess.h"
#include "../include/utils.h"
cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
float l, r, t, b;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
l = bbox[0];
r = bbox[2];
t = bbox[1] - (kInputH - r_w * img.rows) / 2;
b = bbox[3] - (kInputH - r_w * img.rows) / 2;
l = l / r_w;
r = r / r_w;
t = t / r_w;
b = b / r_w;
} else {
l = bbox[0] - (kInputW - r_h * img.cols) / 2;
r = bbox[2] - (kInputW - r_h * img.cols) / 2;
t = bbox[1];
b = bbox[3];
l = l / r_h;
r = r / r_h;
t = t / r_h;
b = b / r_h;
}
return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}
cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[51]) {
int l, r, t, b;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
l = bbox[0] / r_w;
r = bbox[2] / r_w;
t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
for (int i = 0; i < 51; i += 3) {
lmk[i] /= r_w;
lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
// lmk[i + 2]
}
} else {
l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
t = bbox[1] / r_h;
b = bbox[3] / r_h;
for (int i = 0; i < 51; i += 3) {
lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
lmk[i + 1] /= r_h;
// lmk[i + 2]
}
}
return cv::Rect(l, t, r - l, b - t);
}
static float iou(float lbox[4], float rbox[4]) {
float interBox[] = {
(std::max)(lbox[0], rbox[0]),
(std::min)(lbox[2], rbox[2]),
(std::max)(lbox[1], rbox[1]),
(std::min)(lbox[3], rbox[3]),
};
if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;
float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
return interBoxS / unionBoxS;
}
static bool cmp(const Detection& a, const Detection& b) {
if (a.conf == b.conf) {
return a.bbox[0] < b.bbox[0];
}
return a.conf > b.conf;
}
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
int det_size = sizeof(Detection) / sizeof(float);
std::map<float, std::vector<Detection>> m;
for (int i = 0; i < output[0]; i++) {
if (output[1 + det_size * i + 4] <= conf_thresh)
continue;
Detection det;
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
if (m.count(det.class_id) == 0)
m.emplace(det.class_id, std::vector<Detection>());
m[det.class_id].push_back(det);
}
for (auto it = m.begin(); it != m.end(); it++) {
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), cmp);
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n) {
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
dets.erase(dets.begin() + n);
--n;
}
}
}
}
}
void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
float conf_thresh, float nms_thresh) {
res_batch.resize(batch_size);
for (int i = 0; i < batch_size; i++) {
nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
}
}
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
int count) {
Detection det;
for (int i = 0; i < count; i++) {
int basic_pos = 1 + i * bbox_element;
int keep_flag = decode_ptr_host[basic_pos + 6];
if (keep_flag == 1) {
det.bbox[0] = decode_ptr_host[basic_pos + 0];
det.bbox[1] = decode_ptr_host[basic_pos + 1];
det.bbox[2] = decode_ptr_host[basic_pos + 2];
det.bbox[3] = decode_ptr_host[basic_pos + 3];
det.conf = decode_ptr_host[basic_pos + 4];
det.class_id = decode_ptr_host[basic_pos + 5];
res.push_back(det);
}
}
}
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch) {
res_batch.resize(batch_size);
int count = static_cast<int>(*decode_ptr_host);
count = std::min(count, kMaxNumOutputBbox);
for (int i = 0; i < batch_size; i++) {
auto& img = const_cast<cv::Mat&>(img_batch[i]);
process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
}
}
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
cv::Mat img = img_batch[i];
for (size_t j = 0; j < res.size(); j++) {
cv::Rect r = get_rect(img, res[j].bbox);
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
}
}
}
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
const std::vector<std::pair<int, int>> skeleton_pairs = {
{0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11},
{6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
cv::Mat img = img_batch[i];
for (size_t j = 0; j < res.size(); j++) {
cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
for (int k = 0; k < 51; k += 3) {
if (res[j].keypoints[k + 2] > 0.5) {
cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
cv::Scalar(0, 0x27, 0xC1), -1);
}
}
for (const auto& bone : skeleton_pairs) {
int kp1_idx = bone.first * 3;
int kp2_idx = bone.second * 3;
if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
}
}
}
}
}
cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
int x, y, w, h;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
w = kInputW;
h = r_w * img.rows;
x = 0;
y = (kInputH - h) / 2;
} else {
w = r_h * img.cols;
h = kInputH;
x = (kInputW - w) / 2;
y = 0;
}
cv::Rect r(x, y, w, h);
cv::Mat res;
cv::resize(mask(r), res, img.size());
return res;
}
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
std::unordered_map<int, std::string>& labels_map) {
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
for (size_t i = 0; i < dets.size(); i++) {
cv::Mat img_mask = scale_mask(masks[i], img);
auto color = colors[(int)dets[i].class_id % colors.size()];
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
cv::Rect r = get_rect(img, dets[i].bbox);
for (int x = r.x; x < r.x + r.width; x++) {
for (int y = r.y; y < r.y + r.height; y++) {
float val = img_mask.at<float>(y, x);
if (val <= 0.5)
continue;
img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
}
}
cv::rectangle(img, r, bgr, 2);
// Get the size of the text
cv::Size textSize =
cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
// Set the top left corner of the rectangle
cv::Point topLeft(r.x, r.y - textSize.height);
// Set the bottom right corner of the rectangle
cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
// Set the thickness of the rectangle lines
int lineThickness = 2;
// Draw the rectangle on the image
cv::rectangle(img, topLeft, bottomRight, bgr, -1);
cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
}
}

84
third_party/trt_yolov8/src/postprocess.cu vendored Executable file
View File

@@ -0,0 +1,84 @@
//
// Created by lindsay on 23-7-17.
//
#include "../include/types.h"
#include "../include/postprocess.h"
static __global__ void
decode_kernel(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects) {
float count = predict[0];
int position = (blockDim.x * blockIdx.x + threadIdx.x);
if (position >= count) return;
float *pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
int index = atomicAdd(parray, 1);
if (index >= max_objects) return;
float confidence = pitem[4];
if (confidence < confidence_threshold) return;
float left = pitem[0];
float top = pitem[1];
float right = pitem[2];
float bottom = pitem[3];
float label = pitem[5];
float *pout_item = parray + 1 + index * bbox_element;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
static __device__ float
box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) {
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if (c_area == 0.0f) return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
static __global__ void nms_kernel(float *bboxes, int max_objects, float threshold) {
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = bboxes[0];
if (position >= count) return;
float *pcurrent = bboxes + 1 + position * bbox_element;
for (int i = 0; i < count; ++i) {
float *pitem = bboxes + 1 + i * bbox_element;
if (i == position || pcurrent[5] != pitem[5]) continue;
if (pitem[4] >= pcurrent[4]) {
if (pitem[4] == pcurrent[4] && i < position) continue;
float iou = box_iou(
pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
pitem[0], pitem[1], pitem[2], pitem[3]
);
if (iou > threshold) {
pcurrent[6] = 0;
return;
}
}
}
}
void cuda_decode(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects,
cudaStream_t stream) {
int block = 256;
int grid = ceil(num_bboxes / (float)block);
decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}
void cuda_nms(float *parray, float nms_threshold, int max_objects, cudaStream_t stream) {
int block = max_objects < 256 ? max_objects : 256;
int grid = ceil(max_objects / (float)block);
nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}

155
third_party/trt_yolov8/src/preprocess.cu vendored Executable file
View File

@@ -0,0 +1,155 @@
#include "../include/preprocess.h"
#include "../include/cuda_utils.h"
static uint8_t *img_buffer_host = nullptr;
static uint8_t *img_buffer_device = nullptr;
__global__ void
warpaffine_kernel(uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width,
int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge) return;
float m_x1 = d2s.value[0];
float m_y1 = d2s.value[1];
float m_z1 = d2s.value[2];
float m_x2 = d2s.value[3];
float m_y2 = d2s.value[4];
float m_z2 = d2s.value[5];
int dx = position % dst_width;
int dy = position / dst_width;
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
float c0, c1, c2;
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
// out of range
c0 = const_value_st;
c1 = const_value_st;
c2 = const_value_st;
} else {
int y_low = floorf(src_y);
int x_low = floorf(src_x);
int y_high = y_low + 1;
int x_high = x_low + 1;
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
float ly = src_y - y_low;
float lx = src_x - x_low;
float hy = 1 - ly;
float hx = 1 - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
uint8_t *v1 = const_value;
uint8_t *v2 = const_value;
uint8_t *v3 = const_value;
uint8_t *v4 = const_value;
if (y_low >= 0) {
if (x_low >= 0)
v1 = src + y_low * src_line_size + x_low * 3;
if (x_high < src_width)
v2 = src + y_low * src_line_size + x_high * 3;
}
if (y_high < src_height) {
if (x_low >= 0)
v3 = src + y_high * src_line_size + x_low * 3;
if (x_high < src_width)
v4 = src + y_high * src_line_size + x_high * 3;
}
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
}
// bgr to rgb
float t = c2;
c2 = c0;
c0 = t;
// normalization
c0 = c0 / 255.0f;
c1 = c1 / 255.0f;
c2 = c2 / 255.0f;
// rgbrgbrgb to rrrgggbbb
int area = dst_width * dst_height;
float *pdst_c0 = dst + dy * dst_width + dx;
float *pdst_c1 = pdst_c0 + area;
float *pdst_c2 = pdst_c1 + area;
*pdst_c0 = c0;
*pdst_c1 = c1;
*pdst_c2 = c2;
}
void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height,
cudaStream_t stream) {
int img_size = src_width * src_height * 3;
// copy data to pinned memory
memcpy(img_buffer_host, src, img_size);
// copy data to device memory
CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
AffineMatrix s2d, d2s;
float scale = std::min(dst_height / (float) src_height, dst_width / (float) src_width);
s2d.value[0] = scale;
s2d.value[1] = 0;
s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
s2d.value[3] = 0;
s2d.value[4] = scale;
s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
int jobs = dst_height * dst_width;
int threads = 256;
int blocks = ceil(jobs / (float) threads);
warpaffine_kernel<<<blocks, threads, 0, stream>>>(
img_buffer_device, src_width * 3, src_width,
src_height, dst, dst_width,
dst_height, 128, d2s, jobs);
}
void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch,
float *dst, int dst_width, int dst_height,
cudaStream_t stream) {
int dst_size = dst_width * dst_height * 3;
for (size_t i = 0; i < img_batch.size(); i++) {
cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
dst_height, stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
void cuda_preprocess_init(int max_image_size) {
// prepare input data in pinned memory
CUDA_CHECK(cudaMallocHost((void **) &img_buffer_host, max_image_size * 3));
// prepare input data in device memory
CUDA_CHECK(cudaMalloc((void **) &img_buffer_device, max_image_size * 3));
}
void cuda_preprocess_destroy() {
CUDA_CHECK(cudaFree(img_buffer_device));
CUDA_CHECK(cudaFreeHost(img_buffer_host));
}

View File

@@ -0,0 +1,236 @@
#include "trt_yolov8_classifier.h"
namespace trt_yolov8 {
using namespace nvinfer1;
void trt_yolov8_classifier::batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width, int dst_height) {
for (size_t b = 0; b < imgs.size(); b++) {
int h = imgs[b].rows;
int w = imgs[b].cols;
int m = std::min(h, w);
int top = (h - m) / 2;
int left = (w - m) / 2;
cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
img.convertTo(img, CV_32F, 1/255.0);
std::vector<cv::Mat> channels(3);
cv::split(img, channels);
// CHW format
for (int c = 0; c < 3; ++c) {
int i = 0;
for (int row = 0; row < dst_height; ++row) {
for (int col = 0; col < dst_width; ++col) {
output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
channels[c].at<float>(row, col);
++i;
}
}
}
}
}
std::vector<float> trt_yolov8_classifier::softmax(float *prob, int n) {
std::vector<float> res;
float sum = 0.0f;
float t;
for (int i = 0; i < n; i++) {
t = expf(prob[i]);
res.push_back(t);
sum += t;
}
for (int i = 0; i < n; i++) {
res[i] /= sum;
}
return res;
}
std::vector<int> trt_yolov8_classifier::topk(const std::vector<float>& vec, int k) {
std::vector<int> topk_index;
std::vector<size_t> vec_index(vec.size());
std::iota(vec_index.begin(), vec_index.end(), 0);
std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });
int k_num = std::min<int>(vec.size(), k);
for (int i = 0; i < k_num; ++i) {
topk_index.push_back(vec_index[i]);
}
return topk_index;
}
void trt_yolov8_classifier::prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** output_buffer_host) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));
*cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
*output_buffer_host = new float[kBatchSize * kOutputSize];
}
void trt_yolov8_classifier::infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
void trt_yolov8_classifier::serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) {
// Create builder
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
// Create model to populate the network, then set the outputs and create an engine
IHostMemory *serialized_engine = nullptr;
//engine = buildEngineYolov8Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
serialized_engine = buildEngineYolov8Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw);
assert(serialized_engine);
// Save engine to file
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cerr << "Could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
// Close everything down
delete serialized_engine;
delete config;
delete builder;
}
void trt_yolov8_classifier::deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
trt_yolov8_classifier::trt_yolov8_classifier(std::string model_path) {
if (model_path.empty()) {
return;
}
cudaSetDevice(kGpuId);
deserialize_engine(model_path, &runtime, &engine, &context);
CUDA_CHECK(cudaStreamCreate(&stream));
}
trt_yolov8_classifier::~trt_yolov8_classifier() {
cudaStreamDestroy(stream);
delete context;
delete engine;
delete runtime;
}
void trt_yolov8_classifier::classify(std::vector<cv::Mat> images, std::vector<std::vector<Classification>>& classifications, int top_k) {
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* cpu_input_buffer = nullptr;
float* output_buffer_host = nullptr;
prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host);
// batch predict
for (size_t i = 0; i < images.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
for (size_t j = i; j < i + kBatchSize && j < images.size(); j++) {
img_batch.push_back(images[j]);
}
// Preprocess
batch_preprocess(img_batch, cpu_input_buffer);
// Run inference
auto start = std::chrono::system_clock::now();
infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize);
auto end = std::chrono::system_clock::now();
//std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
// Postprocess and get top-k result
for (size_t b = 0; b < img_batch.size(); b++) {
float* p = &output_buffer_host[b * kOutputSize];
auto res = softmax(p, kOutputSize);
auto topk_idx = topk(res, top_k);
std::vector<Classification> classification;
for (auto idx: topk_idx) {
classification.push_back(Classification {idx, res[idx]});
}
classifications.push_back(classification);
}
}
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
delete[] cpu_input_buffer;
delete[] output_buffer_host;
}
bool trt_yolov8_classifier::wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type) {
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (sub_type[0] == 'n') { // yolov8n
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') { // yolov8s
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') { // yolov8m
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') { // yolov8l
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') { // yolov8x
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false; // not support
}
if (sub_type.size() == 2 && sub_type[1] == '6') { // yolov8n6/yolov8s6/yolov8m6/yolov8l6/yolov8x6
is_p = 6;
} else if (sub_type.size() == 2 && sub_type[1] == '2') { // yolov8n2/yolov8s2/yolov8m2/yolov8l2/yolov8x2
is_p = 2;
}
serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
return true;
}
}

View File

@@ -0,0 +1,46 @@
#pragma once
#include <fstream>
#include <iostream>
#include <cmath>
#include <numeric>
#include <opencv2/opencv.hpp>
#include "include/cuda_utils.h"
#include "include/logging.h"
#include "include/model.h"
#include "include/postprocess.h"
#include "include/preprocess.h"
#include "include/utils.h"
namespace trt_yolov8 {
using namespace nvinfer1;
class trt_yolov8_classifier
{
private:
void batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width = 224, int dst_height = 224);
std::vector<float> softmax(float *prob, int n);
std::vector<int> topk(const std::vector<float>& vec, int k);
void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** output_buffer_host);
void infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize);
void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name);
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context);
Logger gLogger;
const static int kOutputSize = kClsNumClass;
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
cudaStream_t stream;
public:
trt_yolov8_classifier(std::string model_path = "");
~trt_yolov8_classifier();
// classify
void classify(std::vector<cv::Mat> images, std::vector<std::vector<Classification>>& classifications, int top_k = 3);
// serialize wts to plan file for image classify
// sub_type: [ n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 ]
bool wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type);
};
}

216
third_party/trt_yolov8/trt_yolov8_detector.cpp vendored Executable file
View File

@@ -0,0 +1,216 @@
#include "trt_yolov8_detector.h"
namespace trt_yolov8 {
using namespace nvinfer1;
void trt_yolov8_detector::serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
if (is_p == 6) {
serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
} else if (is_p == 2) {
serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
} else {
serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
}
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void trt_yolov8_detector::deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void trt_yolov8_detector::prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void trt_yolov8_detector::infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
/*
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;*/
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
/*
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;*/
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
void trt_yolov8_detector::detect(std::vector<cv::Mat> images, std::vector<std::vector<Detection>>& detections) {
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device, cuda_post_process);
// batch predict
for (size_t i = 0; i < images.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
for (size_t j = i; j < i + kBatchSize && j < images.size(); j++) {
img_batch.push_back(images[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
} else if (cuda_post_process == "g") {
//Process gpu decode and nms results
batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
}
// push back to return
detections.insert(detections.end(), res_batch.begin(), res_batch.end());
}
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
}
trt_yolov8_detector::trt_yolov8_detector(std::string model_path) {
if (model_path.empty()) {
return;
}
cudaSetDevice(kGpuId);
// Deserialize the engine from file
deserialize_engine(model_path, &runtime, &engine, &context);
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
}
trt_yolov8_detector::~trt_yolov8_detector() {
// Release stream and buffers
cudaStreamDestroy(stream);
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
}
bool trt_yolov8_detector::wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type) {
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (sub_type[0] == 'n') { // yolov8n
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') { // yolov8s
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') { // yolov8m
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') { // yolov8l
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') { // yolov8x
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false; // not support
}
if (sub_type.size() == 2 && sub_type[1] == '6') { // yolov8n6/yolov8s6/yolov8m6/yolov8l6/yolov8x6
is_p = 6;
} else if (sub_type.size() == 2 && sub_type[1] == '2') { // yolov8n2/yolov8s2/yolov8m2/yolov8l2/yolov8x2
is_p = 2;
}
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
return true;
}
}

View File

@@ -0,0 +1,51 @@
#pragma once
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "include/cuda_utils.h"
#include "include/logging.h"
#include "include/model.h"
#include "include/postprocess.h"
#include "include/preprocess.h"
#include "include/utils.h"
namespace trt_yolov8 {
using namespace nvinfer1;
class trt_yolov8_detector
{
private:
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels);
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context);
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process);
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process);
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
Logger gLogger;
cudaStream_t stream;
int model_bboxes;
// Deserialize the engine from file
nvinfer1::IRuntime* runtime = nullptr;
nvinfer1::ICudaEngine* engine = nullptr;
nvinfer1::IExecutionContext* context = nullptr;
std::string cuda_post_process = "c";
public:
trt_yolov8_detector(std::string model_path = "");
~trt_yolov8_detector();
// detect
void detect(std::vector<cv::Mat> images, std::vector<std::vector<Detection>>& detections);
// serialize wts to plan file for target detect
// sub_type: [ n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 ]
bool wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type);
};
}

View File

@@ -0,0 +1,217 @@
#include "trt_yolov8_pose_detector.h"
namespace trt_yolov8 {
using namespace nvinfer1;
void trt_yolov8_pose_detector::serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
if (is_p == 6) {
serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
} else if (is_p == 2) {
std::cout << "p2 is not supported right now" << std::endl;
} else {
serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
}
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void trt_yolov8_pose_detector::deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void trt_yolov8_pose_detector::prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void trt_yolov8_pose_detector::infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
/*
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;*/
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
/*
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;*/
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
trt_yolov8_pose_detector::trt_yolov8_pose_detector(std::string model_path) {
if (model_path.empty()) {
return;
}
cudaSetDevice(kGpuId);
// Deserialize the engine from file
deserialize_engine(model_path, &runtime, &engine, &context);
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
}
trt_yolov8_pose_detector::~trt_yolov8_pose_detector() {
// Release stream and buffers
cudaStreamDestroy(stream);
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
}
void trt_yolov8_pose_detector::detect(std::vector<cv::Mat> images, std::vector<std::vector<Detection>>& detections) {
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device, cuda_post_process);
// batch predict
for (size_t i = 0; i < images.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
for (size_t j = i; j < i + kBatchSize && j < images.size(); j++) {
img_batch.push_back(images[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
} else if (cuda_post_process == "g") {
// Process gpu decode and nms results
// todo pose in gpu
std::cerr << "pose_postprocess is not support in gpu right now" << std::endl;
}
// push back to return
detections.insert(detections.end(), res_batch.begin(), res_batch.end());
}
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
}
bool trt_yolov8_pose_detector::wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type) {
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (sub_type[0] == 'n') { // yolov8n
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') { // yolov8s
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') { // yolov8m
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') { // yolov8l
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') { // yolov8x
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false; // not support
}
if (sub_type.size() == 2 && sub_type[1] == '6') { // yolov8n6/yolov8s6/yolov8m6/yolov8l6/yolov8x6
is_p = 6;
} else if (sub_type.size() == 2 && sub_type[1] == '2') { // yolov8n2/yolov8s2/yolov8m2/yolov8l2/yolov8x2
is_p = 2;
}
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
return true;
}
}

View File

@@ -0,0 +1,52 @@
#pragma once
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "include/cuda_utils.h"
#include "include/logging.h"
#include "include/model.h"
#include "include/postprocess.h"
#include "include/preprocess.h"
#include "include/utils.h"
namespace trt_yolov8 {
using namespace nvinfer1;
class trt_yolov8_pose_detector
{
private:
/* data */
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels);
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context);
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process);
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process);
Logger gLogger;
const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 32) / sizeof(float) + 1;
cudaStream_t stream;
int model_bboxes;
// Deserialize the engine from file
nvinfer1::IRuntime* runtime = nullptr;
nvinfer1::ICudaEngine* engine = nullptr;
nvinfer1::IExecutionContext* context = nullptr;
std::string cuda_post_process = "c";
public:
trt_yolov8_pose_detector(std::string model_path = "");
~trt_yolov8_pose_detector();
// detect
void detect(std::vector<cv::Mat> images, std::vector<std::vector<Detection>>& detections);
// serialize wts to plan file for pose estimate
// sub_type: [ n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 ]
bool wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type);
};
}

View File

@@ -0,0 +1,272 @@
#include "trt_yolov8_seg_detector.h"
namespace trt_yolov8 {
using namespace nvinfer1;
cv::Rect trt_yolov8_seg_detector::get_downscale_rect(float bbox[4], float scale) {
float left = bbox[0];
float top = bbox[1];
float right = bbox[0] + bbox[2];
float bottom = bbox[1] + bbox[3];
left = left < 0 ? 0 : left;
top = top < 0 ? 0 : top;
right = right > 640 ? 640 : right;
bottom = bottom > 640 ? 640 : bottom;
left /= scale;
top /= scale;
right /= scale;
bottom /= scale;
return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
}
std::vector<cv::Mat> trt_yolov8_seg_detector::process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {
std::vector<cv::Mat> masks;
for (size_t i = 0; i < dets.size(); i++) {
cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
auto r = get_downscale_rect(dets[i].bbox, 4);
for (int x = r.x; x < r.x + r.width; x++) {
for (int y = r.y; y < r.y + r.height; y++) {
float e = 0.0f;
for (int j = 0; j < 32; j++) {
e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
}
e = 1.0f / (1.0f + expf(-e));
mask_mat.at<float>(y, x) = e;
}
}
cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
masks.push_back(mask_mat);
}
return masks;
}
void trt_yolov8_seg_detector::serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw,
int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void trt_yolov8_seg_detector::deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void trt_yolov8_seg_detector::prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) {
assert(engine->getNbBindings() == 3);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
const int outputIndex_seg = engine->getBindingIndex("proto");
assert(inputIndex == 0);
assert(outputIndex == 1);
assert(outputIndex_seg == 2);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
*output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void trt_yolov8_seg_detector::infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
//std::cout << "kOutputSize:" << kOutputSize << std::endl;
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
//std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
cudaMemcpyDeviceToHost, stream));
/*
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;*/
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
/*
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;*/
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
trt_yolov8_seg_detector::trt_yolov8_seg_detector(std::string model_path) {
if (model_path.empty()) {
return;
}
cudaSetDevice(kGpuId);
// Deserialize the engine from file
deserialize_engine(model_path, &runtime, &engine, &context);
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
}
trt_yolov8_seg_detector::~trt_yolov8_seg_detector() {
// Release stream and buffers
cudaStreamDestroy(stream);
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
}
void trt_yolov8_seg_detector::detect(std::vector<cv::Mat> images, std::vector<std::vector<Detection>>& detections, std::vector<std::vector<cv::Mat>>& masks) {
// Prepare cpu and gpu buffers
float* device_buffers[3];
float* output_buffer_host = nullptr;
float* output_seg_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
&output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);
// // batch predict
for (size_t i = 0; i < images.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
for (size_t j = i; j < i + kBatchSize && j < images.size(); j++) {
img_batch.push_back(images[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
for (size_t b = 0; b < img_batch.size(); b++) {
auto& res = res_batch[b];
auto mask = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res);
masks.push_back(mask);
}
}
else if (cuda_post_process == "g") {
// Process gpu decode and nms results
// batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
// todo seg in gpu
std::cerr << "seg_postprocess is not support in gpu right now" << std::endl;
}
// push back to return
detections.insert(detections.end(), res_batch.begin(), res_batch.end());
}
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(device_buffers[2]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
delete[] output_seg_buffer_host;
}
bool trt_yolov8_seg_detector::wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type) {
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (sub_type[0] == 'n') { // yolov8n
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') { // yolov8s
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') { // yolov8m
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') { // yolov8l
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') { // yolov8x
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false; // not support
}
if (sub_type.size() == 2 && sub_type[1] == '6') { // yolov8n6/yolov8s6/yolov8m6/yolov8l6/yolov8x6
is_p = 6;
} else if (sub_type.size() == 2 && sub_type[1] == '2') { // yolov8n2/yolov8s2/yolov8m2/yolov8l2/yolov8x2
is_p = 2;
}
serialize_engine(wts_name, engine_name, sub_type, gd, gw, max_channels);
return true;
}
}

View File

@@ -0,0 +1,55 @@
#pragma once
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "include/cuda_utils.h"
#include "include/logging.h"
#include "include/model.h"
#include "include/postprocess.h"
#include "include/preprocess.h"
#include "include/utils.h"
namespace trt_yolov8 {
using namespace nvinfer1;
class trt_yolov8_seg_detector
{
private:
/* data */
cv::Rect get_downscale_rect(float bbox[4], float scale);
std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets);
void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw,
int& max_channels);
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context);
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process);
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
std::string cuda_post_process);
Logger gLogger;
const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 51) / sizeof(float) + 1;
const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);
std::string cuda_post_process = "c";
int model_bboxes;
cudaStream_t stream;
// Deserialize the engine from file
nvinfer1::IRuntime* runtime = nullptr;
nvinfer1::ICudaEngine* engine = nullptr;
nvinfer1::IExecutionContext* context = nullptr;
public:
trt_yolov8_seg_detector(std::string model_path = "");
~trt_yolov8_seg_detector();
// detect
void detect(std::vector<cv::Mat> images, std::vector<std::vector<Detection>>& detections, std::vector<std::vector<cv::Mat>>& masks);
// serialize wts to plan file for segment
// sub_type: [ n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 ]
bool wts_2_engine(std::string wts_name, std::string engine_name, std::string sub_type);
};
}