Files
2026-06-03 12:43:14 +08:00

335 lines
12 KiB
Plaintext
Executable File

#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "../include/cuda_utils.h"
#include "../include/types.h"
#include "yololayer.h"
namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
void read(const char*& buffer, T& val) {
val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
}
} // namespace Tn
__device__ float sigmoid(float x) {
return 1.0f / (1.0f + exp(-x));
}
namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
int netHeight, int maxOut, bool is_segmentation, bool is_pose, const int* strides,
int stridesLength) {
mClassCount = classCount;
mNumberofpoints = numberofpoints;
mConfthreshkeypoints = confthreshkeypoints;
mYoloV8NetWidth = netWidth;
mYoloV8netHeight = netHeight;
mMaxOutObject = maxOut;
mStridesLength = stridesLength;
mStrides = new int[stridesLength];
memcpy(mStrides, strides, stridesLength * sizeof(int));
is_segmentation_ = is_segmentation;
is_pose_ = is_pose;
}
YoloLayerPlugin::~YoloLayerPlugin() {
if (mStrides != nullptr) {
delete[] mStrides;
mStrides = nullptr;
}
}
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
using namespace Tn;
const char *d = reinterpret_cast<const char*>(data), *a = d;
read(d, mClassCount);
read(d, mNumberofpoints);
read(d, mConfthreshkeypoints);
read(d, mThreadCount);
read(d, mYoloV8NetWidth);
read(d, mYoloV8netHeight);
read(d, mMaxOutObject);
read(d, mStridesLength);
mStrides = new int[mStridesLength];
for (int i = 0; i < mStridesLength; ++i) {
read(d, mStrides[i]);
}
read(d, is_segmentation_);
read(d, is_pose_);
assert(d == a + length);
}
void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
using namespace Tn;
char *d = static_cast<char*>(buffer), *a = d;
write(d, mClassCount);
write(d, mNumberofpoints);
write(d, mConfthreshkeypoints);
write(d, mThreadCount);
write(d, mYoloV8NetWidth);
write(d, mYoloV8netHeight);
write(d, mMaxOutObject);
write(d, mStridesLength);
for (int i = 0; i < mStridesLength; ++i) {
write(d, mStrides[i]);
}
write(d, is_segmentation_);
write(d, is_pose_);
assert(d == a + getSerializationSize());
}
size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_);
}
int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
return 0;
}
nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) TRT_NOEXCEPT {
int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
return nvinfer1::Dims3(total_size + 1, 1, 1);
}
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
mPluginNamespace = pluginNamespace;
}
const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
return mPluginNamespace;
}
nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return nvinfer1::DataType::kFLOAT;
}
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
int nbInputs) const TRT_NOEXCEPT {
return false;
}
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
return false;
}
void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
return "YoloLayer_TRT";
}
const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
return "1";
}
void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
delete this;
}
nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
YoloLayerPlugin* p =
new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
mMaxOutObject, is_segmentation_, is_pose_, mStrides, mStridesLength);
p->setPluginNamespace(mPluginNamespace);
return p;
}
int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
return 0;
}
__device__ float Logist(float data) {
return 1.0f / (1.0f + expf(-data));
};
__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
bool is_segmentation, bool is_pose) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx >= numElements)
return;
const int N_kpts = nk;
int total_grid = grid_h * grid_w;
int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0);
int batchIdx = idx / total_grid;
int elemIdx = idx % total_grid;
const float* curInput = input + batchIdx * total_grid * info_len;
int outputIdx = batchIdx * outputElem;
int class_id = 0;
float max_cls_prob = 0.0;
for (int i = 4; i < 4 + classes; i++) {
float p = Logist(curInput[elemIdx + i * total_grid]);
if (p > max_cls_prob) {
max_cls_prob = p;
class_id = i - 4;
}
}
if (max_cls_prob < 0.1)
return;
int count = (int)atomicAdd(output + outputIdx, 1);
if (count >= maxoutobject)
return;
char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
Detection* det = (Detection*)(data);
int row = elemIdx / grid_w;
int col = elemIdx % grid_w;
det->conf = max_cls_prob;
det->class_id = class_id;
det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
if (is_segmentation) {
for (int k = 0; k < 32; ++k) {
det->mask[k] = curInput[elemIdx + (4 + classes + k) * total_grid];
}
}
if (is_pose) {
for (int kpt = 0; kpt < N_kpts; kpt++) {
int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3) * total_grid;
int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 1) * total_grid;
int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 2) * total_grid;
float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);
float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;
bool is_within_bbox =
kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];
if (kpt_confidence < confkeypoints || !is_within_bbox) {
det->keypoints[kpt * 3] = -1;
det->keypoints[kpt * 3 + 1] = -1;
det->keypoints[kpt * 3 + 2] = -1;
} else {
det->keypoints[kpt * 3] = kpt_x;
det->keypoints[kpt * 3 + 1] = kpt_y;
det->keypoints[kpt * 3 + 2] = kpt_confidence;
}
}
}
}
void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
int mYoloV8NetWidth, int batchSize) {
int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
cudaMemsetAsync(output, 0, sizeof(float), stream);
for (int idx = 0; idx < batchSize; ++idx) {
CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
}
int numElem = 0;
const int maxGrids = mStridesLength;
int grids[maxGrids][2];
for (int i = 0; i < maxGrids; ++i) {
grids[i][0] = mYoloV8netHeight / mStrides[i];
grids[i][1] = mYoloV8NetWidth / mStrides[i];
}
for (unsigned int i = 0; i < maxGrids; i++) {
int grid_h = grids[i][0];
int grid_w = grids[i][1];
int stride = mStrides[i];
numElem = grid_h * grid_w * batchSize;
if (numElem < mThreadCount)
mThreadCount = numElem;
CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_);
}
}
PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
YoloPluginCreator::YoloPluginCreator() {
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
return "YoloLayer_TRT";
}
const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
return "1";
}
const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
return &mFC;
}
IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
assert(fc->nbFields == 1);
assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
int netinfo_count = 8;
int class_count = combinedInfo[0];
int numberofpoints = combinedInfo[1];
float confthreshkeypoints = combinedInfo[2];
int input_w = combinedInfo[3];
int input_h = combinedInfo[4];
int max_output_object_count = combinedInfo[5];
bool is_segmentation = combinedInfo[6];
bool is_pose = combinedInfo[7];
const int* px_arry = combinedInfo + netinfo_count;
int px_arry_length = fc->fields[0].length - netinfo_count;
YoloLayerPlugin* obj =
new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
max_output_object_count, is_segmentation, is_pose, px_arry, px_arry_length);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT {
// This object will be deleted when the network is destroyed, which will
// call YoloLayerPlugin::destroy()
YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
} // namespace nvinfer1