#include #include #include #include #include "../include/cuda_utils.h" #include "../include/types.h" #include "yololayer.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, const int* strides, int stridesLength) { mClassCount = classCount; mNumberofpoints = numberofpoints; mConfthreshkeypoints = confthreshkeypoints; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; mStridesLength = stridesLength; mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); is_segmentation_ = is_segmentation; is_pose_ = is_pose; } YoloLayerPlugin::~YoloLayerPlugin() { if (mStrides != nullptr) { delete[] mStrides; mStrides = nullptr; } } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mNumberofpoints); read(d, mConfthreshkeypoints); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); read(d, mMaxOutObject); read(d, mStridesLength); mStrides = new int[mStridesLength]; for (int i = 0; i < mStridesLength; ++i) { read(d, mStrides[i]); } read(d, is_segmentation_); read(d, is_pose_); assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mNumberofpoints); write(d, mConfthreshkeypoints); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); write(d, mMaxOutObject); write(d, mStridesLength); for (int i = 0; i < mStridesLength; ++i) { write(d, mStrides[i]); } write(d, is_segmentation_); write(d, is_pose_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_, is_pose_, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem, bool is_segmentation, bool is_pose) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; const int N_kpts = nk; int total_grid = grid_h * grid_w; int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0); int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); if (count >= maxoutobject) return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; if (is_segmentation) { for (int k = 0; k < 32; ++k) { det->mask[k] = curInput[elemIdx + (4 + classes + k) * total_grid]; } } if (is_pose) { for (int kpt = 0; kpt < N_kpts; kpt++) { int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3) * total_grid; int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 1) * total_grid; int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 2) * total_grid; float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]); float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride; float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride; bool is_within_bbox = kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3]; if (kpt_confidence < confkeypoints || !is_within_bbox) { det->keypoints[kpt * 3] = -1; det->keypoints[kpt * 3 + 1] = -1; det->keypoints[kpt * 3 + 2] = -1; } else { det->keypoints[kpt * 3] = kpt_x; det->keypoints[kpt * 3 + 1] = kpt_y; det->keypoints[kpt * 3 + 2] = kpt_confidence; } } } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; const int maxGrids = mStridesLength; int grids[maxGrids][2]; for (int i = 0; i < maxGrids; ++i) { grids[i][0] = mYoloV8netHeight / mStrides[i]; grids[i][1] = mYoloV8NetWidth / mStrides[i]; } for (unsigned int i = 0; i < maxGrids; i++) { int grid_h = grids[i][0]; int grid_w = grids[i][1]; int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints, mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_); } } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); int netinfo_count = 8; int class_count = combinedInfo[0]; int numberofpoints = combinedInfo[1]; float confthreshkeypoints = combinedInfo[2]; int input_w = combinedInfo[3]; int input_h = combinedInfo[4]; int max_output_object_count = combinedInfo[5]; bool is_segmentation = combinedInfo[6]; bool is_pose = combinedInfo[7]; const int* px_arry = combinedInfo + netinfo_count; int px_arry_length = fc->fields[0].length - netinfo_count; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h, max_output_object_count, is_segmentation, is_pose, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1