first commit

2026-06-03 12:43:14 +08:00
commit ba76cfae28
608 changed files with 120791 additions and 0 deletions
--- a/nodes/infers/README.md
+++ b/nodes/infers/README.md
@@ -0,0 +1,5 @@
+# Summary
+
+1. If the inference node works on the whole frame, let it derived from `vp_primary_infer_node` class.
+2. If the inference node works on the small cropped images, let it derived from `vp_secondary_infer_node` class.
+3. We can define multi derived classes to handle different types of dl models (`detector/pose estimation/classification`). Also, if they work on the same type of target AND with the same logic (for example, hava the same `preprocess/postprocess`), we can use a unique class to load different deep learning models (like resnet18 and resnet50).
--- a/nodes/infers/vp_classifier_node.cpp
+++ b/nodes/infers/vp_classifier_node.cpp
@@ -0,0 +1,99 @@
+
+
+#include "vp_classifier_node.h"
+
+namespace vp_nodes {
+    
+    vp_classifier_node::vp_classifier_node(std::string node_name, 
+                                            std::string model_path, 
+                                            std::string model_config_path, 
+                                            std::string labels_path, 
+                                            int input_width, 
+                                            int input_height, 
+                                            int batch_size,
+                                            std::vector<int> p_class_ids_applied_to,
+                                            int min_width_applied_to,
+                                            int min_height_applied_to,
+                                            int crop_padding,
+                                            bool need_softmax,
+                                            float scale,
+                                            cv::Scalar mean,
+                                            cv::Scalar std,
+                                            bool swap_rb,
+                                            bool swap_chn):
+                                            vp_secondary_infer_node(node_name,
+                                                                    model_path,
+                                                                    model_config_path,
+                                                                    labels_path,
+                                                                    input_width, 
+                                                                    input_height,
+                                                                    batch_size,
+                                                                    p_class_ids_applied_to,
+                                                                    min_width_applied_to,
+                                                                    min_height_applied_to,
+                                                                    crop_padding,
+                                                                    scale,
+                                                                    mean,
+                                                                    std,
+                                                                    swap_rb,
+                                                                    swap_chn), need_softmax(need_softmax){
+        this->initialized();
+    }
+    
+    vp_classifier_node::~vp_classifier_node() {
+        deinitialized();
+    }
+
+    void vp_classifier_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        // make sure heads of output are not zero
+        assert(raw_outputs.size() > 0);
+        assert(frame_meta_with_batch.size() == 1);
+
+        // just one head of output
+        auto& output = raw_outputs[0];
+        assert(output.dims == 2);
+
+        auto count = output.rows;
+        auto index = 0;
+
+        auto& frame_meta = frame_meta_with_batch[0];
+        cv::Point class_id_point;
+        int class_id;
+        double score;
+
+        for (int i = 0; i < count; i++) {
+            for (int j = index; j < frame_meta->targets.size(); j++)
+            {
+                if (!need_apply(frame_meta->targets[j]->primary_class_id, frame_meta->targets[j]->width, frame_meta->targets[j]->height)) {
+                    // continue as its primary_class_id is not in p_class_ids_applied_to
+                    continue;
+                }
+                
+                auto prob = output.row(i);
+                cv::minMaxLoc(prob, 0, &score, 0, &class_id_point);
+
+                if (need_softmax) {
+                    float maxProb = 0.0;
+                    float sum = 0.0;
+                    cv::Mat softmaxProb;
+                    maxProb = *std::max_element(prob.begin<float>(), prob.end<float>());
+                    cv::exp(prob - maxProb, softmaxProb);
+                    sum = (float)cv::sum(softmaxProb)[0];
+                    softmaxProb /= sum;
+                    minMaxLoc(softmaxProb.reshape(1, 1), 0, &score, 0, &class_id_point);
+                }
+                class_id = class_id_point.x;
+                auto label = (labels.size() < class_id + 1) ? "" : labels[class_id];
+
+                // update back to frame meta
+                frame_meta->targets[j]->secondary_class_ids.push_back(class_id);
+                frame_meta->targets[j]->secondary_scores.push_back(score);
+                frame_meta->targets[j]->secondary_labels.push_back(label);
+
+                // break as we found the right target!
+                index = j + 1;
+                break;
+            }
+        }
+    }
+}
--- a/nodes/infers/vp_classifier_node.h
+++ b/nodes/infers/vp_classifier_node.h
@@ -0,0 +1,38 @@
+
+
+#pragma once
+
+#include "../vp_secondary_infer_node.h"
+
+namespace vp_nodes {
+    // common classifier for image classification task.
+    // used for image classification, update secondary_class_ids/secondary_labels/secondary_scores of vp_frame_target.
+    class vp_classifier_node: public vp_secondary_infer_node
+    {
+    private:
+        // softmax logic applied on output or not
+        bool need_softmax;
+    protected:
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_classifier_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path = "", 
+                            std::string labels_path = "", 
+                            int input_width = 128, 
+                            int input_height = 128, 
+                            int batch_size = 1,
+                            std::vector<int> p_class_ids_applied_to = std::vector<int>(),
+                            int min_width_applied_to = 0,
+                            int min_height_applied_to = 0,
+                            int crop_padding = 10,
+                            bool need_softmax = true,  // imagenet dataset
+                            float scale = 1 / 255.0,
+                            cv::Scalar mean = cv::Scalar(123.675, 116.28, 103.53),  // imagenet dataset
+                            cv::Scalar std = cv::Scalar(0.229, 0.224, 0.225),
+                            bool swap_rb = true,
+                            bool swap_chn = false);
+        ~vp_classifier_node();
+    };
+
+}
--- a/nodes/infers/vp_enet_seg_node.cpp
+++ b/nodes/infers/vp_enet_seg_node.cpp
@@ -0,0 +1,33 @@
+
+#include "vp_enet_seg_node.h"
+
+namespace vp_nodes {
+
+    vp_enet_seg_node::vp_enet_seg_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path, 
+                            std::string labels_path, 
+                            int input_width, 
+                            int input_height, 
+                            int batch_size,
+                            int class_id_offset,
+                            float scale,
+                            cv::Scalar mean,
+                            cv::Scalar std,
+                            bool swap_rb):
+                            vp_primary_infer_node(node_name, model_path, model_config_path, labels_path, input_width, input_height, batch_size, class_id_offset, scale, mean, std, swap_rb) {
+        this->initialized();
+    }
+
+    vp_enet_seg_node::~vp_enet_seg_node() {
+        deinitialized();
+    }
+
+    void vp_enet_seg_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        auto& frame_meta = frame_meta_with_batch[0];
+        auto& mask = raw_outputs[0];
+
+        // save output as mask directly， this is the mask for the whole frame.
+        frame_meta->mask = mask.clone();
+    }
+}
--- a/nodes/infers/vp_enet_seg_node.h
+++ b/nodes/infers/vp_enet_seg_node.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+
+
+namespace vp_nodes {
+    // semantic segmentation based on ENet
+    // 
+    class vp_enet_seg_node: public vp_primary_infer_node
+    {
+    private:
+        /* data */
+    protected:
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_enet_seg_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path = "", 
+                            std::string labels_path = "", 
+                            int input_width = 1024, 
+                            int input_height = 512, 
+                            int batch_size = 1,
+                            int class_id_offset = 0,
+                            float scale = 1 / 255.0,
+                            cv::Scalar mean = cv::Scalar(0),
+                            cv::Scalar std = cv::Scalar(1),
+                            bool swap_rb = true);
+        ~vp_enet_seg_node();
+    };
+}
--- a/nodes/infers/vp_face_swap_node.cpp
+++ b/nodes/infers/vp_face_swap_node.cpp
@@ -0,0 +1,528 @@
+
+#include "vp_face_swap_node.h"
+
+namespace vp_nodes {
+    
+    vp_face_swap_node::vp_face_swap_node(std::string node_name, 
+                                        std::string yunet_face_detect_model,
+                                        std::string buffalo_l_face_encoding_model,
+                                        std::string emap_file_for_embeddings,
+                                        std::string insightface_swap_model,
+                                        std::string swap_source_image,
+                                        int swap_source_face_index,
+                                        int min_face_w_h,
+                                        bool swap_on_osd,
+                                        bool act_as_primary_detector):
+                                        vp_primary_infer_node(node_name, ""),
+                                        act_as_primary_detector(act_as_primary_detector),
+                                        swap_on_osd(swap_on_osd),
+                                        min_face_w_h(min_face_w_h) {
+        /* init net*/
+        face_extract_net = cv::dnn::readNetFromONNX(yunet_face_detect_model);
+        face_encoding_net = cv::dnn::readNetFromONNX(buffalo_l_face_encoding_model);
+        face_swap_net = cv::dnn::readNetFromONNX(insightface_swap_model);
+        #ifdef VP_WITH_CUDA
+        face_extract_net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
+        face_extract_net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
+        face_encoding_net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
+        face_encoding_net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
+        face_swap_net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
+        face_swap_net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
+        #endif
+        init_source_face_embeddings(swap_source_image, swap_source_face_index, emap_file_for_embeddings);
+        this->initialized();
+    }
+    
+    vp_face_swap_node::~vp_face_swap_node() {
+        deinitialized();
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_face_swap_node::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        if (frame_meta->face_targets.size() == 0) {
+            if (!act_as_primary_detector) {
+                return;
+            }
+            // extract faces and update back to frame meta
+            // to-do
+        }
+        
+        auto start_time = std::chrono::system_clock::now();
+        // iterate each face target
+        for (int i = 0; i < frame_meta->face_targets.size(); i++) {
+            if (frame_meta->face_targets[i]->width < min_face_w_h || frame_meta->face_targets[i]->height < min_face_w_h) {
+                continue;
+            }
+            
+            cv::Mat aligned_face, swapped_face;
+
+            //auto t = std::chrono::system_clock::now();
+            // align and crop first
+            auto warp_mat = align_crop(frame_meta->frame, frame_meta->face_targets[i]->key_points, aligned_face);
+            //auto T = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - t).count();
+
+            //std::cout << "1st T:" << T << std::endl;
+            //t = std::chrono::system_clock::now();
+            // swap
+            swap(aligned_face, swapped_face);
+            //T = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - t).count();
+            //std::cout << "2nd T:" << T << std::endl;
+
+            //t = std::chrono::system_clock::now();
+            // past back to frame or osd frame
+            if (swap_on_osd) {
+                if (frame_meta->osd_frame.empty()) {
+                    frame_meta->osd_frame = frame_meta->frame.clone();
+                }
+            }
+            auto& bg = swap_on_osd ? frame_meta->osd_frame : frame_meta->frame;
+            paste_back(bg, swapped_face, warp_mat);
+            //T = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - t).count();
+            //std::cout << "3rd T:" << T << std::endl;
+        }
+
+        //auto total_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time).count();
+        //std::cout << "total T:" << total_time << std::endl;
+    }
+
+    cv::Mat vp_face_swap_node::read_emap_mat_from_txt(const std::string& emap_file_for_embeddings, int rows, int cols) {
+        std::ifstream file(emap_file_for_embeddings);
+
+        cv::Mat matrix(rows, cols, CV_32F);
+        std::string line;
+        for (int i = 0; i < rows; ++i) {
+            std::getline(file, line);
+            std::istringstream iss(line);
+            for (int j = 0; j < cols; ++j) {
+                float value;
+                assert(iss >> value);
+                matrix.at<float>(i, j) = value;
+            }
+        }
+        file.close();
+        return matrix;
+    }
+
+    cv::Mat vp_face_swap_node::process_embeddings_using_emap(const cv::Mat& source_face_normed_embedding, const cv::Mat& emap) {
+        cv::Mat latent = source_face_normed_embedding.clone().reshape(1, 1);
+        cv::Mat result = latent * emap;
+        cv::normalize(result, result);
+        return result;
+    }
+
+    cv::Mat vp_face_swap_node::get_similarity_transform_matrix(float src[5][2]) {
+        using namespace cv;
+        //float dst[5][2] = { {38.2946f, 51.6963f}, {73.5318f, 51.5014f}, {56.0252f, 71.7366f}, {41.5493f, 92.3655f}, {70.7299f, 92.2041f} };  // for 112*112
+        float dst[5][2] = { {43.0f, 58.0f}, {85.0f, 58.0f}, {64.0f, 81.0f}, {47.0f, 105.0f}, {80.0f, 105.0f} };  // for 128*128
+        //float dst[5][2] = { {38.0f, 54.0f}, {90.0f, 54.0f}, {64.0f, 85.0f}, {47.0f, 109.0f}, {80.0f, 109.0f} };  // for 128*128, zoom out
+        float avg0 = (src[0][0] + src[1][0] + src[2][0] + src[3][0] + src[4][0]) / 5;
+        float avg1 = (src[0][1] + src[1][1] + src[2][1] + src[3][1] + src[4][1]) / 5;
+        //Compute mean of src and dst.
+        float src_mean[2] = { avg0, avg1 };
+        float dst_mean[2] = { 56.0262f, 71.9008f };
+        //Subtract mean from src and dst.
+        float src_demean[5][2];
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 5; j++)
+            {
+                src_demean[j][i] = src[j][i] - src_mean[i];
+            }
+        }
+        float dst_demean[5][2];
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 5; j++)
+            {
+                dst_demean[j][i] = dst[j][i] - dst_mean[i];
+            }
+        }
+        double A00 = 0.0, A01 = 0.0, A10 = 0.0, A11 = 0.0;
+        for (int i = 0; i < 5; i++)
+            A00 += dst_demean[i][0] * src_demean[i][0];
+        A00 = A00 / 5;
+        for (int i = 0; i < 5; i++)
+            A01 += dst_demean[i][0] * src_demean[i][1];
+        A01 = A01 / 5;
+        for (int i = 0; i < 5; i++)
+            A10 += dst_demean[i][1] * src_demean[i][0];
+        A10 = A10 / 5;
+        for (int i = 0; i < 5; i++)
+            A11 += dst_demean[i][1] * src_demean[i][1];
+        A11 = A11 / 5;
+        Mat A = (Mat_<double>(2, 2) << A00, A01, A10, A11);
+        double d[2] = { 1.0, 1.0 };
+        double detA = A00 * A11 - A01 * A10;
+        if (detA < 0)
+            d[1] = -1;
+        double T[3][3] = { {1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0} };
+        Mat s, u, vt, v;
+        SVD::compute(A, s, u, vt);
+        double smax = s.ptr<double>(0)[0]>s.ptr<double>(1)[0] ? s.ptr<double>(0)[0] : s.ptr<double>(1)[0];
+        double tol = smax * 2 * FLT_MIN;
+        int rank = 0;
+        if (s.ptr<double>(0)[0]>tol)
+            rank += 1;
+        if (s.ptr<double>(1)[0]>tol)
+            rank += 1;
+        double arr_u[2][2] = { {u.ptr<double>(0)[0], u.ptr<double>(0)[1]}, {u.ptr<double>(1)[0], u.ptr<double>(1)[1]} };
+        double arr_vt[2][2] = { {vt.ptr<double>(0)[0], vt.ptr<double>(0)[1]}, {vt.ptr<double>(1)[0], vt.ptr<double>(1)[1]} };
+        double det_u = arr_u[0][0] * arr_u[1][1] - arr_u[0][1] * arr_u[1][0];
+        double det_vt = arr_vt[0][0] * arr_vt[1][1] - arr_vt[0][1] * arr_vt[1][0];
+        if (rank == 1)
+        {
+            if ((det_u*det_vt) > 0)
+            {
+                Mat uvt = u*vt;
+                T[0][0] = uvt.ptr<double>(0)[0];
+                T[0][1] = uvt.ptr<double>(0)[1];
+                T[1][0] = uvt.ptr<double>(1)[0];
+                T[1][1] = uvt.ptr<double>(1)[1];
+            }
+            else
+            {
+                double temp = d[1];
+                d[1] = -1;
+                Mat D = (Mat_<double>(2, 2) << d[0], 0.0, 0.0, d[1]);
+                Mat Dvt = D*vt;
+                Mat uDvt = u*Dvt;
+                T[0][0] = uDvt.ptr<double>(0)[0];
+                T[0][1] = uDvt.ptr<double>(0)[1];
+                T[1][0] = uDvt.ptr<double>(1)[0];
+                T[1][1] = uDvt.ptr<double>(1)[1];
+                d[1] = temp;
+            }
+        }
+        else
+        {
+            Mat D = (Mat_<double>(2, 2) << d[0], 0.0, 0.0, d[1]);
+            Mat Dvt = D*vt;
+            Mat uDvt = u*Dvt;
+            T[0][0] = uDvt.ptr<double>(0)[0];
+            T[0][1] = uDvt.ptr<double>(0)[1];
+            T[1][0] = uDvt.ptr<double>(1)[0];
+            T[1][1] = uDvt.ptr<double>(1)[1];
+        }
+        double var1 = 0.0;
+        for (int i = 0; i < 5; i++)
+            var1 += src_demean[i][0] * src_demean[i][0];
+        var1 = var1 / 5;
+        double var2 = 0.0;
+        for (int i = 0; i < 5; i++)
+            var2 += src_demean[i][1] * src_demean[i][1];
+        var2 = var2 / 5;
+        double scale = 1.0 / (var1 + var2)* (s.ptr<double>(0)[0] * d[0] + s.ptr<double>(1)[0] * d[1]);
+        double TS[2];
+        TS[0] = T[0][0] * src_mean[0] + T[0][1] * src_mean[1];
+        TS[1] = T[1][0] * src_mean[0] + T[1][1] * src_mean[1];
+        T[0][2] = dst_mean[0] - scale*TS[0];
+        T[1][2] = dst_mean[1] - scale*TS[1];
+        T[0][0] *= scale;
+        T[0][1] *= scale;
+        T[1][0] *= scale;
+        T[1][1] *= scale;
+        Mat transform_mat = (Mat_<double>(2, 3) << T[0][0], T[0][1], T[0][2], T[1][0], T[1][1], T[1][2]);
+        return transform_mat;
+    }
+
+    cv::Mat vp_face_swap_node::align_crop(cv::Mat& src_img, std::vector<std::pair<int, int>>& kps, cv::Mat& aligned_image) {
+        float face_keypoints[5][2] = 
+            {{kps[0].first, kps[0].second}, 
+            {kps[1].first, kps[1].second}, 
+            {kps[2].first, kps[2].second},  
+            {kps[3].first, kps[3].second}, 
+            {kps[4].first, kps[4].second}};
+
+        cv::Mat warp_mat = get_similarity_transform_matrix(face_keypoints);
+        cv::warpAffine(src_img, aligned_image, warp_mat, cv::Size(128, 128), cv::INTER_LINEAR);
+        return warp_mat;
+    }
+    
+    void vp_face_swap_node::extract_faces(const cv::Mat& frame, std::vector<face_box>& face_boxes) {
+        auto blob_to_infer = cv::dnn::blobFromImage(frame);
+        face_extract_net.setInput(blob_to_infer);
+        const std::vector<std::string> out_names = {"loc", "conf", "iou"};
+        std::vector<cv::Mat> raw_outputs;
+        face_extract_net.forward(raw_outputs, out_names);
+
+        using namespace cv;
+        float scoreThreshold = 0.7;
+        float nmsThreshold = 0.5;
+        int topK = 50;
+        int inputW = frame.cols;
+        int inputH = frame.rows;
+
+        // 3 heads of output
+        assert(raw_outputs.size() == 3);
+
+        // Extract from output_blobs
+        Mat loc = raw_outputs[0];
+        Mat conf = raw_outputs[1];
+        Mat iou = raw_outputs[2];
+
+        // we need generate priors if input size changed or priors is not initialized
+        if (loc.rows != priors.size()) {
+            generatePriors(inputW, inputH);
+        }
+        
+        assert(loc.rows == priors.size());
+        assert(loc.rows == conf.rows);
+        assert(loc.rows == iou.rows);
+
+        // Decode from deltas and priors
+        const std::vector<float> variance = {0.1f, 0.2f};
+        float* loc_v = (float*)(loc.data);
+        float* conf_v = (float*)(conf.data);
+        float* iou_v = (float*)(iou.data);
+        Mat faces;
+        // (tl_x, tl_y, w, h, re_x, re_y, le_x, le_y, nt_x, nt_y, rcm_x, rcm_y, lcm_x, lcm_y, score)
+        // 'tl': top left point of the bounding box
+        // 're': right eye, 'le': left eye
+        // 'nt':  nose tip
+        // 'rcm': right corner of mouth, 'lcm': left corner of mouth
+        Mat face(1, 15, CV_32FC1);
+        for (size_t i = 0; i < priors.size(); ++i) {
+            // Get score
+            float clsScore = conf_v[i*2+1];
+            float iouScore = iou_v[i];
+            // Clamp
+            if (iouScore < 0.f) {
+                iouScore = 0.f;
+            }
+            else if (iouScore > 1.f) {
+                iouScore = 1.f;
+            }
+            float score = std::sqrt(clsScore * iouScore);
+            face.at<float>(0, 14) = score;
+
+            // Get bounding box
+            float cx = (priors[i].x + loc_v[i*14+0] * variance[0] * priors[i].width)  * inputW;
+            float cy = (priors[i].y + loc_v[i*14+1] * variance[0] * priors[i].height) * inputH;
+            float w  = priors[i].width  * exp(loc_v[i*14+2] * variance[0]) * inputW;
+            float h  = priors[i].height * exp(loc_v[i*14+3] * variance[1]) * inputH;
+            float x1 = cx - w / 2;
+            float y1 = cy - h / 2;
+            face.at<float>(0, 0) = x1;
+            face.at<float>(0, 1) = y1;
+            face.at<float>(0, 2) = w;
+            face.at<float>(0, 3) = h;
+
+            // Get landmarks
+            face.at<float>(0, 4) = (priors[i].x + loc_v[i*14+ 4] * variance[0] * priors[i].width)  * inputW;  // right eye, x
+            face.at<float>(0, 5) = (priors[i].y + loc_v[i*14+ 5] * variance[0] * priors[i].height) * inputH;  // right eye, y
+            face.at<float>(0, 6) = (priors[i].x + loc_v[i*14+ 6] * variance[0] * priors[i].width)  * inputW;  // left eye, x
+            face.at<float>(0, 7) = (priors[i].y + loc_v[i*14+ 7] * variance[0] * priors[i].height) * inputH;  // left eye, y
+            face.at<float>(0, 8) = (priors[i].x + loc_v[i*14+ 8] * variance[0] * priors[i].width)  * inputW;  // nose tip, x
+            face.at<float>(0, 9) = (priors[i].y + loc_v[i*14+ 9] * variance[0] * priors[i].height) * inputH;  // nose tip, y
+            face.at<float>(0, 10) = (priors[i].x + loc_v[i*14+10] * variance[0] * priors[i].width)  * inputW; // right corner of mouth, x
+            face.at<float>(0, 11) = (priors[i].y + loc_v[i*14+11] * variance[0] * priors[i].height) * inputH; // right corner of mouth, y
+            face.at<float>(0, 12) = (priors[i].x + loc_v[i*14+12] * variance[0] * priors[i].width)  * inputW; // left corner of mouth, x
+            face.at<float>(0, 13) = (priors[i].y + loc_v[i*14+13] * variance[0] * priors[i].height) * inputH; // left corner of mouth, y
+
+            faces.push_back(face);
+        }
+
+        if (faces.rows > 1) {
+            // Retrieve boxes and scores
+            std::vector<Rect2i> faceBoxes;
+            std::vector<float> faceScores;
+            for (int rIdx = 0; rIdx < faces.rows; rIdx++)
+            {
+                faceBoxes.push_back(Rect2i(int(faces.at<float>(rIdx, 0)),
+                                           int(faces.at<float>(rIdx, 1)),
+                                           int(faces.at<float>(rIdx, 2)),
+                                           int(faces.at<float>(rIdx, 3))));
+                faceScores.push_back(faces.at<float>(rIdx, 14));
+            }
+
+            std::vector<int> keepIdx;
+            dnn::NMSBoxes(faceBoxes, faceScores, scoreThreshold, nmsThreshold, keepIdx, 1.f, topK);
+
+            // Get NMS results
+            Mat nms_faces;
+            for (int idx: keepIdx)
+            {
+                nms_faces.push_back(faces.row(idx));
+            }
+
+            // insert face target back to frame meta
+            for (int i = 0; i < nms_faces.rows; i++) {
+                auto x = int(nms_faces.at<float>(i, 0));
+                auto y = int(nms_faces.at<float>(i, 1));
+                auto w = int(nms_faces.at<float>(i, 2));
+                auto h = int(nms_faces.at<float>(i, 3));
+                
+                // check value range
+                x = std::max(x, 0);
+                y = std::max(y, 0);
+                w = std::min(w, frame.cols - x);
+                h = std::min(h, frame.rows - y);
+
+                auto kp1 = std::pair<int, int>(int(nms_faces.at<float>(i, 4)), int(nms_faces.at<float>(i, 5)));
+                auto kp2 = std::pair<int, int>(int(nms_faces.at<float>(i, 6)), int(nms_faces.at<float>(i, 7)));
+                auto kp3 = std::pair<int, int>(int(nms_faces.at<float>(i, 8)), int(nms_faces.at<float>(i, 9)));
+                auto kp4 = std::pair<int, int>(int(nms_faces.at<float>(i, 10)), int(nms_faces.at<float>(i, 11)));
+                auto kp5 = std::pair<int, int>(int(nms_faces.at<float>(i, 12)), int(nms_faces.at<float>(i, 13)));
+                auto score = nms_faces.at<float>(i, 14);
+
+                face_box face;
+                face.x = x;
+                face.y = y;
+                face.width = w;
+                face.height = h;
+                face.score = score;
+                face.kps = std::vector<std::pair<int, int>>{kp1, kp2, kp3, kp4, kp5};
+                face_boxes.push_back(face);
+            }
+        }
+    }
+
+    void vp_face_swap_node::init_source_face_embeddings(std::string& swap_source_image, int swap_source_face_index, std::string& emap_file_for_embeddings) {
+        std::vector<face_box> source_faces;
+        auto source_mat = cv::imread(swap_source_image);
+
+        // extract faces
+        extract_faces(source_mat, source_faces);
+
+        assert(source_faces.size() > 0);
+        // sort from left to right
+        std::sort(source_faces.begin(), source_faces.end(), [](face_box a, face_box b){ return a.x < b.x;});
+
+        auto the_selected_face = (swap_source_face_index < 0 || swap_source_face_index >= source_faces.size()) ? source_faces[0] : source_faces[swap_source_face_index];
+        cv::Mat aligned_face;
+        auto warp_mat = align_crop(source_mat, the_selected_face.kps, aligned_face);
+        // for debug
+        cv::imwrite("selected_source_face.jpg", aligned_face);
+
+        // read emap from file
+        auto emap = read_emap_mat_from_txt(emap_file_for_embeddings);
+
+        // encoding for the selected face (infer for only 1 time)
+        cv::Mat source_blob = cv::dnn::blobFromImage(aligned_face, 1 / 127.5, cv::Size(112, 112), (127.5, 127.5, 127.5), true);
+        std::vector<cv::Mat> source_outputs;
+        face_encoding_net.setInput(source_blob);
+        face_encoding_net.forward(source_outputs, face_encoding_net.getUnconnectedOutLayersNames());
+
+        auto& source_output = source_outputs[0];
+        // process using emap
+        source_face_embeddings = process_embeddings_using_emap(source_output, emap);
+    }
+
+    void vp_face_swap_node::swap(cv::Mat& aligned_face, cv::Mat& swapped_face) {
+        cv::Mat target_blob = cv::dnn::blobFromImage(aligned_face, 1 / 255.0, cv::Size(128, 128), (0, 0, 0), true);
+        std::vector<cv::Mat> target_outputs;
+        face_swap_net.setInput(target_blob, "target");
+        face_swap_net.setInput(source_face_embeddings, "source");
+        face_swap_net.forward(target_outputs, face_swap_net.getUnconnectedOutLayersNames());
+
+        auto& output = target_outputs[0];
+        cv::Mat output_channel_last;
+        cv::transposeND(output, {0,2,3,1}, output_channel_last);
+
+        cv::Mat img_fake(output_channel_last.size[1], output_channel_last.size[2], CV_32FC3, output_channel_last.data);
+        cv::cvtColor(img_fake, swapped_face, cv::COLOR_RGB2BGR);
+    }
+
+    void vp_face_swap_node::paste_back(cv::Mat& bg, cv::Mat& swapped_face, const cv::Mat& transform_matrix) {
+        bg.convertTo(bg, CV_32FC3, 1.0 / 255);
+
+        cv::Mat IM;
+        cv::invertAffineTransform(transform_matrix, IM);
+        cv::Mat img_mask(swapped_face.rows, swapped_face.cols, CV_32FC1, 1);
+        cv::warpAffine(swapped_face, swapped_face, IM, bg.size());
+        cv::warpAffine(img_mask, img_mask, IM, bg.size());
+
+        // create mask
+        cv::threshold(img_mask, img_mask, 0, 1, cv::THRESH_BINARY);
+        cv::Point min_loc, max_loc;
+        double min_val, max_val;
+        cv::minMaxLoc(img_mask, &min_val, &max_val, &min_loc, &max_loc);
+
+        int mask_h = max_loc.y - min_loc.y;
+        int mask_w = max_loc.x - min_loc.x;
+
+        int mask_size = std::sqrt(mask_h * mask_w);
+        int k = std::max(mask_size / 10, 10);
+
+        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(k, k));
+        cv::erode(img_mask, img_mask, kernel);
+
+        k = std::max(mask_size / 20, 5);
+        cv::GaussianBlur(img_mask, img_mask, cv::Size(2 * k + 1, 2 * k + 1), 0);
+        cv::cvtColor(img_mask, img_mask, cv::COLOR_GRAY2BGR);
+
+        // merge swapped face and original background
+        cv::Mat ones(bg.rows, bg.cols, CV_32FC3, cv::Scalar(1, 1, 1));
+        bg = img_mask.mul(swapped_face) + (ones - img_mask).mul(bg);
+        bg.convertTo(bg, CV_8U, 255);
+    }
+
+    void vp_face_swap_node::generatePriors(int inputW, int inputH) {
+         using namespace cv;
+        // Calculate shapes of different scales according to the shape of input image
+        Size feature_map_2nd = {
+            int(int((inputW+1)/2)/2), int(int((inputH+1)/2)/2)
+        };
+        Size feature_map_3rd = {
+            int(feature_map_2nd.width/2), int(feature_map_2nd.height/2)
+        };
+        Size feature_map_4th = {
+            int(feature_map_3rd.width/2), int(feature_map_3rd.height/2)
+        };
+        Size feature_map_5th = {
+            int(feature_map_4th.width/2), int(feature_map_4th.height/2)
+        };
+        Size feature_map_6th = {
+            int(feature_map_5th.width/2), int(feature_map_5th.height/2)
+        };
+
+        std::vector<Size> feature_map_sizes;
+        feature_map_sizes.push_back(feature_map_3rd);
+        feature_map_sizes.push_back(feature_map_4th);
+        feature_map_sizes.push_back(feature_map_5th);
+        feature_map_sizes.push_back(feature_map_6th);
+
+        // Fixed params for generating priors
+        const std::vector<std::vector<float>> min_sizes = {
+            {10.0f,  16.0f,  24.0f},
+            {32.0f,  48.0f},
+            {64.0f,  96.0f},
+            {128.0f, 192.0f, 256.0f}
+        };
+        CV_Assert(min_sizes.size() == feature_map_sizes.size()); // just to keep vectors in sync
+        const std::vector<int> steps = { 8, 16, 32, 64 };
+
+        // Generate priors
+        priors.clear();
+        for (size_t i = 0; i < feature_map_sizes.size(); ++i)
+        {
+            Size feature_map_size = feature_map_sizes[i];
+            std::vector<float> min_size = min_sizes[i];
+
+            for (int _h = 0; _h < feature_map_size.height; ++_h)
+            {
+                for (int _w = 0; _w < feature_map_size.width; ++_w)
+                {
+                    for (size_t j = 0; j < min_size.size(); ++j)
+                    {
+                        float s_kx = min_size[j] / inputW;
+                        float s_ky = min_size[j] / inputH;
+
+                        float cx = (_w + 0.5f) * steps[i] / inputW;
+                        float cy = (_h + 0.5f) * steps[i] / inputH;
+
+                        Rect2f prior = { cx, cy, s_kx, s_ky };
+                        priors.push_back(prior);
+                    }
+                }
+            }
+        }
+    }
+
+    void vp_face_swap_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
--- a/nodes/infers/vp_face_swap_node.h
+++ b/nodes/infers/vp_face_swap_node.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+
+namespace vp_nodes {
+    // face swap node
+    // used to swap faces in videos/images using a specific face
+    class vp_face_swap_node: public vp_primary_infer_node
+    {   
+    private:
+        // inner temporary use
+        struct face_box {
+            int x;
+            int y;
+            int width;
+            int height;
+            float score;
+            std::vector<std::pair<int, int>> kps;
+        };
+
+        /* onnx network using opencv::dnn as backend */
+        cv::dnn::Net face_extract_net;
+        cv::dnn::Net face_encoding_net;
+        cv::dnn::Net face_swap_net;
+
+        cv::Mat read_emap_mat_from_txt(const std::string& emap_file_for_embeddings, int rows = 512, int cols = 512);
+        cv::Mat process_embeddings_using_emap(const cv::Mat& source_face_normed_embedding, const cv::Mat& emap);
+
+        cv::Mat get_similarity_transform_matrix(float src[5][2]);
+        cv::Mat align_crop(cv::Mat& _src_img, std::vector<std::pair<int, int>>& kps, cv::Mat& _aligned_img);
+        
+        void extract_faces(const cv::Mat& frame, std::vector<face_box>& faces);
+        void init_source_face_embeddings(std::string& swap_source_image, int swap_source_face_index, std::string& emap_file_for_embeddings);
+        void swap(cv::Mat& aligned_face, cv::Mat& swapped_face);
+        void paste_back(cv::Mat& bg, cv::Mat& swapped_face, const cv::Mat& transform_matrix);
+
+        cv::Mat source_face_embeddings;
+        bool act_as_primary_detector = false;
+        bool swap_on_osd = true;
+
+        int min_face_w_h = 50;
+
+        /* used for extract faces */
+        std::vector<cv::Rect2f> priors;
+        void generatePriors(int inputW, int inputH);
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_face_swap_node(std::string node_name, 
+                            std::string yunet_face_detect_model,
+                            std::string buffalo_l_face_encoding_model,
+                            std::string emap_file_for_embeddings,
+                            std::string insightface_swap_model,
+                            std::string swap_source_image,
+                            int swap_source_face_index = 0,
+                            int min_face_w_h = 50,
+                            bool swap_on_osd = true,
+                            bool act_as_primary_detector = false);
+        ~vp_face_swap_node();
+    };
+}
--- a/nodes/infers/vp_feature_encoder_node.cpp
+++ b/nodes/infers/vp_feature_encoder_node.cpp
@@ -0,0 +1,15 @@
+
+#include "vp_feature_encoder_node.h"
+
+
+namespace vp_nodes {
+    
+    vp_feature_encoder_node::vp_feature_encoder_node(std::string node_name, std::string model_path):
+                                                    vp_secondary_infer_node(node_name, model_path) {
+        this->initialized();
+    }
+    
+    vp_feature_encoder_node::~vp_feature_encoder_node() {
+        deinitialized();        
+    } 
+}
--- a/nodes/infers/vp_feature_encoder_node.h
+++ b/nodes/infers/vp_feature_encoder_node.h
@@ -0,0 +1,18 @@
+
+#pragma once
+
+#include "../vp_secondary_infer_node.h"
+
+namespace vp_nodes {
+    // common feature encoder for image feature extraction.
+    // used for feature extraction, update embeddings of vp_frame_target.
+    class vp_feature_encoder_node: public vp_secondary_infer_node
+    {
+    private:
+        /* data */
+    public:
+        vp_feature_encoder_node(std::string node_name, std::string model_path);
+        ~vp_feature_encoder_node();
+    };
+
+}
--- a/nodes/infers/vp_lane_detector_node.cpp
+++ b/nodes/infers/vp_lane_detector_node.cpp
@@ -0,0 +1,34 @@
+
+#include "vp_lane_detector_node.h"
+
+namespace vp_nodes {
+
+    vp_lane_detector_node::vp_lane_detector_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path, 
+                            std::string labels_path, 
+                            int input_width, 
+                            int input_height, 
+                            int batch_size,
+                            int class_id_offset,
+                            float scale,
+                            cv::Scalar mean,
+                            cv::Scalar std,
+                            bool swap_rb,
+                            bool swap_chn):
+                            vp_primary_infer_node(node_name, model_path, model_config_path, labels_path, input_width, input_height, batch_size, class_id_offset, scale, mean, std, swap_rb, swap_chn) {
+        this->initialized();
+    }
+
+    vp_lane_detector_node::~vp_lane_detector_node() {
+        deinitialized();
+    }
+
+    void vp_lane_detector_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        auto& frame_meta = frame_meta_with_batch[0];
+        auto& mask = raw_outputs[0];
+
+        // save output as mask directly， this is the mask for the whole frame.
+        frame_meta->mask = mask.clone();
+    }
+}
--- a/nodes/infers/vp_lane_detector_node.h
+++ b/nodes/infers/vp_lane_detector_node.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+
+
+namespace vp_nodes {
+    // lane detect based on CenterNet
+    // 
+    class vp_lane_detector_node: public vp_primary_infer_node
+    {
+    private:
+        /* data */
+    protected:
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_lane_detector_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path = "", 
+                            std::string labels_path = "", 
+                            int input_width = 736, 
+                            int input_height = 416, 
+                            int batch_size = 1,
+                            int class_id_offset = 0,
+                            float scale = 1,
+                            cv::Scalar mean = cv::Scalar(0),
+                            cv::Scalar std = cv::Scalar(1),
+                            bool swap_rb = true,
+                            bool swap_chn = true);
+        ~vp_lane_detector_node();
+    };
+}
--- a/nodes/infers/vp_mask_rcnn_detector_node.cpp
+++ b/nodes/infers/vp_mask_rcnn_detector_node.cpp
@@ -0,0 +1,89 @@
+
+#include "vp_mask_rcnn_detector_node.h"
+
+
+namespace vp_nodes {
+    vp_mask_rcnn_detector_node::vp_mask_rcnn_detector_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path, 
+                            std::string labels_path, 
+                            int input_width, 
+                            int input_height, 
+                            int batch_size,
+                            int class_id_offset,
+                            float score_threshold,
+                            float scale,
+                            cv::Scalar mean,
+                            cv::Scalar std,
+                            bool swap_rb):
+                            vp_primary_infer_node(node_name, model_path, model_config_path, labels_path, input_width, input_height, batch_size, class_id_offset, scale, mean, std, swap_rb),
+                            score_threshold(score_threshold) {
+        this->initialized();
+    }
+    
+    vp_mask_rcnn_detector_node::~vp_mask_rcnn_detector_node() {
+        deinitialized();
+    }
+
+
+    void vp_mask_rcnn_detector_node::infer(const cv::Mat& blob_to_infer, std::vector<cv::Mat>& raw_outputs) {
+        // blob_to_infer is a 4D matrix
+        // the first dim is number of batch, MUST be 1
+        assert(blob_to_infer.dims == 4);
+        assert(blob_to_infer.size[0] == 1);
+        assert(!net.empty());
+
+        net.setInput(blob_to_infer);
+        net.forward(raw_outputs, out_names);
+    }
+
+
+    void vp_mask_rcnn_detector_node::preprocess(const std::vector<cv::Mat>& mats_to_infer, cv::Mat& blob_to_infer) {
+        // ignore preprocess logic in base class
+        cv::dnn::blobFromImages(mats_to_infer, blob_to_infer, 1.0, cv::Size(), cv::Scalar(), true, false);
+    }
+
+    void vp_mask_rcnn_detector_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        auto& frame_meta = frame_meta_with_batch[0];
+        
+        auto outDetections = raw_outputs[0];
+        auto outMasks = raw_outputs[1];
+    
+        // Output size of masks is NxCxHxW where
+        // N - number of detected boxes
+        // C - number of classes (excluding background)
+        // HxW - segmentation shape
+        const int numDetections = outDetections.size[2];
+        const int numClasses = outMasks.size[1];
+    
+        auto outDetections_ = outDetections.reshape(1, outDetections.total() / 7);
+        auto& frame = frame_meta->frame;
+        for (int i = 0; i < numDetections; ++i) {
+            float score = outDetections_.at<float>(i, 2);
+            if (score > score_threshold) {
+                // Extract the bounding box
+                int classId = static_cast<int>(outDetections_.at<float>(i, 1));
+                int left = static_cast<int>(frame.cols * outDetections_.at<float>(i, 3));
+                int top = static_cast<int>(frame.rows * outDetections_.at<float>(i, 4));
+                int right = static_cast<int>(frame.cols * outDetections_.at<float>(i, 5));
+                int bottom = static_cast<int>(frame.rows * outDetections_.at<float>(i, 6));
+    
+                left = max(0, min(left, frame.cols - 1));
+                top = max(0, min(top, frame.rows - 1));
+                right = max(0, min(right, frame.cols - 1));
+                bottom = max(0, min(bottom, frame.rows - 1));
+
+                auto label = (labels.size() < classId + 1) ? "" : labels[classId];
+
+                // Extract the mask for the object
+                cv::Mat objectMask(outMasks.size[2], outMasks.size[3], CV_32F, outMasks.ptr<float>(i, classId));
+                
+                classId += class_id_offset;
+                auto target = std::make_shared<vp_objects::vp_frame_target>(left, top, right - left, bottom - top, classId, score, frame_meta->frame_index, frame_meta->channel_index, label);
+                target->mask = objectMask;
+
+                frame_meta->targets.push_back(target);
+            }
+        }
+    }
+}
--- a/nodes/infers/vp_mask_rcnn_detector_node.h
+++ b/nodes/infers/vp_mask_rcnn_detector_node.h
@@ -0,0 +1,41 @@
+
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+#include "../../objects/vp_frame_target.h"
+
+
+
+namespace vp_nodes {
+    // image segmentation based on Mask RCNN
+    // https://github.com/matterport/Mask_RCNN
+    class vp_mask_rcnn_detector_node: public vp_primary_infer_node
+    {
+    private:
+        /* data */
+        // names of output layers in mask_rcnn
+        const std::vector<std::string> out_names = {"detection_out_final", "detection_masks"};
+        float score_threshold = 0.5;
+    protected:
+        // override infer and preprocess as mask_rcnn has a different logic
+        virtual void infer(const cv::Mat& blob_to_infer, std::vector<cv::Mat>& raw_outputs) override;
+        virtual void preprocess(const std::vector<cv::Mat>& mats_to_infer, cv::Mat& blob_to_infer) override;
+
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_mask_rcnn_detector_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path = "", 
+                            std::string labels_path = "", 
+                            int input_width = 416, 
+                            int input_height = 416, 
+                            int batch_size = 1,
+                            int class_id_offset = 0,
+                            float score_threshold = 0.5,
+                            float scale = 1 / 255.0,
+                            cv::Scalar mean = cv::Scalar(0),
+                            cv::Scalar std = cv::Scalar(1),
+                            bool swap_rb = true);
+        ~vp_mask_rcnn_detector_node();
+    };    
+}
--- a/nodes/infers/vp_mllm_analyser_node.cpp
+++ b/nodes/infers/vp_mllm_analyser_node.cpp
@@ -0,0 +1,34 @@
+#include "vp_mllm_analyser_node.h"
+
+#ifdef VP_WITH_LLM
+namespace vp_nodes {
+    vp_mllm_analyser_node::vp_mllm_analyser_node(std::string node_name,
+                              std::string model_name,
+                              std::string prompt,
+                              std::string api_base_url,
+                              std::string api_key,
+                              llmlib::LLMBackendType backend_type):
+                              vp_primary_infer_node(node_name, ""),
+                              llm_model_name(model_name),
+                              llm_prompt(prompt) {
+        cli = llmlib::LLMClient(api_base_url, api_key, backend_type);
+        this->initialized();
+    }
+
+    vp_mllm_analyser_node::~vp_mllm_analyser_node() {
+        deinitialized();
+    }
+
+    void vp_mllm_analyser_node::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        auto output = cli.simple_chat(llm_model_name, llm_prompt, {frame_meta->frame}, {});
+        frame_meta->description = output;
+    }
+
+    void vp_mllm_analyser_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_mllm_analyser_node.h
+++ b/nodes/infers/vp_mllm_analyser_node.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#ifdef VP_WITH_LLM
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/cpp_llmlib/llmlib.hpp"
+
+namespace vp_nodes {
+    // image(frame) analyser based on Multimodal Large Language Model
+    class vp_mllm_analyser_node: public vp_primary_infer_node 
+    {
+    private:
+        /* data */
+        llmlib::LLMClient cli;
+        std::string llm_prompt;
+        std::string llm_model_name;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_mllm_analyser_node(std::string node_name,
+                              std::string model_name,
+                              std::string prompt,
+                              std::string api_base_url,
+                              std::string api_key = "",
+                              llmlib::LLMBackendType backend_type = llmlib::LLMBackendType::Ollama);
+        ~vp_mllm_analyser_node();
+    };
+}
+#endif
--- a/nodes/infers/vp_openpose_detector_node.cpp
+++ b/nodes/infers/vp_openpose_detector_node.cpp
@@ -0,0 +1,107 @@
+#include <set>
+
+#include "vp_openpose_detector_node.h"
+
+namespace vp_nodes {
+        
+    vp_openpose_detector_node::vp_openpose_detector_node(std::string node_name, 
+                                                        std::string model_path, 
+                                                        std::string model_config_path, 
+                                                        std::string labels_path, 
+                                                        int input_width, 
+                                                        int input_height, 
+                                                        int batch_size,
+                                                        int class_id_offset,
+                                                        float score_threshold,
+                                                        vp_objects::vp_pose_type type,
+                                                        float scale,
+                                                        cv::Scalar mean,
+                                                        cv::Scalar std,
+                                                        bool swap_rb):
+                                                        vp_primary_infer_node(node_name, 
+                                                                            model_path, 
+                                                                            model_config_path, 
+                                                                            labels_path, 
+                                                                            input_width, 
+                                                                            input_height, 
+                                                                            batch_size, 
+                                                                            class_id_offset, 
+                                                                            scale, mean, 
+                                                                            std, 
+                                                                            swap_rb),
+                                                        score_threshold(score_threshold), type(type) {
+        this->initialized();
+    }
+    
+    vp_openpose_detector_node::~vp_openpose_detector_node() {
+        deinitialized();
+    }
+    
+    void vp_openpose_detector_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        // make sure heads of output are not zero
+        assert(raw_outputs.size() > 0);
+        // as it just one head of output
+        auto& netOutputBlobs = raw_outputs[0];
+        // batch, number of heatmaps, h, w
+        assert(netOutputBlobs.dims == 4);
+
+        auto start1 = std::chrono::system_clock::now();
+
+        int netOutputBlob_dims[3] = {netOutputBlobs.size[1], netOutputBlobs.size[2], netOutputBlobs.size[3]};
+        // scan batch
+        for (int b = 0; b < netOutputBlobs.size[0]; b++) {
+            auto& frame_meta = frame_meta_with_batch[b];
+            //auto h_scale = frame_meta->frame.rows * 1.0 / netOutputBlobs.size[2];
+            //auto w_scale = frame_meta->frame.cols * 1.0 / netOutputBlobs.size[3];
+
+            cv::Mat netOutputBlob = cv::Mat(3, netOutputBlob_dims, CV_32F, const_cast<uchar*>(netOutputBlobs.ptr(b)));
+            
+            int keyPointId = 0;
+            std::vector<std::vector<KeyPoint>> detectedKeypoints;
+            std::vector<KeyPoint> keyPointsList;
+
+            auto start2 = std::chrono::system_clock::now();
+            std::vector<cv::Mat> netOutputParts;
+            splitNetOutputBlobToParts(netOutputBlob, cv::Size(frame_meta->frame.cols, frame_meta->frame.rows), netOutputParts);
+            
+            auto start3 = std::chrono::system_clock::now();
+            for(int i = 0; i < points_map.at(type); ++i) {
+                std::vector<KeyPoint> keyPoints;
+                getKeyPoints(netOutputParts[i], score_threshold, keyPoints);
+
+                for(int i = 0; i< keyPoints.size(); ++i, ++keyPointId) {
+                    keyPoints[i].id = keyPointId;
+                }
+
+                detectedKeypoints.push_back(keyPoints);
+                keyPointsList.insert(keyPointsList.end(), keyPoints.begin(), keyPoints.end());
+            }
+
+            std::vector<std::vector<ValidPair>> validPairs;
+            std::set<int> invalidPairs;
+            getValidPairs(netOutputParts, detectedKeypoints, validPairs, invalidPairs);
+
+            std::vector<std::vector<int>> personwiseKeypoints;
+            getPersonwiseKeypoints(validPairs, invalidPairs, personwiseKeypoints);
+
+            // insert pose targets back into frame meta
+            for(int n = 0; n < personwiseKeypoints.size(); ++n) {
+                std::vector<vp_objects::vp_pose_keypoint> kps;
+                for (int i = 0; i < personwiseKeypoints[n].size(); i++) {
+                    auto index = personwiseKeypoints[n][i];
+                    if (index != -1) {                
+                        auto& p = keyPointsList[index];
+                        kps.push_back(vp_objects::vp_pose_keypoint {i, p.point.x, p.point.y, p.probability});
+                    }
+                    else {
+                        // point not detected
+                        kps.push_back(vp_objects::vp_pose_keypoint {i, -1, -1, 0});
+                    }
+                }
+                
+                auto pose_target = std::make_shared<vp_objects::vp_frame_pose_target>(type, kps);
+                frame_meta->pose_targets.push_back(pose_target);
+            }
+        }
+    }
+}
--- a/nodes/infers/vp_openpose_detector_node.h
+++ b/nodes/infers/vp_openpose_detector_node.h
@@ -0,0 +1,293 @@
+
+#pragma once
+
+#include <set>
+#include <map>
+#include <cmath>
+#include <opencv2/imgproc.hpp>
+
+#include "../vp_primary_infer_node.h"
+#include "../../objects/vp_frame_pose_target.h"
+
+
+namespace vp_nodes {
+    // body keypoints detector using openpose
+    // https://github.com/CMU-Perceptual-Computing-Lab/openpose
+    class vp_openpose_detector_node: public vp_primary_infer_node
+    {
+    private:
+        float score_threshold;
+        // pose type (model type)
+        vp_objects::vp_pose_type type;
+
+        // map indexs for PAFs
+        const std::map<vp_objects::vp_pose_type, std::vector<std::pair<int,int>>> mapIdxes_map = {
+            {vp_objects::vp_pose_type::body_25, {{0,1}, {14,15}, {22,23}, {16,17}, {18,19}, {24,25}, {26,27}, {6,7}, {2,3}, {4,5}, {8,9}, {10,11}, {12,13}, {30,31}, {32,33}, {36,37}, {34,35}, 
+                                                {38,39}, {40,41} ,{42,43}, {44,45}, {46,47}, {48,49}, {50,51}}},
+            {vp_objects::vp_pose_type::coco, {{12,13}, {20,21}, {14,15}, {16,17}, {22,23}, {24,25}, {0,1}, 
+                                            {2,3}, {4,5}, {6,7}, {8,9}, {10,11}, {28,29}, {30,31}, {34,35}, {32,33}, {36,37}, {18,19}, {26,27}}},
+            {vp_objects::vp_pose_type::mpi_15, {{0,1}, {2,3}, {4,5}, {6,7}, {8,9}, {10,11}, {12,13}, {14,15}, {16,17}, {18,19}, {20,21}, {22,23}, {24,25}, {26,27}}},
+            {vp_objects::vp_pose_type::hand, std::vector<std::pair<int,int>>()},
+            {vp_objects::vp_pose_type::face, std::vector<std::pair<int,int>>()}
+        };
+
+        // pose pairs for PAFs
+        const std::map<vp_objects::vp_pose_type, std::vector<std::pair<int,int>>> posePairs_map = {
+            {vp_objects::vp_pose_type::body_25, {{1,8}, {1,2}, {1,5}, {2,3}, {3,4}, {5,6}, {6,7}, {8,9}, 
+                                                {9,10}, {10,11}, {8,12}, {12,13}, {13,14}, {1,0}, {0,15}, {15,17}, {0,16}, {16,18}, 
+                                                {14,19}, {19,20}, {14,21}, {11,22}, {22,23}, {11,24}}},
+            {vp_objects::vp_pose_type::coco, {{1,2}, {1,5}, {2,3}, {3,4}, {5,6}, {6,7},
+                                            {1,8}, {8,9}, {9,10}, {1,11}, {11,12}, {12,13},
+                                            {1,0}, {0,14}, {14,16}, {0,15}, {15,17}, {2,16},
+                                            {5,17}}},
+            {vp_objects::vp_pose_type::mpi_15, {{0,1}, {1,2}, {2,3}, {3,4}, {1,5}, {5,6}, {6,7}, {1,14}, {14,8}, {8,9}, {9,10}, {14,11}, {11,12}, {12,13}, {0, 2}, {0, 5}}},
+            {vp_objects::vp_pose_type::hand, std::vector<std::pair<int,int>>()},
+            {vp_objects::vp_pose_type::face, std::vector<std::pair<int,int>>()}
+        };
+
+        // points count for each type of pose model
+        const std::map<vp_objects::vp_pose_type, int> points_map = {
+            {vp_objects::vp_pose_type::body_25, 25},
+            {vp_objects::vp_pose_type::coco, 18},
+            {vp_objects::vp_pose_type::mpi_15, 15},
+            {vp_objects::vp_pose_type::hand, 0},
+            {vp_objects::vp_pose_type::face, 0}
+        };
+
+        // for postprocess purpose
+        struct ValidPair{
+            ValidPair(int aId,int bId,float score) {
+                this->aId = aId;
+                this->bId = bId;
+                this->score = score;
+            }
+
+            int aId;
+            int bId;
+            float score;
+        };
+        struct KeyPoint{
+            KeyPoint(cv::Point point, float probability) {
+                this->id = -1;
+                this->point = point;
+                this->probability = probability;
+            }
+
+            int id;
+            cv::Point point;
+            float probability;
+        };
+        // for postprocess purpose end
+
+        void getKeyPoints(cv::Mat& probMap, double threshold, std::vector<KeyPoint>& keyPoints) {
+            cv::Mat smoothProbMap;
+            cv::GaussianBlur(probMap, smoothProbMap, cv::Size( 3, 3 ), 0, 0);
+
+            cv::Mat maskedProbMap;
+            cv::threshold(smoothProbMap, maskedProbMap, threshold, 255, cv::THRESH_BINARY);
+
+            maskedProbMap.convertTo(maskedProbMap, CV_8U, 1);
+
+            std::vector<std::vector<cv::Point> > contours;
+            cv::findContours(maskedProbMap, contours, cv::RETR_TREE, cv::CHAIN_APPROX_SIMPLE);
+
+            for(int i = 0; i < contours.size();++i) {
+                cv::Mat blobMask = cv::Mat::zeros(smoothProbMap.rows, smoothProbMap.cols, smoothProbMap.type());
+                cv::fillConvexPoly(blobMask, contours[i], cv::Scalar(1));
+
+                double maxVal;
+                cv::Point maxLoc;
+
+                cv::minMaxLoc(smoothProbMap.mul(blobMask), 0, &maxVal, 0, &maxLoc);
+                keyPoints.push_back(KeyPoint(maxLoc, probMap.at<float>(maxLoc.y, maxLoc.x)));
+            }
+        }
+
+        void splitNetOutputBlobToParts(cv::Mat& netOutputBlob, const cv::Size& targetSize, std::vector<cv::Mat>& netOutputParts) {
+            int nParts = netOutputBlob.size[0];
+            int h = netOutputBlob.size[1];
+            int w = netOutputBlob.size[2];
+
+            for(int i = 0; i< nParts; ++i) {
+                cv::Mat part(h, w, CV_32F, netOutputBlob.ptr(i));
+
+                cv::Mat resizedPart;
+
+                cv::resize(part, resizedPart, targetSize);
+
+                netOutputParts.push_back(resizedPart);
+            }
+        }
+
+        void populateInterpPoints(const cv::Point& a,const cv::Point& b,int numPoints,std::vector<cv::Point>& interpCoords){
+            float xStep = ((float)(b.x - a.x)) / (float)(numPoints - 1);
+            float yStep = ((float)(b.y - a.y)) / (float)(numPoints - 1);
+            interpCoords.push_back(a);
+
+            for(int i = 1; i< numPoints - 1; ++i) {
+                interpCoords.push_back(cv::Point(a.x + xStep * i, a.y + yStep * i));
+            }
+            interpCoords.push_back(b);
+        }
+
+        void getValidPairs(const std::vector<cv::Mat>& netOutputParts,
+                        const std::vector<std::vector<KeyPoint>>& detectedKeypoints,
+                        std::vector<std::vector<ValidPair>>& validPairs,
+                        std::set<int>& invalidPairs) {
+
+            int nInterpSamples = 10;
+            float confTh = 0.7;
+
+            for(int k = 0; k < mapIdxes_map.at(type).size(); ++k) {
+                //A->B constitute a limb
+                cv::Mat pafA = netOutputParts[mapIdxes_map.at(type)[k].first + points_map.at(type) + 1];
+                cv::Mat pafB = netOutputParts[mapIdxes_map.at(type)[k].second + points_map.at(type) + 1];
+
+                //Find the keypoints for the first and second limb
+                const std::vector<KeyPoint>& candA = detectedKeypoints[posePairs_map.at(type)[k].first];
+                const std::vector<KeyPoint>& candB = detectedKeypoints[posePairs_map.at(type)[k].second];
+
+                int nA = candA.size();
+                int nB = candB.size();
+
+                /*
+                # If keypoints for the joint-pair is detected
+                # check every joint in candA with every joint in candB
+                # Calculate the distance vector between the two joints
+                # Find the PAF values at a set of interpolated points between the joints
+                # Use the above formula to compute a score to mark the connection valid
+                */
+
+                if(nA != 0 && nB != 0){
+                    std::vector<ValidPair> localValidPairs;
+
+                    for(int i = 0; i< nA; ++i) {
+                        int maxJ = -1;
+                        float maxScore = -1;
+                        bool found = false;
+
+                        for(int j = 0; j < nB; ++j) {
+                            std::pair<float, float> distance(candB[j].point.x - candA[i].point.x, candB[j].point.y - candA[i].point.y);
+
+                            float norm = std::sqrt(distance.first * distance.first + distance.second * distance.second);
+
+                            if(!norm) {
+                                continue;
+                            }
+
+                            distance.first /= norm;
+                            distance.second /= norm;
+
+                            //Find p(u)
+                            std::vector<cv::Point> interpCoords;
+                            populateInterpPoints(candA[i].point, candB[j].point, nInterpSamples, interpCoords);
+                            //Find L(p(u))
+                            std::vector<std::pair<float, float>> pafInterp;
+                            for(int l = 0; l < interpCoords.size();++l) {
+                                pafInterp.push_back(
+                                    std::pair<float,float>(
+                                        pafA.at<float>(interpCoords[l].y, interpCoords[l].x),
+                                        pafB.at<float>(interpCoords[l].y, interpCoords[l].x)
+                                    ));
+                            }
+
+                            std::vector<float> pafScores;
+                            float sumOfPafScores = 0;
+                            int numOverTh = 0;
+                            for(int l = 0; l< pafInterp.size(); ++l){
+                                float score = pafInterp[l].first * distance.first + pafInterp[l].second * distance.second;
+                                sumOfPafScores += score;
+                                if(score > score_threshold){
+                                    ++numOverTh;
+                                }
+
+                                pafScores.push_back(score);
+                            }
+
+                            float avgPafScore = sumOfPafScores / ((float)pafInterp.size());
+
+                            if(((float)numOverTh)/((float)nInterpSamples) > confTh){
+                                if(avgPafScore > maxScore) {
+                                    maxJ = j;
+                                    maxScore = avgPafScore;
+                                    found = true;
+                                }
+                            }
+                        }/* j */
+
+                        if(found) {
+                            localValidPairs.push_back(ValidPair(candA[i].id, candB[maxJ].id, maxScore));
+                        }
+
+                    }/* i */
+
+                    validPairs.push_back(localValidPairs);
+
+                } else {
+                    invalidPairs.insert(k);
+                    validPairs.push_back(std::vector<ValidPair>());
+                }
+            }/* k */
+        }
+
+        void getPersonwiseKeypoints(const std::vector<std::vector<ValidPair>>& validPairs,
+                                    const std::set<int>& invalidPairs,
+                                    std::vector<std::vector<int>>& personwiseKeypoints) {
+            for(int k = 0; k < mapIdxes_map.at(type).size(); ++k) {
+                if(invalidPairs.find(k) != invalidPairs.end()) {
+                    continue;
+                }
+
+                const std::vector<ValidPair>& localValidPairs(validPairs[k]);
+
+                int indexA(posePairs_map.at(type)[k].first);
+                int indexB(posePairs_map.at(type)[k].second);
+
+                for(int i = 0; i< localValidPairs.size(); ++i) {
+                    bool found = false;
+                    int personIdx = -1;
+
+                    for(int j = 0; !found && j < personwiseKeypoints.size();++j) {
+                        if(indexA < personwiseKeypoints[j].size() &&
+                            personwiseKeypoints[j][indexA] == localValidPairs[i].aId) {
+                            personIdx = j;
+                            found = true;
+                        }
+                    }/* j */
+
+                    if(found) {
+                        personwiseKeypoints[personIdx].at(indexB) = localValidPairs[i].bId;
+                    } 
+                    else if(k < points_map.at(type) - 1) {
+                        std::vector<int> lpkp(std::vector<int>(points_map.at(type), -1));
+
+                        lpkp.at(indexA) = localValidPairs[i].aId;
+                        lpkp.at(indexB) = localValidPairs[i].bId;
+
+                        personwiseKeypoints.push_back(lpkp);
+                    }
+
+                }/* i */
+            }/* k */
+        }
+
+    protected:
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_openpose_detector_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path = "", 
+                            std::string labels_path = "", 
+                            int input_width = 368, 
+                            int input_height = 368, 
+                            int batch_size = 1,
+                            int class_id_offset = 0,
+                            float score_threshold = 0.1,
+                            vp_objects::vp_pose_type type = vp_objects::vp_pose_type::coco,
+                            float scale = 1 / 255.0,
+                            cv::Scalar mean = cv::Scalar(0),
+                            cv::Scalar std = cv::Scalar(1),
+                            bool swap_rb = false);
+        ~vp_openpose_detector_node();
+    };
+}
--- a/nodes/infers/vp_ppocr_text_detector_node.cpp
+++ b/nodes/infers/vp_ppocr_text_detector_node.cpp
@@ -0,0 +1,66 @@
+
+#ifdef VP_WITH_PADDLE
+#include "vp_ppocr_text_detector_node.h"
+#include "../../objects/vp_frame_text_target.h"
+
+namespace vp_nodes {
+        
+    vp_ppocr_text_detector_node::vp_ppocr_text_detector_node(std::string node_name, 
+                                                            std::string det_model_dir, 
+                                                            std::string cls_model_dir, 
+                                                            std::string rec_model_dir, 
+                                                            std::string rec_char_dict_path):
+                                                            vp_primary_infer_node(node_name, "") {
+        // to make the code simpler, paddle_ocr has no more config other than model path
+        // we need modify source code at ../../third_party/paddle_ocr/ if we need tune the parameters 
+        ocr = std::make_shared<PaddleOCR::PPOCR>(det_model_dir, cls_model_dir, rec_model_dir, rec_char_dict_path);
+        this->initialized();
+    }
+    
+    vp_ppocr_text_detector_node::~vp_ppocr_text_detector_node() {
+        deinitialized();
+    }
+    
+    void vp_ppocr_text_detector_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+    
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_ppocr_text_detector_node::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_primary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        // call paddle ocr
+        auto ocr_results = ocr->ocr(mats_to_infer);
+        assert(ocr_results.size() == 1);
+
+        auto& ocr_result = ocr_results[0];
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        // scan text detected in frame
+        for (int i = 0; i < ocr_result.size(); i++) {
+            /* code */
+            auto& text = ocr_result[i];
+            std::vector<std::pair<int, int>> region_vertexes = {{text.box[0][0], text.box[0][1]}, 
+                                                                {text.box[1][0], text.box[1][1]}, 
+                                                                {text.box[2][0], text.box[2][1]}, 
+                                                                {text.box[3][0], text.box[3][1]}};
+            // create text target and update back into frame meta
+            auto text_target = std::make_shared<vp_objects::vp_frame_text_target>(region_vertexes, text.text, text.score);
+            frame_meta->text_targets.push_back(text_target);
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+}
+#endif
--- a/nodes/infers/vp_ppocr_text_detector_node.h
+++ b/nodes/infers/vp_ppocr_text_detector_node.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#ifdef VP_WITH_PADDLE
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/paddle_ocr/include/paddleocr.h"
+
+namespace vp_nodes {
+    // ocr based on paddle ocr
+    // paddle ocr project(official): https://github.com/PaddlePaddle/PaddleOCR
+    // source code(modified based on official): ../../third_party/paddle_ocr
+    // note:
+    // this class is not based on opencv::dnn module but paddle, a few data members declared in base class are not usable any more(just ignore), such as vp_infer_node::net.
+    class vp_ppocr_text_detector_node: public vp_primary_infer_node
+    {
+    private:
+        // paddle ocr instance
+        std::shared_ptr<PaddleOCR::PPOCR> ocr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_ppocr_text_detector_node(std::string node_name, 
+                                    std::string det_model_dir = "", 
+                                    std::string cls_model_dir = "", 
+                                    std::string rec_model_dir = "", 
+                                    std::string rec_char_dict_path = "");
+        ~vp_ppocr_text_detector_node();
+    };
+}
+#endif
--- a/nodes/infers/vp_restoration_node.cpp
+++ b/nodes/infers/vp_restoration_node.cpp
@@ -0,0 +1,52 @@
+
+#include "vp_restoration_node.h"
+
+namespace vp_nodes {
+    
+    vp_restoration_node::vp_restoration_node(std::string node_name, 
+                            std::string realesrgan_bg_restoration_model,
+                            std::string face_restoration_model,
+                            bool restoration_to_osd):
+                            vp_primary_infer_node(node_name, ""),
+                            restoration_to_osd(restoration_to_osd) {        
+        /* init net*/
+        restoration_net = cv::dnn::readNetFromONNX(realesrgan_bg_restoration_model);
+        /* to-do, load face restoration model*/
+        #ifdef VP_WITH_CUDA
+        //restoration_net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
+        //restoration_net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
+        #endif
+        this->initialized();
+    }
+    
+    vp_restoration_node::~vp_restoration_node() {
+        deinitialized();
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_restoration_node::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        // 4x larger for RealESRGAN_x4plus model
+        cv::Mat target_blob = cv::dnn::blobFromImage(frame_meta->frame, 1 / 255.0, cv::Size(frame_meta->frame.cols, frame_meta->frame.rows), (0, 0, 0), true);
+        std::vector<cv::Mat> target_outputs;
+        restoration_net.setInput(target_blob);
+        restoration_net.forward(target_outputs, restoration_net.getUnconnectedOutLayersNames());
+
+        // parse to image
+        auto& output = target_outputs[0];
+        cv::Mat output_channel_last;
+        cv::transposeND(output, {0,2,3,1}, output_channel_last);
+        cv::Mat img_result(output_channel_last.size[1], output_channel_last.size[2], CV_32FC3, output_channel_last.data);
+        img_result.convertTo(img_result, CV_8U, 255);
+        
+        // update back to frame meta
+        auto& bg = restoration_to_osd ? frame_meta->osd_frame : frame_meta->frame;
+        cv::cvtColor(img_result, bg, cv::COLOR_RGB2BGR);
+    }
+
+    void vp_restoration_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
--- a/nodes/infers/vp_restoration_node.h
+++ b/nodes/infers/vp_restoration_node.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+
+namespace vp_nodes {
+    // general image restoration node using Real-ESRGAN
+    // used to enhance quality of frames in video， see more: https://github.com/xinntao/Real-ESRGAN
+    class vp_restoration_node: public vp_primary_infer_node
+    {   
+    private:
+        /* onnx network using opencv::dnn as backend */
+        cv::dnn::Net restoration_net;
+        bool restoration_to_osd = true;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_restoration_node(std::string node_name, 
+                            std::string realesrgan_bg_restoration_model,
+                            std::string face_restoration_model = "",
+                            bool restoration_to_osd = true);
+        ~vp_restoration_node();
+    };
+}
--- a/nodes/infers/vp_sface_feature_encoder_node.cpp
+++ b/nodes/infers/vp_sface_feature_encoder_node.cpp
@@ -0,0 +1,183 @@
+
+#include <algorithm>
+#include <opencv2/imgproc.hpp>
+#include "vp_sface_feature_encoder_node.h"
+
+namespace vp_nodes {
+        
+    vp_sface_feature_encoder_node::vp_sface_feature_encoder_node(std::string node_name, std::string model_path):
+                                                                vp_secondary_infer_node(node_name, model_path, 
+                                                                "", "", 
+                                                                112, 112, 
+                                                                1, std::vector<int>(), 0, 0, 
+                                                                0, 1, cv::Scalar()) {
+        this->initialized();
+    }
+    
+    vp_sface_feature_encoder_node::~vp_sface_feature_encoder_node() {
+        deinitialized();
+    }
+
+    void vp_sface_feature_encoder_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        // make sure heads of output are not zero
+        assert(raw_outputs.size() > 0);
+        assert(frame_meta_with_batch.size() == 1);
+
+        // just one head of output
+        auto& output = raw_outputs[0];
+        assert(output.dims == 2);
+
+        auto count = output.rows;
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        // update feature data back into frame meta
+        for (int i = 0; i < count; i++) {
+            cv::Mat feature = output.row(i);
+
+            for (int j = 0; j < feature.cols; j++) {
+                frame_meta->face_targets[i]->embeddings.push_back(feature.at<float>(0, j));
+            }
+        }  
+    }
+
+    // refer to vp_secondary_infer_node::prepare
+    void vp_sface_feature_encoder_node::prepare(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch, std::vector<cv::Mat>& mats_to_infer) {
+        // only one by one for secondary infer node
+        assert(frame_meta_with_batch.size() == 1);
+
+        // align face and crop 
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        // batch by batch inside single frame
+        for (auto& i : frame_meta->face_targets) {
+            // align and crop
+            float face_keypoints[5][2] = 
+                {{i->key_points[0].first, i->key_points[0].second}, 
+                 {i->key_points[1].first, i->key_points[1].second}, 
+                 {i->key_points[2].first, i->key_points[2].second}, 
+                 {i->key_points[3].first, i->key_points[3].second}, 
+                 {i->key_points[4].first, i->key_points[4].second}};
+            cv::Mat aligned_face;
+            alignCrop(frame_meta->frame, face_keypoints, aligned_face);
+            mats_to_infer.push_back(aligned_face); 
+        }
+    }
+
+    cv::Mat vp_sface_feature_encoder_node::getSimilarityTransformMatrix(float src[5][2]) {
+        using namespace cv;
+        float dst[5][2] = { {38.2946f, 51.6963f}, {73.5318f, 51.5014f}, {56.0252f, 71.7366f}, {41.5493f, 92.3655f}, {70.7299f, 92.2041f} };
+        float avg0 = (src[0][0] + src[1][0] + src[2][0] + src[3][0] + src[4][0]) / 5;
+        float avg1 = (src[0][1] + src[1][1] + src[2][1] + src[3][1] + src[4][1]) / 5;
+        //Compute mean of src and dst.
+        float src_mean[2] = { avg0, avg1 };
+        float dst_mean[2] = { 56.0262f, 71.9008f };
+        //Subtract mean from src and dst.
+        float src_demean[5][2];
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 5; j++)
+            {
+                src_demean[j][i] = src[j][i] - src_mean[i];
+            }
+        }
+        float dst_demean[5][2];
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 5; j++)
+            {
+                dst_demean[j][i] = dst[j][i] - dst_mean[i];
+            }
+        }
+        double A00 = 0.0, A01 = 0.0, A10 = 0.0, A11 = 0.0;
+        for (int i = 0; i < 5; i++)
+            A00 += dst_demean[i][0] * src_demean[i][0];
+        A00 = A00 / 5;
+        for (int i = 0; i < 5; i++)
+            A01 += dst_demean[i][0] * src_demean[i][1];
+        A01 = A01 / 5;
+        for (int i = 0; i < 5; i++)
+            A10 += dst_demean[i][1] * src_demean[i][0];
+        A10 = A10 / 5;
+        for (int i = 0; i < 5; i++)
+            A11 += dst_demean[i][1] * src_demean[i][1];
+        A11 = A11 / 5;
+        Mat A = (Mat_<double>(2, 2) << A00, A01, A10, A11);
+        double d[2] = { 1.0, 1.0 };
+        double detA = A00 * A11 - A01 * A10;
+        if (detA < 0)
+            d[1] = -1;
+        double T[3][3] = { {1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0} };
+        Mat s, u, vt, v;
+        SVD::compute(A, s, u, vt);
+        double smax = s.ptr<double>(0)[0]>s.ptr<double>(1)[0] ? s.ptr<double>(0)[0] : s.ptr<double>(1)[0];
+        double tol = smax * 2 * FLT_MIN;
+        int rank = 0;
+        if (s.ptr<double>(0)[0]>tol)
+            rank += 1;
+        if (s.ptr<double>(1)[0]>tol)
+            rank += 1;
+        double arr_u[2][2] = { {u.ptr<double>(0)[0], u.ptr<double>(0)[1]}, {u.ptr<double>(1)[0], u.ptr<double>(1)[1]} };
+        double arr_vt[2][2] = { {vt.ptr<double>(0)[0], vt.ptr<double>(0)[1]}, {vt.ptr<double>(1)[0], vt.ptr<double>(1)[1]} };
+        double det_u = arr_u[0][0] * arr_u[1][1] - arr_u[0][1] * arr_u[1][0];
+        double det_vt = arr_vt[0][0] * arr_vt[1][1] - arr_vt[0][1] * arr_vt[1][0];
+        if (rank == 1)
+        {
+            if ((det_u*det_vt) > 0)
+            {
+                Mat uvt = u*vt;
+                T[0][0] = uvt.ptr<double>(0)[0];
+                T[0][1] = uvt.ptr<double>(0)[1];
+                T[1][0] = uvt.ptr<double>(1)[0];
+                T[1][1] = uvt.ptr<double>(1)[1];
+            }
+            else
+            {
+                double temp = d[1];
+                d[1] = -1;
+                Mat D = (Mat_<double>(2, 2) << d[0], 0.0, 0.0, d[1]);
+                Mat Dvt = D*vt;
+                Mat uDvt = u*Dvt;
+                T[0][0] = uDvt.ptr<double>(0)[0];
+                T[0][1] = uDvt.ptr<double>(0)[1];
+                T[1][0] = uDvt.ptr<double>(1)[0];
+                T[1][1] = uDvt.ptr<double>(1)[1];
+                d[1] = temp;
+            }
+        }
+        else
+        {
+            Mat D = (Mat_<double>(2, 2) << d[0], 0.0, 0.0, d[1]);
+            Mat Dvt = D*vt;
+            Mat uDvt = u*Dvt;
+            T[0][0] = uDvt.ptr<double>(0)[0];
+            T[0][1] = uDvt.ptr<double>(0)[1];
+            T[1][0] = uDvt.ptr<double>(1)[0];
+            T[1][1] = uDvt.ptr<double>(1)[1];
+        }
+        double var1 = 0.0;
+        for (int i = 0; i < 5; i++)
+            var1 += src_demean[i][0] * src_demean[i][0];
+        var1 = var1 / 5;
+        double var2 = 0.0;
+        for (int i = 0; i < 5; i++)
+            var2 += src_demean[i][1] * src_demean[i][1];
+        var2 = var2 / 5;
+        double scale = 1.0 / (var1 + var2)* (s.ptr<double>(0)[0] * d[0] + s.ptr<double>(1)[0] * d[1]);
+        double TS[2];
+        TS[0] = T[0][0] * src_mean[0] + T[0][1] * src_mean[1];
+        TS[1] = T[1][0] * src_mean[0] + T[1][1] * src_mean[1];
+        T[0][2] = dst_mean[0] - scale*TS[0];
+        T[1][2] = dst_mean[1] - scale*TS[1];
+        T[0][0] *= scale;
+        T[0][1] *= scale;
+        T[1][0] *= scale;
+        T[1][1] *= scale;
+        Mat transform_mat = (Mat_<double>(2, 3) << T[0][0], T[0][1], T[0][2], T[1][0], T[1][1], T[1][2]);
+        return transform_mat;
+    }
+
+    void vp_sface_feature_encoder_node::alignCrop(cv::Mat& _src_img, float _src_point[5][2], cv::Mat& _aligned_img) {
+        cv::Mat warp_mat = getSimilarityTransformMatrix(_src_point);
+        cv::warpAffine(_src_img, _aligned_img, warp_mat, cv::Size(input_width, input_height), cv::INTER_LINEAR);
+    }
+}
--- a/nodes/infers/vp_sface_feature_encoder_node.h
+++ b/nodes/infers/vp_sface_feature_encoder_node.h
@@ -0,0 +1,27 @@
+
+#pragma once
+
+#include "../vp_secondary_infer_node.h"
+
+namespace vp_nodes {
+    // face feature encoder based on SFace, update embeddings of vp_frame_face_target
+    // https://github.com/opencv/opencv/blob/4.x/modules/objdetect/src/face_recognize.cpp
+    // https://github.com/zhongyy/SFace
+    class vp_sface_feature_encoder_node: public vp_secondary_infer_node
+    {
+    private:
+        // get transform matrix for aglin face
+        cv::Mat getSimilarityTransformMatrix(float src[5][2]);
+        // align and crop
+        void alignCrop(cv::Mat& _src_img, float _src_point[5][2], cv::Mat& _aligned_img);
+    protected:
+        // override prepare as sface has an additional logic for face align before preprocess
+        virtual void prepare(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch, std::vector<cv::Mat>& mats_to_infer) override;
+
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_sface_feature_encoder_node(std::string node_name, std::string model_path);
+        ~vp_sface_feature_encoder_node();
+    };
+
+ }
--- a/nodes/infers/vp_trt_vehicle_color_classifier.cpp
+++ b/nodes/infers/vp_trt_vehicle_color_classifier.cpp
@@ -0,0 +1,64 @@
+#ifdef VP_WITH_TRT
+#include "vp_trt_vehicle_color_classifier.h"
+
+namespace vp_nodes {
+    vp_trt_vehicle_color_classifier::vp_trt_vehicle_color_classifier(std::string node_name, 
+                                                                    std::string vehicle_color_cls_model_path, 
+                                                                    std::vector<int> p_class_ids_applied_to,
+                                                                    int min_width_applied_to, int min_height_applied_to):
+                                                                    vp_secondary_infer_node(node_name, "", "", "", 1, 1, 1, p_class_ids_applied_to, min_width_applied_to, min_height_applied_to) {
+        vehicle_color_classifier = std::make_shared<trt_vehicle::VehicleColorClassifier>(vehicle_color_cls_model_path);
+        this->initialized();
+    }
+    
+    vp_trt_vehicle_color_classifier::~vp_trt_vehicle_color_classifier() {
+        deinitialized();
+    }
+
+    void vp_trt_vehicle_color_classifier::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_secondary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // infer using trt_vehicle library
+        start_time = std::chrono::system_clock::now();
+        std::vector<trt_vehicle::ObjCls> vehicles_color;
+        vehicle_color_classifier->classify(mats_to_infer, vehicles_color);
+
+        auto& frame_meta = frame_meta_with_batch[0];
+        auto index = 0;
+        for (int i = 0; i < vehicles_color.size(); i++) {
+            for (int j = index; j < frame_meta->targets.size(); j++) {
+                // need apply or not?
+                if (!need_apply(frame_meta->targets[j]->primary_class_id, frame_meta->targets[j]->width, frame_meta->targets[j]->height)) {
+                    // continue as its primary_class_id is not in p_class_ids_applied_to
+                    continue;
+                }
+
+                // update back to frame meta
+                frame_meta->targets[j]->secondary_class_ids.push_back(vehicles_color[i].class_);
+                frame_meta->targets[j]->secondary_scores.push_back(vehicles_color[i].score);
+                frame_meta->targets[j]->secondary_labels.push_back(vehicles_color[i].label);
+
+                // break as we found the right target!
+                index = j + 1;
+                break;
+            }
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_vehicle_color_classifier::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_color_classifier.h
+++ b/nodes/infers/vp_trt_vehicle_color_classifier.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_secondary_infer_node.h"
+#include "../../third_party/trt_vehicle/models/vehicle_color_classifier.h"
+
+namespace vp_nodes {
+    // vehicle color classifier based on tensorrt using trt_vehicle library
+    // update secondary_class_ids/secondary_labels/secondary_scores of vp_frame_target.
+    class vp_trt_vehicle_color_classifier: public vp_secondary_infer_node
+    {
+    private:
+        /* data */
+        std::shared_ptr<trt_vehicle::VehicleColorClassifier> vehicle_color_classifier = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_vehicle_color_classifier(std::string node_name, std::string vehicle_color_cls_model_path = "", std::vector<int> p_class_ids_applied_to = std::vector<int>(), int min_width_applied_to = 0, int min_height_applied_to = 0);
+        ~vp_trt_vehicle_color_classifier();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_detector.cpp
+++ b/nodes/infers/vp_trt_vehicle_detector.cpp
@@ -0,0 +1,64 @@
+#ifdef VP_WITH_TRT
+#include "vp_trt_vehicle_detector.h"
+
+namespace vp_nodes {
+    
+    vp_trt_vehicle_detector::vp_trt_vehicle_detector(std::string node_name, 
+                                                    std::string vehicle_det_model_path):
+                                                    vp_primary_infer_node(node_name, "") {
+        vehicle_detector = std::make_shared<trt_vehicle::VehicleDetector>(vehicle_det_model_path);
+        this->initialized();
+    }
+    
+    vp_trt_vehicle_detector::~vp_trt_vehicle_detector() {
+        deinitialized();
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_trt_vehicle_detector::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_primary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        std::vector<std::vector<trt_vehicle::ObjBox>> vehicles;
+        vehicle_detector->detect(mats_to_infer, vehicles);
+
+        assert(vehicles.size() == 1);
+        auto& vehicle_list = vehicles[0];
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        for (int i = 0; i < vehicle_list.size(); i++) {
+            auto& objbox = vehicle_list[i];
+
+            // check value range
+            objbox.x = std::max(objbox.x, 0);
+            objbox.y = std::max(objbox.y, 0);
+            objbox.width = std::min(objbox.width, frame_meta->frame.cols - objbox.x);
+            objbox.height = std::min(objbox.height, frame_meta->frame.rows - objbox.y);
+            if (objbox.width <= 0 || objbox.height <= 0) {
+                continue;
+            }
+            
+            auto target = std::make_shared<vp_objects::vp_frame_target>(objbox.x, objbox.y, objbox.width, objbox.height, 
+                                                                        objbox.class_, objbox.score, frame_meta->frame_index, frame_meta->channel_index, objbox.label);
+            // create target and update back into frame meta
+            frame_meta->targets.push_back(target);
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_vehicle_detector::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_detector.h
+++ b/nodes/infers/vp_trt_vehicle_detector.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/trt_vehicle/models/vehicle_detector.h"
+
+namespace vp_nodes {
+    // vehicle detector based on tensorrt using trt_vehicle library
+    class vp_trt_vehicle_detector: public vp_primary_infer_node
+    {
+    private:
+        std::shared_ptr<trt_vehicle::VehicleDetector> vehicle_detector = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_vehicle_detector(std::string node_name, std::string vehicle_det_model_path = "");
+        ~vp_trt_vehicle_detector();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_feature_encoder.cpp
+++ b/nodes/infers/vp_trt_vehicle_feature_encoder.cpp
@@ -0,0 +1,65 @@
+#ifdef VP_WITH_TRT
+#include "vp_trt_vehicle_feature_encoder.h"
+
+namespace vp_nodes {
+    vp_trt_vehicle_feature_encoder::vp_trt_vehicle_feature_encoder(std::string node_name, 
+                                                                    std::string vehicle_feature_model_path, 
+                                                                    std::vector<int> p_class_ids_applied_to,
+                                                                    int min_width_applied_to, int min_height_applied_to):
+                                                                    vp_secondary_infer_node(node_name, "", "", "", 1, 1, 1, p_class_ids_applied_to, min_width_applied_to, min_height_applied_to) {
+        vehicle_feature_encoder = std::make_shared<trt_vehicle::VehicleFeatureEncoder>(vehicle_feature_model_path);
+        this->initialized();
+    }
+    
+    vp_trt_vehicle_feature_encoder::~vp_trt_vehicle_feature_encoder() {
+        deinitialized();
+    }
+
+    void vp_trt_vehicle_feature_encoder::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_secondary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // infer using trt_vehicle library
+        start_time = std::chrono::system_clock::now();
+        std::vector<std::vector<float>> vehicles_feature;
+        vehicle_feature_encoder->encode(mats_to_infer, vehicles_feature);
+
+        auto& frame_meta = frame_meta_with_batch[0];
+        auto index = 0;
+        for (int i = 0; i < vehicles_feature.size(); i++) {
+            for (int j = index; j < frame_meta->targets.size(); j++) {
+                // need apply or not?
+                if (!need_apply(frame_meta->targets[j]->primary_class_id, frame_meta->targets[j]->width, frame_meta->targets[j]->height)) {
+                    // continue as its primary_class_id is not in p_class_ids_applied_to
+                    continue;
+                }
+
+                // update back to frame meta
+                auto& features = vehicles_feature[i];
+                for(int k = 0; k < features.size(); k++) {
+                    frame_meta->targets[j]->embeddings.push_back(features[k]);
+                }
+
+                // break as we found the right target!
+                index = j + 1;
+                break;
+            }
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_vehicle_feature_encoder::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_feature_encoder.h
+++ b/nodes/infers/vp_trt_vehicle_feature_encoder.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_secondary_infer_node.h"
+#include "../../third_party/trt_vehicle/models/vehicle_feature_encoder.h"
+
+namespace vp_nodes {
+    // vehicle feature encoder based on tensorrt using trt_vehicle library
+    // update embeddings of vp_frame_target
+    class vp_trt_vehicle_feature_encoder: public vp_secondary_infer_node
+    {
+    private:
+        /* data */
+        std::shared_ptr<trt_vehicle::VehicleFeatureEncoder> vehicle_feature_encoder = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_vehicle_feature_encoder(std::string node_name, std::string vehicle_feature_model_path = "", std::vector<int> p_class_ids_applied_to = std::vector<int>(), int min_width_applied_to = 0, int min_height_applied_to = 0);
+        ~vp_trt_vehicle_feature_encoder();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_plate_detector.cpp
+++ b/nodes/infers/vp_trt_vehicle_plate_detector.cpp
@@ -0,0 +1,71 @@
+#ifdef VP_WITH_TRT
+#include "vp_trt_vehicle_plate_detector.h"
+
+namespace vp_nodes {
+    
+    vp_trt_vehicle_plate_detector::vp_trt_vehicle_plate_detector(std::string node_name, 
+                                                                std::string plate_det_model_path, 
+                                                                std::string char_rec_model_path):
+                                                                vp_secondary_infer_node(node_name, "") {
+        plate_detector = std::make_shared<trt_vehicle::VehiclePlateDetector>(plate_det_model_path, char_rec_model_path);
+        this->initialized();
+    }
+    
+    vp_trt_vehicle_plate_detector::~vp_trt_vehicle_plate_detector() {
+        deinitialized();
+    }
+
+    void vp_trt_vehicle_plate_detector::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_trt_vehicle_plate_detector::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_secondary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        // here we detect vehicle plate for all target(like car/bus) in frame
+        std::vector<std::vector<trt_vehicle::Plate>> plates;
+        plate_detector->detect(mats_to_infer, plates, true);  // detect at most 1 plate for each image
+
+        auto& frame_meta = frame_meta_with_batch[0];
+        for (int i = 0; i < plates.size(); i++) {
+            for (int j = 0; j < plates[i].size(); j++) {
+                auto& plate = plates[i][j];
+
+                // only plate detection but no recognition result
+                if (plate.text.empty()) {
+                    continue;
+                }
+                
+                // check value range
+                auto x = std::max(0, plate.x + frame_meta->targets[i]->x - 10);  // offset
+                auto y = std::max(0, plate.y + frame_meta->targets[i]->y - 8);   // offset
+                auto w = std::min(plate.width, frame_meta->frame.cols - x);
+                auto h = std::min(plate.height, frame_meta->frame.rows - y);
+                if (w <= 0 || h <=0) {
+                    continue;
+                }
+                
+                // create sub target and update back into frame meta
+                // we treat vehicle plate as sub target of those in vp_frame_meta.targets
+                auto sub_target = std::make_shared<vp_objects::vp_sub_target>(x, y, w, h, 
+                                                    -1, 0, plate.color + "_" + plate.text, frame_meta->frame_index, frame_meta->channel_index);
+                frame_meta->targets[i]->sub_targets.push_back(sub_target);
+            }
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_plate_detector.h
+++ b/nodes/infers/vp_trt_vehicle_plate_detector.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_secondary_infer_node.h"
+#include "../../objects/vp_sub_target.h"
+#include "../../third_party/trt_vehicle/models/vehicle_plate_detector.h"
+
+namespace vp_nodes {
+    // vehicle plate detector based on tensorrt using trt_vehicle library
+    // source code: ../../third_party/trt_vehicle
+    // note: derived from vp_secondary_infer_node since it detects plates on small cropped images, which is different from vp_trt_vehicle_plate_detector_v2 class
+    // this class is not based on opencv::dnn module but tensorrt, a few data members declared in base class are not usable any more(just ignore), such as vp_infer_node::net.
+    class vp_trt_vehicle_plate_detector: public vp_secondary_infer_node
+    {
+    private:
+        /* data */
+        std::shared_ptr<trt_vehicle::VehiclePlateDetector> plate_detector = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_vehicle_plate_detector(std::string node_name, std::string plate_det_model_path = "", std::string char_rec_model_path = "");
+        ~vp_trt_vehicle_plate_detector();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_plate_detector_v2.cpp
+++ b/nodes/infers/vp_trt_vehicle_plate_detector_v2.cpp
@@ -0,0 +1,75 @@
+#ifdef VP_WITH_TRT
+#include "vp_trt_vehicle_plate_detector_v2.h"
+
+namespace vp_nodes {
+    
+    vp_trt_vehicle_plate_detector_v2::vp_trt_vehicle_plate_detector_v2(std::string node_name, 
+                                                                std::string plate_det_model_path, 
+                                                                std::string char_rec_model_path):
+                                                                vp_primary_infer_node(node_name, "") {
+        plate_detector = std::make_shared<trt_vehicle::VehiclePlateDetector>(plate_det_model_path, char_rec_model_path);
+        this->initialized();
+    }
+    
+    vp_trt_vehicle_plate_detector_v2::~vp_trt_vehicle_plate_detector_v2() {
+        deinitialized();
+    }
+
+    void vp_trt_vehicle_plate_detector_v2::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_trt_vehicle_plate_detector_v2::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_primary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        // here we detect vehicle plate on the whole big frame
+        std::vector<std::vector<trt_vehicle::Plate>> plates;
+        plate_detector->detect(mats_to_infer, plates);
+
+        for (int i = 0; i < plates.size(); i++) {
+            auto& frame_meta = frame_meta_with_batch[i];
+            for (int j = 0; j < plates[i].size(); j++) {
+                auto& plate = plates[i][j];
+
+                // only plate detection but no recognition result
+                if (plate.text.empty()) {
+                    continue;
+                }
+                
+                // check value range
+                auto x = std::max(0, plate.x);
+                auto y = std::max(0, plate.y);
+                auto w = std::min(plate.width, frame_meta->frame.cols - x);
+                auto h = std::min(plate.height, frame_meta->frame.rows - y);
+                if (w <= 0 || h <=0) {
+                    continue;
+                }
+                
+                // create target and update back into frame meta
+                /* 
+                  we treat vehicle plate as target in vp_frame_meta.targets,
+                  1. x,y,width,height in vp_frame_target stand for location of plate in frame
+                  2. primary_label in vp_frame_target stands for plate color and plate text, which combined with '_'
+                */ 
+                auto target = std::make_shared<vp_objects::vp_frame_target>(x, y, w, h, 
+                                                    -1, 0, frame_meta->frame_index, frame_meta->channel_index, plate.color + "_" + plate.text);
+                frame_meta->targets.push_back(target);
+                VP_INFO("plate-color:" + plate.color + "    plate-text:" + plate.text);
+            }
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_plate_detector_v2.h
+++ b/nodes/infers/vp_trt_vehicle_plate_detector_v2.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/trt_vehicle/models/vehicle_plate_detector.h"
+
+namespace vp_nodes {
+    // vehicle plate detector based on tensorrt using trt_vehicle library
+    // source code: ../../third_party/trt_vehicle
+    // note: derived from vp_primary_infer_node since it detects plates on the whole big frame, which is different from vp_trt_vehicle_plate_detector class
+    // this class is not based on opencv::dnn module but tensorrt, a few data members declared in base class are not usable any more(just ignore), such as vp_infer_node::net.
+    class vp_trt_vehicle_plate_detector_v2: public vp_primary_infer_node
+    {
+    private:
+        /* data */
+        std::shared_ptr<trt_vehicle::VehiclePlateDetector> plate_detector = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_vehicle_plate_detector_v2(std::string node_name, std::string plate_det_model_path = "", std::string char_rec_model_path = "");
+        ~vp_trt_vehicle_plate_detector_v2();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_scanner.cpp
+++ b/nodes/infers/vp_trt_vehicle_scanner.cpp
@@ -0,0 +1,64 @@
+#ifdef VP_WITH_TRT
+#include "vp_trt_vehicle_scanner.h"
+
+namespace vp_nodes {
+    
+    vp_trt_vehicle_scanner::vp_trt_vehicle_scanner(std::string node_name, 
+                                                    std::string vehicle_scan_model_path):
+                                                    vp_primary_infer_node(node_name, "") {
+        vehicle_scanner = std::make_shared<trt_vehicle::VehicleScanner>(vehicle_scan_model_path);
+        this->initialized();
+    }
+    
+    vp_trt_vehicle_scanner::~vp_trt_vehicle_scanner() {
+        deinitialized();
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_trt_vehicle_scanner::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_primary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        std::vector<std::vector<trt_vehicle::ObjBox>> vehicles;
+        vehicle_scanner->detect(mats_to_infer, vehicles);
+
+        assert(vehicles.size() == 1);
+        auto& vehicle_list = vehicles[0];
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        for (int i = 0; i < vehicle_list.size(); i++) {
+            auto& objbox = vehicle_list[i];
+
+            // check value range
+            objbox.x = std::max(objbox.x, 0);
+            objbox.y = std::max(objbox.y, 0);
+            objbox.width = std::min(objbox.width, frame_meta->frame.cols - objbox.x);
+            objbox.height = std::min(objbox.height, frame_meta->frame.rows - objbox.y);
+            if (objbox.width <= 0 || objbox.height <= 0) {
+                continue;
+            }
+            
+            auto target = std::make_shared<vp_objects::vp_frame_target>(objbox.x, objbox.y, objbox.width, objbox.height, 
+                                                                        objbox.class_, objbox.score, frame_meta->frame_index, frame_meta->channel_index, objbox.label);
+            // create target and update back into frame meta
+            frame_meta->targets.push_back(target);
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_vehicle_scanner::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_scanner.h
+++ b/nodes/infers/vp_trt_vehicle_scanner.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/trt_vehicle/models/vehicle_scanner.h"
+
+namespace vp_nodes {
+    // vehicle scanner based on tensorrt using trt_vehicle library
+    class vp_trt_vehicle_scanner: public vp_primary_infer_node
+    {
+    private:
+        std::shared_ptr<trt_vehicle::VehicleScanner> vehicle_scanner = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_vehicle_scanner(std::string node_name, std::string vehicle_scan_model_path = "");
+        ~vp_trt_vehicle_scanner();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_type_classifier.cpp
+++ b/nodes/infers/vp_trt_vehicle_type_classifier.cpp
@@ -0,0 +1,64 @@
+#ifdef VP_WITH_TRT
+#include "vp_trt_vehicle_type_classifier.h"
+
+namespace vp_nodes {
+    vp_trt_vehicle_type_classifier::vp_trt_vehicle_type_classifier(std::string node_name, 
+                                                                    std::string vehicle_type_cls_model_path, 
+                                                                    std::vector<int> p_class_ids_applied_to,
+                                                                    int min_width_applied_to, int min_height_applied_to):
+                                                                    vp_secondary_infer_node(node_name, "", "", "", 1, 1, 1, p_class_ids_applied_to, min_width_applied_to, min_height_applied_to) {
+        vehicle_type_classifier = std::make_shared<trt_vehicle::VehicleTypeClassifier>(vehicle_type_cls_model_path);
+        this->initialized();
+    }
+    
+    vp_trt_vehicle_type_classifier::~vp_trt_vehicle_type_classifier() {
+        deinitialized();
+    }
+
+    void vp_trt_vehicle_type_classifier::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_secondary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // infer using trt_vehicle library
+        start_time = std::chrono::system_clock::now();
+        std::vector<trt_vehicle::ObjCls> vehicles_type;
+        vehicle_type_classifier->classify(mats_to_infer, vehicles_type);
+
+        auto& frame_meta = frame_meta_with_batch[0];
+        auto index = 0;
+        for (int i = 0; i < vehicles_type.size(); i++) {
+            for (int j = index; j < frame_meta->targets.size(); j++) {
+                // need apply or not?
+                if (!need_apply(frame_meta->targets[j]->primary_class_id, frame_meta->targets[j]->width, frame_meta->targets[j]->height)) {
+                    // continue as its primary_class_id is not in p_class_ids_applied_to
+                    continue;
+                }
+
+                // update back to frame meta
+                frame_meta->targets[j]->secondary_class_ids.push_back(vehicles_type[i].class_);
+                frame_meta->targets[j]->secondary_scores.push_back(vehicles_type[i].score);
+                frame_meta->targets[j]->secondary_labels.push_back(vehicles_type[i].label);
+
+                // break as we found the right target!
+                index = j + 1;
+                break;
+            }
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_vehicle_type_classifier::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_vehicle_type_classifier.h
+++ b/nodes/infers/vp_trt_vehicle_type_classifier.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_secondary_infer_node.h"
+#include "../../third_party/trt_vehicle/models/vehicle_type_classifier.h"
+
+namespace vp_nodes {
+    // vehicle type classifier based on tensorrt using trt_vehicle library
+    // update secondary_class_ids/secondary_labels/secondary_scores of vp_frame_target.
+    class vp_trt_vehicle_type_classifier: public vp_secondary_infer_node
+    {
+    private:
+        /* data */
+        std::shared_ptr<trt_vehicle::VehicleTypeClassifier> vehicle_type_classifier = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_vehicle_type_classifier(std::string node_name, std::string vehicle_type_cls_model_path = "", std::vector<int> p_class_ids_applied_to = std::vector<int>(), int min_width_applied_to = 0, int min_height_applied_to = 0);
+        ~vp_trt_vehicle_type_classifier();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_classifier.cpp
+++ b/nodes/infers/vp_trt_yolov8_classifier.cpp
@@ -0,0 +1,67 @@
+
+#ifdef VP_WITH_TRT
+#include "vp_trt_yolov8_classifier.h"
+
+namespace vp_nodes {
+    vp_trt_yolov8_classifier::vp_trt_yolov8_classifier(std::string node_name, 
+                                                        std::string model_path, 
+                                                        std::string labels_path,  
+                                                        std::vector<int> p_class_ids_applied_to,
+                                                        int min_width_applied_to, int min_height_applied_to):
+                                                        vp_secondary_infer_node(node_name, "", "", labels_path, 1, 1, 1, p_class_ids_applied_to, min_width_applied_to, min_height_applied_to) {
+        yolov8_classifier = std::make_shared<trt_yolov8::trt_yolov8_classifier>(model_path);
+        this->initialized();
+    }
+    
+    vp_trt_yolov8_classifier::~vp_trt_yolov8_classifier() {
+        deinitialized();
+    }
+
+    void vp_trt_yolov8_classifier::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_secondary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // infer using trt_vehicle library
+        start_time = std::chrono::system_clock::now();
+        std::vector<std::vector<Classification>> classidications;
+        yolov8_classifier->classify(mats_to_infer, classidications);
+
+        auto& frame_meta = frame_meta_with_batch[0];
+        auto index = 0;
+        for (int i = 0; i < classidications.size(); i++) {
+            for (int j = index; j < frame_meta->targets.size(); j++) {
+                // need apply or not?
+                if (!need_apply(frame_meta->targets[j]->primary_class_id, frame_meta->targets[j]->width, frame_meta->targets[j]->height)) {
+                    // continue as its primary_class_id is not in p_class_ids_applied_to
+                    continue;
+                }
+
+                // update back to frame meta
+                frame_meta->targets[j]->secondary_class_ids.push_back(classidications[i][0].class_id);
+                frame_meta->targets[j]->secondary_scores.push_back(classidications[i][0].conf);
+                auto label = labels.size() == 0 ? "" : labels[classidications[i][0].class_id];
+                frame_meta->targets[j]->secondary_labels.push_back(label);
+
+                // break as we found the right target!
+                index = j + 1;
+                break;
+            }
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_yolov8_classifier::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_classifier.h
+++ b/nodes/infers/vp_trt_yolov8_classifier.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_secondary_infer_node.h"
+#include "../../third_party/trt_yolov8/trt_yolov8_classifier.h"
+
+namespace vp_nodes {
+    // universal yolov8 classifier based on tensorrt using third_party/trt_yolov8 library
+    class vp_trt_yolov8_classifier: public vp_secondary_infer_node
+    {
+    private:
+        /* data */
+        std::shared_ptr<trt_yolov8::trt_yolov8_classifier> yolov8_classifier = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_yolov8_classifier(std::string node_name, 
+                                std::string model_path,
+                                std::string labels_path = "",  
+                                std::vector<int> p_class_ids_applied_to = std::vector<int>(), 
+                                int min_width_applied_to = 0, 
+                                int min_height_applied_to = 0);
+        ~vp_trt_yolov8_classifier();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_detector.cpp
+++ b/nodes/infers/vp_trt_yolov8_detector.cpp
@@ -0,0 +1,69 @@
+
+#ifdef VP_WITH_TRT
+#include "vp_trt_yolov8_detector.h"
+
+namespace vp_nodes {
+    
+    vp_trt_yolov8_detector::vp_trt_yolov8_detector(std::string node_name, 
+                                                    std::string model_path,
+                                                    std::string labels_path):
+                                                    vp_primary_infer_node(node_name, "", "", labels_path) {
+        yolov8_detector = std::make_shared<trt_yolov8::trt_yolov8_detector>(model_path);
+        this->initialized();
+    }
+    
+    vp_trt_yolov8_detector::~vp_trt_yolov8_detector() {
+        deinitialized();
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_trt_yolov8_detector::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_primary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        std::vector<std::vector<Detection>> detections;
+        yolov8_detector->detect(mats_to_infer, detections);
+
+        assert(detections.size() == 1);
+        auto& detection_list = detections[0];
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        for (int i = 0; i < detection_list.size(); i++) {
+            auto& objbox = detection_list[i];
+
+            // objbox.bbox: center_x center_y width height
+            auto rect = get_rect(frame_meta->frame, objbox.bbox);  // convert to: x, y, width,height
+            
+            // check value range
+            int x = std::max(rect.x, 0);
+            int y = std::max(rect.y, 0);
+            int width = std::min(rect.width, frame_meta->frame.cols - x);
+            int height = std::min(rect.height, frame_meta->frame.rows - y);
+            if (width <= 0 || height <= 0) {
+                continue;
+            }
+            auto label = labels.size() == 0 ? "" : labels[objbox.class_id];
+            auto target = std::make_shared<vp_objects::vp_frame_target>(x, y, width, height, 
+                                                                        objbox.class_id, objbox.conf, frame_meta->frame_index, frame_meta->channel_index, label);
+            // create target and update back into frame meta
+            frame_meta->targets.push_back(target);
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_yolov8_detector::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_detector.h
+++ b/nodes/infers/vp_trt_yolov8_detector.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/trt_yolov8/trt_yolov8_detector.h"
+
+namespace vp_nodes {
+    // universal yolov8 detector based on tensorrt using third_party/trt_yolov8 library
+    class vp_trt_yolov8_detector: public vp_primary_infer_node
+    {
+    private:
+        std::shared_ptr<trt_yolov8::trt_yolov8_detector> yolov8_detector = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_yolov8_detector(std::string node_name, std::string model_path, std::string labels_path = "");
+        ~vp_trt_yolov8_detector();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_pose_detector.cpp
+++ b/nodes/infers/vp_trt_yolov8_pose_detector.cpp
@@ -0,0 +1,60 @@
+
+#ifdef VP_WITH_TRT
+#include "vp_trt_yolov8_pose_detector.h"
+
+namespace vp_nodes {
+    
+    vp_trt_yolov8_pose_detector::vp_trt_yolov8_pose_detector(std::string node_name, 
+                                                    std::string model_path):
+                                                    vp_primary_infer_node(node_name, "") {
+        yolov8_pose_detector = std::make_shared<trt_yolov8::trt_yolov8_pose_detector>(model_path);
+        this->initialized();
+    }
+    
+    vp_trt_yolov8_pose_detector::~vp_trt_yolov8_pose_detector() {
+        deinitialized();
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_trt_yolov8_pose_detector::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_primary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        std::vector<std::vector<Detection>> detections;
+        yolov8_pose_detector->detect(mats_to_infer, detections);
+
+        assert(detections.size() == 1);
+        auto& detection_list = detections[0];
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        for (int i = 0; i < detection_list.size(); i++) {
+            auto& objbox = detection_list[i];
+            auto rect = get_rect_adapt_landmark(frame_meta->frame, objbox.bbox, objbox.keypoints);
+            
+            std::vector<vp_objects::vp_pose_keypoint> kps;    
+            for (int j = 0; j < 51; j += 3) {
+                kps.push_back(vp_objects::vp_pose_keypoint {j, int(objbox.keypoints[j]), int(objbox.keypoints[j + 1]), objbox.keypoints[j + 2]});
+            }
+                        
+            auto pose_target = std::make_shared<vp_objects::vp_frame_pose_target>(vp_objects::vp_pose_type::yolov8_pose_17, kps);
+            frame_meta->pose_targets.push_back(pose_target);            
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_yolov8_pose_detector::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_pose_detector.h
+++ b/nodes/infers/vp_trt_yolov8_pose_detector.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/trt_yolov8/trt_yolov8_pose_detector.h"
+
+namespace vp_nodes {
+    // universal yolov8 pose detector based on tensorrt using third_party/trt_yolov8 library
+    class vp_trt_yolov8_pose_detector: public vp_primary_infer_node
+    {
+    private:
+        std::shared_ptr<trt_yolov8::trt_yolov8_pose_detector> yolov8_pose_detector = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_yolov8_pose_detector(std::string node_name, std::string model_path);
+        ~vp_trt_yolov8_pose_detector();
+    };
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_seg_detector.cpp
+++ b/nodes/infers/vp_trt_yolov8_seg_detector.cpp
@@ -0,0 +1,78 @@
+
+#ifdef VP_WITH_TRT
+#include "vp_trt_yolov8_seg_detector.h"
+
+namespace vp_nodes {
+    
+    vp_trt_yolov8_seg_detector::vp_trt_yolov8_seg_detector(std::string node_name, 
+                                                    std::string model_path,
+                                                    std::string labels_path):
+                                                    vp_primary_infer_node(node_name, "", "", labels_path) {
+        yolov8_seg_detector = std::make_shared<trt_yolov8::trt_yolov8_seg_detector>(model_path);
+        this->initialized();
+    }
+    
+    vp_trt_yolov8_seg_detector::~vp_trt_yolov8_seg_detector() {
+        deinitialized();
+    }
+
+    // please refer to vp_infer_node::run_infer_combinations
+    void vp_trt_yolov8_seg_detector::run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        assert(frame_meta_with_batch.size() == 1);
+        std::vector<cv::Mat> mats_to_infer;
+
+        // start
+        auto start_time = std::chrono::system_clock::now();
+
+        // prepare data, as same as base class
+        vp_primary_infer_node::prepare(frame_meta_with_batch, mats_to_infer);
+        auto prepare_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        start_time = std::chrono::system_clock::now();
+        std::vector<std::vector<Detection>> detections;
+        std::vector<std::vector<cv::Mat>> masks;
+        yolov8_seg_detector->detect(mats_to_infer, detections, masks);
+
+        assert(detections.size() == 1);
+        assert(masks.size() == 1);
+        auto& detection_list = detections[0];
+        auto& mask_list = masks[0];
+        assert(detection_list.size() == mask_list.size());
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        for (int i = 0; i < detection_list.size(); i++) {
+            auto& objbox = detection_list[i];
+            auto& mask = mask_list[i];
+            auto scaled_mask = scale_mask(mask, frame_meta->frame);
+
+            // objbox.bbox: center_x center_y width height
+            auto rect = get_rect(frame_meta->frame, objbox.bbox);  // convert to: x, y, width,height
+            // check value range
+            rect.x = std::max(rect.x, 0);
+            rect.y = std::max(rect.y, 0);
+            rect.width = std::min(rect.width, frame_meta->frame.cols - rect.x);
+            rect.height = std::min(rect.height, frame_meta->frame.rows - rect.y);
+            if (rect.width <= 0 || rect.height <= 0) {
+                continue;
+            }
+
+            auto label = labels.size() == 0 ? "" : labels[objbox.class_id];
+            auto target = std::make_shared<vp_objects::vp_frame_target>(rect.x, rect.y, rect.width, rect.height, 
+                                                                        objbox.class_id, objbox.conf, frame_meta->frame_index, frame_meta->channel_index, label);
+            auto rect_mask = scaled_mask(rect);
+            target->mask = rect_mask;
+
+            // create target and update back into frame meta
+            frame_meta->targets.push_back(target);
+        }
+        auto infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start_time);
+
+        // can not calculate preprocess time and postprocess time, set 0 by default.
+        vp_infer_node::infer_combinations_time_cost(mats_to_infer.size(), prepare_time.count(), 0, infer_time.count(), 0);
+    }
+
+    void vp_trt_yolov8_seg_detector::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+
+    }
+}
+#endif
--- a/nodes/infers/vp_trt_yolov8_seg_detector.h
+++ b/nodes/infers/vp_trt_yolov8_seg_detector.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef VP_WITH_TRT
+#include "../vp_primary_infer_node.h"
+#include "../../third_party/trt_yolov8/trt_yolov8_seg_detector.h"
+
+namespace vp_nodes {
+    // universal yolov8 segmentation detector based on tensorrt using third_party/trt_yolov8 library
+    class vp_trt_yolov8_seg_detector: public vp_primary_infer_node
+    {
+    private:
+        std::shared_ptr<trt_yolov8::trt_yolov8_seg_detector> yolov8_seg_detector = nullptr;
+    protected:
+        // we need a totally new logic for the whole infer combinations
+        // no separate step pre-defined needed in base class
+        virtual void run_infer_combinations(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        // override pure virtual method, for compile pass
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_trt_yolov8_seg_detector(std::string node_name, std::string model_path, std::string labels_path = "");
+        ~vp_trt_yolov8_seg_detector();
+    };
+}
+#endif
--- a/nodes/infers/vp_yolo5_seg_node.cpp
+++ b/nodes/infers/vp_yolo5_seg_node.cpp
@@ -0,0 +1,307 @@
+
+#include <cmath>
+#include <algorithm>
+#include <numeric> 
+#include "vp_yolo5_seg_node.h"
+
+
+namespace vp_nodes {
+    vp_yolo5_seg_node::vp_yolo5_seg_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string labels_path, 
+                            int input_width, 
+                            int input_height, 
+                            int batch_size,
+                            int class_id_offset,
+                            float conf_threshold,
+                            float iou_threshold):
+                            vp_primary_infer_node(node_name, model_path, "", labels_path, input_width, input_height, batch_size, class_id_offset),
+                            conf_threshold(conf_threshold),
+                            iou_threshold(iou_threshold) {
+        this->initialized();
+    }
+    
+    vp_yolo5_seg_node::~vp_yolo5_seg_node() {
+        deinitialized();
+    }
+
+    void vp_yolo5_seg_node::prepare(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch, std::vector<cv::Mat>& mats_to_infer) {
+        for (auto& i: frame_meta_with_batch) {
+            auto letterboxed = letterbox(i->frame, cv::Size(input_width, input_height), scale_ratio, padding);
+            mats_to_infer.push_back(letterboxed);
+        }
+    }
+
+    void vp_yolo5_seg_node::infer(const cv::Mat& blob_to_infer, std::vector<cv::Mat>& raw_outputs) {
+        // blob_to_infer is a 4D matrix
+        // the first dim is number of batch, MUST be 1
+        assert(blob_to_infer.dims == 4);
+        assert(blob_to_infer.size[0] == 1);
+        assert(!net.empty());
+
+        net.setInput(blob_to_infer);
+        net.forward(raw_outputs, net.getUnconnectedOutLayersNames());
+    }
+
+    void vp_yolo5_seg_node::preprocess(const std::vector<cv::Mat>& mats_to_infer, cv::Mat& blob_to_infer) {
+        // ignore preprocess logic in base class
+        cv::dnn::blobFromImages(mats_to_infer, blob_to_infer, 1.0 / 255.0, cv::Size(), cv::Scalar(), true, false);
+    }
+
+    void vp_yolo5_seg_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        auto& frame_meta = frame_meta_with_batch[0];
+        auto& frame = frame_meta->frame;
+        auto pred = raw_outputs[0]; // [1, N, 4+1+NUM_CLASSES+32]
+        auto proto = raw_outputs[1]; // [1, 32, 96, 160]
+
+        std::vector<cv::Rect> boxes;
+        std::vector<float> confidences;
+        std::vector<int> class_ids;
+        std::vector<cv::Mat> mask_coeffs;
+
+        int num_boxes = pred.size[1];
+        float* data = (float*)pred.data;
+
+        // decode
+        for (int i = 0; i < num_boxes; ++i) {
+            float obj_conf = data[4];
+            if (obj_conf < conf_threshold) {
+                data += (5 + labels.size() + MASK_CHANNELS);
+                continue;
+            }
+
+            int class_id = -1;
+            float max_cls_conf = 0;
+            for (int c = 0; c < labels.size(); ++c) {
+                float cls_conf = data[5 + c];
+                if (cls_conf > max_cls_conf) {
+                    max_cls_conf = cls_conf;
+                    class_id = c;
+                }
+            }
+
+            float total_conf = obj_conf * max_cls_conf;
+            if (total_conf < conf_threshold || class_id == -1) {
+                data += (5 + labels.size() + MASK_CHANNELS);
+                continue;
+            }
+
+            float cx = data[0];
+            float cy = data[1];
+            float w  = data[2];
+            float h  = data[3];
+
+            float x1 = cx - w * 0.5f;
+            float y1 = cy - h * 0.5f;
+            float x2 = cx + w * 0.5f;
+            float y2 = cy + h * 0.5f;
+
+            if (x2 <= x1 || y2 <= y1 || w <= 0 || h <= 0) {
+                data += (5 + labels.size() + MASK_CHANNELS);
+                continue;
+            }
+
+            float x_scale = scale_ratio.at<float>(0);
+            float y_scale = scale_ratio.at<float>(1);
+            int orig_x1 = cvRound((x1 - padding.x) * x_scale);
+            int orig_y1 = cvRound((y1 - padding.y) * y_scale);
+            int orig_x2 = cvRound((x2 - padding.x) * x_scale);
+            int orig_y2 = cvRound((y2 - padding.y) * y_scale);
+
+            orig_x1 = max(0, orig_x1);
+            orig_y1 = max(0, orig_y1);
+            orig_x2 = min(frame.cols, orig_x2);
+            orig_y2 = min(frame.rows, orig_y2);
+
+            if (orig_x2 <= orig_x1 || orig_y2 <= orig_y1) {
+                data += (5 + labels.size() + MASK_CHANNELS);
+                continue;
+            }
+
+            cv::Rect box(orig_x1, orig_y1, orig_x2 - orig_x1, orig_y2 - orig_y1);
+            boxes.push_back(box);
+            confidences.push_back(total_conf);
+            class_ids.push_back(class_id);
+
+            cv::Mat coeff(1, MASK_CHANNELS, CV_32F);
+            for (int m = 0; m < MASK_CHANNELS; ++m) {
+                coeff.at<float>(m) = data[5 + labels.size() + m];
+            }
+            mask_coeffs.push_back(coeff);
+
+            data += (5 + labels.size() + MASK_CHANNELS);
+        }
+
+        std::vector<int> final_indices;
+        std::unordered_map<int, std::vector<int>> class_boxes;
+        for (int i = 0; i < boxes.size(); ++i) {
+            class_boxes[class_ids[i]].push_back(i);
+        }
+
+        // NMS
+        for (auto& kv : class_boxes) {
+            const std::vector<int>& indices_in_class = kv.second;
+            std::vector<cv::Rect> boxes_cls;
+            std::vector<float> scores_cls;
+            for (int idx : indices_in_class) {
+                boxes_cls.push_back(boxes[idx]);
+                scores_cls.push_back(confidences[idx]);
+            }
+
+            std::vector<int> nms_indices = nms(boxes_cls, scores_cls, iou_threshold);
+            for (int local_idx : nms_indices) {
+                final_indices.push_back(indices_in_class[local_idx]);
+            }
+        }
+
+        // create target push back to frame meta
+        for (int idx : final_indices) {
+            cv::Rect box = boxes[idx];
+            float conf = confidences[idx];
+            int cls = class_ids[idx];
+
+            auto label = (labels.size() < cls + 1) ? "" : labels[cls];            
+            cls += class_id_offset;
+            auto target = std::make_shared<vp_objects::vp_frame_target>(box.x, box.y, box.width, box.height, cls, conf, frame_meta->frame_index, frame_meta->channel_index, label);
+
+            // try to extract mask
+            auto local_mask = process_mask(proto, mask_coeffs[idx], box,
+                                          cv::Size(input_width, input_height),
+                                          padding, scale_ratio.at<float>(0), scale_ratio.at<float>(1));
+            target->mask = local_mask;
+            
+            frame_meta->targets.push_back(target);
+        }
+
+        std::cout << "Detected: " << final_indices.size() << " instances" << std::endl;
+    }
+
+    cv::Mat vp_yolo5_seg_node::letterbox(const cv::Mat& src, cv::Size target_size, cv::Mat& scale_ratio, cv::Point& padding) {
+        float scale = min(static_cast<float>(target_size.height) / src.rows,
+                        static_cast<float>(target_size.width) / src.cols);
+        int new_width = cvRound(src.cols * scale);
+        int new_height = cvRound(src.rows * scale);
+
+        cv::Mat resized;
+        resize(src, resized, cv::Size(new_width, new_height), 0, 0, cv::INTER_LINEAR);
+
+        cv::Mat letterboxed(target_size, CV_8UC3, cv::Scalar(114, 114, 114));
+        int top = (target_size.height - new_height) / 2;
+        int left = (target_size.width - new_width) / 2;
+        cv::Rect roi(left, top, new_width, new_height);
+        resized.copyTo(letterboxed(roi));
+
+        scale_ratio = cv::Mat(1, 2, CV_32F);
+        scale_ratio.at<float>(0) = 1.0f / scale;  // x_scale
+        scale_ratio.at<float>(1) = 1.0f / scale;  // y_scale
+
+        padding = cv::Point(left, top);
+        return letterboxed;
+    }
+
+    // Sigmoid
+    cv::Mat vp_yolo5_seg_node::sigmoid(const cv::Mat& x) {
+        cv::Mat result;
+        cv::exp(-x, result);
+        result = 1.0f / (1.0f + result);
+        return result;
+    }
+
+    cv::Mat vp_yolo5_seg_node::process_mask(const cv::Mat& proto,           // [1, 32, H/4, W/4]
+                    const cv::Mat& coeffs,          // [1, 32]
+                    const cv::Rect& bbox,           
+                    const cv::Size& input_size,     // e.g., 640x384
+                    const cv::Point& padding,
+                    const float x_scale,
+                    const float y_scale) {
+
+        // 1. 获取 proto 尺寸 [1,32,96,160] → h=96, w=160
+        int h_proto = proto.size[2];
+        int w_proto = proto.size[3];
+
+        // 2. 重塑为 [32, h, w]
+        cv::Mat proto_3d = proto.reshape(1, {MASK_CHANNELS, h_proto, w_proto});
+
+        // 3. 展平为 [32, h*w]
+        cv::Mat proto_flat = proto_3d.reshape(1, MASK_CHANNELS);
+
+        // 4. 矩阵乘: [1,32] × [32, h*w] → [1, h*w]
+        cv::Mat mask_raw = coeffs * proto_flat;
+
+        // 5. 重塑为 [h, w]
+        cv::Mat mask_2d = mask_raw.reshape(0, h_proto);
+
+        // 6. Sigmoid
+        cv::Mat mask_sigmoid = sigmoid(mask_2d);
+
+        // 7. 原图 bbox → 输入图坐标（含 padding）
+        float x1_in = bbox.x / x_scale + padding.x;
+        float y1_in = bbox.y / y_scale + padding.y;
+        float x2_in = (bbox.x + bbox.width) / x_scale + padding.x;
+        float y2_in = (bbox.y + bbox.height) / y_scale + padding.y;
+
+        // 8. 输入图 → proto 坐标（/4）
+        float x1_p = x1_in / 4.0f;
+        float y1_p = y1_in / 4.0f;
+        float x2_p = x2_in / 4.0f;
+        float y2_p = y2_in / 4.0f;
+
+        int px1 = max(0, (int)floor(x1_p));
+        int py1 = max(0, (int)floor(y1_p));
+        int px2 = min(w_proto, (int)ceil(x2_p));
+        int py2 = min(h_proto, (int)ceil(y2_p));
+
+        if (px2 <= px1 || py2 <= py1) {
+            return cv::Mat::zeros(bbox.size(), CV_8UC1);
+        }
+
+        // 9. 裁剪
+        cv::Mat cropped = mask_sigmoid(cv::Rect(px1, py1, px2 - px1, py2 - py1)).clone();
+
+        // 10. 上采样到 bbox 尺寸
+        cv::Mat resized;
+        resize(cropped, resized, bbox.size(), 0, 0, cv::INTER_LINEAR);
+
+        // 11. 二值化
+        cv::Mat mask_bin;
+        threshold(resized, mask_bin, 0.5, 255, cv::THRESH_BINARY);
+        mask_bin.convertTo(mask_bin, CV_8UC1);
+
+        return mask_bin;
+    }
+
+    // NMS
+    std::vector<int> vp_yolo5_seg_node::nms(const std::vector<cv::Rect>& boxes, const std::vector<float>& scores, float iou_threshold) {
+        std::vector<int> indices;
+        std::vector<float> areas(boxes.size());
+        for (size_t i = 0; i < boxes.size(); ++i) {
+            areas[i] = boxes[i].area();
+        }
+
+        std::vector<int> order(scores.size());
+        iota(order.begin(), order.end(), 0);
+        sort(order.begin(), order.end(), [&](int a, int b) {
+            return scores[a] > scores[b];
+        });
+
+        std::vector<bool> keep(scores.size(), true);
+        for (int i = 0; i < order.size(); ++i) {
+            int idx = order[i];
+            if (!keep[idx]) continue;
+            for (int j = i + 1; j < order.size(); ++j) {
+                int idx2 = order[j];
+                if (!keep[idx2]) continue;
+                cv::Rect intersect = boxes[idx] & boxes[idx2];
+                float iou = intersect.area() / (areas[idx] + areas[idx2] - intersect.area() + 1e-6f);
+                if (iou > iou_threshold) {
+                    keep[idx2] = false;
+                }
+            }
+        }
+
+        for (int i = 0; i < keep.size(); ++i) {
+            if (keep[i]) indices.push_back(i);
+        }
+        return indices;
+    }
+}
--- a/nodes/infers/vp_yolo5_seg_node.h
+++ b/nodes/infers/vp_yolo5_seg_node.h
@@ -0,0 +1,48 @@
+
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+#include "../../objects/vp_frame_target.h"
+
+
+
+namespace vp_nodes {
+    // driving area segmentation based on yolov5s-seg-v7.0
+    // https://github.com/ultralytics/yolov5/releases/tag/v7.0
+    class vp_yolo5_seg_node: public vp_primary_infer_node
+    {
+    private:
+        /* data */
+        float conf_threshold = 0.5;
+        float iou_threshold = 0.7;
+        int MASK_CHANNELS = 32;
+        cv::Mat scale_ratio;
+        cv::Point padding;
+        cv::Mat letterbox(const cv::Mat& src, cv::Size target_size, cv::Mat& scale_ratio, cv::Point& padding);
+        cv::Mat sigmoid(const cv::Mat& x);
+        cv::Mat process_mask(const cv::Mat& proto,  // [1, 32, H/4, W/4]
+                    const cv::Mat& coeffs,          // [1, 32]
+                    const cv::Rect& bbox,           
+                    const cv::Size& input_size,     // e.g., 640x384
+                    const cv::Point& padding,
+                    const float x_scale,
+                    const float y_scale);
+        std::vector<int> nms(const std::vector<cv::Rect>& boxes, const std::vector<float>& scores, float iou_threshold);
+    protected:
+        virtual void infer(const cv::Mat& blob_to_infer, std::vector<cv::Mat>& raw_outputs) override;
+        virtual void preprocess(const std::vector<cv::Mat>& mats_to_infer, cv::Mat& blob_to_infer) override;
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+        virtual void prepare(const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch, std::vector<cv::Mat>& mats_to_infer) override;
+    public:
+        vp_yolo5_seg_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string labels_path = "", 
+                            int input_width = 640, 
+                            int input_height = 384, 
+                            int batch_size = 1,
+                            int class_id_offset = 0,
+                            float conf_threshold = 0.5,
+                            float iou_threshold = 0.7);
+        ~vp_yolo5_seg_node();
+    };    
+}
--- a/nodes/infers/vp_yolo_detector_node.cpp
+++ b/nodes/infers/vp_yolo_detector_node.cpp
@@ -0,0 +1,118 @@
+
+#include "vp_yolo_detector_node.h"
+
+namespace vp_nodes {
+    
+    vp_yolo_detector_node::vp_yolo_detector_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path, 
+                            std::string labels_path, 
+                            int input_width, 
+                            int input_height, 
+                            int batch_size,
+                            int class_id_offset,
+                            float score_threshold,
+                            float confidence_threshold,
+                            float nms_threshold,
+                            float scale,
+                            cv::Scalar mean,
+                            cv::Scalar std,
+                            bool swap_rb):
+                            vp_primary_infer_node(node_name, model_path, model_config_path,labels_path,input_width, input_height, batch_size, class_id_offset, scale, mean, std, swap_rb),
+                            score_threshold(score_threshold),
+                            confidence_threshold(confidence_threshold),
+                            nms_threshold(nms_threshold) {
+        this->initialized();
+    }
+    
+    vp_yolo_detector_node::~vp_yolo_detector_node() {
+        deinitialized();
+    }
+    
+    void vp_yolo_detector_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        // make sure heads of output are not zero
+        assert(raw_outputs.size() > 0);
+
+        // check dims of each output
+        auto dim_offset = raw_outputs[0].dims == 2 ? 0 : 1;
+        auto batch = dim_offset == 0 ? 1 : raw_outputs[0].size[0];
+
+        std::vector<int> class_ids;
+        std::vector<float> confidences;
+        std::vector<cv::Rect> boxes;
+
+        for (int k = 0; k < batch; k++) {  // scan batch
+            auto& frame_meta = frame_meta_with_batch[k];
+            for (int i = 0; i < raw_outputs.size(); ++i) {  // scan heads of output
+                cv::Mat output = cv::Mat(raw_outputs[i].size[0 + dim_offset], raw_outputs[i].size[1 + dim_offset], CV_32F, const_cast<uchar*>(raw_outputs[i].ptr(k)));
+
+                auto data = (float*)output.data;
+                for (int j = 0; j < output.rows; ++j, data += output.cols) {
+                    
+                    float confidence = data[4];
+                    // check confidence threshold
+                    if (confidence < confidence_threshold) {
+                        continue;
+                    }
+
+                    cv::Mat scores = output.row(j).colRange(5, output.cols);
+                    cv::Point class_id;
+                    double max_score;
+                    // Get the value and location of the maximum score
+                    cv::minMaxLoc(scores, 0, &max_score, 0, &class_id);
+
+                    // check score threshold
+                    if (max_score >= score_threshold) {
+                        // smaller than 1.0 means relative value 
+                        // greater than 1.0 means absolute value which need to be converted to relative value
+                        int centerX = (int)((data[0] <= 1 ? data[0] : data[0] / input_width) * frame_meta->frame.cols);
+                        int centerY = (int)((data[1] <= 1 ? data[1] : data[1] / input_height) * frame_meta->frame.rows);
+                        int width = (int)((data[2] <= 1 ? data[2] : data[2] / input_width) * frame_meta->frame.cols);
+                        int height = (int)((data[3] <= 1 ? data[3] : data[3] / input_height) * frame_meta->frame.rows);
+                        int left = centerX - width / 2;
+                        int top = centerY - height / 2;
+
+                        class_ids.push_back(class_id.x);
+                        confidences.push_back((float)confidence);
+                        boxes.push_back(cv::Rect(left, top, width, height));
+                    }
+                }
+            }
+
+            // nms
+            std::vector<int> indices;
+            cv::dnn::NMSBoxes(boxes, confidences, confidence_threshold, nms_threshold, indices);
+
+            // create target
+            for (int i = 0; i < indices.size(); ++i) {
+                int idx = indices[i];
+                auto box = boxes[idx];
+                auto confidence = confidences[idx];
+                auto class_id = class_ids[idx];
+                auto label = (labels.size() < class_id + 1) ? "" : labels[class_id];
+
+                // check value range
+                box.x = std::max(box.x, 0);
+                box.y = std::max(box.y, 0);
+                box.width = std::min(box.width, frame_meta->frame.cols - box.x);
+                box.height = std::min(box.height, frame_meta->frame.rows - box.y);
+                if (box.width <= 0 || box.height <= 0) {
+                    continue;
+                }
+                
+                // apply offset to class id since multi detectors can exist at front of this one.
+                // later we MUST use the new class id (applied offset) instead of orignal one.
+                class_id += class_id_offset;
+
+                auto target = std::make_shared<vp_objects::vp_frame_target>(box.x, box.y, box.width, box.height, class_id, confidence, frame_meta->frame_index, frame_meta->channel_index, label);
+
+                // insert target back to frame meta
+                frame_meta->targets.push_back(target);
+            }
+
+            class_ids.clear();
+            confidences.clear();
+            boxes.clear();
+        }
+    }
+}
--- a/nodes/infers/vp_yolo_detector_node.h
+++ b/nodes/infers/vp_yolo_detector_node.h
@@ -0,0 +1,35 @@
+
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+
+namespace vp_nodes {
+    // yolo detector, support yolov3/4/5
+    // https://github.com/pjreddie/darknet
+    class vp_yolo_detector_node: public vp_primary_infer_node
+    {
+    private:
+        float score_threshold;
+        float confidence_threshold;
+        float nms_threshold;
+    protected:
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_yolo_detector_node(std::string node_name, 
+                            std::string model_path, 
+                            std::string model_config_path = "", 
+                            std::string labels_path = "", 
+                            int input_width = 416, 
+                            int input_height = 416, 
+                            int batch_size = 1,
+                            int class_id_offset = 0,
+                            float score_threshold = 0.5,
+                            float confidence_threshold = 0.5,
+                            float nms_threshold = 0.5,
+                            float scale = 1 / 255.0,
+                            cv::Scalar mean = cv::Scalar(0),
+                            cv::Scalar std = cv::Scalar(1),
+                            bool swap_rb = true);
+        ~vp_yolo_detector_node();
+    };
+}
--- a/nodes/infers/vp_yunet_face_detector_node.cpp
+++ b/nodes/infers/vp_yunet_face_detector_node.cpp
@@ -0,0 +1,229 @@
+
+#include "vp_yunet_face_detector_node.h"
+
+
+namespace vp_nodes {
+        
+    vp_yunet_face_detector_node::vp_yunet_face_detector_node(std::string node_name, 
+                                                            std::string model_path, 
+                                                            float score_threshold, 
+                                                            float nms_threshold, 
+                                                            int top_k):
+                                                            vp_primary_infer_node(node_name, model_path),
+                                                            scoreThreshold(score_threshold),
+                                                            nmsThreshold(nms_threshold),
+                                                            topK(top_k) {
+        this->initialized();
+    }
+    
+    vp_yunet_face_detector_node::~vp_yunet_face_detector_node() {
+        deinitialized();
+    }
+    
+    void vp_yunet_face_detector_node::postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) {
+        using namespace cv;
+        // 3 heads of output
+        assert(raw_outputs.size() == 3);
+        assert(frame_meta_with_batch.size() == 1);
+        auto& frame_meta = frame_meta_with_batch[0];
+
+        // Extract from output_blobs
+        Mat loc = raw_outputs[0];
+        Mat conf = raw_outputs[1];
+        Mat iou = raw_outputs[2];
+
+        // we need generate priors if input size changed or priors is not initialized
+        if (loc.rows != priors.size()) {
+            inputW = frame_meta->frame.cols;
+            inputH = frame_meta->frame.rows;
+            generatePriors();
+        }
+        
+        assert(loc.rows == priors.size());
+        assert(loc.rows == conf.rows);
+        assert(loc.rows == iou.rows);
+
+        // Decode from deltas and priors
+        const std::vector<float> variance = {0.1f, 0.2f};
+        float* loc_v = (float*)(loc.data);
+        float* conf_v = (float*)(conf.data);
+        float* iou_v = (float*)(iou.data);
+        Mat faces;
+        // (tl_x, tl_y, w, h, re_x, re_y, le_x, le_y, nt_x, nt_y, rcm_x, rcm_y, lcm_x, lcm_y, score)
+        // 'tl': top left point of the bounding box
+        // 're': right eye, 'le': left eye
+        // 'nt':  nose tip
+        // 'rcm': right corner of mouth, 'lcm': left corner of mouth
+        Mat face(1, 15, CV_32FC1);
+        for (size_t i = 0; i < priors.size(); ++i) {
+            // Get score
+            float clsScore = conf_v[i*2+1];
+            float iouScore = iou_v[i];
+            // Clamp
+            if (iouScore < 0.f) {
+                iouScore = 0.f;
+            }
+            else if (iouScore > 1.f) {
+                iouScore = 1.f;
+            }
+            float score = std::sqrt(clsScore * iouScore);
+            face.at<float>(0, 14) = score;
+
+            // Get bounding box
+            float cx = (priors[i].x + loc_v[i*14+0] * variance[0] * priors[i].width)  * inputW;
+            float cy = (priors[i].y + loc_v[i*14+1] * variance[0] * priors[i].height) * inputH;
+            float w  = priors[i].width  * exp(loc_v[i*14+2] * variance[0]) * inputW;
+            float h  = priors[i].height * exp(loc_v[i*14+3] * variance[1]) * inputH;
+            float x1 = cx - w / 2;
+            float y1 = cy - h / 2;
+            face.at<float>(0, 0) = x1;
+            face.at<float>(0, 1) = y1;
+            face.at<float>(0, 2) = w;
+            face.at<float>(0, 3) = h;
+
+            // Get landmarks
+            face.at<float>(0, 4) = (priors[i].x + loc_v[i*14+ 4] * variance[0] * priors[i].width)  * inputW;  // right eye, x
+            face.at<float>(0, 5) = (priors[i].y + loc_v[i*14+ 5] * variance[0] * priors[i].height) * inputH;  // right eye, y
+            face.at<float>(0, 6) = (priors[i].x + loc_v[i*14+ 6] * variance[0] * priors[i].width)  * inputW;  // left eye, x
+            face.at<float>(0, 7) = (priors[i].y + loc_v[i*14+ 7] * variance[0] * priors[i].height) * inputH;  // left eye, y
+            face.at<float>(0, 8) = (priors[i].x + loc_v[i*14+ 8] * variance[0] * priors[i].width)  * inputW;  // nose tip, x
+            face.at<float>(0, 9) = (priors[i].y + loc_v[i*14+ 9] * variance[0] * priors[i].height) * inputH;  // nose tip, y
+            face.at<float>(0, 10) = (priors[i].x + loc_v[i*14+10] * variance[0] * priors[i].width)  * inputW; // right corner of mouth, x
+            face.at<float>(0, 11) = (priors[i].y + loc_v[i*14+11] * variance[0] * priors[i].height) * inputH; // right corner of mouth, y
+            face.at<float>(0, 12) = (priors[i].x + loc_v[i*14+12] * variance[0] * priors[i].width)  * inputW; // left corner of mouth, x
+            face.at<float>(0, 13) = (priors[i].y + loc_v[i*14+13] * variance[0] * priors[i].height) * inputH; // left corner of mouth, y
+
+            faces.push_back(face);
+        }
+
+        if (faces.rows > 1)
+        {
+            // Retrieve boxes and scores
+            std::vector<Rect2i> faceBoxes;
+            std::vector<float> faceScores;
+            for (int rIdx = 0; rIdx < faces.rows; rIdx++)
+            {
+                faceBoxes.push_back(Rect2i(int(faces.at<float>(rIdx, 0)),
+                                           int(faces.at<float>(rIdx, 1)),
+                                           int(faces.at<float>(rIdx, 2)),
+                                           int(faces.at<float>(rIdx, 3))));
+                faceScores.push_back(faces.at<float>(rIdx, 14));
+            }
+
+            std::vector<int> keepIdx;
+            dnn::NMSBoxes(faceBoxes, faceScores, scoreThreshold, nmsThreshold, keepIdx, 1.f, topK);
+
+            // Get NMS results
+            Mat nms_faces;
+            for (int idx: keepIdx)
+            {
+                nms_faces.push_back(faces.row(idx));
+            }
+
+            // insert face target back to frame meta
+            for (int i = 0; i < nms_faces.rows; i++) {
+                auto x = int(nms_faces.at<float>(i, 0));
+                auto y = int(nms_faces.at<float>(i, 1));
+                auto w = int(nms_faces.at<float>(i, 2));
+                auto h = int(nms_faces.at<float>(i, 3));
+                
+                // check value range
+                x = std::max(x, 0);
+                y = std::max(y, 0);
+                w = std::min(w, frame_meta->frame.cols - x);
+                h = std::min(h, frame_meta->frame.rows - y);
+
+                auto kp1 = std::pair<int, int>(int(nms_faces.at<float>(i, 4)), int(nms_faces.at<float>(i, 5)));
+                auto kp2 = std::pair<int, int>(int(nms_faces.at<float>(i, 6)), int(nms_faces.at<float>(i, 7)));
+                auto kp3 = std::pair<int, int>(int(nms_faces.at<float>(i, 8)), int(nms_faces.at<float>(i, 9)));
+                auto kp4 = std::pair<int, int>(int(nms_faces.at<float>(i, 10)), int(nms_faces.at<float>(i, 11)));
+                auto kp5 = std::pair<int, int>(int(nms_faces.at<float>(i, 12)), int(nms_faces.at<float>(i, 13)));
+                auto score = nms_faces.at<float>(i, 14);
+
+                auto face_target = std::make_shared<vp_objects::vp_frame_face_target>(x, y, w, h, score, std::vector<std::pair<int, int>>{kp1, kp2, kp3, kp4, kp5});
+
+                frame_meta->face_targets.push_back(face_target);
+            }
+            
+        }
+    }
+
+    // refer to vp_infer_node::preprocess
+    void vp_yunet_face_detector_node::preprocess(const std::vector<cv::Mat>& mats_to_infer, cv::Mat& blob_to_infer) {
+        cv::dnn::blobFromImages(mats_to_infer, blob_to_infer);
+    }
+    
+    // refer to vp_infer_node::infer
+    void vp_yunet_face_detector_node::infer(const cv::Mat& blob_to_infer, std::vector<cv::Mat>& raw_outputs) {
+        // blob_to_infer is a 4D matrix
+        // the first dim is number of batch, MUST be 1
+        assert(blob_to_infer.dims == 4);
+        assert(blob_to_infer.size[0] == 1);
+        assert(!net.empty());
+
+        net.setInput(blob_to_infer);
+        net.forward(raw_outputs, out_names);
+    }
+
+    void vp_yunet_face_detector_node::generatePriors() {
+        using namespace cv;
+        // Calculate shapes of different scales according to the shape of input image
+        Size feature_map_2nd = {
+            int(int((inputW+1)/2)/2), int(int((inputH+1)/2)/2)
+        };
+        Size feature_map_3rd = {
+            int(feature_map_2nd.width/2), int(feature_map_2nd.height/2)
+        };
+        Size feature_map_4th = {
+            int(feature_map_3rd.width/2), int(feature_map_3rd.height/2)
+        };
+        Size feature_map_5th = {
+            int(feature_map_4th.width/2), int(feature_map_4th.height/2)
+        };
+        Size feature_map_6th = {
+            int(feature_map_5th.width/2), int(feature_map_5th.height/2)
+        };
+
+        std::vector<Size> feature_map_sizes;
+        feature_map_sizes.push_back(feature_map_3rd);
+        feature_map_sizes.push_back(feature_map_4th);
+        feature_map_sizes.push_back(feature_map_5th);
+        feature_map_sizes.push_back(feature_map_6th);
+
+        // Fixed params for generating priors
+        const std::vector<std::vector<float>> min_sizes = {
+            {10.0f,  16.0f,  24.0f},
+            {32.0f,  48.0f},
+            {64.0f,  96.0f},
+            {128.0f, 192.0f, 256.0f}
+        };
+        CV_Assert(min_sizes.size() == feature_map_sizes.size()); // just to keep vectors in sync
+        const std::vector<int> steps = { 8, 16, 32, 64 };
+
+        // Generate priors
+        priors.clear();
+        for (size_t i = 0; i < feature_map_sizes.size(); ++i)
+        {
+            Size feature_map_size = feature_map_sizes[i];
+            std::vector<float> min_size = min_sizes[i];
+
+            for (int _h = 0; _h < feature_map_size.height; ++_h)
+            {
+                for (int _w = 0; _w < feature_map_size.width; ++_w)
+                {
+                    for (size_t j = 0; j < min_size.size(); ++j)
+                    {
+                        float s_kx = min_size[j] / inputW;
+                        float s_ky = min_size[j] / inputH;
+
+                        float cx = (_w + 0.5f) * steps[i] / inputW;
+                        float cy = (_h + 0.5f) * steps[i] / inputH;
+
+                        Rect2f prior = { cx, cy, s_kx, s_ky };
+                        priors.push_back(prior);
+                    }
+                }
+            }
+        }
+    }
+}
--- a/nodes/infers/vp_yunet_face_detector_node.h
+++ b/nodes/infers/vp_yunet_face_detector_node.h
@@ -0,0 +1,34 @@
+
+#pragma once
+
+#include "../vp_primary_infer_node.h"
+#include "../../objects/vp_frame_face_target.h"
+
+namespace vp_nodes {
+    // face detector based on YunNet
+    // https://github.com/opencv/opencv/blob/4.x/modules/objdetect/src/face_detect.cpp
+    // https://github.com/ShiqiYu/libfacedetection
+    class vp_yunet_face_detector_node: public vp_primary_infer_node
+    {
+    private:
+        // names of output layers in yunet
+        const std::vector<std::string> out_names = {"loc", "conf", "iou"};
+        float scoreThreshold = 0.7;
+        float nmsThreshold = 0.5;
+        int topK = 50;
+        int inputW;
+        int inputH;
+        std::vector<cv::Rect2f> priors;
+        void generatePriors();
+    protected:
+        // override infer and preprocess as yunet has a different logic
+        virtual void infer(const cv::Mat& blob_to_infer, std::vector<cv::Mat>& raw_outputs) override;
+        virtual void preprocess(const std::vector<cv::Mat>& mats_to_infer, cv::Mat& blob_to_infer) override;
+
+        virtual void postprocess(const std::vector<cv::Mat>& raw_outputs, const std::vector<std::shared_ptr<vp_objects::vp_frame_meta>>& frame_meta_with_batch) override;
+    public:
+        vp_yunet_face_detector_node(std::string node_name, std::string model_path, float score_threshold = 0.7, float nms_threshold = 0.5, int top_k = 50);
+        ~vp_yunet_face_detector_node();
+    };
+
+}