更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/deploy/cpp/src/jde_detector.cc
+++ b/paddle_detection/deploy/cpp/src/jde_detector.cc
@@ -0,0 +1,368 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/jde_detector.h"
+
+using namespace paddle_infer;
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void JDEDetector::LoadModel(const std::string& model_dir,
+                            const int batch_size,
+                            const std::string& run_mode) {
+  paddle_infer::Config config;
+  std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel";
+  std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams";
+  config.SetModel(prog_file, params_file);
+  if (this->device_ == "GPU") {
+    config.EnableUseGpu(200, this->gpu_id_);
+    config.SwitchIrOptim(true);
+    // use tensorrt
+    if (run_mode != "paddle") {
+      auto precision = paddle_infer::Config::Precision::kFloat32;
+      if (run_mode == "trt_fp32") {
+        precision = paddle_infer::Config::Precision::kFloat32;
+      } else if (run_mode == "trt_fp16") {
+        precision = paddle_infer::Config::Precision::kHalf;
+      } else if (run_mode == "trt_int8") {
+        precision = paddle_infer::Config::Precision::kInt8;
+      } else {
+        printf(
+            "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or "
+            "'trt_int8'");
+      }
+      // set tensorrt
+      config.EnableTensorRtEngine(1 << 30,
+                                  batch_size,
+                                  this->min_subgraph_size_,
+                                  precision,
+                                  false,
+                                  this->trt_calib_mode_);
+
+      // set use dynamic shape
+      if (this->use_dynamic_shape_) {
+        // set DynamicShsape for image tensor
+        const std::vector<int> min_input_shape = {
+            1, 3, this->trt_min_shape_, this->trt_min_shape_};
+        const std::vector<int> max_input_shape = {
+            1, 3, this->trt_max_shape_, this->trt_max_shape_};
+        const std::vector<int> opt_input_shape = {
+            1, 3, this->trt_opt_shape_, this->trt_opt_shape_};
+        const std::map<std::string, std::vector<int>> map_min_input_shape = {
+            {"image", min_input_shape}};
+        const std::map<std::string, std::vector<int>> map_max_input_shape = {
+            {"image", max_input_shape}};
+        const std::map<std::string, std::vector<int>> map_opt_input_shape = {
+            {"image", opt_input_shape}};
+
+        config.SetTRTDynamicShapeInfo(
+            map_min_input_shape, map_max_input_shape, map_opt_input_shape);
+        std::cout << "TensorRT dynamic shape enabled" << std::endl;
+      }
+    }
+
+  } else if (this->device_ == "XPU") {
+    config.EnableXpu(10 * 1024 * 1024);
+  } else {
+    config.DisableGpu();
+    if (this->use_mkldnn_) {
+      config.EnableMKLDNN();
+      // cache 10 different shapes for mkldnn to avoid memory leak
+      config.SetMkldnnCacheCapacity(10);
+    }
+    config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+  }
+  config.SwitchUseFeedFetchOps(false);
+  config.SwitchIrOptim(true);
+  config.DisableGlogInfo();
+  // Memory optimization
+  config.EnableMemoryOptim();
+  predictor_ = std::move(CreatePredictor(config));
+}
+
+// Visualiztion results
+cv::Mat VisualizeTrackResult(const cv::Mat& img,
+                             const MOT_Result& results,
+                             const float fps,
+                             const int frame_id) {
+  cv::Mat vis_img = img.clone();
+  int im_h = img.rows;
+  int im_w = img.cols;
+  float text_scale = std::max(1, int(im_w / 1600.));
+  float text_thickness = 2.;
+  float line_thickness = std::max(1, int(im_w / 500.));
+
+  std::ostringstream oss;
+  oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+  oss << "frame: " << frame_id << " ";
+  oss << "fps: " << fps << " ";
+  oss << "num: " << results.size();
+  std::string text = oss.str();
+
+  cv::Point origin;
+  origin.x = 0;
+  origin.y = int(15 * text_scale);
+  cv::putText(vis_img,
+              text,
+              origin,
+              cv::FONT_HERSHEY_PLAIN,
+              text_scale,
+              (0, 0, 255),
+              2);
+
+  for (int i = 0; i < results.size(); ++i) {
+    const int obj_id = results[i].ids;
+    const float score = results[i].score;
+
+    cv::Scalar color = GetColor(obj_id);
+
+    cv::Point pt1 = cv::Point(results[i].rects.left, results[i].rects.top);
+    cv::Point pt2 = cv::Point(results[i].rects.right, results[i].rects.bottom);
+    cv::Point id_pt =
+        cv::Point(results[i].rects.left, results[i].rects.top + 10);
+    cv::Point score_pt =
+        cv::Point(results[i].rects.left, results[i].rects.top - 10);
+    cv::rectangle(vis_img, pt1, pt2, color, line_thickness);
+
+    std::ostringstream idoss;
+    idoss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    idoss << obj_id;
+    std::string id_text = idoss.str();
+
+    cv::putText(vis_img,
+                id_text,
+                id_pt,
+                cv::FONT_HERSHEY_PLAIN,
+                text_scale,
+                cv::Scalar(0, 255, 255),
+                text_thickness);
+
+    std::ostringstream soss;
+    soss << std::setiosflags(std::ios::fixed) << std::setprecision(2);
+    soss << score;
+    std::string score_text = soss.str();
+
+    cv::putText(vis_img,
+                score_text,
+                score_pt,
+                cv::FONT_HERSHEY_PLAIN,
+                text_scale,
+                cv::Scalar(0, 255, 255),
+                text_thickness);
+  }
+  return vis_img;
+}
+
+void FilterDets(const float conf_thresh,
+                const cv::Mat dets,
+                std::vector<int>* index) {
+  for (int i = 0; i < dets.rows; ++i) {
+    float score = *dets.ptr<float>(i, 4);
+    if (score > conf_thresh) {
+      index->push_back(i);
+    }
+  }
+}
+
+void JDEDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void JDEDetector::Postprocess(const cv::Mat dets,
+                              const cv::Mat emb,
+                              MOT_Result* result) {
+  result->clear();
+  std::vector<Track> tracks;
+  std::vector<int> valid;
+  FilterDets(conf_thresh_, dets, &valid);
+  cv::Mat new_dets, new_emb;
+  for (int i = 0; i < valid.size(); ++i) {
+    new_dets.push_back(dets.row(valid[i]));
+    new_emb.push_back(emb.row(valid[i]));
+  }
+  JDETracker::instance()->update(new_dets, new_emb, tracks);
+  if (tracks.size() == 0) {
+    MOT_Track mot_track;
+    MOT_Rect ret = {*dets.ptr<float>(0, 0),
+                    *dets.ptr<float>(0, 1),
+                    *dets.ptr<float>(0, 2),
+                    *dets.ptr<float>(0, 3)};
+    mot_track.ids = 1;
+    mot_track.score = *dets.ptr<float>(0, 4);
+    mot_track.rects = ret;
+    result->push_back(mot_track);
+  } else {
+    std::vector<Track>::iterator titer;
+    for (titer = tracks.begin(); titer != tracks.end(); ++titer) {
+      if (titer->score < threshold_) {
+        continue;
+      } else {
+        float w = titer->ltrb[2] - titer->ltrb[0];
+        float h = titer->ltrb[3] - titer->ltrb[1];
+        bool vertical = w / h > 1.6;
+        float area = w * h;
+        if (area > min_box_area_ && !vertical) {
+          MOT_Track mot_track;
+          MOT_Rect ret = {
+              titer->ltrb[0], titer->ltrb[1], titer->ltrb[2], titer->ltrb[3]};
+          mot_track.rects = ret;
+          mot_track.score = titer->score;
+          mot_track.ids = titer->id;
+          result->push_back(mot_track);
+        }
+      }
+    }
+  }
+}
+
+void JDEDetector::Predict(const std::vector<cv::Mat> imgs,
+                          const double threshold,
+                          const int warmup,
+                          const int repeats,
+                          MOT_Result* result,
+                          std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+
+  // Prepare input tensor
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputHandle(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Reshape({batch_size, 3, rh, rw});
+      in_tensor->CopyFromCpu(in_data_all.data());
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(im_shape_all.data());
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(scale_factor_all.data());
+    }
+  }
+
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<int> bbox_shape;
+  std::vector<int> emb_shape;
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto bbox_tensor = predictor_->GetOutputHandle(output_names[0]);
+    bbox_shape = bbox_tensor->shape();
+    auto emb_tensor = predictor_->GetOutputHandle(output_names[1]);
+    emb_shape = emb_tensor->shape();
+    // Calculate bbox length
+    int bbox_size = 1;
+    for (int j = 0; j < bbox_shape.size(); ++j) {
+      bbox_size *= bbox_shape[j];
+    }
+    // Calculate emb length
+    int emb_size = 1;
+    for (int j = 0; j < emb_shape.size(); ++j) {
+      emb_size *= emb_shape[j];
+    }
+
+    bbox_data_.resize(bbox_size);
+    bbox_tensor->CopyToCpu(bbox_data_.data());
+
+    emb_data_.resize(emb_size);
+    emb_tensor->CopyToCpu(emb_data_.data());
+  }
+
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto bbox_tensor = predictor_->GetOutputHandle(output_names[0]);
+    bbox_shape = bbox_tensor->shape();
+    auto emb_tensor = predictor_->GetOutputHandle(output_names[1]);
+    emb_shape = emb_tensor->shape();
+    // Calculate bbox length
+    int bbox_size = 1;
+    for (int j = 0; j < bbox_shape.size(); ++j) {
+      bbox_size *= bbox_shape[j];
+    }
+    // Calculate emb length
+    int emb_size = 1;
+    for (int j = 0; j < emb_shape.size(); ++j) {
+      emb_size *= emb_shape[j];
+    }
+
+    bbox_data_.resize(bbox_size);
+    bbox_tensor->CopyToCpu(bbox_data_.data());
+
+    emb_data_.resize(emb_size);
+    emb_tensor->CopyToCpu(emb_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  result->clear();
+
+  cv::Mat dets(bbox_shape[0], 6, CV_32FC1, bbox_data_.data());
+  cv::Mat emb(bbox_shape[0], emb_shape[1], CV_32FC1, emb_data_.data());
+
+  Postprocess(dets, emb, result);
+
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  (*times)[0] += double(preprocess_diff.count() * 1000);
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  (*times)[1] += double(inference_diff.count() * 1000);
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  (*times)[2] += double(postprocess_diff.count() * 1000);
+}
+
+cv::Scalar GetColor(int idx) {
+  idx = idx * 3;
+  cv::Scalar color =
+      cv::Scalar((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255);
+  return color;
+}
+
+}  // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/keypoint_detector.cc
+++ b/paddle_detection/deploy/cpp/src/keypoint_detector.cc
@@ -0,0 +1,314 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/keypoint_detector.h"
+
+using namespace paddle_infer;
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void KeyPointDetector::LoadModel(const std::string& model_dir,
+                                 const int batch_size,
+                                 const std::string& run_mode) {
+  paddle_infer::Config config;
+  std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel";
+  std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams";
+  config.SetModel(prog_file, params_file);
+  if (this->device_ == "GPU") {
+    config.EnableUseGpu(200, this->gpu_id_);
+    config.SwitchIrOptim(true);
+    // use tensorrt
+    if (run_mode != "paddle") {
+      auto precision = paddle_infer::Config::Precision::kFloat32;
+      if (run_mode == "trt_fp32") {
+        precision = paddle_infer::Config::Precision::kFloat32;
+      } else if (run_mode == "trt_fp16") {
+        precision = paddle_infer::Config::Precision::kHalf;
+      } else if (run_mode == "trt_int8") {
+        precision = paddle_infer::Config::Precision::kInt8;
+      } else {
+        printf(
+            "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or "
+            "'trt_int8'");
+      }
+      // set tensorrt
+      config.EnableTensorRtEngine(1 << 30,
+                                  batch_size,
+                                  this->min_subgraph_size_,
+                                  precision,
+                                  false,
+                                  this->trt_calib_mode_);
+
+      // set use dynamic shape
+      if (this->use_dynamic_shape_) {
+        // set DynamicShsape for image tensor
+        const std::vector<int> min_input_shape = {
+            1, 3, this->trt_min_shape_, this->trt_min_shape_};
+        const std::vector<int> max_input_shape = {
+            1, 3, this->trt_max_shape_, this->trt_max_shape_};
+        const std::vector<int> opt_input_shape = {
+            1, 3, this->trt_opt_shape_, this->trt_opt_shape_};
+        const std::map<std::string, std::vector<int>> map_min_input_shape = {
+            {"image", min_input_shape}};
+        const std::map<std::string, std::vector<int>> map_max_input_shape = {
+            {"image", max_input_shape}};
+        const std::map<std::string, std::vector<int>> map_opt_input_shape = {
+            {"image", opt_input_shape}};
+
+        config.SetTRTDynamicShapeInfo(
+            map_min_input_shape, map_max_input_shape, map_opt_input_shape);
+        std::cout << "TensorRT dynamic shape enabled" << std::endl;
+      }
+    }
+
+  } else if (this->device_ == "XPU") {
+    config.EnableXpu(10 * 1024 * 1024);
+  } else {
+    config.DisableGpu();
+    if (this->use_mkldnn_) {
+      config.EnableMKLDNN();
+      // cache 10 different shapes for mkldnn to avoid memory leak
+      config.SetMkldnnCacheCapacity(10);
+    }
+    config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+  }
+  config.SwitchUseFeedFetchOps(false);
+  config.SwitchIrOptim(true);
+  config.DisableGlogInfo();
+  // Memory optimization
+  config.EnableMemoryOptim();
+  predictor_ = std::move(CreatePredictor(config));
+}
+
+// Visualization MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > 0.5) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+      int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+      int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+      int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+      cv::line(vis_img,
+               cv::Point2d(x_start, y_start),
+               cv::Point2d(x_end, y_end),
+               colormap[i],
+               1);
+    }
+  }
+  return vis_img;
+}
+
+void KeyPointDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<int> output_shape,
+                                   std::vector<int64_t>& idxout,
+                                   std::vector<int> idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark);
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               const double threshold,
+                               const int warmup,
+                               const int repeats,
+                               std::vector<KeyPointResult>* result,
+                               std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+
+  // Prepare input tensor
+
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputHandle(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Reshape({batch_size, 3, rh, rw});
+      in_tensor->CopyFromCpu(in_data_all.data());
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(im_shape_all.data());
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(scale_factor_all.data());
+    }
+  }
+
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<int> output_shape, idx_shape;
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetOutputHandle(output_names[0]);
+    output_shape = out_tensor->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    output_data_.resize(output_size);
+    out_tensor->CopyToCpu(output_data_.data());
+
+    auto idx_tensor = predictor_->GetOutputHandle(output_names[1]);
+    idx_shape = idx_tensor->shape();
+    // Calculate output length
+    output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    idx_tensor->CopyToCpu(idx_data_.data());
+  }
+
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetOutputHandle(output_names[0]);
+    output_shape = out_tensor->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    out_tensor->CopyToCpu(output_data_.data());
+
+    auto idx_tensor = predictor_->GetOutputHandle(output_names[1]);
+    idx_shape = idx_tensor->shape();
+    // Calculate output length
+    output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    idx_tensor->CopyToCpu(idx_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+
+}  // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/keypoint_postprocess.cc
+++ b/paddle_detection/deploy/cpp/src/keypoint_postprocess.cc
@@ -0,0 +1,316 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "include/keypoint_postprocess.h"
+#include <math.h>
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+namespace PaddleDetection {
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{-0.5f * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine) {
+  if (affine) {
+    cv::Mat trans(2, 3, CV_64FC1);
+    get_affine_transform(center, scale, 0, output_size, trans, 1);
+    for (int p = 0; p < dim[1]; ++p) {
+      affine_tranform(
+          coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+    }
+  } else {
+    float heat_w = static_cast<float>(output_size[0]);
+    float heat_h = static_cast<float>(output_size[1]);
+    float x_scale = scale[0] / heat_w;
+    float y_scale = scale[1] / heat_h;
+    float offset_x = center[0] - scale[0] / 2.;
+    float offset_y = center[1] - scale[1] / 2.;
+    for (int i = 0; i < dim[1]; i++) {
+      target_coords[i * 3 + 1] = x_scale * coords[i * 2] + offset_x;
+      target_coords[i * 3 + 2] = y_scale * coords[i * 2 + 1] + offset_y;
+    }
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(float* heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   float* maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<int>& dim,
+                std::vector<float>& coords,
+                int px,
+                int py,
+                int index,
+                int ch) {
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 =
+      heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0, dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1, 1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1, 1));
+
+  float epsilon = 1e-10;
+  // sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  // compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2 * xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2 * xy + yd2);
+
+  // finally get offset by derivative and hassian, which combined by dx/dy and
+  // dxx/dyy
+  if (dxx * dyy - dxy * dxy != 0) {
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2, 2, CV_32F, M);
+    cv::Mat derivative(2, 1, CV_32F, D);
+    cv::Mat offset = -hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0, 0);
+    coords[ch * 2 + 1] += offset.at<float>(1, 0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int64_t>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if (DARK && px > 1 && px < heatmap_width - 2 && py > 1 &&
+        py < heatmap_height - 2) {
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    } else {
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                       heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                       heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
+
+// Run predictor
+KeyPointResult PoseSmooth::smooth_process(KeyPointResult* result) {
+  KeyPointResult keypoint_smoothed = *result;
+  if (this->x_prev_hat.num_joints == -1) {
+    this->x_prev_hat = *result;
+    this->dx_prev_hat = *result;
+    std::fill(dx_prev_hat.keypoints.begin(), dx_prev_hat.keypoints.end(), 0.);
+    return keypoint_smoothed;
+  } else {
+    for (int i = 0; i < result->num_joints; i++) {
+      this->PointSmooth(result, &keypoint_smoothed, this->thresholds, i);
+    }
+    return keypoint_smoothed;
+  }
+}
+
+void PoseSmooth::PointSmooth(KeyPointResult* result,
+                             KeyPointResult* keypoint_smoothed,
+                             std::vector<float> thresholds,
+                             int index) {
+  float distance = sqrt(pow((result->keypoints[index * 3 + 1] -
+                             this->x_prev_hat.keypoints[index * 3 + 1]) /
+                                this->width,
+                            2) +
+                        pow((result->keypoints[index * 3 + 2] -
+                             this->x_prev_hat.keypoints[index * 3 + 2]) /
+                                this->height,
+                            2));
+  if (distance < thresholds[index] * this->thres_mult) {
+    keypoint_smoothed->keypoints[index * 3 + 1] =
+        this->x_prev_hat.keypoints[index * 3 + 1];
+    keypoint_smoothed->keypoints[index * 3 + 2] =
+        this->x_prev_hat.keypoints[index * 3 + 2];
+  } else {
+    if (this->filter_type == "OneEuro") {
+      keypoint_smoothed->keypoints[index * 3 + 1] =
+          this->OneEuroFilter(result->keypoints[index * 3 + 1],
+                              this->x_prev_hat.keypoints[index * 3 + 1],
+                              index * 3 + 1);
+      keypoint_smoothed->keypoints[index * 3 + 2] =
+          this->OneEuroFilter(result->keypoints[index * 3 + 2],
+                              this->x_prev_hat.keypoints[index * 3 + 2],
+                              index * 3 + 2);
+    } else {
+      keypoint_smoothed->keypoints[index * 3 + 1] =
+          this->ExpSmoothing(result->keypoints[index * 3 + 1],
+                             this->x_prev_hat.keypoints[index * 3 + 1],
+                             index * 3 + 1);
+      keypoint_smoothed->keypoints[index * 3 + 2] =
+          this->ExpSmoothing(result->keypoints[index * 3 + 2],
+                             this->x_prev_hat.keypoints[index * 3 + 2],
+                             index * 3 + 2);
+    }
+  }
+  return;
+}
+
+float PoseSmooth::OneEuroFilter(float x_cur, float x_pre, int loc) {
+  float te = 1.0;
+  this->alpha = this->smoothing_factor(te, this->fc_d);
+  float dx_cur = (x_cur - x_pre) / te;
+  float dx_cur_hat =
+      this->ExpSmoothing(dx_cur, this->dx_prev_hat.keypoints[loc]);
+
+  float fc = this->fc_min + this->beta * abs(dx_cur_hat);
+  this->alpha = this->smoothing_factor(te, fc);
+  float x_cur_hat = this->ExpSmoothing(x_cur, x_pre);
+  this->x_prev_hat.keypoints[loc] = x_cur_hat;
+  this->dx_prev_hat.keypoints[loc] = dx_cur_hat;
+  return x_cur_hat;
+}
+
+float PoseSmooth::smoothing_factor(float te, float fc) {
+  float r = 2 * PI * fc * te;
+  return r / (r + 1);
+}
+
+float PoseSmooth::ExpSmoothing(float x_cur, float x_pre, int loc) {
+  return this->alpha * x_cur + (1 - this->alpha) * x_pre;
+}
+}  // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/lapjv.cpp
+++ b/paddle_detection/deploy/cpp/src/lapjv.cpp
@@ -0,0 +1,405 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/gatagat/lap/blob/master/lap/lapjv.cpp
+// Ths copyright of gatagat/lap is as follows:
+// MIT License
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "include/lapjv.h"
+
+namespace PaddleDetection {
+
+/** Column-reduction and reduction transfer for a dense cost matrix.
+ */
+int _ccrrt_dense(const int n, float *cost[],
+                     int *free_rows, int *x, int *y, float *v)
+{
+    int n_free_rows;
+    bool *unique;
+
+    for (int i = 0; i < n; i++) {
+        x[i] = -1;
+        v[i] = LARGE;
+        y[i] = 0;
+    }
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            const float c = cost[i][j];
+            if (c < v[j]) {
+                v[j] = c;
+                y[j] = i;
+            }
+        }
+    }
+    NEW(unique, bool, n);
+    memset(unique, TRUE, n);
+    {
+        int j = n;
+        do {
+            j--;
+            const int i = y[j];
+            if (x[i] < 0) {
+                x[i] = j;
+            } else {
+                unique[i] = FALSE;
+                y[j] = -1;
+            }
+        } while (j > 0);
+    }
+    n_free_rows = 0;
+    for (int i = 0; i < n; i++) {
+        if (x[i] < 0) {
+            free_rows[n_free_rows++] = i;
+        } else if (unique[i]) {
+            const int j = x[i];
+            float min = LARGE;
+            for (int j2 = 0; j2 < n; j2++) {
+                if (j2 == (int)j) {
+                    continue;
+                }
+                const float c = cost[i][j2] - v[j2];
+                if (c < min) {
+                    min = c;
+                }
+            }
+            v[j] -= min;
+        }
+    }
+    FREE(unique);
+    return n_free_rows;
+}
+
+
+/** Augmenting row reduction for a dense cost matrix.
+ */
+int _carr_dense(
+    const int n, float *cost[],
+    const int n_free_rows,
+    int *free_rows, int *x, int *y, float *v)
+{
+    int current = 0;
+    int new_free_rows = 0;
+    int rr_cnt = 0;
+    while (current < n_free_rows) {
+        int i0;
+        int j1, j2;
+        float v1, v2, v1_new;
+        bool v1_lowers;
+
+        rr_cnt++;
+        const int free_i = free_rows[current++];
+        j1 = 0;
+        v1 = cost[free_i][0] - v[0];
+        j2 = -1;
+        v2 = LARGE;
+        for (int j = 1; j < n; j++) {
+            const float c = cost[free_i][j] - v[j];
+            if (c < v2) {
+                if (c >= v1) {
+                    v2 = c;
+                    j2 = j;
+                } else {
+                    v2 = v1;
+                    v1 = c;
+                    j2 = j1;
+                    j1 = j;
+                }
+            }
+        }
+        i0 = y[j1];
+        v1_new = v[j1] - (v2 - v1);
+        v1_lowers = v1_new < v[j1];
+        if (rr_cnt < current * n) {
+            if (v1_lowers) {
+                v[j1] = v1_new;
+            } else if (i0 >= 0 && j2 >= 0) {
+                j1 = j2;
+                i0 = y[j2];
+            }
+            if (i0 >= 0) {
+                if (v1_lowers) {
+                    free_rows[--current] = i0;
+                } else {
+                    free_rows[new_free_rows++] = i0;
+                }
+            }
+        } else {
+            if (i0 >= 0) {
+                free_rows[new_free_rows++] = i0;
+            }
+        }
+        x[free_i] = j1;
+        y[j1] = free_i;
+    }
+    return new_free_rows;
+}
+
+
+/** Find columns with minimum d[j] and put them on the SCAN list.
+ */
+int _find_dense(const int n, int lo, float *d, int *cols, int *y)
+{
+    int hi = lo + 1;
+    float mind = d[cols[lo]];
+    for (int k = hi; k < n; k++) {
+        int j = cols[k];
+        if (d[j] <= mind) {
+            if (d[j] < mind) {
+                hi = lo;
+                mind = d[j];
+            }
+            cols[k] = cols[hi];
+            cols[hi++] = j;
+        }
+    }
+    return hi;
+}
+
+
+// Scan all columns in TODO starting from arbitrary column in SCAN
+// and try to decrease d of the TODO columns using the SCAN column.
+int _scan_dense(const int n, float *cost[],
+                    int *plo, int*phi,
+                    float *d, int *cols, int *pred,
+                    int *y, float *v)
+{
+    int lo = *plo;
+    int hi = *phi;
+    float h, cred_ij;
+
+    while (lo != hi) {
+        int j = cols[lo++];
+        const int i = y[j];
+        const float mind = d[j];
+        h = cost[i][j] - v[j] - mind;
+        // For all columns in TODO
+        for (int k = hi; k < n; k++) {
+            j = cols[k];
+            cred_ij = cost[i][j] - v[j] - h;
+            if (cred_ij < d[j]) {
+                d[j] = cred_ij;
+                pred[j] = i;
+                if (cred_ij == mind) {
+                    if (y[j] < 0) {
+                        return j;
+                    }
+                    cols[k] = cols[hi];
+                    cols[hi++] = j;
+                }
+            }
+        }
+    }
+    *plo = lo;
+    *phi = hi;
+    return -1;
+}
+
+
+/** Single iteration of modified Dijkstra shortest path algorithm as explained in the JV paper.
+ *
+ * This is a dense matrix version.
+ *
+ * \return The closest free column index.
+ */
+int find_path_dense(
+    const int n, float *cost[],
+    const int start_i,
+    int *y, float *v,
+    int *pred)
+{
+    int lo = 0, hi = 0;
+    int final_j = -1;
+    int n_ready = 0;
+    int *cols;
+    float *d;
+
+    NEW(cols, int, n);
+    NEW(d, float, n);
+
+    for (int i = 0; i < n; i++) {
+        cols[i] = i;
+        pred[i] = start_i;
+        d[i] = cost[start_i][i] - v[i];
+    }
+    while (final_j == -1) {
+        // No columns left on the SCAN list.
+        if (lo == hi) {
+            n_ready = lo;
+            hi = _find_dense(n, lo, d, cols, y);
+            for (int k = lo; k < hi; k++) {
+                const int j = cols[k];
+                if (y[j] < 0) {
+                    final_j = j;
+                }
+            }
+        }
+        if (final_j == -1) {
+            final_j = _scan_dense(
+                    n, cost, &lo, &hi, d, cols, pred, y, v);
+        }
+    }
+
+    {
+        const float mind = d[cols[lo]];
+        for (int k = 0; k < n_ready; k++) {
+            const int j = cols[k];
+            v[j] += d[j] - mind;
+        }
+    }
+
+    FREE(cols);
+    FREE(d);
+
+    return final_j;
+}
+
+
+/** Augment for a dense cost matrix.
+ */
+int _ca_dense(
+    const int n, float *cost[],
+    const int n_free_rows,
+    int *free_rows, int *x, int *y, float *v)
+{
+    int *pred;
+
+    NEW(pred, int, n);
+
+    for (int *pfree_i = free_rows; pfree_i < free_rows + n_free_rows; pfree_i++) {
+        int i = -1, j;
+        int k = 0;
+
+        j = find_path_dense(n, cost, *pfree_i, y, v, pred);
+        while (i != *pfree_i) {
+            i = pred[j];
+            y[j] = i;
+            SWAP_INDICES(j, x[i]);
+            k++;
+        }
+    }
+    FREE(pred);
+    return 0;
+}
+
+
+/** Solve dense sparse LAP.
+ */
+int lapjv_internal(
+    const cv::Mat &cost, const bool extend_cost, const float cost_limit,
+    int *x, int *y ) {
+    int n_rows = cost.rows;
+    int n_cols = cost.cols;
+    int n;
+    if (n_rows == n_cols) {
+      n = n_rows;
+    } else if (!extend_cost) {
+      throw std::invalid_argument("Square cost array expected. If cost is intentionally non-square, pass extend_cost=True.");
+    }
+
+    // Get extend cost
+    if (extend_cost || cost_limit < LARGE) {
+      n = n_rows + n_cols;
+    }
+    cv::Mat cost_expand(n, n, CV_32F);
+    float expand_value;
+    if (cost_limit < LARGE) {
+      expand_value = cost_limit / 2;
+    } else {
+      double max_v;
+      minMaxLoc(cost, nullptr, &max_v);
+      expand_value = (float)max_v + 1;
+    }
+
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < n; ++j) {
+        cost_expand.at<float>(i, j) = expand_value;
+        if (i >= n_rows && j >= n_cols) {
+          cost_expand.at<float>(i, j) = 0;
+        } else if (i < n_rows && j < n_cols) {
+          cost_expand.at<float>(i, j) = cost.at<float>(i, j);
+        }
+      }
+    } 
+
+    // Convert Mat to pointer array
+    float **cost_ptr;
+    NEW(cost_ptr, float *, n);
+    for (int i = 0; i < n; ++i) {
+      NEW(cost_ptr[i], float, n);
+    }
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < n; ++j) {
+        cost_ptr[i][j] = cost_expand.at<float>(i, j);
+      }
+    }
+
+    int ret;
+    int *free_rows;
+    float *v;
+    int *x_c;
+    int *y_c;
+
+    NEW(free_rows, int, n);
+    NEW(v, float, n);
+    NEW(x_c, int, n);
+    NEW(y_c, int, n);
+
+    ret = _ccrrt_dense(n, cost_ptr, free_rows, x_c, y_c, v);
+    int i = 0;
+    while (ret > 0 && i < 2) {
+      ret = _carr_dense(n, cost_ptr, ret, free_rows, x_c, y_c, v);
+      i++;
+    }
+    if (ret > 0) {
+      ret = _ca_dense(n, cost_ptr, ret, free_rows, x_c, y_c, v);
+    }
+    FREE(v);
+    FREE(free_rows);
+    for (int i = 0; i < n; ++i) {
+      FREE(cost_ptr[i]);
+    }
+    FREE(cost_ptr);
+    if (ret != 0) {
+      if (ret == -1){
+        throw "Out of memory.";
+      }
+      throw "Unknown error (lapjv_internal)";
+    }
+    // Get output of x, y, opt
+    for (int i = 0; i < n; ++i) {
+      if (i < n_rows) {
+        x[i] = x_c[i];
+        if (x[i] >= n_cols) {
+          x[i] = -1;
+        }
+      }      
+      if (i < n_cols) {
+        y[i] = y_c[i];
+        if (y[i] >= n_rows) {
+          y[i] = -1;
+        }
+      } 
+    }
+    
+    FREE(x_c);
+    FREE(y_c);
+    return ret;
+}
+
+} // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/main.cc
+++ b/paddle_detection/deploy/cpp/src/main.cc
@@ -0,0 +1,428 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _WIN32
+#include <direct.h>
+#include <io.h>
+#elif LINUX
+#include <stdarg.h>
+#include <sys/stat.h>
+#endif
+
+#include <gflags/gflags.h>
+#include "include/object_detector.h"
+
+DEFINE_string(model_dir, "", "Path of inference model");
+DEFINE_string(image_file, "", "Path of input image");
+DEFINE_string(image_dir,
+              "",
+              "Dir of input image, `image_file` has a higher priority.");
+DEFINE_int32(batch_size, 1, "batch_size");
+DEFINE_string(
+    video_file,
+    "",
+    "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_int32(camera_id, -1, "Device id of camera to predict");
+DEFINE_bool(
+    use_gpu,
+    false,
+    "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device,
+              "CPU",
+              "Choose the device you want to run, it can be: CPU/GPU/XPU, "
+              "default is CPU.");
+DEFINE_double(threshold, 0.5, "Threshold of score.");
+DEFINE_string(output_dir, "output", "Directory of output visualization files.");
+DEFINE_string(run_mode,
+              "paddle",
+              "Mode of running(paddle/trt_fp32/trt_fp16/trt_int8)");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
+DEFINE_bool(run_benchmark,
+            false,
+            "Whether to predict a image_file repeatedly for benchmark");
+DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
+DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
+DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI");
+DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI");
+DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI");
+DEFINE_bool(trt_calib_mode,
+            false,
+            "If the model is produced by TRT offline quantitative calibration, "
+            "trt_calib_mode need to set True");
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Config info -----------------------";
+  LOG(INFO) << "runtime_device: " << FLAGS_device;
+  LOG(INFO) << "ir_optim: "
+            << "True";
+  LOG(INFO) << "enable_memory_optim: "
+            << "True";
+  int has_trt = FLAGS_run_mode.find("trt");
+  if (has_trt >= 0) {
+    LOG(INFO) << "enable_tensorrt: "
+              << "True";
+    std::string precision = FLAGS_run_mode.substr(4, 8);
+    LOG(INFO) << "precision: " << precision;
+  } else {
+    LOG(INFO) << "enable_tensorrt: "
+              << "False";
+    LOG(INFO) << "precision: "
+              << "fp32";
+  }
+  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
+  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "input_shape: "
+            << "dynamic shape";
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of("/") + 1);
+  LOG(INFO) << "model_name: "
+            << FLAGS_model_dir.substr(FLAGS_model_dir.find_last_of('/') + 1);
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0);
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+#ifdef _WIN32
+  struct _stat buffer;
+  return (_stat(path.c_str(), &buffer) == 0);
+#else
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+#ifdef _WIN32
+  ret = _mkdir(path.c_str());
+#else
+  ret = mkdir(path.c_str(), 0755);
+#endif  // !_WIN32
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictVideo(const std::string& video_path,
+                  PaddleDetection::ObjectDetector* det,
+                  const std::string& output_dir = "output") {
+  // Open video
+  cv::VideoCapture capture;
+  std::string video_out_name = "output.mp4";
+  if (FLAGS_camera_id != -1) {
+    capture.open(FLAGS_camera_id);
+  } else {
+    capture.open(video_path.c_str());
+    video_out_name =
+        video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
+  }
+  if (!capture.isOpened()) {
+    printf("can not open video : %s\n", video_path.c_str());
+    return;
+  }
+
+  // Get Video info : resolution, fps, frame count
+  int video_width = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_WIDTH));
+  int video_height = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_HEIGHT));
+  int video_fps = static_cast<int>(capture.get(CV_CAP_PROP_FPS));
+  int video_frame_count =
+      static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
+  printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
+
+  // Create VideoWriter for output
+  cv::VideoWriter video_out;
+  std::string video_out_path(output_dir);
+  if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+    video_out_path += OS_PATH_SEP;
+  }
+  video_out_path += video_out_name;
+  video_out.open(video_out_path.c_str(),
+                 0x00000021,
+                 video_fps,
+                 cv::Size(video_width, video_height),
+                 true);
+  if (!video_out.isOpened()) {
+    printf("create video writer failed!\n");
+    return;
+  }
+
+  std::vector<PaddleDetection::ObjectResult> result;
+  std::vector<int> bbox_num;
+  std::vector<double> det_times;
+  auto labels = det->GetLabelList();
+  auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+  // Capture all frames and do inference
+  cv::Mat frame;
+  int frame_id = 1;
+  bool is_rbox = false;
+  while (capture.read(frame)) {
+    if (frame.empty()) {
+      break;
+    }
+    std::vector<cv::Mat> imgs;
+    imgs.push_back(frame);
+    printf("detect frame: %d\n", frame_id);
+    det->Predict(imgs, FLAGS_threshold, 0, 1, &result, &bbox_num, &det_times);
+    std::vector<PaddleDetection::ObjectResult> out_result;
+    for (const auto& item : result) {
+      if (item.confidence < FLAGS_threshold || item.class_id == -1) {
+        continue;
+      }
+      out_result.push_back(item);
+      if (item.rect.size() > 6) {
+        is_rbox = true;
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3],
+               item.rect[4],
+               item.rect[5],
+               item.rect[6],
+               item.rect[7]);
+      } else {
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3]);
+      }
+    }
+
+    cv::Mat out_im = PaddleDetection::VisualizeResult(
+        frame, out_result, labels, colormap, is_rbox);
+
+    video_out.write(out_im);
+    frame_id += 1;
+  }
+  capture.release();
+  video_out.release();
+}
+
+void PredictImage(const std::vector<std::string> all_img_paths,
+                  const int batch_size,
+                  const double threshold,
+                  const bool run_benchmark,
+                  PaddleDetection::ObjectDetector* det,
+                  const std::string& output_dir = "output") {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(float(all_img_paths.size()) / batch_size);
+  printf("total images = %d, batch_size = %d, total steps = %d\n",
+         all_img_paths.size(),
+         batch_size,
+         steps);
+  for (int idx = 0; idx < steps; idx++) {
+    std::vector<cv::Mat> batch_imgs;
+    int left_image_cnt = all_img_paths.size() - idx * batch_size;
+    if (left_image_cnt > batch_size) {
+      left_image_cnt = batch_size;
+    }
+    for (int bs = 0; bs < left_image_cnt; bs++) {
+      std::string image_file_path = all_img_paths.at(idx * batch_size + bs);
+      cv::Mat im = cv::imread(image_file_path, 1);
+      batch_imgs.insert(batch_imgs.end(), im);
+    }
+
+    // Store all detected result
+    std::vector<PaddleDetection::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(
+          batch_imgs, threshold, 10, 10, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(batch_imgs, threshold, 0, 1, &result, &bbox_num, &det_times);
+      // get labels and colormap
+      auto labels = det->GetLabelList();
+      auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+
+      int item_start_idx = 0;
+      for (int i = 0; i < left_image_cnt; i++) {
+        cv::Mat im = batch_imgs[i];
+        std::vector<PaddleDetection::ObjectResult> im_result;
+        int detect_num = 0;
+
+        for (int j = 0; j < bbox_num[i]; j++) {
+          PaddleDetection::ObjectResult item = result[item_start_idx + j];
+          if (item.confidence < threshold || item.class_id == -1) {
+            continue;
+          }
+          detect_num += 1;
+          im_result.push_back(item);
+          if (item.rect.size() > 6) {
+            is_rbox = true;
+            printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+                   item.class_id,
+                   item.confidence,
+                   item.rect[0],
+                   item.rect[1],
+                   item.rect[2],
+                   item.rect[3],
+                   item.rect[4],
+                   item.rect[5],
+                   item.rect[6],
+                   item.rect[7]);
+          } else {
+            printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+                   item.class_id,
+                   item.confidence,
+                   item.rect[0],
+                   item.rect[1],
+                   item.rect[2],
+                   item.rect[3]);
+          }
+        }
+        std::cout << all_img_paths.at(idx * batch_size + i)
+                  << " The number of detected box: " << detect_num << std::endl;
+        item_start_idx = item_start_idx + bbox_num[i];
+        // Visualization result
+        cv::Mat vis_img = PaddleDetection::VisualizeResult(
+            im, im_result, labels, colormap, is_rbox);
+        std::vector<int> compression_params;
+        compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+        compression_params.push_back(95);
+        std::string output_path(output_dir);
+        if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+          output_path += OS_PATH_SEP;
+        }
+        std::string image_file_path = all_img_paths.at(idx * batch_size + i);
+        output_path +=
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::imwrite(output_path, vis_img, compression_params);
+        printf("Visualized output saved as %s\n", output_path.c_str());
+      }
+    }
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+    det_times.clear();
+  }
+  PrintBenchmarkLog(det_t, all_img_paths.size());
+}
+
+int main(int argc, char** argv) {
+  // Parsing command-line
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_image_file.empty() && FLAGS_image_dir.empty() &&
+       FLAGS_video_file.empty())) {
+    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ "
+              << "--image_file=/PATH/TO/INPUT/IMAGE/" << std::endl;
+    return -1;
+  }
+  if (!(FLAGS_run_mode == "paddle" || FLAGS_run_mode == "trt_fp32" ||
+        FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
+    std::cout
+        << "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
+    return -1;
+  }
+  transform(FLAGS_device.begin(),
+            FLAGS_device.end(),
+            FLAGS_device.begin(),
+            ::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" ||
+        FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want "
+                 "to run.";
+    return -1;
+  }
+  // Load model and create a object detector
+  PaddleDetection::ObjectDetector det(FLAGS_model_dir,
+                                      FLAGS_device,
+                                      FLAGS_use_mkldnn,
+                                      FLAGS_cpu_threads,
+                                      FLAGS_run_mode,
+                                      FLAGS_batch_size,
+                                      FLAGS_gpu_id,
+                                      FLAGS_trt_min_shape,
+                                      FLAGS_trt_max_shape,
+                                      FLAGS_trt_opt_shape,
+                                      FLAGS_trt_calib_mode);
+  // Do inference on input video or image
+  if (!PathExists(FLAGS_output_dir)) {
+    MkDirs(FLAGS_output_dir);
+  }
+  if (!FLAGS_video_file.empty() || FLAGS_camera_id != -1) {
+    PredictVideo(FLAGS_video_file, &det, FLAGS_output_dir);
+  } else if (!FLAGS_image_file.empty() || !FLAGS_image_dir.empty()) {
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!FLAGS_image_file.empty()) {
+      all_img_paths.push_back(FLAGS_image_file);
+      if (FLAGS_batch_size > 1) {
+        std::cout << "batch_size should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(FLAGS_image_dir, cv_all_img_paths);
+      for (const auto& img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    PredictImage(all_img_paths,
+                 FLAGS_batch_size,
+                 FLAGS_threshold,
+                 FLAGS_run_benchmark,
+                 &det,
+                 FLAGS_output_dir);
+  }
+  return 0;
+}
--- a/paddle_detection/deploy/cpp/src/main_jde.cc
+++ b/paddle_detection/deploy/cpp/src/main_jde.cc
@@ -0,0 +1,269 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _WIN32
+#include <direct.h>
+#include <io.h>
+#elif LINUX
+#include <stdarg.h>
+#include <sys/stat.h>
+#endif
+
+#include <gflags/gflags.h>
+#include <opencv2/opencv.hpp>
+#include "include/jde_detector.h"
+#include "include/object_detector.h"
+
+DEFINE_string(model_dir, "", "Path of inference model");
+DEFINE_int32(batch_size, 1, "batch_size");
+DEFINE_string(
+    video_file,
+    "",
+    "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_int32(camera_id, -1, "Device id of camera to predict");
+DEFINE_bool(
+    use_gpu,
+    false,
+    "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device,
+              "CPU",
+              "Choose the device you want to run, it can be: CPU/GPU/XPU, "
+              "default is CPU.");
+DEFINE_double(threshold, 0.5, "Threshold of score.");
+DEFINE_string(output_dir, "output", "Directory of output visualization files.");
+DEFINE_string(run_mode,
+              "paddle",
+              "Mode of running(paddle/trt_fp32/trt_fp16/trt_int8)");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
+DEFINE_bool(run_benchmark,
+            false,
+            "Whether to predict a image_file repeatedly for benchmark");
+DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
+DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
+DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI");
+DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI");
+DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI");
+DEFINE_bool(trt_calib_mode,
+            false,
+            "If the model is produced by TRT offline quantitative calibration, "
+            "trt_calib_mode need to set True");
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Config info -----------------------";
+  LOG(INFO) << "runtime_device: " << FLAGS_device;
+  LOG(INFO) << "ir_optim: "
+            << "True";
+  LOG(INFO) << "enable_memory_optim: "
+            << "True";
+  int has_trt = FLAGS_run_mode.find("trt");
+  if (has_trt >= 0) {
+    LOG(INFO) << "enable_tensorrt: "
+              << "True";
+    std::string precision = FLAGS_run_mode.substr(4, 8);
+    LOG(INFO) << "precision: " << precision;
+  } else {
+    LOG(INFO) << "enable_tensorrt: "
+              << "False";
+    LOG(INFO) << "precision: "
+              << "fp32";
+  }
+  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
+  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "input_shape: "
+            << "dynamic shape";
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of("/") + 1);
+  LOG(INFO) << "model_name: "
+            << FLAGS_model_dir.substr(FLAGS_model_dir.find_last_of('/') + 1);
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0);
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+#ifdef _WIN32
+  struct _stat buffer;
+  return (_stat(path.c_str(), &buffer) == 0);
+#else
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+#ifdef _WIN32
+  ret = _mkdir(path.c_str());
+#else
+  ret = mkdir(path.c_str(), 0755);
+#endif  // !_WIN32
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictVideo(const std::string& video_path,
+                  PaddleDetection::JDEDetector* mot,
+                  const std::string& output_dir = "output") {
+  // Open video
+  cv::VideoCapture capture;
+  std::string video_out_name = "output.mp4";
+  if (FLAGS_camera_id != -1) {
+    capture.open(FLAGS_camera_id);
+  } else {
+    capture.open(video_path.c_str());
+    video_out_name =
+        video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
+  }
+  if (!capture.isOpened()) {
+    printf("can not open video : %s\n", video_path.c_str());
+    return;
+  }
+
+  // Get Video info : resolution, fps, frame count
+  int video_width = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_WIDTH));
+  int video_height = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_HEIGHT));
+  int video_fps = static_cast<int>(capture.get(CV_CAP_PROP_FPS));
+  int video_frame_count =
+      static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
+  printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
+
+  // Create VideoWriter for output
+  cv::VideoWriter video_out;
+  std::string video_out_path(output_dir);
+  if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+    video_out_path += OS_PATH_SEP;
+  }
+  video_out_path += video_out_name;
+  video_out.open(video_out_path.c_str(),
+                 0x00000021,
+                 video_fps,
+                 cv::Size(video_width, video_height),
+                 true);
+  if (!video_out.isOpened()) {
+    printf("create video writer failed!\n");
+    return;
+  }
+
+  PaddleDetection::MOT_Result result;
+  std::vector<double> det_times(3);
+  double times;
+  // Capture all frames and do inference
+  cv::Mat frame;
+  int frame_id = 1;
+  while (capture.read(frame)) {
+    if (frame.empty()) {
+      break;
+    }
+    std::vector<cv::Mat> imgs;
+    imgs.push_back(frame);
+    printf("detect frame: %d\n", frame_id);
+    mot->Predict(imgs, FLAGS_threshold, 0, 1, &result, &det_times);
+    frame_id += 1;
+    times = std::accumulate(det_times.begin(), det_times.end(), 0) / frame_id;
+
+    cv::Mat out_im = PaddleDetection::VisualizeTrackResult(
+        frame, result, 1000. / times, frame_id);
+
+    video_out.write(out_im);
+  }
+  capture.release();
+  video_out.release();
+  PrintBenchmarkLog(det_times, frame_id);
+  printf("Visualized output saved as %s\n", video_out_path.c_str());
+}
+
+int main(int argc, char** argv) {
+  // Parsing command-line
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() || FLAGS_video_file.empty()) {
+    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ "
+              << "--video_file=/PATH/TO/INPUT/VIDEO/" << std::endl;
+    return -1;
+  }
+  if (!(FLAGS_run_mode == "paddle" || FLAGS_run_mode == "trt_fp32" ||
+        FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
+    std::cout
+        << "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
+    return -1;
+  }
+  transform(FLAGS_device.begin(),
+            FLAGS_device.end(),
+            FLAGS_device.begin(),
+            ::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" ||
+        FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want "
+                 "to run.";
+    return -1;
+  }
+
+  // Do inference on input video or image
+  PaddleDetection::JDEDetector mot(FLAGS_model_dir,
+                                   FLAGS_device,
+                                   FLAGS_use_mkldnn,
+                                   FLAGS_cpu_threads,
+                                   FLAGS_run_mode,
+                                   FLAGS_batch_size,
+                                   FLAGS_gpu_id,
+                                   FLAGS_trt_min_shape,
+                                   FLAGS_trt_max_shape,
+                                   FLAGS_trt_opt_shape,
+                                   FLAGS_trt_calib_mode);
+  if (!PathExists(FLAGS_output_dir)) {
+    MkDirs(FLAGS_output_dir);
+  }
+  PredictVideo(FLAGS_video_file, &mot, FLAGS_output_dir);
+  return 0;
+}
--- a/paddle_detection/deploy/cpp/src/main_keypoint.cc
+++ b/paddle_detection/deploy/cpp/src/main_keypoint.cc
@@ -0,0 +1,598 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _WIN32
+#include <direct.h>
+#include <io.h>
+#elif LINUX
+#include <stdarg.h>
+#endif
+
+#include <gflags/gflags.h>
+#include "include/keypoint_detector.h"
+#include "include/object_detector.h"
+#include "include/preprocess_op.h"
+
+DEFINE_string(model_dir, "", "Path of object detector inference model");
+DEFINE_string(model_dir_keypoint,
+              "",
+              "Path of keypoint detector inference model");
+DEFINE_string(image_file, "", "Path of input image");
+DEFINE_string(image_dir,
+              "",
+              "Dir of input image, `image_file` has a higher priority.");
+DEFINE_int32(batch_size, 1, "batch_size of object detector");
+DEFINE_int32(batch_size_keypoint, 8, "batch_size of keypoint detector");
+DEFINE_string(
+    video_file,
+    "",
+    "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_int32(camera_id, -1, "Device id of camera to predict");
+DEFINE_bool(
+    use_gpu,
+    false,
+    "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device,
+              "CPU",
+              "Choose the device you want to run, it can be: CPU/GPU/XPU, "
+              "default is CPU.");
+DEFINE_double(threshold, 0.5, "Threshold of score.");
+DEFINE_double(threshold_keypoint, 0.5, "Threshold of score.");
+DEFINE_string(output_dir, "output", "Directory of output visualization files.");
+DEFINE_string(run_mode,
+              "paddle",
+              "Mode of running(paddle/trt_fp32/trt_fp16/trt_int8)");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
+DEFINE_bool(run_benchmark,
+            false,
+            "Whether to predict a image_file repeatedly for benchmark");
+DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
+DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
+DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI");
+DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI");
+DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI");
+DEFINE_bool(trt_calib_mode,
+            false,
+            "If the model is produced by TRT offline quantitative calibration, "
+            "trt_calib_mode need to set True");
+DEFINE_bool(use_dark, true, "Whether use dark decode in keypoint postprocess");
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Config info -----------------------";
+  LOG(INFO) << "runtime_device: " << FLAGS_device;
+  LOG(INFO) << "ir_optim: "
+            << "True";
+  LOG(INFO) << "enable_memory_optim: "
+            << "True";
+  int has_trt = FLAGS_run_mode.find("trt");
+  if (has_trt >= 0) {
+    LOG(INFO) << "enable_tensorrt: "
+              << "True";
+    std::string precision = FLAGS_run_mode.substr(4, 8);
+    LOG(INFO) << "precision: " << precision;
+  } else {
+    LOG(INFO) << "enable_tensorrt: "
+              << "False";
+    LOG(INFO) << "precision: "
+              << "fp32";
+  }
+  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
+  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "input_shape: "
+            << "dynamic shape";
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of(OS_PATH_SEP) + 1);
+  LOG(INFO) << "model_name: " << FLAGS_model_dir;
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.);
+  img_num = std::max(1, img_num);
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+void PrintKptsBenchmarkLog(std::vector<double> det_time, int img_num) {
+  LOG(INFO) << "----------------------- Data info -----------------------";
+  LOG(INFO) << "batch_size_keypoint: " << FLAGS_batch_size_keypoint;
+  LOG(INFO) << "----------------------- Model info -----------------------";
+  FLAGS_model_dir_keypoint.erase(
+      FLAGS_model_dir_keypoint.find_last_not_of(OS_PATH_SEP) + 1);
+  LOG(INFO) << "keypoint_model_name: " << FLAGS_model_dir_keypoint;
+  LOG(INFO) << "----------------------- Perf info ------------------------";
+  LOG(INFO) << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.);
+  img_num = std::max(1, img_num);
+  LOG(INFO) << "Average time cost per person:";
+  LOG(INFO) << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+#ifdef _WIN32
+  struct _stat buffer;
+  return (_stat(path.c_str(), &buffer) == 0);
+#else
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+#ifdef _WIN32
+  ret = _mkdir(path.c_str());
+#else
+  ret = mkdir(path.c_str(), 0755);
+#endif  // !_WIN32
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictVideo(const std::string& video_path,
+                  PaddleDetection::ObjectDetector* det,
+                  PaddleDetection::KeyPointDetector* keypoint,
+                  const std::string& output_dir = "output") {
+  // Open video
+  cv::VideoCapture capture;
+  std::string video_out_name = "output.mp4";
+  if (FLAGS_camera_id != -1) {
+    capture.open(FLAGS_camera_id);
+  } else {
+    capture.open(video_path.c_str());
+    video_out_name =
+        video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
+  }
+  if (!capture.isOpened()) {
+    printf("can not open video : %s\n", video_path.c_str());
+    return;
+  }
+
+  // Get Video info : resolution, fps, frame count
+  int video_width = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_WIDTH));
+  int video_height = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_HEIGHT));
+  int video_fps = static_cast<int>(capture.get(CV_CAP_PROP_FPS));
+  int video_frame_count =
+      static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
+  printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
+
+  // Create VideoWriter for output
+  cv::VideoWriter video_out;
+  std::string video_out_path(output_dir);
+  if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+    video_out_path += OS_PATH_SEP;
+  }
+  video_out_path += video_out_name;
+  video_out.open(video_out_path.c_str(),
+                 0x00000021,
+                 video_fps,
+                 cv::Size(video_width, video_height),
+                 true);
+  if (!video_out.isOpened()) {
+    printf("create video writer failed!\n");
+    return;
+  }
+  PaddleDetection::PoseSmooth smoother =
+      PaddleDetection::PoseSmooth(video_width, video_height);
+
+  std::vector<PaddleDetection::ObjectResult> result;
+  std::vector<int> bbox_num;
+  std::vector<double> det_times;
+  auto labels = det->GetLabelList();
+  auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+
+  // Store keypoint results
+  std::vector<PaddleDetection::KeyPointResult> result_kpts;
+  std::vector<cv::Mat> imgs_kpts;
+  std::vector<std::vector<float>> center_bs;
+  std::vector<std::vector<float>> scale_bs;
+  std::vector<int> colormap_kpts = PaddleDetection::GenerateColorMap(20);
+  // Capture all frames and do inference
+  cv::Mat frame;
+  int frame_id = 1;
+  bool is_rbox = false;
+  while (capture.read(frame)) {
+    if (frame.empty()) {
+      break;
+    }
+    std::vector<cv::Mat> imgs;
+    imgs.push_back(frame);
+    printf("detect frame: %d\n", frame_id);
+    det->Predict(imgs, FLAGS_threshold, 0, 1, &result, &bbox_num, &det_times);
+    std::vector<PaddleDetection::ObjectResult> out_result;
+    for (const auto& item : result) {
+      if (item.confidence < FLAGS_threshold || item.class_id == -1) {
+        continue;
+      }
+      out_result.push_back(item);
+      if (item.rect.size() > 6) {
+        is_rbox = true;
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3],
+               item.rect[4],
+               item.rect[5],
+               item.rect[6],
+               item.rect[7]);
+      } else {
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+               item.class_id,
+               item.confidence,
+               item.rect[0],
+               item.rect[1],
+               item.rect[2],
+               item.rect[3]);
+      }
+    }
+
+    if (keypoint) {
+      result_kpts.clear();
+      int imsize = out_result.size();
+      for (int i = 0; i < imsize; i++) {
+        auto item = out_result[i];
+        cv::Mat crop_img;
+        std::vector<double> keypoint_times;
+        std::vector<int> rect = {
+            item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+        std::vector<float> center;
+        std::vector<float> scale;
+        if (item.class_id == 0) {
+          PaddleDetection::CropImg(frame, crop_img, rect, center, scale);
+          center_bs.emplace_back(center);
+          scale_bs.emplace_back(scale);
+          imgs_kpts.emplace_back(crop_img);
+        }
+
+        if (imgs_kpts.size() == FLAGS_batch_size_keypoint ||
+            ((i == imsize - 1) && !imgs_kpts.empty())) {
+          keypoint->Predict(imgs_kpts,
+                            center_bs,
+                            scale_bs,
+                            FLAGS_threshold,
+                            0,
+                            1,
+                            &result_kpts,
+                            &keypoint_times);
+          imgs_kpts.clear();
+          center_bs.clear();
+          scale_bs.clear();
+        }
+      }
+
+      if (result_kpts.size() == 1) {
+        for (int i = 0; i < result_kpts.size(); i++) {
+          result_kpts[i] = smoother.smooth_process(&(result_kpts[i]));
+        }
+      }
+
+      cv::Mat out_im = VisualizeKptsResult(frame, result_kpts, colormap_kpts);
+      video_out.write(out_im);
+    } else {
+      // Visualization result
+      cv::Mat out_im = PaddleDetection::VisualizeResult(
+          frame, out_result, labels, colormap, is_rbox);
+      video_out.write(out_im);
+    }
+
+    frame_id += 1;
+  }
+  capture.release();
+  video_out.release();
+}
+
+void PredictImage(const std::vector<std::string> all_img_paths,
+                  const int batch_size,
+                  const double threshold,
+                  const bool run_benchmark,
+                  PaddleDetection::ObjectDetector* det,
+                  PaddleDetection::KeyPointDetector* keypoint,
+                  const std::string& output_dir = "output") {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(static_cast<float>(all_img_paths.size()) / batch_size);
+  int kpts_imgs = 0;
+  std::vector<double> keypoint_t = {0, 0, 0};
+  printf("total images = %d, batch_size = %d, total steps = %d\n",
+         all_img_paths.size(),
+         batch_size,
+         steps);
+  for (int idx = 0; idx < steps; idx++) {
+    std::vector<cv::Mat> batch_imgs;
+    int left_image_cnt = all_img_paths.size() - idx * batch_size;
+    if (left_image_cnt > batch_size) {
+      left_image_cnt = batch_size;
+    }
+    for (int bs = 0; bs < left_image_cnt; bs++) {
+      std::string image_file_path = all_img_paths.at(idx * batch_size + bs);
+      cv::Mat im = cv::imread(image_file_path, 1);
+      batch_imgs.insert(batch_imgs.end(), im);
+    }
+
+    // Store all detected result
+    std::vector<PaddleDetection::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+
+    // Store keypoint results
+    std::vector<PaddleDetection::KeyPointResult> result_kpts;
+    std::vector<cv::Mat> imgs_kpts;
+    std::vector<std::vector<float>> center_bs;
+    std::vector<std::vector<float>> scale_bs;
+    std::vector<int> colormap_kpts = PaddleDetection::GenerateColorMap(20);
+
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(
+          batch_imgs, threshold, 10, 10, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(batch_imgs, threshold, 0, 1, &result, &bbox_num, &det_times);
+    }
+    // get labels and colormap
+    auto labels = det->GetLabelList();
+    auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+    int item_start_idx = 0;
+    for (int i = 0; i < left_image_cnt; i++) {
+      cv::Mat im = batch_imgs[i];
+      std::vector<PaddleDetection::ObjectResult> im_result;
+      int detect_num = 0;
+      for (int j = 0; j < bbox_num[i]; j++) {
+        PaddleDetection::ObjectResult item = result[item_start_idx + j];
+        if (item.confidence < threshold || item.class_id == -1) {
+          continue;
+        }
+        detect_num += 1;
+        im_result.push_back(item);
+        if (item.rect.size() > 6) {
+          is_rbox = true;
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3],
+                 item.rect[4],
+                 item.rect[5],
+                 item.rect[6],
+                 item.rect[7]);
+        } else {
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3]);
+        }
+      }
+      std::cout << all_img_paths.at(idx * batch_size + i)
+                << " The number of detected box: " << detect_num << std::endl;
+      item_start_idx = item_start_idx + bbox_num[i];
+
+      std::vector<int> compression_params;
+      compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+      compression_params.push_back(95);
+      std::string output_path(output_dir);
+      if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+        output_path += OS_PATH_SEP;
+      }
+      std::string image_file_path = all_img_paths.at(idx * batch_size + i);
+      if (keypoint) {
+        int imsize = im_result.size();
+        for (int i = 0; i < imsize; i++) {
+          auto item = im_result[i];
+          cv::Mat crop_img;
+          std::vector<double> keypoint_times;
+          std::vector<int> rect = {
+              item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+          std::vector<float> center;
+          std::vector<float> scale;
+          if (item.class_id == 0) {
+            PaddleDetection::CropImg(im, crop_img, rect, center, scale);
+            center_bs.emplace_back(center);
+            scale_bs.emplace_back(scale);
+            imgs_kpts.emplace_back(crop_img);
+            kpts_imgs += 1;
+          }
+
+          if (imgs_kpts.size() == FLAGS_batch_size_keypoint ||
+              ((i == imsize - 1) && !imgs_kpts.empty())) {
+            if (run_benchmark) {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0.5,
+                                10,
+                                10,
+                                &result_kpts,
+                                &keypoint_times);
+            } else {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0.5,
+                                0,
+                                1,
+                                &result_kpts,
+                                &keypoint_times);
+            }
+            imgs_kpts.clear();
+            center_bs.clear();
+            scale_bs.clear();
+            keypoint_t[0] += keypoint_times[0];
+            keypoint_t[1] += keypoint_times[1];
+            keypoint_t[2] += keypoint_times[2];
+          }
+        }
+        std::string kpts_savepath =
+            output_path + "keypoint_" +
+            image_file_path.substr(image_file_path.find_last_of(OS_PATH_SEP) + 1);
+        cv::Mat kpts_vis_img =
+            VisualizeKptsResult(im, result_kpts, colormap_kpts);
+        cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+        printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+      } else {
+        // Visualization result
+        cv::Mat vis_img = PaddleDetection::VisualizeResult(
+            im, im_result, labels, colormap, is_rbox);
+        std::string det_savepath =
+            output_path +
+            image_file_path.substr(image_file_path.find_last_of(OS_PATH_SEP) + 1);
+        cv::imwrite(det_savepath, vis_img, compression_params);
+        printf("Visualized output saved as %s\n", det_savepath.c_str());
+      }
+    }
+
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+  }
+  PrintBenchmarkLog(det_t, all_img_paths.size());
+  if (keypoint) {
+    PrintKptsBenchmarkLog(keypoint_t, kpts_imgs);
+  }
+}
+
+int main(int argc, char** argv) {
+  // Parsing command-line
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_image_file.empty() && FLAGS_image_dir.empty() &&
+       FLAGS_video_file.empty())) {
+    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ "
+                 "(--model_dir_keypoint=/PATH/TO/INFERENCE_MODEL/)"
+              << "--image_file=/PATH/TO/INPUT/IMAGE/" << std::endl;
+    return -1;
+  }
+  if (!(FLAGS_run_mode == "paddle" || FLAGS_run_mode == "trt_fp32" ||
+        FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
+    std::cout
+        << "run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
+    return -1;
+  }
+  transform(FLAGS_device.begin(),
+            FLAGS_device.end(),
+            FLAGS_device.begin(),
+            ::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" ||
+        FLAGS_device == "XPU")) {
+    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
+    return -1;
+  }
+  if (FLAGS_use_gpu) {
+    std::cout << "Deprecated, please use `--device` to set the device you want "
+                 "to run.";
+    return -1;
+  }
+  // Load model and create a object detector
+  PaddleDetection::ObjectDetector det(FLAGS_model_dir,
+                                      FLAGS_device,
+                                      FLAGS_use_mkldnn,
+                                      FLAGS_cpu_threads,
+                                      FLAGS_run_mode,
+                                      FLAGS_batch_size,
+                                      FLAGS_gpu_id,
+                                      FLAGS_trt_min_shape,
+                                      FLAGS_trt_max_shape,
+                                      FLAGS_trt_opt_shape,
+                                      FLAGS_trt_calib_mode);
+
+  PaddleDetection::KeyPointDetector* keypoint = nullptr;
+  if (!FLAGS_model_dir_keypoint.empty()) {
+    keypoint = new PaddleDetection::KeyPointDetector(FLAGS_model_dir_keypoint,
+                                                     FLAGS_device,
+                                                     FLAGS_use_mkldnn,
+                                                     FLAGS_cpu_threads,
+                                                     FLAGS_run_mode,
+                                                     FLAGS_batch_size_keypoint,
+                                                     FLAGS_gpu_id,
+                                                     FLAGS_trt_min_shape,
+                                                     FLAGS_trt_max_shape,
+                                                     FLAGS_trt_opt_shape,
+                                                     FLAGS_trt_calib_mode,
+                                                     FLAGS_use_dark);
+  }
+  // Do inference on input video or image
+  if (!PathExists(FLAGS_output_dir)) {
+    MkDirs(FLAGS_output_dir);
+  }
+  if (!FLAGS_video_file.empty() || FLAGS_camera_id != -1) {
+    PredictVideo(FLAGS_video_file, &det, keypoint, FLAGS_output_dir);
+  } else if (!FLAGS_image_file.empty() || !FLAGS_image_dir.empty()) {
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!FLAGS_image_file.empty()) {
+      all_img_paths.push_back(FLAGS_image_file);
+      if (FLAGS_batch_size > 1) {
+        std::cout << "batch_size should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(FLAGS_image_dir, cv_all_img_paths);
+      for (const auto& img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    PredictImage(all_img_paths,
+                 FLAGS_batch_size,
+                 FLAGS_threshold,
+                 FLAGS_run_benchmark,
+                 &det,
+                 keypoint,
+                 FLAGS_output_dir);
+  }
+  delete keypoint;
+  keypoint = nullptr;
+  return 0;
+}
--- a/paddle_detection/deploy/cpp/src/object_detector.cc
+++ b/paddle_detection/deploy/cpp/src/object_detector.cc
@@ -0,0 +1,592 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+
+#include "include/object_detector.h"
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void ObjectDetector::LoadModel(const std::string &model_dir,
+                               const int batch_size,
+                               const std::string &run_mode) {
+  paddle_infer::Config config;
+  std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel";
+  std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams";
+  config.SetModel(prog_file, params_file);
+  if (this->device_ == "GPU") {
+    config.EnableUseGpu(200, this->gpu_id_);
+    config.SwitchIrOptim(true);
+    // use tensorrt
+    if (run_mode != "paddle") {
+      auto precision = paddle_infer::Config::Precision::kFloat32;
+      if (run_mode == "trt_fp32") {
+        precision = paddle_infer::Config::Precision::kFloat32;
+      } else if (run_mode == "trt_fp16") {
+        precision = paddle_infer::Config::Precision::kHalf;
+      } else if (run_mode == "trt_int8") {
+        precision = paddle_infer::Config::Precision::kInt8;
+      } else {
+        printf("run_mode should be 'paddle', 'trt_fp32', 'trt_fp16' or "
+               "'trt_int8'");
+      }
+      // set tensorrt
+      config.EnableTensorRtEngine(1 << 30, batch_size, this->min_subgraph_size_,
+                                  precision, false, this->trt_calib_mode_);
+
+      // set use dynamic shape
+      if (this->use_dynamic_shape_) {
+        // set DynamicShape for image tensor
+        const std::vector<int> min_input_shape = {
+            batch_size, 3, this->trt_min_shape_, this->trt_min_shape_};
+        const std::vector<int> max_input_shape = {
+            batch_size, 3, this->trt_max_shape_, this->trt_max_shape_};
+        const std::vector<int> opt_input_shape = {
+            batch_size, 3, this->trt_opt_shape_, this->trt_opt_shape_};
+        const std::map<std::string, std::vector<int>> map_min_input_shape = {
+            {"image", min_input_shape}};
+        const std::map<std::string, std::vector<int>> map_max_input_shape = {
+            {"image", max_input_shape}};
+        const std::map<std::string, std::vector<int>> map_opt_input_shape = {
+            {"image", opt_input_shape}};
+
+        config.SetTRTDynamicShapeInfo(map_min_input_shape, map_max_input_shape,
+                                      map_opt_input_shape);
+        std::cout << "TensorRT dynamic shape enabled" << std::endl;
+      }
+    }
+
+  } else if (this->device_ == "XPU") {
+    config.EnableXpu(10 * 1024 * 1024);
+  } else {
+    config.DisableGpu();
+    if (this->use_mkldnn_) {
+      config.EnableMKLDNN();
+      // cache 10 different shapes for mkldnn to avoid memory leak
+      config.SetMkldnnCacheCapacity(10);
+    }
+    config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
+  }
+  config.SwitchUseFeedFetchOps(false);
+  config.SwitchIrOptim(true);
+  config.DisableGlogInfo();
+  // Memory optimization
+  config.EnableMemoryOptim();
+  predictor_ = std::move(CreatePredictor(config));
+}
+
+// Visualiztion MaskDetector results
+cv::Mat
+VisualizeResult(const cv::Mat &img,
+                const std::vector<PaddleDetection::ObjectResult> &results,
+                const std::vector<std::string> &lables,
+                const std::vector<int> &colormap, const bool is_rbox = false) {
+  cv::Mat vis_img = img.clone();
+  int img_h = vis_img.rows;
+  int img_w = vis_img.cols;
+  for (int i = 0; i < results.size(); ++i) {
+    // Configure color and text size
+    std::ostringstream oss;
+    oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    oss << lables[results[i].class_id] << " ";
+    oss << results[i].confidence;
+    std::string text = oss.str();
+    int c1 = colormap[3 * results[i].class_id + 0];
+    int c2 = colormap[3 * results[i].class_id + 1];
+    int c3 = colormap[3 * results[i].class_id + 2];
+    cv::Scalar roi_color = cv::Scalar(c1, c2, c3);
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 0.5f;
+    float thickness = 0.5;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    cv::Point origin;
+
+    if (is_rbox) {
+      // Draw object, text, and background
+      for (int k = 0; k < 4; k++) {
+        cv::Point pt1 = cv::Point(results[i].rect[(k * 2) % 8],
+                                  results[i].rect[(k * 2 + 1) % 8]);
+        cv::Point pt2 = cv::Point(results[i].rect[(k * 2 + 2) % 8],
+                                  results[i].rect[(k * 2 + 3) % 8]);
+        cv::line(vis_img, pt1, pt2, roi_color, 2);
+      }
+    } else {
+      int w = results[i].rect[2] - results[i].rect[0];
+      int h = results[i].rect[3] - results[i].rect[1];
+      cv::Rect roi = cv::Rect(results[i].rect[0], results[i].rect[1], w, h);
+      // Draw roi object, text, and background
+      cv::rectangle(vis_img, roi, roi_color, 2);
+
+      // Draw mask
+      std::vector<int> mask_v = results[i].mask;
+      if (mask_v.size() > 0) {
+        cv::Mat mask = cv::Mat(img_h, img_w, CV_32S);
+        std::memcpy(mask.data, mask_v.data(), mask_v.size() * sizeof(int));
+
+        cv::Mat colored_img = vis_img.clone();
+
+        std::vector<cv::Mat> contours;
+        cv::Mat hierarchy;
+        mask.convertTo(mask, CV_8U);
+        cv::findContours(mask, contours, hierarchy, cv::RETR_CCOMP,
+                         cv::CHAIN_APPROX_SIMPLE);
+        cv::drawContours(colored_img, contours, -1, roi_color, -1, cv::LINE_8,
+                         hierarchy, 100);
+
+        cv::Mat debug_roi = vis_img;
+        colored_img = 0.4 * colored_img + 0.6 * vis_img;
+        colored_img.copyTo(vis_img, mask);
+      }
+    }
+
+    origin.x = results[i].rect[0];
+    origin.y = results[i].rect[1];
+
+    // Configure text background
+    cv::Rect text_back =
+        cv::Rect(results[i].rect[0], results[i].rect[1] - text_size.height,
+                 text_size.width, text_size.height);
+    // Draw text, and background
+    cv::rectangle(vis_img, text_back, roi_color, -1);
+    cv::putText(vis_img, text, origin, font_face, font_scale,
+                cv::Scalar(255, 255, 255), thickness);
+  }
+  return vis_img;
+}
+
+void ObjectDetector::Preprocess(const cv::Mat &ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void ObjectDetector::Postprocess(
+    const std::vector<cv::Mat> mats,
+    std::vector<PaddleDetection::ObjectResult> *result,
+    std::vector<int> bbox_num, std::vector<float> output_data_,
+    std::vector<int> output_mask_data_, bool is_rbox = false) {
+  result->clear();
+  int start_idx = 0;
+  int total_num = std::accumulate(bbox_num.begin(), bbox_num.end(), 0);
+  int out_mask_dim = -1;
+  if (config_.mask_) {
+    out_mask_dim = output_mask_data_.size() / total_num;
+  }
+
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat raw_mat = mats[im_id];
+    int rh = 1;
+    int rw = 1;
+    for (int j = start_idx; j < start_idx + bbox_num[im_id]; j++) {
+      if (is_rbox) {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 10]));
+        // Confidence score
+        float score = output_data_[1 + j * 10];
+        int x1 = (output_data_[2 + j * 10] * rw);
+        int y1 = (output_data_[3 + j * 10] * rh);
+        int x2 = (output_data_[4 + j * 10] * rw);
+        int y2 = (output_data_[5 + j * 10] * rh);
+        int x3 = (output_data_[6 + j * 10] * rw);
+        int y3 = (output_data_[7 + j * 10] * rh);
+        int x4 = (output_data_[8 + j * 10] * rw);
+        int y4 = (output_data_[9 + j * 10] * rh);
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {x1, y1, x2, y2, x3, y3, x4, y4};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      } else {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 6]));
+        // Confidence score
+        float score = output_data_[1 + j * 6];
+        int xmin = (output_data_[2 + j * 6] * rw);
+        int ymin = (output_data_[3 + j * 6] * rh);
+        int xmax = (output_data_[4 + j * 6] * rw);
+        int ymax = (output_data_[5 + j * 6] * rh);
+        int wd = xmax - xmin;
+        int hd = ymax - ymin;
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {xmin, ymin, xmax, ymax};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+
+        if (config_.mask_) {
+          std::vector<int> mask;
+          for (int k = 0; k < out_mask_dim; ++k) {
+            if (output_mask_data_[k + j * out_mask_dim] > -1) {
+              mask.push_back(output_mask_data_[k + j * out_mask_dim]);
+            }
+          }
+          result_item.mask = mask;
+        }
+
+        result->push_back(result_item);
+      }
+    }
+    start_idx += bbox_num[im_id];
+  }
+}
+
+// This function is to convert output result from SOLOv2 to class ObjectResult
+void ObjectDetector::SOLOv2Postprocess(
+    const std::vector<cv::Mat> mats, std::vector<ObjectResult> *result,
+    std::vector<int> *bbox_num, std::vector<int> out_bbox_num_data_,
+    std::vector<int64_t> out_label_data_, std::vector<float> out_score_data_,
+    std::vector<uint8_t> out_global_mask_data_, float threshold) {
+
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat mat = mats[im_id];
+
+    int valid_bbox_count = 0;
+    for (int bbox_id = 0; bbox_id < out_bbox_num_data_[im_id]; ++bbox_id) {
+      if (out_score_data_[bbox_id] >= threshold) {
+        ObjectResult result_item;
+        result_item.class_id = out_label_data_[bbox_id];
+        result_item.confidence = out_score_data_[bbox_id];
+        std::vector<int> global_mask;
+
+        for (int k = 0; k < mat.rows * mat.cols; ++k) {
+          global_mask.push_back(static_cast<int>(
+              out_global_mask_data_[k + bbox_id * mat.rows * mat.cols]));
+        }
+
+        // find minimize bounding box from mask
+        cv::Mat mask(mat.rows, mat.cols, CV_32SC1);
+        std::memcpy(mask.data, global_mask.data(),
+                    global_mask.size() * sizeof(int));
+
+        cv::Mat mask_fp;
+        cv::Mat rowSum;
+        cv::Mat colSum;
+        std::vector<float> sum_of_row(mat.rows);
+        std::vector<float> sum_of_col(mat.cols);
+
+        mask.convertTo(mask_fp, CV_32FC1);
+        cv::reduce(mask_fp, colSum, 0, CV_REDUCE_SUM, CV_32FC1);
+        cv::reduce(mask_fp, rowSum, 1, CV_REDUCE_SUM, CV_32FC1);
+
+        for (int row_id = 0; row_id < mat.rows; ++row_id) {
+          sum_of_row[row_id] = rowSum.at<float>(row_id, 0);
+        }
+
+        for (int col_id = 0; col_id < mat.cols; ++col_id) {
+          sum_of_col[col_id] = colSum.at<float>(0, col_id);
+        }
+
+        auto it = std::find_if(sum_of_row.begin(), sum_of_row.end(),
+                               [](int x) { return x > 0.5; });
+        int y1 = std::distance(sum_of_row.begin(), it);
+
+        auto it2 = std::find_if(sum_of_col.begin(), sum_of_col.end(),
+                                [](int x) { return x > 0.5; });
+        int x1 = std::distance(sum_of_col.begin(), it2);
+
+        auto rit = std::find_if(sum_of_row.rbegin(), sum_of_row.rend(),
+                                [](int x) { return x > 0.5; });
+        int y2 = std::distance(rit, sum_of_row.rend());
+
+        auto rit2 = std::find_if(sum_of_col.rbegin(), sum_of_col.rend(),
+                                 [](int x) { return x > 0.5; });
+        int x2 = std::distance(rit2, sum_of_col.rend());
+
+        result_item.rect = {x1, y1, x2, y2};
+        result_item.mask = global_mask;
+
+        result->push_back(result_item);
+        valid_bbox_count++;
+      }
+    }
+    bbox_num->push_back(valid_bbox_count);
+  }
+}
+
+void ObjectDetector::Predict(const std::vector<cv::Mat> imgs,
+                             const double threshold, const int warmup,
+                             const int repeats,
+                             std::vector<PaddleDetection::ObjectResult> *result,
+                             std::vector<int> *bbox_num,
+                             std::vector<double> *times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+  std::vector<const float *> output_data_list_;
+  std::vector<int> out_bbox_num_data_;
+  std::vector<int> out_mask_data_;
+
+  // these parameters are for SOLOv2 output
+  std::vector<float> out_score_data_;
+  std::vector<uint8_t> out_global_mask_data_;
+  std::vector<int64_t> out_label_data_;
+
+  // in_net img for each batch
+  std::vector<cv::Mat> in_net_img_all(batch_size);
+
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    in_data_all.insert(in_data_all.end(), inputs_.im_data_.begin(),
+                       inputs_.im_data_.end());
+
+    // collect in_net img
+    in_net_img_all[bs_idx] = inputs_.in_net_im_;
+  }
+
+  // Pad Batch if batch size > 1
+  if (batch_size > 1 && CheckDynamicInput(in_net_img_all)) {
+    in_data_all.clear();
+    std::vector<cv::Mat> pad_img_all = PadBatch(in_net_img_all);
+    int rh = pad_img_all[0].rows;
+    int rw = pad_img_all[0].cols;
+    int rc = pad_img_all[0].channels();
+
+    for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+      cv::Mat pad_img = pad_img_all[bs_idx];
+      pad_img.convertTo(pad_img, CV_32FC3);
+      std::vector<float> pad_data;
+      pad_data.resize(rc * rh * rw);
+      float *base = pad_data.data();
+      for (int i = 0; i < rc; ++i) {
+        cv::extractChannel(pad_img,
+                           cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+      }
+      in_data_all.insert(in_data_all.end(), pad_data.begin(), pad_data.end());
+    }
+    // update in_net_shape
+    inputs_.in_net_shape_ = {static_cast<float>(rh), static_cast<float>(rw)};
+  }
+
+  auto preprocess_end = std::chrono::steady_clock::now();
+  // Prepare input tensor
+  auto input_names = predictor_->GetInputNames();
+  for (const auto &tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputHandle(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Reshape({batch_size, 3, rh, rw});
+      in_tensor->CopyFromCpu(in_data_all.data());
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(im_shape_all.data());
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Reshape({batch_size, 2});
+      in_tensor->CopyFromCpu(scale_factor_all.data());
+    }
+  }
+
+  // Run predictor
+  std::vector<std::vector<float>> out_tensor_list;
+  std::vector<std::vector<int>> output_shape_list;
+  bool is_rbox = false;
+  int reg_max = 7;
+  int num_class = 80;
+
+  auto inference_start = std::chrono::steady_clock::now();
+  if (config_.arch_ == "SOLOv2") {
+    // warmup
+    for (int i = 0; i < warmup; i++) {
+      predictor_->Run();
+      // Get output tensor
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        if (j == 0) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else if (j == 1) {
+          out_label_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_label_data_.data());
+        } else if (j == 2) {
+          out_score_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_score_data_.data());
+        } else if (config_.mask_ && (j == 3)) {
+          out_global_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_global_mask_data_.data());
+        }
+      }
+    }
+
+    inference_start = std::chrono::steady_clock::now();
+    for (int i = 0; i < repeats; i++) {
+      predictor_->Run();
+      // Get output tensor
+      out_tensor_list.clear();
+      output_shape_list.clear();
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        output_shape_list.push_back(output_shape);
+        if (j == 0) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else if (j == 1) {
+          out_label_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_label_data_.data());
+        } else if (j == 2) {
+          out_score_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_score_data_.data());
+        } else if (config_.mask_ && (j == 3)) {
+          out_global_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_global_mask_data_.data());
+        }
+      }
+    }
+  } else {
+    // warmup
+    for (int i = 0; i < warmup; i++) {
+      predictor_->Run();
+      // Get output tensor
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        if (config_.mask_ && (j == 2)) {
+          out_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_mask_data_.data());
+        } else if (output_tensor->type() == paddle_infer::DataType::INT32) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else {
+          std::vector<float> out_data;
+          out_data.resize(out_num);
+          output_tensor->CopyToCpu(out_data.data());
+          out_tensor_list.push_back(out_data);
+        }
+      }
+    }
+
+    inference_start = std::chrono::steady_clock::now();
+    for (int i = 0; i < repeats; i++) {
+      predictor_->Run();
+      // Get output tensor
+      out_tensor_list.clear();
+      output_shape_list.clear();
+      auto output_names = predictor_->GetOutputNames();
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetOutputHandle(output_names[j]);
+        std::vector<int> output_shape = output_tensor->shape();
+        int out_num = std::accumulate(output_shape.begin(), output_shape.end(),
+                                      1, std::multiplies<int>());
+        output_shape_list.push_back(output_shape);
+        if (config_.mask_ && (j == 2)) {
+          out_mask_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_mask_data_.data());
+        } else if (output_tensor->type() == paddle_infer::DataType::INT32) {
+          out_bbox_num_data_.resize(out_num);
+          output_tensor->CopyToCpu(out_bbox_num_data_.data());
+        } else {
+          std::vector<float> out_data;
+          out_data.resize(out_num);
+          output_tensor->CopyToCpu(out_data.data());
+          out_tensor_list.push_back(out_data);
+        }
+      }
+    }
+  }
+
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  result->clear();
+  bbox_num->clear();
+  if (config_.arch_ == "PicoDet") {
+    for (int i = 0; i < out_tensor_list.size(); i++) {
+      if (i == 0) {
+        num_class = output_shape_list[i][2];
+      }
+      if (i == config_.fpn_stride_.size()) {
+        reg_max = output_shape_list[i][2] / 4 - 1;
+      }
+      float *buffer = new float[out_tensor_list[i].size()];
+      memcpy(buffer, &out_tensor_list[i][0],
+             out_tensor_list[i].size() * sizeof(float));
+      output_data_list_.push_back(buffer);
+    }
+    PaddleDetection::PicoDetPostProcess(
+        result, output_data_list_, config_.fpn_stride_, inputs_.im_shape_,
+        inputs_.scale_factor_, config_.nms_info_["score_threshold"].as<float>(),
+        config_.nms_info_["nms_threshold"].as<float>(), num_class, reg_max);
+    bbox_num->push_back(result->size());
+  } else if (config_.arch_ == "SOLOv2") {
+    SOLOv2Postprocess(imgs, result, bbox_num, out_bbox_num_data_,
+                      out_label_data_, out_score_data_, out_global_mask_data_,
+                      threshold);
+  } else {
+    is_rbox = output_shape_list[0][output_shape_list[0].size() - 1] % 10 == 0;
+    Postprocess(imgs, result, out_bbox_num_data_, out_tensor_list[0],
+                out_mask_data_, is_rbox);
+    for (int k = 0; k < out_bbox_num_data_.size(); k++) {
+      int tmp = out_bbox_num_data_[k];
+      bbox_num->push_back(tmp);
+    }
+  }
+
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(static_cast<double>(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(
+      static_cast<double>(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(static_cast<double>(postprocess_diff.count() * 1000));
+}
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+} // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/picodet_postprocess.cc
+++ b/paddle_detection/deploy/cpp/src/picodet_postprocess.cc
@@ -0,0 +1,128 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on:
+// https://github.com/RangiLyu/nanodet/blob/main/demo_mnn/nanodet_mnn.cpp
+
+#include "include/picodet_postprocess.h"
+
+namespace PaddleDetection {
+
+float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+// PicoDet decode
+PaddleDetection::ObjectResult
+disPred2Bbox(const float *&dfl_det, int label, float score, int x, int y,
+             int stride, std::vector<float> im_shape, int reg_max) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(dfl_det + i * (reg_max + 1), dis_after_sm,
+                                reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  int xmin = (int)(std::max)(ct_x - dis_pred[0], .0f);
+  int ymin = (int)(std::max)(ct_y - dis_pred[1], .0f);
+  int xmax = (int)(std::min)(ct_x + dis_pred[2], (float)im_shape[0]);
+  int ymax = (int)(std::min)(ct_y + dis_pred[3], (float)im_shape[1]);
+
+  PaddleDetection::ObjectResult result_item;
+  result_item.rect = {xmin, ymin, xmax, ymax};
+  result_item.class_id = label;
+  result_item.confidence = score;
+
+  return result_item;
+}
+
+void PicoDetPostProcess(std::vector<PaddleDetection::ObjectResult> *results,
+                        std::vector<const float *> outs,
+                        std::vector<int> fpn_stride,
+                        std::vector<float> im_shape,
+                        std::vector<float> scale_factor, float score_threshold,
+                        float nms_threshold, int num_class, int reg_max) {
+  std::vector<std::vector<PaddleDetection::ObjectResult>> bbox_results;
+  bbox_results.resize(num_class);
+  int in_h = im_shape[0], in_w = im_shape[1];
+  for (int i = 0; i < fpn_stride.size(); ++i) {
+    int feature_h = std::ceil((float)in_h / fpn_stride[i]);
+    int feature_w = std::ceil((float)in_w / fpn_stride[i]);
+    for (int idx = 0; idx < feature_h * feature_w; idx++) {
+      const float *scores = outs[i] + (idx * num_class);
+
+      int row = idx / feature_w;
+      int col = idx % feature_w;
+      float score = 0;
+      int cur_label = 0;
+      for (int label = 0; label < num_class; label++) {
+        if (scores[label] > score) {
+          score = scores[label];
+          cur_label = label;
+        }
+      }
+      if (score > score_threshold) {
+        const float *bbox_pred =
+            outs[i + fpn_stride.size()] + (idx * 4 * (reg_max + 1));
+        bbox_results[cur_label].push_back(
+            disPred2Bbox(bbox_pred, cur_label, score, col, row, fpn_stride[i],
+                         im_shape, reg_max));
+      }
+    }
+  }
+  for (int i = 0; i < (int)bbox_results.size(); i++) {
+    PaddleDetection::nms(bbox_results[i], nms_threshold);
+
+    for (auto box : bbox_results[i]) {
+      box.rect[0] = box.rect[0] / scale_factor[1];
+      box.rect[2] = box.rect[2] / scale_factor[1];
+      box.rect[1] = box.rect[1] / scale_factor[0];
+      box.rect[3] = box.rect[3] / scale_factor[0];
+      results->push_back(box);
+    }
+  }
+}
+
+} // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/preprocess_op.cc
+++ b/paddle_detection/deploy/cpp/src/preprocess_op.cc
@@ -0,0 +1,355 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "include/preprocess_op.h"
+
+namespace PaddleDetection {
+
+void InitInfo::Run(cv::Mat* im, ImageBlob* data) {
+  data->im_shape_ = {static_cast<float>(im->rows),
+                     static_cast<float>(im->cols)};
+  data->scale_factor_ = {1., 1.};
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+}
+
+void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) {
+  double e = 1.0;
+  if (is_scale_) {
+    e /= 255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  if (norm_type_ == "mean_std"){
+    for (int h = 0; h < im->rows; h++) {
+      for (int w = 0; w < im->cols; w++) {
+        im->at<cv::Vec3f>(h, w)[0] =
+            (im->at<cv::Vec3f>(h, w)[0] - mean_[0]) / scale_[0];
+        im->at<cv::Vec3f>(h, w)[1] =
+            (im->at<cv::Vec3f>(h, w)[1] - mean_[1]) / scale_[1];
+        im->at<cv::Vec3f>(h, w)[2] =
+            (im->at<cv::Vec3f>(h, w)[2] - mean_[2]) / scale_[2];
+      }
+    }
+  }
+}
+
+void Permute::Run(cv::Mat* im, ImageBlob* data) {
+  (*im).convertTo(*im, CV_32FC3);
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  (data->im_data_).resize(rc * rh * rw);
+  float* base = (data->im_data_).data();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+  }
+}
+
+void Resize::Run(cv::Mat* im, ImageBlob* data) {
+  auto resize_scale = GenerateScale(*im);
+  cv::resize(
+      *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_);
+
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+  data->im_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale.second, resize_scale.first,
+  };
+}
+
+std::pair<float, float> Resize::GenerateScale(const cv::Mat& im) {
+  std::pair<float, float> resize_scale;
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+
+  if (keep_ratio_) {
+    int im_size_max = std::max(origin_w, origin_h);
+    int im_size_min = std::min(origin_w, origin_h);
+    int target_size_max =
+        *std::max_element(target_size_.begin(), target_size_.end());
+    int target_size_min =
+        *std::min_element(target_size_.begin(), target_size_.end());
+    float scale_min =
+        static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+    float scale_max =
+        static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+    float scale_ratio = std::min(scale_min, scale_max);
+    resize_scale = {scale_ratio, scale_ratio};
+  } else {
+    resize_scale.first =
+        static_cast<float>(target_size_[1]) / static_cast<float>(origin_w);
+    resize_scale.second =
+        static_cast<float>(target_size_[0]) / static_cast<float>(origin_h);
+  }
+  return resize_scale;
+}
+
+void LetterBoxResize::Run(cv::Mat* im, ImageBlob* data) {
+  float resize_scale = GenerateScale(*im);
+  int new_shape_w = std::round(im->cols * resize_scale);
+  int new_shape_h = std::round(im->rows * resize_scale);
+  data->im_shape_ = {static_cast<float>(new_shape_h),
+                     static_cast<float>(new_shape_w)};
+  float padw = (target_size_[1] - new_shape_w) / 2.;
+  float padh = (target_size_[0] - new_shape_h) / 2.;
+
+  int top = std::round(padh - 0.1);
+  int bottom = std::round(padh + 0.1);
+  int left = std::round(padw - 0.1);
+  int right = std::round(padw + 0.1);
+
+  cv::resize(
+      *im, *im, cv::Size(new_shape_w, new_shape_h), 0, 0, cv::INTER_AREA);
+
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  cv::copyMakeBorder(*im,
+                     *im,
+                     top,
+                     bottom,
+                     left,
+                     right,
+                     cv::BORDER_CONSTANT,
+                     cv::Scalar(127.5));
+
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+
+  data->scale_factor_ = {
+      resize_scale, resize_scale,
+  };
+}
+
+float LetterBoxResize::GenerateScale(const cv::Mat& im) {
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+
+  int target_h = target_size_[0];
+  int target_w = target_size_[1];
+
+  float ratio_h = static_cast<float>(target_h) / static_cast<float>(origin_h);
+  float ratio_w = static_cast<float>(target_w) / static_cast<float>(origin_w);
+  float resize_scale = std::min(ratio_h, ratio_w);
+  return resize_scale;
+}
+
+void PadStride::Run(cv::Mat* im, ImageBlob* data) {
+  if (stride_ <= 0) {
+    data->in_net_im_ = im->clone();
+    return;
+  }
+  int rc = im->channels();
+  int rh = im->rows;
+  int rw = im->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(
+      *im, *im, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0));
+  data->in_net_im_ = im->clone();
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+
+void TopDownEvalAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::resize(*im, *im, cv::Size(trainsize_[0], trainsize_[1]), 0, 0, interp_);
+  // todo: Simd::ResizeBilinear();
+  data->in_net_shape_ = {
+      static_cast<float>(trainsize_[1]), static_cast<float>(trainsize_[0]),
+  };
+}
+
+void GetAffineTrans(const cv::Point2f center,
+                    const cv::Point2f input_size,
+                    const cv::Point2f output_size,
+                    cv::Mat* trans) {
+  cv::Point2f srcTri[3];
+  cv::Point2f dstTri[3];
+  float src_w = input_size.x;
+  float dst_w = output_size.x;
+  float dst_h = output_size.y;
+
+  cv::Point2f src_dir(0, -0.5 * src_w);
+  cv::Point2f dst_dir(0, -0.5 * dst_w);
+
+  srcTri[0] = center;
+  srcTri[1] = center + src_dir;
+  cv::Point2f src_d = srcTri[0] - srcTri[1];
+  srcTri[2] = srcTri[1] + cv::Point2f(-src_d.y, src_d.x);
+
+  dstTri[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstTri[1] = cv::Point2f(dst_w * 0.5, dst_h * 0.5) + dst_dir;
+  cv::Point2f dst_d = dstTri[0] - dstTri[1];
+  dstTri[2] = dstTri[1] + cv::Point2f(-dst_d.y, dst_d.x);
+
+  *trans = cv::getAffineTransform(srcTri, dstTri);
+}
+
+void WarpAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::cvtColor(*im, *im, cv::COLOR_RGB2BGR);
+  cv::Mat trans(2, 3, CV_32FC1);
+  cv::Point2f center;
+  cv::Point2f input_size;
+  int h = im->rows;
+  int w = im->cols;
+  if (keep_res_) {
+    input_h_ = (h | pad_) + 1;
+    input_w_ = (w + pad_) + 1;
+    input_size = cv::Point2f(input_w_, input_h_);
+    center = cv::Point2f(w / 2, h / 2);
+  } else {
+    float s = std::max(h, w) * 1.0;
+    input_size = cv::Point2f(s, s);
+    center = cv::Point2f(w / 2., h / 2.);
+  }
+  cv::Point2f output_size(input_w_, input_h_);
+
+  GetAffineTrans(center, input_size, output_size, &trans);
+  cv::warpAffine(*im, *im, trans, cv::Size(input_w_, input_h_));
+  data->in_net_shape_ = {
+      static_cast<float>(input_h_), static_cast<float>(input_w_),
+  };
+}
+
+void Pad::Run(cv::Mat* im, ImageBlob* data) {
+  int h = size_[0];
+  int w = size_[1];
+  int rh = im->rows;
+  int rw = im->cols;
+  if (h == rh && w == rw){
+    data->in_net_im_ = im->clone();
+    return;
+  }
+  cv::copyMakeBorder(
+      *im, *im, 0, h - rh, 0, w - rw, cv::BORDER_CONSTANT, cv::Scalar(114));
+  data->in_net_im_ = im->clone();
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+
+// Preprocessor op running order
+const std::vector<std::string> Preprocessor::RUN_ORDER = {"InitInfo",
+                                                          "TopDownEvalAffine",
+                                                          "Resize",
+                                                          "LetterBoxResize",
+                                                          "WarpAffine",
+                                                          "NormalizeImage",
+                                                          "PadStride",
+                                                          "Pad",
+                                                          "Permute"};
+
+void Preprocessor::Run(cv::Mat* im, ImageBlob* data) {
+  for (const auto& name : RUN_ORDER) {
+    if (ops_.find(name) != ops_.end()) {
+      ops_[name]->Run(im, data);
+    }
+  }
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  // adjust h or w to keep image ratio, expand the shorter edge
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
+
+bool CheckDynamicInput(const std::vector<cv::Mat>& imgs) {
+  if (imgs.size() == 1) return false;
+
+  int h = imgs.at(0).rows;
+  int w = imgs.at(0).cols;
+  for (int i = 1; i < imgs.size(); ++i) {
+    int hi = imgs.at(i).rows;
+    int wi = imgs.at(i).cols;
+    if (hi != h || wi != w) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<cv::Mat> PadBatch(const std::vector<cv::Mat>& imgs) {
+  std::vector<cv::Mat> out_imgs;
+  int max_h = 0;
+  int max_w = 0;
+  int rh = 0;
+  int rw = 0;
+  // find max_h and max_w in batch
+  for (int i = 0; i < imgs.size(); ++i) {
+    rh = imgs.at(i).rows;
+    rw = imgs.at(i).cols;
+    if (rh > max_h) max_h = rh;
+    if (rw > max_w) max_w = rw;
+  }
+  for (int i = 0; i < imgs.size(); ++i) {
+    cv::Mat im = imgs.at(i);
+    cv::copyMakeBorder(im,
+                       im,
+                       0,
+                       max_h - imgs.at(i).rows,
+                       0,
+                       max_w - imgs.at(i).cols,
+                       cv::BORDER_CONSTANT,
+                       cv::Scalar(0));
+    out_imgs.push_back(im);
+  }
+  return out_imgs;
+}
+
+}  // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/tracker.cc
+++ b/paddle_detection/deploy/cpp/src/tracker.cc
@@ -0,0 +1,333 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/jdetracker.cpp
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License 
+
+#include <map>
+#include <stdio.h>
+#include <limits.h>
+#include <algorithm>
+
+#include "include/lapjv.h"
+#include "include/tracker.h"
+
+#define mat2vec4f(m) cv::Vec4f(*m.ptr<float>(0,0), *m.ptr<float>(0,1), *m.ptr<float>(0,2), *m.ptr<float>(0,3))
+
+namespace PaddleDetection {
+
+static std::map<int, float> chi2inv95 = {
+    {1,  3.841459f},
+    {2,  5.991465f},
+    {3,  7.814728f},
+    {4,  9.487729f},
+    {5, 11.070498f},
+    {6, 12.591587f},
+    {7, 14.067140f},
+    {8, 15.507313f},
+    {9, 16.918978f}
+};
+
+JDETracker *JDETracker::me = new JDETracker;
+
+JDETracker *JDETracker::instance(void)
+{
+    return me;
+}
+
+JDETracker::JDETracker(void) : timestamp(0), max_lost_time(30), lambda(0.98f), det_thresh(0.3f)
+{    
+}
+
+bool JDETracker::update(const cv::Mat &dets, const cv::Mat &emb, std::vector<Track> &tracks)
+{
+    ++timestamp;
+    TrajectoryPool candidates(dets.rows);
+    for (int i = 0; i < dets.rows; ++i)
+    {
+        float score = *dets.ptr<float>(i, 1);
+        const cv::Mat &ltrb_ = dets(cv::Rect(2, i, 4, 1));
+        cv::Vec4f ltrb = mat2vec4f(ltrb_);
+        const cv::Mat &embedding = emb(cv::Rect(0, i, emb.cols, 1));
+        candidates[i] = Trajectory(ltrb, score, embedding);
+    }
+
+    
+    TrajectoryPtrPool tracked_trajectories;
+    TrajectoryPtrPool unconfirmed_trajectories;
+    for (size_t i = 0; i < this->tracked_trajectories.size(); ++i)
+    {
+        if (this->tracked_trajectories[i].is_activated)
+            tracked_trajectories.push_back(&this->tracked_trajectories[i]);
+        else
+            unconfirmed_trajectories.push_back(&this->tracked_trajectories[i]);
+    }
+    
+    
+    TrajectoryPtrPool trajectory_pool = tracked_trajectories + this->lost_trajectories;
+    
+    for (size_t i = 0; i < trajectory_pool.size(); ++i)
+        trajectory_pool[i]->predict();   
+    
+    Match matches;
+    std::vector<int> mismatch_row;
+    std::vector<int> mismatch_col;
+
+    cv::Mat cost = motion_distance(trajectory_pool, candidates);
+    linear_assignment(cost, 0.7f, matches, mismatch_row, mismatch_col);
+    
+    MatchIterator miter;
+    TrajectoryPtrPool activated_trajectories;
+    TrajectoryPtrPool retrieved_trajectories;    
+
+    
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        Trajectory *pt = trajectory_pool[miter->first];
+        Trajectory &ct = candidates[miter->second];
+        if (pt->state == Tracked)
+        {
+            pt->update(ct, timestamp);
+            activated_trajectories.push_back(pt);
+        }
+        else
+        {
+            pt->reactivate(ct, timestamp);
+            retrieved_trajectories.push_back(pt);
+        }
+    }
+    
+    TrajectoryPtrPool next_candidates(mismatch_col.size());
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+        next_candidates[i] = &candidates[mismatch_col[i]];
+    
+    TrajectoryPtrPool next_trajectory_pool;
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        int j = mismatch_row[i];
+        if (trajectory_pool[j]->state == Tracked)
+            next_trajectory_pool.push_back(trajectory_pool[j]);
+    }
+    
+    cost = iou_distance(next_trajectory_pool, next_candidates);
+    linear_assignment(cost, 0.5f, matches, mismatch_row, mismatch_col);
+    
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        Trajectory *pt = next_trajectory_pool[miter->first];
+        Trajectory *ct = next_candidates[miter->second];
+        if (pt->state == Tracked)
+        {
+            pt->update(*ct, timestamp);
+            activated_trajectories.push_back(pt);
+        }
+        else
+        {
+            pt->reactivate(*ct, timestamp);
+            retrieved_trajectories.push_back(pt);
+        }
+    }
+    
+    TrajectoryPtrPool lost_trajectories;
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        Trajectory *pt = next_trajectory_pool[mismatch_row[i]];
+        if (pt->state != Lost)
+        {
+            pt->mark_lost();
+            lost_trajectories.push_back(pt);
+        }
+    }
+    
+    TrajectoryPtrPool nnext_candidates(mismatch_col.size());
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+        nnext_candidates[i] = next_candidates[mismatch_col[i]];
+    cost = iou_distance(unconfirmed_trajectories, nnext_candidates);
+    linear_assignment(cost, 0.7f, matches, mismatch_row, mismatch_col);
+    
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        unconfirmed_trajectories[miter->first]->update(*nnext_candidates[miter->second], timestamp);
+        activated_trajectories.push_back(unconfirmed_trajectories[miter->first]);
+    }
+    
+    TrajectoryPtrPool removed_trajectories;
+
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        unconfirmed_trajectories[mismatch_row[i]]->mark_removed();
+        removed_trajectories.push_back(unconfirmed_trajectories[mismatch_row[i]]);
+    }
+    
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+    {
+        if (nnext_candidates[mismatch_col[i]]->score < det_thresh) continue;
+        nnext_candidates[mismatch_col[i]]->activate(timestamp);
+        activated_trajectories.push_back(nnext_candidates[mismatch_col[i]]);
+    }
+    
+    for (size_t i = 0; i < this->lost_trajectories.size(); ++i)
+    {
+        Trajectory &lt = this->lost_trajectories[i];
+        if (timestamp - lt.timestamp > max_lost_time)
+        {
+            lt.mark_removed();
+            removed_trajectories.push_back(&lt);
+        }
+    }
+    
+    TrajectoryPoolIterator piter;
+    for (piter = this->tracked_trajectories.begin(); piter != this->tracked_trajectories.end(); )
+    {
+        if (piter->state != Tracked)
+            piter = this->tracked_trajectories.erase(piter);
+        else
+            ++piter;
+    }
+    
+    this->tracked_trajectories += activated_trajectories;
+    this->tracked_trajectories += retrieved_trajectories;
+
+    this->lost_trajectories -= this->tracked_trajectories;
+    this->lost_trajectories += lost_trajectories;
+    this->lost_trajectories -= this->removed_trajectories;
+    this->removed_trajectories += removed_trajectories;
+    remove_duplicate_trajectory(this->tracked_trajectories, this->lost_trajectories);
+    
+    tracks.clear();
+    for (size_t i = 0; i < this->tracked_trajectories.size(); ++i)
+    {
+        if (this->tracked_trajectories[i].is_activated)
+        {
+            Track track = {
+                .id = this->tracked_trajectories[i].id,
+                .score = this->tracked_trajectories[i].score,
+                .ltrb = this->tracked_trajectories[i].ltrb};
+            tracks.push_back(track);
+        }
+    }
+    return 0;
+}
+
+
+cv::Mat JDETracker::motion_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    if (0 == a.size() || 0 == b.size())
+        return cv::Mat(a.size(), b.size(), CV_32F);
+    
+    cv::Mat edists = embedding_distance(a, b);
+    cv::Mat mdists = mahalanobis_distance(a, b);
+    cv::Mat fdists = lambda * edists + (1 - lambda) * mdists;
+    
+    const float gate_thresh = chi2inv95[4];
+    for (int i = 0; i < fdists.rows; ++i)
+    {
+        for (int j = 0; j < fdists.cols; ++j)
+        {
+            if (*mdists.ptr<float>(i, j) > gate_thresh)
+                *fdists.ptr<float>(i, j) = FLT_MAX;
+        }
+    }
+    
+    return fdists;
+}
+
+void JDETracker::linear_assignment(const cv::Mat &cost, float cost_limit, Match &matches,
+    std::vector<int> &mismatch_row, std::vector<int> &mismatch_col)
+{
+    matches.clear();
+    mismatch_row.clear();    
+    mismatch_col.clear();
+    if (cost.empty())
+    {
+        for (int i = 0; i < cost.rows; ++i)
+            mismatch_row.push_back(i);
+        for (int i = 0; i < cost.cols; ++i)
+            mismatch_col.push_back(i);
+        return;
+    }
+    
+    float opt = 0; 
+    cv::Mat x(cost.rows, 1, CV_32S);
+    cv::Mat y(cost.cols, 1, CV_32S);    
+
+    lapjv_internal(cost, true, cost_limit,
+        (int *)x.data, (int *)y.data);
+     
+    for (int i = 0; i < x.rows; ++i)
+    {
+        int j = *x.ptr<int>(i);
+        if (j >= 0)
+            matches.insert({i, j});
+        else
+            mismatch_row.push_back(i);
+    }
+    
+    for (int i = 0; i < y.rows; ++i)
+    {
+        int j = *y.ptr<int>(i);
+        if (j < 0)
+            mismatch_col.push_back(i);
+    }
+    
+    return;
+}
+
+void JDETracker::remove_duplicate_trajectory(TrajectoryPool &a, TrajectoryPool &b, float iou_thresh)
+{
+    if (0 == a.size() || 0 == b.size())
+        return;
+    
+    cv::Mat dist = iou_distance(a, b);
+    cv::Mat mask = dist < iou_thresh;
+    std::vector<cv::Point> idx;
+    cv::findNonZero(mask, idx);
+    
+    std::vector<int> da;
+    std::vector<int> db;
+    for (size_t i = 0; i < idx.size(); ++i)
+    {
+        int ta = a[idx[i].y].timestamp - a[idx[i].y].starttime;
+        int tb = b[idx[i].x].timestamp - b[idx[i].x].starttime;
+        if (ta > tb)
+            db.push_back(idx[i].x);
+        else
+            da.push_back(idx[i].y);
+    }
+    
+    int id = 0;
+    TrajectoryPoolIterator piter;
+    for (piter = a.begin(); piter != a.end(); )
+    {
+        std::vector<int>::iterator iter = find(da.begin(), da.end(), id++);
+        if (iter != da.end())
+            piter = a.erase(piter);
+        else
+            ++piter;
+    }
+    
+    id = 0;
+    for (piter = b.begin(); piter != b.end(); )
+    {
+        std::vector<int>::iterator iter = find(db.begin(), db.end(), id++);
+        if (iter != db.end())
+            piter = b.erase(piter);
+        else
+            ++piter;
+    }
+}
+
+}   // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/trajectory.cc
+++ b/paddle_detection/deploy/cpp/src/trajectory.cc
@@ -0,0 +1,584 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/trajectory.cpp
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License
+
+#include <algorithm>
+
+#include "include/trajectory.h"
+
+namespace PaddleDetection {
+
+void TKalmanFilter::init(const cv::Mat &measurement)
+{
+    measurement.copyTo(statePost(cv::Rect(0, 0, 1, 4)));
+    statePost(cv::Rect(0, 4, 1, 4)).setTo(0);
+    statePost.copyTo(statePre);
+
+    float varpos = 2 *  std_weight_position * (*measurement.ptr<float>(3));
+    varpos *= varpos;
+    float varvel = 10 * std_weight_velocity * (*measurement.ptr<float>(3));
+    varvel *= varvel;
+    
+    errorCovPost.setTo(0);
+    *errorCovPost.ptr<float>(0, 0) = varpos;
+    *errorCovPost.ptr<float>(1, 1) = varpos;
+    *errorCovPost.ptr<float>(2, 2) = 1e-4f;
+    *errorCovPost.ptr<float>(3, 3) = varpos;
+    *errorCovPost.ptr<float>(4, 4) = varvel;
+    *errorCovPost.ptr<float>(5, 5) = varvel;
+    *errorCovPost.ptr<float>(6, 6) = 1e-10f;
+    *errorCovPost.ptr<float>(7, 7) = varvel;  
+    errorCovPost.copyTo(errorCovPre);
+}
+
+const cv::Mat &TKalmanFilter::predict()
+{
+    float varpos = std_weight_position * (*statePre.ptr<float>(3));
+    varpos *= varpos;
+    float varvel = std_weight_velocity * (*statePre.ptr<float>(3));
+    varvel *= varvel;
+    
+    processNoiseCov.setTo(0);
+    *processNoiseCov.ptr<float>(0, 0) = varpos;
+    *processNoiseCov.ptr<float>(1, 1) = varpos;
+    *processNoiseCov.ptr<float>(2, 2) = 1e-4f;
+    *processNoiseCov.ptr<float>(3, 3) = varpos;
+    *processNoiseCov.ptr<float>(4, 4) = varvel;
+    *processNoiseCov.ptr<float>(5, 5) = varvel;
+    *processNoiseCov.ptr<float>(6, 6) = 1e-10f;
+    *processNoiseCov.ptr<float>(7, 7) = varvel;
+    
+    return cv::KalmanFilter::predict();
+}
+
+const cv::Mat &TKalmanFilter::correct(const cv::Mat &measurement)
+{
+    float varpos = std_weight_position * (*measurement.ptr<float>(3));
+    varpos *= varpos;
+
+    measurementNoiseCov.setTo(0);
+    *measurementNoiseCov.ptr<float>(0, 0) = varpos;
+    *measurementNoiseCov.ptr<float>(1, 1) = varpos;
+    *measurementNoiseCov.ptr<float>(2, 2) = 1e-2f;
+    *measurementNoiseCov.ptr<float>(3, 3) = varpos;
+    
+    return cv::KalmanFilter::correct(measurement);
+}
+
+void TKalmanFilter::project(cv::Mat &mean, cv::Mat &covariance) const
+{    
+    float varpos = std_weight_position * (*statePost.ptr<float>(3));
+    varpos *= varpos;
+    
+    cv::Mat measurementNoiseCov_ = cv::Mat::eye(4, 4, CV_32F);
+    *measurementNoiseCov_.ptr<float>(0, 0) = varpos;
+    *measurementNoiseCov_.ptr<float>(1, 1) = varpos;
+    *measurementNoiseCov_.ptr<float>(2, 2) = 1e-2f;
+    *measurementNoiseCov_.ptr<float>(3, 3) = varpos;
+        
+    mean = measurementMatrix * statePost;
+    cv::Mat temp = measurementMatrix * errorCovPost;
+    gemm(temp, measurementMatrix, 1, measurementNoiseCov_, 1, covariance, cv::GEMM_2_T);
+}
+
+int Trajectory::count = 0;
+
+const cv::Mat &Trajectory::predict(void)
+{
+    if (state != Tracked)
+        *cv::KalmanFilter::statePost.ptr<float>(7) = 0;
+    return TKalmanFilter::predict();
+}
+
+void Trajectory::update(Trajectory &traj, int timestamp_, bool update_embedding_)
+{
+    timestamp = timestamp_;
+    ++length;
+    ltrb = traj.ltrb;
+    xyah = traj.xyah;    
+    TKalmanFilter::correct(cv::Mat(traj.xyah));    
+    state = Tracked;
+    is_activated = true;
+    score = traj.score;   
+    if (update_embedding_)
+        update_embedding(traj.current_embedding);
+}
+
+void Trajectory::activate(int timestamp_)
+{
+    id = next_id();
+    TKalmanFilter::init(cv::Mat(xyah));    
+    length = 0;
+    state = Tracked;
+    if (timestamp_ == 1) {
+      is_activated = true;
+    }
+    timestamp = timestamp_;
+    starttime = timestamp_;
+}
+
+void Trajectory::reactivate(Trajectory &traj, int timestamp_, bool newid)
+{
+    TKalmanFilter::correct(cv::Mat(traj.xyah));
+    update_embedding(traj.current_embedding);
+    length = 0;
+    state = Tracked;
+    is_activated = true;
+    timestamp = timestamp_;
+    if (newid)
+        id = next_id();
+}
+
+void Trajectory::update_embedding(const cv::Mat &embedding)
+{
+    current_embedding = embedding / cv::norm(embedding);
+    if (smooth_embedding.empty())
+    {
+        smooth_embedding = current_embedding;
+    }
+    else
+    {
+        smooth_embedding = eta * smooth_embedding + (1 - eta) * current_embedding;
+    }
+    smooth_embedding = smooth_embedding / cv::norm(smooth_embedding);
+}
+
+TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    TrajectoryPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i].id);
+        if (iter == ids.end())
+        {
+            sum.push_back(b[i]);
+            ids.push_back(b[i].id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            sum.push_back(*b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPool &operator+=(TrajectoryPool &a, const TrajectoryPtrPool &b)
+{    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        if (b[i]->smooth_embedding.empty())
+            continue;
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            a.push_back(*b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    
+    return a;
+}
+
+TrajectoryPool operator-(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    TrajectoryPool dif;
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i].id;
+    
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), a[i].id);
+        if (iter == ids.end())
+            dif.push_back(a[i]);
+    }
+    
+    return dif;
+}
+
+TrajectoryPool &operator-=(TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i].id;
+    
+    TrajectoryPoolIterator piter;
+    for (piter = a.begin(); piter != a.end(); )
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), piter->id);
+        if (iter == ids.end())
+            ++piter;
+        else
+            piter = a.erase(piter);
+    }
+    
+    return a;
+}
+
+TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPtrPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i]->id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            sum.push_back(b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, TrajectoryPool &b)
+{
+    TrajectoryPtrPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i]->id;
+    
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i].id);
+        if (iter == ids.end())
+        {
+            sum.push_back(&b[i]);
+            ids.push_back(b[i].id);
+        }
+    }
+    
+    return sum;
+}
+
+TrajectoryPtrPool operator-(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPtrPool dif;
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i]->id;
+    
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), a[i]->id);
+        if (iter == ids.end())
+            dif.push_back(a[i]);
+    }
+    
+    return dif;
+}
+
+cv::Mat embedding_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            cv::Mat u = a[i].smooth_embedding;
+            cv::Mat v = b[j].smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            //double dist = cv::norm(a[i].smooth_embedding, b[j].smooth_embedding, cv::NORM_L2);
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+        }
+    }
+    return dists;
+}
+
+cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            //double dist = cv::norm(a[i]->smooth_embedding, b[j]->smooth_embedding, cv::NORM_L2);
+            //distsi[j] = static_cast<float>(dist);
+            cv::Mat u = a[i]->smooth_embedding;
+            cv::Mat v = b[j]->smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            //double dist = cv::norm(a[i]->smooth_embedding, b[j].smooth_embedding, cv::NORM_L2);
+            //distsi[j] = static_cast<float>(dist);
+            cv::Mat u = a[i]->smooth_embedding;
+            cv::Mat v = b[j].smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat mahalanobis_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i].project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j].xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i]->project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j]->xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+ 
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i]->project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j].xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    
+    return dists;
+}
+
+static inline float calc_inter_area(const cv::Vec4f &a, const cv::Vec4f &b)
+{
+    if (a[2] < b[0] || a[0] > b[2] || a[3] < b[1] || a[1] > b[3])
+        return 0.f;
+    
+    float w = std::min(a[2], b[2]) - std::max(a[0], b[0]);
+    float h = std::min(a[3], b[3]) - std::max(a[1], b[1]);
+    return w * h;
+}
+
+cv::Mat iou_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i].ltrb[2] - a[i].ltrb[0];
+        float h = a[i].ltrb[3] - a[i].ltrb[1];
+        areaa[i] = w * h;
+    }
+    
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j].ltrb[2] - b[j].ltrb[0];
+        float h = b[j].ltrb[3] - b[j].ltrb[1];
+        areab[j] = w * h;
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i].ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j].ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i]->ltrb[2] - a[i]->ltrb[0];
+        float h = a[i]->ltrb[3] - a[i]->ltrb[1];
+        areaa[i] = w * h;
+    }
+    
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j]->ltrb[2] - b[j]->ltrb[0];
+        float h = b[j]->ltrb[3] - b[j]->ltrb[1];
+        areab[j] = w * h;
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i]->ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j]->ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    
+    return dists;
+}
+
+cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i]->ltrb[2] - a[i]->ltrb[0];
+        float h = a[i]->ltrb[3] - a[i]->ltrb[1];
+        areaa[i] = w * h;
+    }
+    
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j].ltrb[2] - b[j].ltrb[0];
+        float h = b[j].ltrb[3] - b[j].ltrb[1];
+        areab[j] = w * h;
+    }
+    
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i]->ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j].ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    
+    return dists;
+}
+
+}   // namespace PaddleDetection
--- a/paddle_detection/deploy/cpp/src/utils.cc
+++ b/paddle_detection/deploy/cpp/src/utils.cc
@@ -0,0 +1,49 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/utils.h"
+
+namespace PaddleDetection {
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold) {
+  std::sort(input_boxes.begin(),
+  input_boxes.end(), 
+  [](ObjectResult a, ObjectResult b) { return a.confidence > b.confidence; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).rect[2] - input_boxes.at(i).rect[0] + 1) 
+            * (input_boxes.at(i).rect[3] - input_boxes.at(i).rect[1] + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].rect[0], input_boxes[j].rect[0]);
+      float yy1 = (std::max)(input_boxes[i].rect[1], input_boxes[j].rect[1]);
+      float xx2 = (std::min)(input_boxes[i].rect[2], input_boxes[j].rect[2]);
+      float yy2 = (std::min)(input_boxes[i].rect[3], input_boxes[j].rect[3]);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= nms_threshold) {
+          input_boxes.erase(input_boxes.begin() + j);
+          vArea.erase(vArea.begin() + j);
+      }
+      else {
+          j++;
+      }
+    }
+  }
+}
+
+}  // namespace PaddleDetection