I was implementing a YOLOv8 pose estimation model for detecting keypoints (not hums pose), with both Python ONNX and C++ TensorRT implementations. The model should detect 10 objects with 4 keypoints each (40 total keypoints).
Issue: The C++ TensorRT implementation was producing all 40 keypoints arranged in a diagonal line from top-left to bottom-right, while the Python ONNX implementation worked perfectly with keypoints properly distributed across the image.
Environment
- Model: YOLOv8m-pose trained for pillar keypoint detection
- Input: 1280Ă—1280 (model), Original images: 1920Ă—1080
- Output: [1, 26, 33600] - 26 values per prediction (4 bbox + 10 classes + 12 keypoint coords)
- Python: ONNX Runtime
- C++: TensorRT with CUDA
Symptoms
- Python ONNX:
40 keypoints properly distributed
- C++ TensorRT:
40 keypoints in a distorted diagonal line
- Same model, same preprocessing, same input image
- All keypoints had reasonable confidence scores
- NMS was implemented in Python and it worked perfectly, for C++ I used cv::dnn::NMSBoxes()
My major confusion is with the preprocessing/postprocessing conducted on the image and the output.
code snippet:
preprocess :
PreProcessCPU(const cv::Mat& image, int target_size)
{
// Resize directly to target_size x target_size (no padding, no aspect ratio)
cv::Mat resized;
cv::resize(image, resized, cv::Size(target_size, target_size), cv::INTER_LINEAR);
// Convert BGR to RGB
cv::Mat rgb;
cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);
// Convert to float and normalize to [0, 1]
cv::Mat float_img;
rgb.convertTo(float_img, CV_32F, 1.0f / 255.0f);
// Prepare vector for CHW format
std::vector<float> chw;
chw.reserve(3 * target_size * target_size);
// HWC to CHW
for (int c = 0; c < 3; ++c)
{
for (int y = 0; y < target_size; ++y)
{
for (int x = 0; x < target_size; ++x)
{
chw.push_back(float_img.at<cv::Vec3f>(y, x)[c]);
}
}
}
return chw;
}
postprocess :
const float scale_x = static_cast(inputImage.cols) / static_cast(inputDimensions_.width);
const float scale_y = static_cast(inputImage.rows) / static_cast(inputDimensions_.height);
cOUT << "Original image size: " << inputImage.cols << "x" << inputImage.rows;
cOUT << "Input size: " << inputDimensions_.width << "x" << inputDimensions_.height;
cOUT << "Scale factors: " << scale_x << ", " << scale_y;
cv::Mat output_mat(num_predictions, num_attrs, CV_32F, output_buffer.data());
std::vector<cv::Rect> boxes;
std::vector<float> scores;
std::vector<int> class_ids;
std::vector<std::vector<cv::Point3f>> all_keypoints;
// First pass: extract all detections with confidence filtering
for (int i = 0; i < num_predictions; ++i)
{
const float* row = output_mat.ptr<float>(i);
float x = row[0]; // center x
float y = row[1]; // center y
float w = row[2]; // width
float h = row[3]; // height
// Find class with max score
float max_score = -1.0f;
int best_class = -1;
for (int c = 0; c < num_classes; ++c)
{
float score = row[4 + c];
if (score > max_score)
{
max_score = score;
best_class = c;
}
}
// CORRECTED: Apply confidence threshold (matches ONNX)
if (max_score < conf_thresh)
continue;
// Convert box from center format to corner format and scale to original image
float x1 = (x - w / 2.0f) * scale_x;
float y1 = (y - h / 2.0f) * scale_y;
float x2 = (x + w / 2.0f) * scale_x;
float y2 = (y + h / 2.0f) * scale_y;
boxes.emplace_back(cv::Rect(cv::Point(static_cast<int>(x1), static_cast<int>(y1)),
cv::Point(static_cast<int>(x2), static_cast<int>(y2))));
scores.emplace_back(max_score);
class_ids.emplace_back(best_class);
// CORRECTED: Parse keypoints with proper scaling (matches ONNX exactly)
std::vector<cv::Point3f> kpts;
for (int k = 0; k < num_keypoints; ++k)
{
float kpt_x = row[14 + k * 3]; // keypoint x in model coordinates
float kpt_y = row[14 + k * 3 + 1]; // keypoint y in model coordinates
float kpt_conf = row[14 + k * 3 + 2]; // keypoint confidence
// CORRECTED: Scale to original image coordinates (matches ONNX)
float x_orig = kpt_x *scale_x;
float y_orig = kpt_y *scale_y;
kpts.emplace_back(cv::Point3f(x_orig, y_orig, kpt_conf));
}
all_keypoints.emplace_back(kpts);
}
// Apply NMS
std::vector<int> nms_indices;
cv::dnn::NMSBoxes(boxes, scores, conf_thresh, nms_thresh, nms_indices);
cOUT << "Detections after NMS: " << nms_indices.size();
// Store keypoints from NMS-filtered detections
for (int idx : nms_indices)
{
cOUT << "Detection " << idx << " - Class: " << class_ids[idx]
<< ", Score: " << scores[idx];
// Add all keypoints from this detection to the result
for (const auto& kp : all_keypoints[idx])
{
keypoints.emplace_back(kp.x, kp.y);
flatKeypoints.push_back(kp.x);
flatKeypoints.push_back(kp.y);
cOUT << "Keypoint: (" << kp.x << ", " << kp.y << ") conf: " << kp.z;
}
}
resultPtr->keypoints = keypoints;
resultPtr->flatKeypoints = flatKeypoints;
cOUT << "Total keypoints extracted: " << keypoints.size();
return { true, resultPtr };