[工程部署]Nvidia AGX Orin yolov5/11 部署教程

原创已于 2025-10-19 14:56:42 修改 · 693 阅读

10 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#YOLO

于 2025-10-19 14:48:40 首次发布

算法部署同时被 2 个专栏收录

10 篇文章

订阅专栏

Nvidia

1 篇文章

订阅专栏

官方部署

具体参考快速入门指南：Ultralytics YOLO11 与 NVIDIA Jetson

首先 PC 端 clone yolov11 仓库

git clone git@github.com:ultralytics/ultralytics.git

首先导出为 onnx model（PC or Orin 均可以，但是倾向于 PC）

from ultralytics import YOLO

# Load a model
model = YOLO("yolo11n.pt")  # load an official model

# Export the model
model.export(format="onnx")

onnx 转 engine（必须在 Orin 端）

trtexec   --onnx=./yolo11n.onnx --saveEngine=yolo11n.engine --workspace=4096 --explicitBatch --fp16

出现如下的警告是正常的，网上说是 tensort 的问题，一般转换时间 15 分钟左右
在这里插入图片描述
最终转换成功后显示

导出为 engine 格式后，编写推理代码

import os
import sys
os.chdir(sys.path[0])
from ultralytics import YOLO

trt_model = YOLO("./ENGINE/yolo11m.engine")

trt_model.predict(
    source=0,        # 摄像头输入（0 表示默认摄像头）
    show=True,      # 实时显示窗口
    conf=0.25,      # 设置置信度阈值（默认 0.25）
    iou=0.45,       # IoU NMS 阈值
    stream=False    # stream=True 会返回 generator，可用于自己处理帧
)

在这里插入图片描述

官方部署几行代码即可搞定，适合快速上手，但是如果需要部署其他的 model，还是得自己去分配 buffer 和对输入和输出进行处理

自定义部署

大致的流程为：PC 端训练好模型-> 导出为 onnx model->pc 端编写 onnx 的推理代码-> 将 onnx model 放在板端导出为 engine model-> 板端调用 engine model 进行推理

[!TIP]
pc 端编写 onnx 的推理代码主要有两个目的，，一是将输入的预处理和输出的后处理部分整理封装一遍后面板端部署的复用，二是验证模型导出的完整性和准确性。

这里以 yolov5 为例进行演示，选择 yolov5n，参考 yolov5 教程

首先 PC 端 clone 仓库

git clone https://github.com/ultralytics/yolov5 # clone
cd yolov5
pip install -r requirements.txt # install

导出为 onnx

python export.py --weights yolov5n.pt --include onnx

编写 onnx 推理代码

import os
import cv2
import numpy as np
import onnxruntime
import time
import sys
os.chdir(sys.path[0])

CLASSES=['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
        'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
        'hair drier', 'toothbrush'] #coco80类别

class YOLOV5():
    def __init__(self,onnxpath):
        self.onnx_session=onnxruntime.InferenceSession(onnxpath)
        self.input_name=self.get_input_name()
        self.output_name=self.get_output_name()
    #-------------------------------------------------------
    #   获取输入输出的名字
    #-------------------------------------------------------
    def get_input_name(self):
        input_name=[]
        for node in self.onnx_session.get_inputs():
            input_name.append(node.name)
        return input_name
    def get_output_name(self):
        output_name=[]
        for node in self.onnx_session.get_outputs():
            output_name.append(node.name)
        return output_name
    #-------------------------------------------------------
    #   输入图像
    #-------------------------------------------------------
    def get_input_feed(self,img_tensor):
        input_feed={}
        for name in self.input_name:
            input_feed[name]=img_tensor
        return input_feed
    #-------------------------------------------------------
    #   1.cv2读取图像并resize
    #   2.图像转BGR2RGB和HWC2CHW
    #   3.图像归一化
    #   4.图像增加维度
    #   5.onnx_session 推理
    #-------------------------------------------------------
    def inference(self,img_path):
        img=cv2.imread(img_path)
        or_img=cv2.resize(img,(640,640))
        img=or_img[:,:,::-1].transpose(2,0,1)  #BGR2RGB和HWC2CHW
        img=img.astype(dtype=np.float32)
        img/=255.0
        img=np.expand_dims(img,axis=0)
        input_feed=self.get_input_feed(img)
        pred=self.onnx_session.run(None,input_feed)[0]
        return pred,or_img

#dets:  array [x,6] 6个值分别为x1,y1,x2,y2,score,class 
#thresh: 阈值
def nms(dets, thresh):
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    #-------------------------------------------------------
    #   计算框的面积
    #   置信度从大到小排序
    #-------------------------------------------------------
    areas = (y2 - y1 + 1) * (x2 - x1 + 1)
    scores = dets[:, 4]
    keep = []
    index = scores.argsort()[::-1] 

    while index.size > 0:
        i = index[0]
        keep.append(i)
        #-------------------------------------------------------
        #   计算相交面积
        #   1.相交
        #   2.不相交
        #-------------------------------------------------------
        x11 = np.maximum(x1[i], x1[index[1:]]) 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])

        w = np.maximum(0, x22 - x11 + 1)                              
        h = np.maximum(0, y22 - y11 + 1) 

        overlaps = w * h
        #-------------------------------------------------------
        #   计算该框与其它框的IOU，去除掉重复的框，即IOU值大的框
        #   IOU小于thresh的框保留下来
        #-------------------------------------------------------
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        idx = np.where(ious <= thresh)[0]
        index = index[idx + 1]
    return keep

def xywh2xyxy(x):
    # [x, y, w, h] to [x1, y1, x2, y2]
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2
    y[:, 1] = x[:, 1] - x[:, 3] / 2
    y[:, 2] = x[:, 0] + x[:, 2] / 2
    y[:, 3] = x[:, 1] + x[:, 3] / 2
    return y

def filter_box(org_box,conf_thres,iou_thres): #过滤掉无用的框
    #-------------------------------------------------------
    #   删除为1的维度
    #   删除置信度小于conf_thres的BOX
    #-------------------------------------------------------
    org_box=np.squeeze(org_box)
    conf = org_box[..., 4] > conf_thres
    box = org_box[conf == True]
    #-------------------------------------------------------
    #   通过argmax获取置信度最大的类别
    #-------------------------------------------------------
    cls_cinf = box[..., 5:]
    cls = []
    for i in range(len(cls_cinf)):
        cls.append(int(np.argmax(cls_cinf[i])))
    all_cls = list(set(cls))     
    #-------------------------------------------------------
    #   分别对每个类别进行过滤
    #   1.将第6列元素替换为类别下标
    #   2.xywh2xyxy 坐标转换
    #   3.经过非极大抑制后输出的BOX下标
    #   4.利用下标取出非极大抑制后的BOX
    #-------------------------------------------------------
    output = []
    for i in range(len(all_cls)):
        curr_cls = all_cls[i]
        curr_cls_box = []
        curr_out_box = []
        for j in range(len(cls)):
            if cls[j] == curr_cls:
                box[j][5] = curr_cls
                curr_cls_box.append(box[j][:6])
        curr_cls_box = np.array(curr_cls_box)
        # curr_cls_box_old = np.copy(curr_cls_box)
        curr_cls_box = xywh2xyxy(curr_cls_box)
        curr_out_box = nms(curr_cls_box,iou_thres)
        for k in curr_out_box:
            output.append(curr_cls_box[k])
    output = np.array(output)
    return output

def draw(image,box_data):  
    #-------------------------------------------------------
    #   取整，方便画框
    #-------------------------------------------------------
    boxes=box_data[...,:4].astype(np.int32) 
    scores=box_data[...,4]
    classes=box_data[...,5].astype(np.int32) 

    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        print('class: {}, score: {}'.format(CLASSES[cl], score))
        print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))

        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(CLASSES[cl], score),
                    (top, left ),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 2)

if __name__=="__main__":
    onnx_path='yolov5s.onnx'
    model=YOLOV5(onnx_path)
    output,or_img=model.inference('bus.jpg')
    outbox=filter_box(output,0.5,0.5)
    draw(or_img,outbox)
    cv2.imwrite('out.jpg',or_img)

推理结果
在这里插入图片描述

onnx2engine：将 onnx 文件复制到板端，编写如下脚本进行转换

#!/bin/bash

# ========== YOLO → TensorRT Engine Export Script ==========
ONNX_PATH="./ONNX/yolov5n.onnx"

ENGINE_PATH="./ENGINE/yolov5n.engine"

WORKSPACE=4096

if ! command -v trtexec &> /dev/null
then
    echo "[WARN] trtexec not found in /usr/bin/, excute from /usr/src/tensorrt/bin/trtexec "
    TRTEXEC="/usr/src/tensorrt/bin/trtexec"
else
    TRTEXEC="trtexec"
fi

echo "===================================="
echo " ONNX: $ONNX_PATH"
echo " ENGINE: $ENGINE_PATH"
echo " Workspace: ${WORKSPACE}MB"
echo "===================================="

$TRTEXEC \
    --onnx=$ONNX_PATH \
    --saveEngine=$ENGINE_PATH \
    --fp16 \
    --workspace=$WORKSPACE

echo "✅ Engine generate: $ENGINE_PATH"
echo "✅ Done!"

转换过程较长，出现如下的警告是正常的，网上说是 tensort 的问题，一般转换时间 15 分钟左右
在这里插入图片描述
最终转换成功后显示

编写 engine 推理代码
图片推理：

import os
import sys
os.chdir(sys.path[0])
import cv2
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  
import time

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

CLASS_NAMES = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train',
               7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter',
               13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant',
               21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie',
               28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite',
               34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
               39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana',
               47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
               54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table',
               61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone',
               68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock',
               75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}

color_palette = np.random.uniform(0, 255, size=(len(CLASS_NAMES), 3))

"""load engine"""
def load_engine(engine_path):
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

# 2. 分配输入输出内存
def allocate_buffers(engine):
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # 分配主机和设备内存
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if engine.binding_is_input(binding):
            inputs.append({'host': host_mem, 'device': device_mem})
        else:
            outputs.append({'host': host_mem, 'device': device_mem})
    return inputs, outputs, bindings, stream

# 3. inference
def do_inference(context, inputs, outputs, bindings, stream):
    # data copy to device
    [cuda.memcpy_htod_async(inp['device'], inp['host'], stream) for inp in inputs]
    # excute
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # copy data to host
    [cuda.memcpy_dtoh_async(out['host'], out['device'], stream) for out in outputs]
    stream.synchronize()
    return [out['host'] for out in outputs]

""""preprocess and post process"""
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True):
    shape = img.shape[:2]  
    print(f"Original image shape: {shape}")
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])  
    if not scaleup:  
        r = min(r, 1.0)
    new_unpad = (int(round(shape[1] * r)), int(round(shape[0] * r)))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] 
    dw_left = dw // 2  
    dh_top = dh // 2
    if shape[::-1] != new_unpad:  
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh_top)), int(round(dh - dh_top))
    left, right = int(round(dw_left)), int(round(dw - dw_left))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    print(f"Final letterboxed image shape: {img.shape}")

    return img, (r, r), (dw_left, dh_top)

def preprocess(img_input):    
    if isinstance(img_input, str) and os.path.isfile(img_input):
        img = cv2.imread(img_input)
        if img is None:
            raise ValueError(f"Can not load image：{img_input}")

    elif isinstance(img_input, np.ndarray):
        img = img_input
    else:
        raise TypeError("img_input image path or numpy array required.")
    img_height, img_width = img.shape[:2]
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img, ratio, (dw, dh) = letterbox(img, new_shape=(640, 640))
    image_data = np.array(img) / 255.0
    image_data = np.transpose(image_data, (2, 0, 1))  # 通道优先
    image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
    return image_data, ratio, dw, dh

def nms(dets, thresh):
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    #-------------------------------------------------------
    #   计算框的面积
    #   置信度从大到小排序
    #-------------------------------------------------------
    areas = (y2 - y1 + 1) * (x2 - x1 + 1)
    scores = dets[:, 4]
    keep = []
    index = scores.argsort()[::-1] 

    while index.size > 0:
        i = index[0]
        keep.append(i)
        #-------------------------------------------------------
        #   计算相交面积
        #   1.相交
        #   2.不相交
        #-------------------------------------------------------
        x11 = np.maximum(x1[i], x1[index[1:]]) 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])

        w = np.maximum(0, x22 - x11 + 1)                              
        h = np.maximum(0, y22 - y11 + 1) 

        overlaps = w * h
        #-------------------------------------------------------
        #   计算该框与其它框的IOU，去除掉重复的框，即IOU值大的框
        #   IOU小于thresh的框保留下来
        #-------------------------------------------------------
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        idx = np.where(ious <= thresh)[0]
        index = index[idx + 1]
    return keep

def xywh2xyxy(x):
    # [x, y, w, h] to [x1, y1, x2, y2]
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2
    y[:, 1] = x[:, 1] - x[:, 3] / 2
    y[:, 2] = x[:, 0] + x[:, 2] / 2
    y[:, 3] = x[:, 1] + x[:, 3] / 2
    return y

def filter_box(org_box,conf_thres,iou_thres): #过滤掉无用的框
    #-------------------------------------------------------
    #   删除为1的维度
    #   删除置信度小于conf_thres的BOX
    #-------------------------------------------------------
    org_box=np.squeeze(org_box)
    conf = org_box[..., 4] > conf_thres
    box = org_box[conf == True]
    #-------------------------------------------------------
    #   通过argmax获取置信度最大的类别
    #-------------------------------------------------------
    cls_cinf = box[..., 5:]
    cls = []
    for i in range(len(cls_cinf)):
        cls.append(int(np.argmax(cls_cinf[i])))
    all_cls = list(set(cls))     
    #-------------------------------------------------------
    #   分别对每个类别进行过滤
    #   1.将第6列元素替换为类别下标
    #   2.xywh2xyxy 坐标转换
    #   3.经过非极大抑制后输出的BOX下标
    #   4.利用下标取出非极大抑制后的BOX
    #-------------------------------------------------------
    output = []
    for i in range(len(all_cls)):
        curr_cls = all_cls[i]
        curr_cls_box = []
        curr_out_box = []
        for j in range(len(cls)):
            if cls[j] == curr_cls:
                box[j][5] = curr_cls
                curr_cls_box.append(box[j][:6])
        curr_cls_box = np.array(curr_cls_box)
        # curr_cls_box_old = np.copy(curr_cls_box)
        curr_cls_box = xywh2xyxy(curr_cls_box)
        curr_out_box = nms(curr_cls_box,iou_thres)
        for k in curr_out_box:
            output.append(curr_cls_box[k])
    output = np.array(output)
    return output

def scale_coords(boxes, ratio, dw, dh,orig_w,orig_h):
    """将 640×640 下的坐标映射回原始图尺寸"""
   # 去 padding (先减，再除缩放比例)
    boxes[0] = (boxes[0] - dw) / ratio[0]  # x1
    boxes[1] = (boxes[1] - dh) / ratio[1]  # y1
    boxes[2] = (boxes[2] - dw) / ratio[0]  # x2
    boxes[3] = (boxes[3] - dh) / ratio[1]  # y2

    # 限制坐标范围，避免越界
    boxes[0] = np.clip(boxes[0], 0, orig_w)
    boxes[1] = np.clip(boxes[1], 0, orig_h)
    boxes[2] = np.clip(boxes[2], 0, orig_w)
    boxes[3] = np.clip(boxes[3], 0, orig_h)
    return boxes

def draw(image,box_data,ratio, dw, dh,orig_w,orig_h):  
    #-------------------------------------------------------
    #   取整，方便画框
    #-------------------------------------------------------
    boxes=box_data[...,:4].astype(np.int32) 
    scores=box_data[...,4]
    classes=box_data[...,5].astype(np.int32) 

    for box, score, cl in zip(boxes, scores, classes):
        box=scale_coords(box,ratio, dw, dh,orig_w,orig_h)
        top, left, right, bottom = box
        print('class: {}, score: {}'.format(CLASS_NAMES[cl], score))
        print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(top, left, right, bottom))

        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(CLASS_NAMES[cl], score),
                    (top, left ),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 2)

def main():
    engine_path = "./ENGINE/yolov5n.engine"
    engine = load_engine(engine_path)
    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    """prepocess"""
    image=cv2.imread('./testimages/bus.jpg')
    img_data, ratio, dw, dh = preprocess(img_input='./testimages/bus.jpg')
    """"""
    np.copyto(inputs[0]['host'], img_data.ravel())
    start_time = time.time()
    out = do_inference(context, inputs, outputs, bindings, stream)
    end_time = time.time()
    print(f'inference time: {(end_time - start_time) * 1000:.2f} ms')
    """"postprocess"""
    out=out[0].reshape(1, 25200, 85) #2142000->1,85,25200
    outbox=filter_box(out,0.5,0.5)  
    draw(image,outbox,ratio, dw, dh,image.shape[0],image.shape[1])
    cv2.imwrite('out.jpg',image)
if __name__ == "__main__":
    main()

输出

warren@warren:/home/Deploy$ /usr/bin/python /home/Deploy/yolov5/infer_frame.py
[10/17/2025-21:28:05] [TRT] [I] Loaded engine size: 5 MiB
[10/17/2025-21:28:05] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +3, now: CPU 0, GPU 3 (MiB)
[10/17/2025-21:28:05] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +11, now: CPU 0, GPU 14 (MiB)
/home/Deploy/yolov5/infer_frame.py:38: DeprecationWarning: Use get_tensor_shape instead.
  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
/home/Deploy/yolov5/infer_frame.py:38: DeprecationWarning: Use network created with NetworkDefinitionCreationFlag::EXPLICIT_BATCH flag instead.
  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
[10/17/2025-21:28:05] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
/home/Deploy/yolov5/infer_frame.py:39: DeprecationWarning: Use get_tensor_dtype instead.
  dtype = trt.nptype(engine.get_binding_dtype(binding))
/home/Deploy/yolov5/infer_frame.py:44: DeprecationWarning: Use get_tensor_mode instead.
  if engine.binding_is_input(binding):
[10/17/2025-21:28:05] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
Original image shape: (1080, 810)
Final letterboxed image shape: (640, 640, 3)
inference time: 13.35 ms
class: person, score: 0.8203257322311401
box coordinate left,top,right,down: [219, 406, 347, 810]
class: person, score: 0.818359375
box coordinate left,top,right,down: [48, 398, 205, 810]
class: person, score: 0.6343173384666443
box coordinate left,top,right,down: [680, 381, 813, 810]
class: bus, score: 0.6187803745269775
box coordinate left,top,right,down: [48, 234, 818, 779]

结果
在这里插入图片描述

camera 实时推理：

import os
import sys
# Set working directory to the script's location
os.chdir(sys.path[0])
import cv2
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # Corrected trailing space
import time

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

CLASS_NAMES = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train',
               7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter',
               13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant',
               21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie',
               28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite',
               34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
               39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana',
               47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
               54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table',
               61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone',
               68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock',
               75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}

color_palette = np.random.uniform(0, 255, size=(len(CLASS_NAMES), 3))

"""load engine"""
def load_engine(engine_path):
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

# 2. 分配输入输出内存
def allocate_buffers(engine):
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # 分配主机和设备内存
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if engine.binding_is_input(binding):
            inputs.append({'host': host_mem, 'device': device_mem})
        else:
            outputs.append({'host': host_mem, 'device': device_mem})
    return inputs, outputs, bindings, stream

# 3. inference
def do_inference(context, inputs, outputs, bindings, stream):
    # data copy to device
    [cuda.memcpy_htod_async(inp['device'], inp['host'], stream) for inp in inputs]
    # excute
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # copy data to host
    [cuda.memcpy_dtoh_async(out['host'], out['device'], stream) for out in outputs]
    stream.synchronize()
    return [out['host'] for out in outputs]

""""preprocess and post process"""
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True):
    shape = img.shape[:2]
    # --- MODIFIED: Commented out print for real-time use ---
    # print(f"Original image shape: {shape}")
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:
        r = min(r, 1.0)
    new_unpad = (int(round(shape[1] * r)), int(round(shape[0] * r)))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
    dw_left = dw // 2
    dh_top = dh // 2
    if shape[::-1] != new_unpad:
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh_top)), int(round(dh - dh_top))
    left, right = int(round(dw_left)), int(round(dw - dw_left))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    # --- MODIFIED: Commented out print for real-time use ---
    # print(f"Final letterboxed image shape: {img.shape}")

    return img, (r, r), (dw_left, dh_top)

def preprocess(img_input):
    if isinstance(img_input, str) and os.path.isfile(img_input):
        img = cv2.imread(img_input)
        if img is None:
            raise ValueError(f"Can not load image：{img_input}")

    elif isinstance(img_input, np.ndarray):
        img = img_input
    else:
        raise TypeError("img_input image path or numpy array required.")
    
    # img_height, img_width = img.shape[:2] # Not used, can be removed
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img, ratio, (dw, dh) = letterbox(img, new_shape=(640, 640))
    image_data = np.array(img) / 255.0
    image_data = np.transpose(image_data, (2, 0, 1))  # 通道优先
    image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
    return image_data, ratio, dw, dh

def nms(dets, thresh):
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    #-------------------------------------------------------
    #   计算框的面积
    #   置信度从大到小排序
    #-------------------------------------------------------
    areas = (y2 - y1 + 1) * (x2 - x1 + 1)
    scores = dets[:, 4]
    keep = []
    index = scores.argsort()[::-1]

    while index.size > 0:
        i = index[0]
        keep.append(i)
        #-------------------------------------------------------
        #   计算相交面积
        #   1.相交
        #   2.不相交
        #-------------------------------------------------------
        x11 = np.maximum(x1[i], x1[index[1:]])
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])

        w = np.maximum(0, x22 - x11 + 1)
        h = np.maximum(0, y22 - y11 + 1)

        overlaps = w * h
        #-------------------------------------------------------
        #   计算该框与其它框的IOU，去除掉重复的框，即IOU值大的框
        #   IOU小于thresh的框保留下来
        #-------------------------------------------------------
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        idx = np.where(ious <= thresh)[0]
        index = index[idx + 1]
    return keep

def xywh2xyxy(x):
    # [x, y, w, h] to [x1, y1, x2, y2]
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2
    y[:, 1] = x[:, 1] - x[:, 3] / 2
    y[:, 2] = x[:, 0] + x[:, 2] / 2
    y[:, 3] = x[:, 1] + x[:, 3] / 2
    return y

def filter_box(org_box,conf_thres,iou_thres): #过滤掉无用的框
    #-------------------------------------------------------
    #   删除为1的维度
    #   删除置信度小于conf_thres的BOX
    #-------------------------------------------------------
    org_box=np.squeeze(org_box)
    
    # --- Check for empty detections ---
    if org_box.ndim == 1 or org_box.size == 0:
        return np.empty((0, 6))
        
    conf = org_box[..., 4] > conf_thres
    box = org_box[conf == True]
    
    # --- Check if all boxes were filtered out ---
    if box.shape[0] == 0:
        return np.empty((0, 6))

    #-------------------------------------------------------
    #   通过argmax获取置信度最大的类别
    #-------------------------------------------------------
    cls_cinf = box[..., 5:]
    cls = []
    for i in range(len(cls_cinf)):
        cls.append(int(np.argmax(cls_cinf[i])))
    all_cls = list(set(cls))
    #-------------------------------------------------------
    #   分别对每个类别进行过滤
    #   1.将第6列元素替换为类别下标
    #   2.xywh2xyxy 坐标转换
    #   3.经过非极大抑制后输出的BOX下标
    #   4.利用下标取出非极大抑制后的BOX
    #-------------------------------------------------------
    output = []
    for i in range(len(all_cls)):
        curr_cls = all_cls[i]
        curr_cls_box = []
        # curr_out_box = [] # This variable is not used
        for j in range(len(cls)):
            if cls[j] == curr_cls:
                box[j][5] = curr_cls
                curr_cls_box.append(box[j][:6])
        
        if not curr_cls_box:
            continue
            
        curr_cls_box = np.array(curr_cls_box)
        # curr_cls_box_old = np.copy(curr_cls_box)
        curr_cls_box = xywh2xyxy(curr_cls_box)
        curr_out_box_indices = nms(curr_cls_box,iou_thres)
        for k in curr_out_box_indices:
            output.append(curr_cls_box[k])
    output = np.array(output)
    return output

def scale_coords(boxes, ratio, dw, dh, orig_w, orig_h):
    """将 640×640 下的坐标映射回原始图尺寸"""
    # 去 padding (先减，再除缩放比例)
    boxes[0] = (boxes[0] - dw) / ratio[0]  # x1
    boxes[1] = (boxes[1] - dh) / ratio[1]  # y1
    boxes[2] = (boxes[2] - dw) / ratio[0]  # x2
    boxes[3] = (boxes[3] - dh) / ratio[1]  # y2

    # 限制坐标范围，避免越界
    boxes[0] = np.clip(boxes[0], 0, orig_w)
    boxes[1] = np.clip(boxes[1], 0, orig_h)
    boxes[2] = np.clip(boxes[2], 0, orig_w)
    boxes[3] = np.clip(boxes[3], 0, orig_h)
    return boxes

def draw(image,box_data,ratio, dw, dh,orig_w,orig_h):
    #-------------------------------------------------------
    #   取整，方便画框
    #-------------------------------------------------------
    
    # --- Check for empty box data ---
    if box_data.shape[0] == 0:
        return

    boxes=box_data[...,:4].astype(np.int32)
    scores=box_data[...,4]
    classes=box_data[...,5].astype(np.int32)

    for box, score, cl in zip(boxes, scores, classes):
        # --- MODIFIED: Pass w, h in correct order ---
        box=scale_coords(box,ratio, dw, dh,orig_w,orig_h)
        
        # --- Original code had top, left, right, bottom assignment swapped ---
        # --- Corrected: x1, y1, x2, y2 ---
        x1, y1, x2, y2 = box
        
        # --- MODIFIED: Commented out prints for real-time use ---
        # print('class: {}, score: {}'.format(CLASS_NAMES[cl], score))
        # print('box coordinate left,top,right,down: [{}, {}, {}, {}]'.format(x1, y1, x2, y2))
        
        # --- Use color palette ---
        color = color_palette[cl]
        
        cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
        cv2.putText(image, '{0} {1:.2f}'.format(CLASS_NAMES[cl], score),
                    (x1, y1 - 5 if y1 > 20 else y1 + 20), # Adjust text position
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, color, 2)

# --- MODIFIED: main() function updated for real-time camera ---
def main():
    engine_path = "./ENGINE/yolov5n.engine"
    
    print("Loading TensorRT engine...")
    engine = load_engine(engine_path)
    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    print("Engine loaded.")

    # --- NEW: Initialize Camera ---
    cap = cv2.VideoCapture(0)  # 0 for default webcam, or provide path to video file
    if not cap.isOpened():
        print("Error: Could not open camera or video file.")
        return

    print("Starting real-time detection... Press 'q' to quit.")
    
    # --- NEW: Real-time loop ---
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Failed to capture frame.")
            break

        # --- Store original frame dimensions ---
        orig_h, orig_w = frame.shape[:2]

        """preprocess"""
        # --- Use the captured frame for preprocessing ---
        img_data, ratio, dw, dh = preprocess(img_input=frame)
        
        """Inference"""
        np.copyto(inputs[0]['host'], img_data.ravel())
        
        start_time = time.time()
        out = do_inference(context, inputs, outputs, bindings, stream)
        end_time = time.time()
        
        # --- Calculate and display FPS ---
        fps = 1.0 / (end_time - start_time)
        print(f'Inference time: {(end_time - start_time) * 1000:.2f} ms ({fps:.2f} FPS)')
        
        """postprocess"""
        out = out[0].reshape(1, 25200, 85)
        outbox = filter_box(out, 0.5, 0.5)
        
        """draw"""
        # --- Draw on the original 'frame', not the preprocessed image ---
        # --- Pass original width and height correctly ---
        draw(frame, outbox, ratio, dw, dh, orig_w, orig_h)
        
        # --- Display FPS on the frame ---
        cv2.putText(frame, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        """display"""
        # --- NEW: Show the frame in a window ---
        cv2.imshow('Real-time Detection (Press q to quit)', frame)

        # --- NEW: Exit condition ---
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # --- NEW: Release resources ---
    print("Releasing resources...")
    cap.release()
    cv2.destroyAllWindows()
    # --- END OF MODIFICATIONS ---

if __name__ == "__main__":
    main()