PP-DocLayoutV3版面区域检测模型部署

最新推荐文章于 2026-04-11 04:08:25 发布

原创最新推荐文章于 2026-04-11 04:08:25 发布 · 80 阅读

1 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#OCR #python

机器学习专栏收录该内容

20 篇文章

订阅专栏

sensevoice-small-轻量级多任务语音模型的 ONNX 量化版WebUI V1.0

端侧应用：手机 / 平板 / 嵌入式设备的离线语音助手、实时字幕。边缘计算：无 GPU 服务器的语音转写、客服质检、会议纪要。隐私敏感场景：医疗 / 金融等需本地处理语音数据的业务。低资源环境：带宽有限或算力不足的设备与场景。

部署方法

使用模型的onnx版本进行推理，模型在此处下载。
依赖安装：

pip install numpy opencv-python onnxruntime

python推理

此类封装了推理和可视化：

# DocLayoutV3模型推理和可视化

import numpy as np
import cv2
import onnxruntime as ort
import json


class DocLayoutV3Detector:

    LABEL_MAP = [
        "abstract", "algorithm", "aside_text", "chart", "content",
        "display_formula", "doc_title", "figure_title", "footer",
        "footer_image", "footnote", "formula_number", "header",
        "header_image", "image", "inline_formula", "number",
        "paragraph_title", "reference", "reference_content",
        "seal", "table", "text", "vertical_text", "vision_footnote"
    ]

    def __init__(self, model_path, input_size=(800, 800), conf_threshold=0.5):
        """
        Args:
            model_path: ONNX模型路径
            input_size: 模型输入尺寸
            conf_threshold: 置信度阈值
        """
        self.model_path = model_path
        self.input_size = input_size
        self.conf_threshold = conf_threshold
        self.session = ort.InferenceSession(model_path)
        self.input_names = [i.name for i in self.session.get_inputs()]
        self.output_names = [o.name for o in self.session.get_outputs()]
        self.colors = self._generate_colors(len(self.LABEL_MAP))

    def _generate_colors(self, num_classes):
        """生成类别颜色"""
        np.random.seed(42)
        colors = []
        for i in range(num_classes):
            hue = int(255 * i / num_classes)
            color = cv2.cvtColor(
                np.uint8([[[hue, 255, 255]]]),
                cv2.COLOR_HSV2BGR
            )[0][0]
            colors.append(tuple(int(c) for c in color))
        return colors

    def preprocess(self, image):
        orig_h, orig_w = image.shape[:2]
        target_h, target_w = self.input_size
        scale_h = target_h / orig_h
        scale_w = target_w / orig_w
        resized = cv2.resize(
            image,
            (target_w, target_h),
            interpolation=cv2.INTER_LINEAR
        )
        img = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32) / 255.0
        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        img = (img - mean) / std
        img = img.transpose(2, 0, 1)[np.newaxis, ...]
        return img, scale_h, scale_w

    def infer(self, image):
        input_blob, scale_h, scale_w = self.preprocess(image)
        preprocess_shape = [np.array(self.input_size, dtype=np.float32)]
        input_feed = {
            self.input_names[0]: preprocess_shape,
            self.input_names[1]: input_blob,
            self.input_names[2]: [[scale_h, scale_w]]
        }
        outputs = self.session.run(self.output_names, input_feed)[0]
        return outputs


    def postprocess(self, outputs):
        boxes = outputs[outputs[:, 1] > self.conf_threshold]
        # 按阅读顺序排序
        boxes = boxes[np.argsort(boxes[:, 6])]
        return boxes


    def to_json(self, boxes):
        layout_results = []
        for box in boxes:
            label_idx = int(box[0])
            score = float(box[1])
            xmin, ymin, xmax, ymax = box[2:6]
            label_name = (
                self.LABEL_MAP[label_idx]
                if label_idx < len(self.LABEL_MAP)
                else "unknown"
            )
            points = [
                [float(xmin), float(ymin)],
                [float(xmax), float(ymin)],
                [float(xmax), float(ymax)],
                [float(xmin), float(ymax)]
            ]
            layout_results.append({
                "type": label_name,
                "points": points,
                "confidence": round(score, 2)
            })
        return {"layout_results": layout_results}


    def visualize(self, image, boxes, output_path=None, alpha=0.35):
        vis_image = image.copy()
        overlay = vis_image.copy()
        h, w = vis_image.shape[:2]
        font_scale = max(0.5, min(h, w) / 1000)

        # 先绘制半透明框
        for box in boxes:
            label_idx = int(box[0])
            xmin, ymin, xmax, ymax = map(int, box[2:6])
            color = self.colors[label_idx % len(self.colors)]
            # 实心矩形画在 overlay
            cv2.rectangle(
                overlay,
                (xmin, ymin),
                (xmax, ymax),
                color,
                -1
            )

        # 半透明融合
        vis_image = cv2.addWeighted(overlay, alpha, vis_image, 1 - alpha, 0)

        # 绘制标签
        for box in boxes:
            label_idx = int(box[0])
            score = float(box[1])
            xmin, ymin, xmax, ymax = map(int, box[2:6])
            read_order = int(box[6])
            label_name = self.LABEL_MAP[label_idx]
            text = f"{label_name}|{score:.2f}#{read_order}"
            cv2.putText(
                vis_image,
                text,
                (xmin, max(ymin - 5, 20)),
                cv2.FONT_HERSHEY_SIMPLEX,
                font_scale,
                (255, 0, 0),
                1
            )

        if output_path:
            cv2.imwrite(output_path, vis_image)

        return vis_image
    # ---------------------------------------------------------
    # 对外接口

    def predict(self, image):
        if isinstance(image, str):
            image = cv2.imread(image)
        outputs = self.infer(image)
        boxes = self.postprocess(outputs)
        return boxes

    def predict_json(self, image):
        boxes = self.predict(image)
        return self.to_json(boxes)

    def predict_and_visualize(self, image, save_path=None):
        if isinstance(image, str):
            image = cv2.imread(image)
        boxes = self.predict(image)
        vis = self.visualize(image, boxes, save_path)
        return boxes, vis


if __name__ == '__main__':
    detector = DocLayoutV3Detector(
        "./PP-DocLayoutV3.onnx")
    result = detector.predict_json("./test_cases/complex_latex.png")
    print(json.dumps(result, indent=2, ensure_ascii=False))
    boxes, vis = detector.predict_and_visualize(
        image="./test_cases/complex_latex.png",
        save_path="./vis.jpg"
    )

win7 兼容

win7支持的onnxruntime版本最高到1.5，其ONNX IR version 只支持到8，不支持上述模型。
故将上面的原始onnx模型进行转换，使其兼容老的IR版本：

# pip install onnx
import onnx

# 加载模型
model = onnx.load("./PP-DocLayoutV3.onnx")

# 查看当前IR版本
print("Original IR version:", model.ir_version)

# 设置为IR 8
model.ir_version = 8

# 保存新模型
onnx.save(model, "PP-DocLayoutV3.onnx")