目标检测输出层#

import set_env
from d2py.utils.file import mkdir
root_dir = ".temp"
mkdir(f"{root_dir}/logs")
mkdir(f"{root_dir}/libs")

配置信息:

from dataclasses import dataclass

@dataclass
class Config:
    shape: tuple # 输入数据 shape
    name: str = "data" # 输入数据名称
    dtype: str = "float32" # 输入数据类型

构建 PyTorch 人脸检测模型#

import torch
from torch import nn

class M(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(3, 64, 1, 1, 0, bias=False, groups=1)
        self.conv0 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
        self.conv00 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
        self.resize_2 = nn.Conv2d(64, 64, 1, 2, 0, bias=False, groups=1)
        self.conv1 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
        self.conv11 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
        self.resize_4 = nn.Conv2d(64, 64, 1, 4, 0, bias=False, groups=1)
        self.conv2 = nn.Conv2d(64, 48, 1, 1, 0, bias=False, groups=1)
        self.conv22 = nn.Conv2d(64, 48, 1, 1, 0, bias=False, groups=1)

    def forward(self, x):
        _x = self.conv(x)
        __x = self.resize_2(_x)
        ___x = self.resize_4(_x)
        x0 = self.conv0(_x).permute(0, 2, 3, 1) # NCHW => NHWC
        x1 = self.conv1(__x).permute(0, 2, 3, 1)
        x2 = self.conv2(___x).permute(0, 2, 3, 1)
        x0 = x0.reshape(1, -1, 3)
        x1 = x1.reshape(1, -1, 3)
        x2 = x2.reshape(1, -1, 3)
        x = torch.concat((x0, x1, x2), dim=1)

        x00 = self.conv00(_x).permute(0, 2, 3, 1) # NCHW => NHWC
        x11 = self.conv11(__x).permute(0, 2, 3, 1)
        x22 = self.conv22(___x).permute(0, 2, 3, 1)
        x00 = x00.reshape(1, -1, 4)
        x11 = x11.reshape(1, -1, 4)
        x22 = x22.reshape(1, -1, 4)
        xx = torch.concat((x00, x11, x22), dim=1)
        return torch.softmax(x, dim=2), xx

导出 ONNX 模型#

from torch.onnx import OperatorExportTypes, utils
import numpy as np
import onnx

model = M()
model.eval()

config = Config((1, 3, 48, 80))
data_np = np.random.rand(*config.shape).astype(config.dtype)
output_name = "test-det-class"
xx = torch.rand(*config.shape, dtype=torch.float32, requires_grad=False)
utils.export(
    model,               # torch 模型
    xx,                         # 模型输入或者对于多个输入,使用元组
    f"{root_dir}/{output_name}.onnx",               # 模型保存的位置(可以是文件或类似文件的对象)
    export_params=True,        # 将训练后的参数权重存储在模型文件内
    # opset_version=17,          # 导出模型的 ONNX 版本
    do_constant_folding=True,  # 是否执行常量折叠以进行优化
    input_names = [config.name],    # 模型的输入名称
    output_names = ['output'], # 模型的输出名称
    # keep_initializers_as_inputs=True,
    # export_modules_as_functions=True,
    verbose=True,
    # operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
    # dynamic_axes={'data' : {0 : 'batch_size'},    # 可变长度的轴
    #               'output' : {0 : 'batch_size'}}
)
Exported graph: graph(%data : Float(1, 3, 48, 80, strides=[11520, 3840, 80, 1], requires_grad=0, device=cpu),
      %conv.weight : Float(64, 3, 1, 1, strides=[3, 1, 1, 1], requires_grad=1, device=cpu),
      %conv0.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
      %conv00.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
      %resize_2.weight : Float(64, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
      %conv1.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
      %conv11.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
      %resize_4.weight : Float(64, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
      %conv2.weight : Float(48, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
      %conv22.weight : Float(48, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu)):
  %/conv/Conv_output_0 : Float(1, 64, 48, 80, strides=[245760, 3840, 80, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv/Conv"](%data, %conv.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/resize_2/Conv_output_0 : Float(1, 64, 24, 40, strides=[61440, 960, 40, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[2, 2], onnx_name="/resize_2/Conv"](%/conv/Conv_output_0, %resize_2.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::resize_2 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/resize_4/Conv_output_0 : Float(1, 64, 12, 20, strides=[15360, 240, 20, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[4, 4], onnx_name="/resize_4/Conv"](%/conv/Conv_output_0, %resize_4.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::resize_4 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/conv0/Conv_output_0 : Float(1, 36, 48, 80, strides=[138240, 3840, 80, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv0/Conv"](%/conv/Conv_output_0, %conv0.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv0 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/Transpose_output_0 : Float(1, 48, 80, 36, strides=[138240, 80, 1, 3840], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose"](%/conv0/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:21:0
  %/conv1/Conv_output_0 : Float(1, 36, 24, 40, strides=[34560, 960, 40, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv1/Conv"](%/resize_2/Conv_output_0, %conv1.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv1 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/Transpose_1_output_0 : Float(1, 24, 40, 36, strides=[34560, 40, 1, 960], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_1"](%/conv1/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:22:0
  %/conv2/Conv_output_0 : Float(1, 48, 12, 20, strides=[11520, 240, 20, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv2/Conv"](%/resize_4/Conv_output_0, %conv2.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv2 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/Transpose_2_output_0 : Float(1, 12, 20, 48, strides=[11520, 20, 1, 240], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_2"](%/conv2/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:23:0
  %/Constant_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1  3 [ CPULongType{3} ], onnx_name="/Constant"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:24:0
  %/Reshape_output_0 : Float(1, 46080, 3, strides=[138240, 3, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape"](%/Transpose_output_0, %/Constant_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:24:0
  %/Constant_1_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1  3 [ CPULongType{3} ], onnx_name="/Constant_1"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:25:0
  %/Reshape_1_output_0 : Float(1, 11520, 3, strides=[34560, 3, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_1"](%/Transpose_1_output_0, %/Constant_1_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:25:0
  %/Constant_2_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1  3 [ CPULongType{3} ], onnx_name="/Constant_2"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:26:0
  %/Reshape_2_output_0 : Float(1, 3840, 3, strides=[11520, 3, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_2"](%/Transpose_2_output_0, %/Constant_2_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:26:0
  %/Concat_output_0 : Float(1, 61440, 3, strides=[184320, 3, 1], requires_grad=1, device=cpu) = onnx::Concat[axis=1, onnx_name="/Concat"](%/Reshape_output_0, %/Reshape_1_output_0, %/Reshape_2_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:27:0
  %/conv00/Conv_output_0 : Float(1, 36, 48, 80, strides=[138240, 3840, 80, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv00/Conv"](%/conv/Conv_output_0, %conv00.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv00 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/Transpose_3_output_0 : Float(1, 48, 80, 36, strides=[138240, 80, 1, 3840], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_3"](%/conv00/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:29:0
  %/conv11/Conv_output_0 : Float(1, 36, 24, 40, strides=[34560, 960, 40, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv11/Conv"](%/resize_2/Conv_output_0, %conv11.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv11 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/Transpose_4_output_0 : Float(1, 24, 40, 36, strides=[34560, 40, 1, 960], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_4"](%/conv11/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:30:0
  %/conv22/Conv_output_0 : Float(1, 48, 12, 20, strides=[11520, 240, 20, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv22/Conv"](%/resize_4/Conv_output_0, %conv22.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv22 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
  %/Transpose_5_output_0 : Float(1, 12, 20, 48, strides=[11520, 20, 1, 240], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_5"](%/conv22/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:31:0
  %/Constant_3_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1  4 [ CPULongType{3} ], onnx_name="/Constant_3"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:32:0
  %/Reshape_3_output_0 : Float(1, 34560, 4, strides=[138240, 4, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_3"](%/Transpose_3_output_0, %/Constant_3_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:32:0
  %/Constant_4_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1  4 [ CPULongType{3} ], onnx_name="/Constant_4"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:33:0
  %/Reshape_4_output_0 : Float(1, 8640, 4, strides=[34560, 4, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_4"](%/Transpose_4_output_0, %/Constant_4_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:33:0
  %/Constant_5_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1  4 [ CPULongType{3} ], onnx_name="/Constant_5"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:34:0
  %/Reshape_5_output_0 : Float(1, 2880, 4, strides=[11520, 4, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_5"](%/Transpose_5_output_0, %/Constant_5_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:34:0
  %38 : Float(1, 46080, 4, strides=[184320, 4, 1], requires_grad=1, device=cpu) = onnx::Concat[axis=1, onnx_name="/Concat_1"](%/Reshape_3_output_0, %/Reshape_4_output_0, %/Reshape_5_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:35:0
  %output : Float(1, 61440, 3, strides=[184320, 3, 1], requires_grad=1, device=cpu) = onnx::Softmax[axis=2, onnx_name="/Softmax"](%/Concat_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:36:0
  return (%output, %38)

前端导入#

ONNX 前端:

from tvm import relay
onnx_model = onnx.load(f"{root_dir}/{output_name}.onnx")
mod, params = relay.frontend.from_onnx(onnx_model, {config.name: config.shape}, freeze_params=True)
onnx_mod = relay.transform.InferType()(mod)
onnx_mod.show()
def @main(%data: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] span=/conv/Conv.data:0:0 */) -> (Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) {
  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 1, 1), float32] span=/conv/Conv.conv.weight:0:0 */, padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 48, 80), float32] span=/conv/Conv:0:0 */;
  %1 = nn.conv2d(%0, meta[relay.Constant][1] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv0/Conv.conv0.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv0/Conv:0:0 */;
  %2 = transpose(%1, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 48, 80, 36), float32] span=/Transpose:0:0 */;
  %3 = nn.conv2d(%0, meta[relay.Constant][2] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_2/Conv.resize_2.weight:0:0 */, strides=[2, 2], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 24, 40), float32] span=/resize_2/Conv:0:0 */;
  %4 = nn.conv2d(%3, meta[relay.Constant][3] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv1/Conv.conv1.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv1/Conv:0:0 */;
  %5 = transpose(%4, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 24, 40, 36), float32] span=/Transpose_1:0:0 */;
  %6 = nn.conv2d(%0, meta[relay.Constant][4] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_4/Conv.resize_4.weight:0:0 */, strides=[4, 4], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 12, 20), float32] span=/resize_4/Conv:0:0 */;
  %7 = nn.conv2d(%6, meta[relay.Constant][5] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv2/Conv.conv2.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv2/Conv:0:0 */;
  %8 = transpose(%7, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 12, 20, 48), float32] span=/Transpose_2:0:0 */;
  %9 = reshape(%2, newshape=[1, -1, 3]) /* ty=Tensor[(1, 46080, 3), float32] span=/Reshape:0:0 */;
  %10 = reshape(%5, newshape=[1, -1, 3]) /* ty=Tensor[(1, 11520, 3), float32] span=/Reshape_1:0:0 */;
  %11 = reshape(%8, newshape=[1, -1, 3]) /* ty=Tensor[(1, 3840, 3), float32] span=/Reshape_2:0:0 */;
  %12 = (%9, %10, %11) /* ty=(Tensor[(1, 46080, 3), float32], Tensor[(1, 11520, 3), float32], Tensor[(1, 3840, 3), float32]) span=/Concat:0:0 */;
  %13 = concatenate(%12, axis=1) /* ty=Tensor[(1, 61440, 3), float32] span=/Concat:0:0 */;
  %14 = nn.conv2d(%0, meta[relay.Constant][6] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv00/Conv.conv00.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv00/Conv:0:0 */;
  %15 = transpose(%14, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 48, 80, 36), float32] span=/Transpose_3:0:0 */;
  %16 = nn.conv2d(%3, meta[relay.Constant][7] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv11/Conv.conv11.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv11/Conv:0:0 */;
  %17 = transpose(%16, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 24, 40, 36), float32] span=/Transpose_4:0:0 */;
  %18 = nn.conv2d(%6, meta[relay.Constant][8] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv22/Conv.conv22.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv22/Conv:0:0 */;
  %19 = transpose(%18, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 12, 20, 48), float32] span=/Transpose_5:0:0 */;
  %20 = reshape(%15, newshape=[1, -1, 4]) /* ty=Tensor[(1, 34560, 4), float32] span=/Reshape_3:0:0 */;
  %21 = reshape(%17, newshape=[1, -1, 4]) /* ty=Tensor[(1, 8640, 4), float32] span=/Reshape_4:0:0 */;
  %22 = reshape(%19, newshape=[1, -1, 4]) /* ty=Tensor[(1, 2880, 4), float32] span=/Reshape_5:0:0 */;
  %23 = (%20, %21, %22) /* ty=(Tensor[(1, 34560, 4), float32], Tensor[(1, 8640, 4), float32], Tensor[(1, 2880, 4), float32]) span=/Concat_1:0:0 */;
  %24 = nn.softmax(%13, axis=2) /* ty=Tensor[(1, 61440, 3), float32] span=/Softmax:0:0 */;
  %25 = concatenate(%23, axis=1) /* ty=Tensor[(1, 46080, 4), float32] span=/Concat_1:0:0 */;
  (%24, %25) /* ty=(Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) */
}

验证数值一致性:

import onnxruntime

ort_session = onnxruntime.InferenceSession(f"{root_dir}/{output_name}.onnx", providers=["CPUExecutionProvider"])
# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: data_np}
ort_outputs = ort_session.run(None, ort_inputs)
with torch.no_grad():
    torch_outputs = model(torch.from_numpy(data_np))
    torch_outputs = [o.numpy() for o in torch_outputs]
[
    np.testing.assert_allclose(ort_output, torch_output, rtol=1e-07, atol=1e-5) 
    for ort_output, torch_output in zip(ort_outputs, torch_outputs)
];
import logging
import textwrap
import numpy as np
from tvm.relay.testing import run_infer_type
from tvm.relay.dataflow_pattern import (
    wildcard, is_op, is_tuple,
    is_constant, is_tuple_get_item,
    DFPatternCallback,
    rewrite
)
import tvm
from tvm.relay import transform as _transform
from special_op import det_class_predict, det_bbox_predict
class TransposeReshapeConcatRewrite(DFPatternCallback):
    """融合 Transpose+Reshape+Concat+Softmax => det_class_predict
    """
    def __init__(self):
        super().__init__()
        axes = (0, 2, 3, 1)
        # newshape = (1, -1, group)
        self.x0 = wildcard()
        self.x1 = wildcard()
        self.x2 = wildcard()
        self.transpose0 = is_op("transpose")(self.x0).has_attr({"axes": axes})
        self.reshape0 = is_op("reshape")(self.transpose0) #.has_attr({"newshape": newshape})
        self.transpose1 = is_op("transpose")(self.x1).has_attr({"axes": axes})
        self.reshape1 = is_op("reshape")(self.transpose1) #.has_attr({"newshape": newshape})
        self.transpose2 = is_op("transpose")(self.x2).has_attr({"axes": axes})
        self.reshape2 = is_op("reshape")(self.transpose2) #.has_attr({"newshape": newshape})
        self.tuple_op = is_tuple((self.reshape0, self.reshape1, self.reshape2))
        self.cat = is_op("concatenate")(self.tuple_op).has_attr({"axis": 1})
        self.softmax = is_op("nn.softmax")(self.cat)
        self.output = self.softmax | self.cat
        self.pattern = self.output

    def callback(self, pre, post, node_map):
        x0 = node_map[self.x0][0]
        x1 = node_map[self.x1][0]
        x2 = node_map[self.x2][0]
        reshape2 = node_map[self.reshape2][0]
        shape = _transform.InferTypeLocal(reshape2).shape
        softmax = node_map.get(self.softmax, [])
        if softmax:
            return det_class_predict(x0, x1, x2, class_num=int(shape[-1]))
        else:
            return det_bbox_predict(x0, x1, x2)
from copy import deepcopy

origin_mod = deepcopy(mod)
mod["main"] = rewrite(TransposeReshapeConcatRewrite(), mod["main"])
mod.show()
def @main(%data: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] span=/conv/Conv.data:0:0 */) -> (Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) {
  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 1, 1), float32] span=/conv/Conv.conv.weight:0:0 */, padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 48, 80), float32] span=/conv/Conv:0:0 */;
  %1 = nn.conv2d(%0, meta[relay.Constant][2] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_2/Conv.resize_2.weight:0:0 */, strides=[2, 2], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 24, 40), float32] span=/resize_2/Conv:0:0 */;
  %2 = nn.conv2d(%0, meta[relay.Constant][4] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_4/Conv.resize_4.weight:0:0 */, strides=[4, 4], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 12, 20), float32] span=/resize_4/Conv:0:0 */;
  %3 = nn.conv2d(%0, meta[relay.Constant][1] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv0/Conv.conv0.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv0/Conv:0:0 */;
  %4 = nn.conv2d(%1, meta[relay.Constant][3] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv1/Conv.conv1.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv1/Conv:0:0 */;
  %5 = nn.conv2d(%2, meta[relay.Constant][5] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv2/Conv.conv2.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv2/Conv:0:0 */;
  %6 = nn.conv2d(%0, meta[relay.Constant][6] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv00/Conv.conv00.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv00/Conv:0:0 */;
  %7 = nn.conv2d(%1, meta[relay.Constant][7] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv11/Conv.conv11.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv11/Conv:0:0 */;
  %8 = nn.conv2d(%2, meta[relay.Constant][8] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv22/Conv.conv22.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv22/Conv:0:0 */;
  %9 = llvm_special.det_class_predict(%3, %4, %5, __dict__={"class_num"=3});
  %10 = llvm_special.det_bbox_predict(%6, %7, %8);
  (%9, %10) /* ty=(Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) */
}
target = "llvm"
dev = tvm.device(target, 0)
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target)
func = lib[lib.libmod_name]
module = tvm.contrib.graph_executor.GraphModule(func(dev))
module.run(**{config.name: data_np})
outputs = [module.get_output(k).numpy() for k in range(2)]
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(origin_mod, target)
func = lib[lib.libmod_name]
module = tvm.contrib.graph_executor.GraphModule(func(dev))
module.run(**{config.name: data_np})
origon_outputs = [module.get_output(k).numpy() for k in range(2)]
with torch.no_grad():
    torch_outputs = [o.numpy() for o in model(torch.from_numpy(data_np))]
[
    np.testing.assert_allclose(torch_output, origon_output, rtol=1e-07, atol=1e-05)
    for torch_output, origon_output in zip(torch_outputs, origon_outputs)
]
[
    np.testing.assert_allclose(torch_output, output, rtol=1e-07, atol=1e-05)
    for torch_output, output in zip(torch_outputs, outputs)
];
from tvm.relay.op import op as _op
from special_op.utils import schedule_special_op
# from tvm import te, topi
# from tvm.relay.op import strategy as _strategy
# from tvm.relay.op.op import OpPattern, OpStrategy

# def det_bbox_predict_compute(attrs, inputs, out_type):
#     """det_bbox_predict Relay 计算"""
#     assert len(inputs) == 3, "输入参数数量不为 3"
#     x0, x1, x2 = inputs
#     bbox_size = 4
#     x0 = topi.transpose(x0, [0, 2, 3, 1])
#     x0 = topi.reshape(x0, [1, x0.shape[1]*x0.shape[2]*x0.shape[3]//bbox_size, bbox_size])
#     x1 = topi.transpose(x1, [0, 2, 3, 1])
#     x1 = topi.reshape(x1, [1, x1.shape[1]*x1.shape[2]*x1.shape[3]//bbox_size, bbox_size])
#     x2 = topi.transpose(x2, [0, 2, 3, 1])
#     x2 = topi.reshape(x2, [1, x2.shape[1]*x2.shape[2]*x2.shape[3]//bbox_size, bbox_size])
#     x = topi.concatenate([x0, x1, x2], axis=1)
#     print(x)
#     return [x]


# op_name = "vta_special.det_bbox_predict"
# def det_bbox_predict_strategy_vta(attrs, inputs, out_type, target):
#     strategy = OpStrategy()
#     strategy.add_implementation(
#         det_bbox_predict_compute,
#         schedule_special_op,
#         name=f"{op_name}.llvm",
#     )
#     return strategy


# _op.get(op_name).get_attr("FTVMStrategy").register(det_bbox_predict_strategy_vta, "llvm", allow_override=True)
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, "llvm")
# func = lib[lib.libmod_name]
# module = tvm.contrib.graph_executor.GraphModule(func(tvm.ext_dev(0)))
# module.run(**{config.name: data_np})
# outputs = [module.get_output(k).numpy() for k in range(2)]
from tvm.relay.dataflow_pattern import wildcard, is_constant, is_op
# import logging

def is_QPartitionExpr(op):
    r = is_op("annotation.cast_hint")(op)
    r = is_op("annotation.stop_fusion")(r)
    return r

def debug_partition(func):
    def is_QPartitionExpr():
        # logging.debug(f"enter {func.__name__}()")
        r = func()
        r = is_op("annotation.cast_hint")(r) | r
        # r = is_op("annotation.stop_fusion")(r) | r
        return r
    return is_QPartitionExpr

@debug_partition
def make_conv_add_squeeze_pattern():
    x = wildcard()
    w = wildcard()
    bias = wildcard()
    x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant())
    x_ = x_ | x
    w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant())
    w_ = w_ | w
    bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
    conv_node = is_op("nn.conv2d")(x_, w_)
    r = is_op("add")(conv_node, bias_)
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    r = is_op("squeeze")(r)
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_conv_add_relu_max_pool2d_pattern():
    x = wildcard()
    w = wildcard()
    bias = wildcard()
    # x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
    # w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
    bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
    conv_node = is_op("nn.conv2d")(x, w)
    r = is_op("add")(conv_node, bias_) | conv_node
    r = is_op("nn.relu")(r) | is_op("nn.prelu")(r, wildcard())
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    xx = is_op("annotation.cast_hint")(r)
    xx = is_op("annotation.stop_fusion")(xx)
    # xx = is_op("nn.max_pool2d")(xx)
    r = is_op("nn.max_pool2d")(xx|r).has_attr({
        "padding": [0, 0, 0, 0],
        # "ceil_mode": False,
    })
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_conv_add_activate_pattern():
    r"""Create a pattern to match the following graph.

    conv2d
        |
    add
        |
        (relu|relu6|prelu|sigmoid|relux)
    """
    x = wildcard()
    w = wildcard()
    bias = wildcard()
    alpha = wildcard()
    # x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
    # w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
    bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
    # alpha_ = is_op("relay.op.annotation.simulated_quantize")(alpha, is_constant(), is_constant(), is_constant()) | alpha
    conv_node = is_op("nn.conv2d")(x, w)
    r = is_op("add")(conv_node, bias_) | is_op("nn.bias_add")(conv_node, bias_) | conv_node
    
    # 激活函数
    r1 = r.optional(lambda x: is_op("nn.relu")(x))
    r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
    r3 = r.optional(lambda x: is_op("nn.prelu")(x, alpha)) # prelu
    r4 = r.optional(lambda x: is_op("sigmoid")(x)) # sigmoid
    # r5 = r.optional(lambda x: is_op("nn.relux")(x, alpha)) # relux
    # r6 = r.optional(lambda x: is_op("silu")(x)) # silu
    # r7 = r.optional(lambda x: is_op("hard_sigmoid")(x)) # hard_sigmoid
    # r8 = r.optional(lambda x: is_op("hard_swish")(x)) # hard_swish
    r = r1 | r2 | r3 | r4

    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_conv2d_transpose_add_activate_pattern():
    r"""Create a pattern to match the following graph.

    conv2d_transpose
        |
    add
    """
    x = wildcard()
    w = wildcard()
    bias = wildcard()
    alpha = wildcard()
    alpha_ = is_op("relay.op.annotation.simulated_quantize")(alpha, is_constant(), is_constant(), is_constant()) | alpha
    # x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
    # w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
    bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
    r = is_op("nn.conv2d_transpose")(x, w)
    r = is_op("add")(r, bias_) | r
    
    # 激活函数
    r1 = r.optional(lambda x: is_op("nn.relu")(x))
    r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
    r3 = r.optional(lambda x: is_op("nn.prelu")(x, alpha)) # prelu
    r4 = r.optional(lambda x: is_op("sigmoid")(x)) # sigmoid
    r = r1 | r2 | r3 | r4

    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    # r = is_QPartitionExpr(r) | r
    # r = is_op("strided_slice")(r)
    return r

@debug_partition
def make_max_pool2d_pattern():
    x = wildcard()
    r = is_op("nn.max_pool2d")(x)
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_strided_slice_pattern():
    x = wildcard()
    r = is_op("strided_slice")(x)
    return r

@debug_partition
def make_concat_pattern():
    x = wildcard()
    r = is_op("concatenate")(x)
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_avg_pool2d_pattern():
    x = wildcard()
    r = is_op("nn.avg_pool2d")(x) 
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_adaptive_avg_pool2d_pattern():
    x = wildcard()
    r = is_op("nn.adaptive_avg_pool2d")(x) 
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_dense_add_pattern():
    r"""Create a pattern to match the following graph.

      nn.dense
        |
       add
    """
    x = wildcard()
    y = wildcard()
    w = wildcard()
    # x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
    # y_ = is_op("relay.op.annotation.simulated_quantize")(y, is_constant(), is_constant(), is_constant()) | y
    w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
    node = is_op("nn.dense")(x, y)
    r = is_op("add")(node, w_) | node
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_add_add_pattern():
    """
    z = x + y
    z2 = z + c1
    其中 c1 为常量
    """
    c1 = is_constant()
    c1_ = is_op("relay.op.annotation.simulated_quantize")(c1, is_constant(), is_constant(), is_constant()) | c1

    r = wildcard() + wildcard()
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    r = r + c1_
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_multiply_add_pattern():
    """
    z = x * y
    z2 = z + c1
    """
    c1 = is_constant()
    c1_ = is_op("relay.op.annotation.simulated_quantize")(c1, is_constant(), is_constant(), is_constant()) | c1

    r = wildcard() * wildcard()
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    r = r + c1_
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_add_multiply_add_pattern():
    """
    z = x + y
    z2 = z * c1 + c2
    其中 c1,c2 为常量
    """
    c1 = is_constant()
    c2 = is_constant()
    c1_ = is_op("relay.op.annotation.simulated_quantize")(c1, is_constant(), is_constant(), is_constant()) | c1
    c2_ = is_op("relay.op.annotation.simulated_quantize")(c2, is_constant(), is_constant(), is_constant()) | c2

    r = wildcard() + wildcard()
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    r = r * c1_
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    r = r + c2_
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_add_pattern():
    r"""Create a pattern to match the following graph.

      add
        |
      relu|relu6
    """
    r = wildcard() + wildcard()
    r1 = r.optional(lambda x: is_op("nn.relu")(x))
    r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
    r = r1 | r2
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

@debug_partition
def make_multiply_pattern():
    r"""Create a pattern to match the following graph.

      multiply
        |
      relu|relu6
    """
    r = wildcard() * wildcard()
    r1 = r.optional(lambda x: is_op("nn.relu")(x))
    r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
    r = r1 | r2
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r

def make_reshape_squeeze_pattern():
    x = wildcard()
    r = is_op("reshape")(x)
    r = is_op("squeeze")(r)
    r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
    return r
# 配置融合规则
compiler_name = "vta_special"
pattern_table = [
    # (f"{compiler_name}.concat", make_concat_pattern()),
    (f"{compiler_name}.conv_add_squeeze", make_conv_add_squeeze_pattern()), # mobilenet_v2_tf 最后一层
    (f"{compiler_name}.conv_add_relu_max_pool2d", make_conv_add_relu_max_pool2d_pattern()),
    (f"{compiler_name}.conv2d_transpose_add_activate", make_conv2d_transpose_add_activate_pattern()),
    (f"{compiler_name}.conv_add_activate", make_conv_add_activate_pattern()),
    (f"{compiler_name}.max_pool2d", make_max_pool2d_pattern()),
    (f"{compiler_name}.dense_add", make_dense_add_pattern()),
    (f"{compiler_name}.adaptive_avg_pool2d", make_adaptive_avg_pool2d_pattern()),
    (f"{compiler_name}.avg_pool2dd", make_avg_pool2d_pattern()),
    (f"{compiler_name}.add_multiply_add", make_add_multiply_add_pattern()), # kr_karen
    (f"{compiler_name}.add_add", make_add_add_pattern()),
    (f"{compiler_name}.multiply_add", make_multiply_add_pattern()),
    (f"{compiler_name}.add", make_add_pattern()),
    (f"{compiler_name}.multiply", make_multiply_pattern()),
    # ("det_class_predict_temp", make_det_class_predict_temp_pattern()),
    # ("det_bbox_predict_temp",make_det_bbox_predict_temp_pattern())
    # (f"{compiler_name}.strided_slice", make_strided_slice_pattern()),
]
merge_passes = tvm.transform.Sequential([
    relay.transform.InferType(),
    relay.transform.MergeComposite(pattern_table),
])
run_mod = deepcopy(mod)
with tvm.transform.PassContext(opt_level=3):
    partition_mod_t = merge_passes(run_mod)
print(partition_mod_t)
def @main(%data: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] span=/conv/Conv.data:0:0 */) -> (Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) {
  %0 = fn (%FunctionVar_8_0: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] */, %FunctionVar_8_1: Tensor[(64, 3, 1, 1), float32] /* ty=Tensor[(64, 3, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 64, 48, 80), float32] {
    nn.conv2d(%FunctionVar_8_0, %FunctionVar_8_1, padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 48, 80), float32] span=/conv/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 3, 48, 80), float32], Tensor[(64, 3, 1, 1), float32]) -> Tensor[(1, 64, 48, 80), float32] */;
  %1 = %0(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 1, 1), float32] span=/conv/Conv.conv.weight:0:0 */) /* ty=Tensor[(1, 64, 48, 80), float32] */;
  %2 = fn (%FunctionVar_7_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_7_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 48, 80), float32] {
    nn.conv2d(%FunctionVar_7_0, %FunctionVar_7_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv0/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 48, 80), float32] */;
  %3 = fn (%FunctionVar_6_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_6_1: Tensor[(64, 64, 1, 1), float32] /* ty=Tensor[(64, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 64, 24, 40), float32] {
    nn.conv2d(%FunctionVar_6_0, %FunctionVar_6_1, strides=[2, 2], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 24, 40), float32] span=/resize_2/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(64, 64, 1, 1), float32]) -> Tensor[(1, 64, 24, 40), float32] */;
  %4 = %3(%1, meta[relay.Constant][2] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_2/Conv.resize_2.weight:0:0 */) /* ty=Tensor[(1, 64, 24, 40), float32] */;
  %5 = fn (%FunctionVar_5_0: Tensor[(1, 64, 24, 40), float32] /* ty=Tensor[(1, 64, 24, 40), float32] */, %FunctionVar_5_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 24, 40), float32] {
    nn.conv2d(%FunctionVar_5_0, %FunctionVar_5_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv1/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 24, 40), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 24, 40), float32] */;
  %6 = fn (%FunctionVar_4_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_4_1: Tensor[(64, 64, 1, 1), float32] /* ty=Tensor[(64, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 64, 12, 20), float32] {
    nn.conv2d(%FunctionVar_4_0, %FunctionVar_4_1, strides=[4, 4], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 12, 20), float32] span=/resize_4/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(64, 64, 1, 1), float32]) -> Tensor[(1, 64, 12, 20), float32] */;
  %7 = %6(%1, meta[relay.Constant][4] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_4/Conv.resize_4.weight:0:0 */) /* ty=Tensor[(1, 64, 12, 20), float32] */;
  %8 = fn (%FunctionVar_3_0: Tensor[(1, 64, 12, 20), float32] /* ty=Tensor[(1, 64, 12, 20), float32] */, %FunctionVar_3_1: Tensor[(48, 64, 1, 1), float32] /* ty=Tensor[(48, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 48, 12, 20), float32] {
    nn.conv2d(%FunctionVar_3_0, %FunctionVar_3_1, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv2/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 12, 20), float32], Tensor[(48, 64, 1, 1), float32]) -> Tensor[(1, 48, 12, 20), float32] */;
  %9 = %2(%1, meta[relay.Constant][1] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv0/Conv.conv0.weight:0:0 */) /* ty=Tensor[(1, 36, 48, 80), float32] */;
  %10 = %5(%4, meta[relay.Constant][3] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv1/Conv.conv1.weight:0:0 */) /* ty=Tensor[(1, 36, 24, 40), float32] */;
  %11 = %8(%7, meta[relay.Constant][5] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv2/Conv.conv2.weight:0:0 */) /* ty=Tensor[(1, 48, 12, 20), float32] */;
  %12 = fn (%FunctionVar_2_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_2_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 48, 80), float32] {
    nn.conv2d(%FunctionVar_2_0, %FunctionVar_2_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv00/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 48, 80), float32] */;
  %13 = fn (%FunctionVar_1_0: Tensor[(1, 64, 24, 40), float32] /* ty=Tensor[(1, 64, 24, 40), float32] */, %FunctionVar_1_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 24, 40), float32] {
    nn.conv2d(%FunctionVar_1_0, %FunctionVar_1_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv11/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 24, 40), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 24, 40), float32] */;
  %14 = fn (%FunctionVar_0_0: Tensor[(1, 64, 12, 20), float32] /* ty=Tensor[(1, 64, 12, 20), float32] */, %FunctionVar_0_1: Tensor[(48, 64, 1, 1), float32] /* ty=Tensor[(48, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 48, 12, 20), float32] {
    nn.conv2d(%FunctionVar_0_0, %FunctionVar_0_1, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv22/Conv:0:0 */
  } /* ty=fn (Tensor[(1, 64, 12, 20), float32], Tensor[(48, 64, 1, 1), float32]) -> Tensor[(1, 48, 12, 20), float32] */;
  %15 = %12(%1, meta[relay.Constant][6] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv00/Conv.conv00.weight:0:0 */) /* ty=Tensor[(1, 36, 48, 80), float32] */;
  %16 = %13(%4, meta[relay.Constant][7] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv11/Conv.conv11.weight:0:0 */) /* ty=Tensor[(1, 36, 24, 40), float32] */;
  %17 = %14(%7, meta[relay.Constant][8] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv22/Conv.conv22.weight:0:0 */) /* ty=Tensor[(1, 48, 12, 20), float32] */;
  %18 = llvm_special.det_class_predict(%9, %10, %11, __dict__={"class_num"=3}) /* ty=Tensor[(1, 61440, 3), float32] */;
  %19 = llvm_special.det_bbox_predict(%15, %16, %17) /* ty=Tensor[(1, 46080, 4), float32] */;
  (%18, %19) /* ty=(Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) */
}