目标检测输出层#
import set_env
from d2py.utils.file import mkdir
root_dir = ".temp"
mkdir(f"{root_dir}/logs")
mkdir(f"{root_dir}/libs")
配置信息:
from dataclasses import dataclass
@dataclass
class Config:
shape: tuple # 输入数据 shape
name: str = "data" # 输入数据名称
dtype: str = "float32" # 输入数据类型
构建 PyTorch 人脸检测模型#
import torch
from torch import nn
class M(nn.Module):
def __init__(self):
super().__init__()
self.conv = nn.Conv2d(3, 64, 1, 1, 0, bias=False, groups=1)
self.conv0 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
self.conv00 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
self.resize_2 = nn.Conv2d(64, 64, 1, 2, 0, bias=False, groups=1)
self.conv1 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
self.conv11 = nn.Conv2d(64, 36, 1, 1, 0, bias=False, groups=1)
self.resize_4 = nn.Conv2d(64, 64, 1, 4, 0, bias=False, groups=1)
self.conv2 = nn.Conv2d(64, 48, 1, 1, 0, bias=False, groups=1)
self.conv22 = nn.Conv2d(64, 48, 1, 1, 0, bias=False, groups=1)
def forward(self, x):
_x = self.conv(x)
__x = self.resize_2(_x)
___x = self.resize_4(_x)
x0 = self.conv0(_x).permute(0, 2, 3, 1) # NCHW => NHWC
x1 = self.conv1(__x).permute(0, 2, 3, 1)
x2 = self.conv2(___x).permute(0, 2, 3, 1)
x0 = x0.reshape(1, -1, 3)
x1 = x1.reshape(1, -1, 3)
x2 = x2.reshape(1, -1, 3)
x = torch.concat((x0, x1, x2), dim=1)
x00 = self.conv00(_x).permute(0, 2, 3, 1) # NCHW => NHWC
x11 = self.conv11(__x).permute(0, 2, 3, 1)
x22 = self.conv22(___x).permute(0, 2, 3, 1)
x00 = x00.reshape(1, -1, 4)
x11 = x11.reshape(1, -1, 4)
x22 = x22.reshape(1, -1, 4)
xx = torch.concat((x00, x11, x22), dim=1)
return torch.softmax(x, dim=2), xx
导出 ONNX 模型#
from torch.onnx import OperatorExportTypes, utils
import numpy as np
import onnx
model = M()
model.eval()
config = Config((1, 3, 48, 80))
data_np = np.random.rand(*config.shape).astype(config.dtype)
output_name = "test-det-class"
xx = torch.rand(*config.shape, dtype=torch.float32, requires_grad=False)
utils.export(
model, # torch 模型
xx, # 模型输入或者对于多个输入,使用元组
f"{root_dir}/{output_name}.onnx", # 模型保存的位置(可以是文件或类似文件的对象)
export_params=True, # 将训练后的参数权重存储在模型文件内
# opset_version=17, # 导出模型的 ONNX 版本
do_constant_folding=True, # 是否执行常量折叠以进行优化
input_names = [config.name], # 模型的输入名称
output_names = ['output'], # 模型的输出名称
# keep_initializers_as_inputs=True,
# export_modules_as_functions=True,
verbose=True,
# operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
# dynamic_axes={'data' : {0 : 'batch_size'}, # 可变长度的轴
# 'output' : {0 : 'batch_size'}}
)
Exported graph: graph(%data : Float(1, 3, 48, 80, strides=[11520, 3840, 80, 1], requires_grad=0, device=cpu),
%conv.weight : Float(64, 3, 1, 1, strides=[3, 1, 1, 1], requires_grad=1, device=cpu),
%conv0.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
%conv00.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
%resize_2.weight : Float(64, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
%conv1.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
%conv11.weight : Float(36, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
%resize_4.weight : Float(64, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
%conv2.weight : Float(48, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu),
%conv22.weight : Float(48, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=1, device=cpu)):
%/conv/Conv_output_0 : Float(1, 64, 48, 80, strides=[245760, 3840, 80, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv/Conv"](%data, %conv.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/resize_2/Conv_output_0 : Float(1, 64, 24, 40, strides=[61440, 960, 40, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[2, 2], onnx_name="/resize_2/Conv"](%/conv/Conv_output_0, %resize_2.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::resize_2 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/resize_4/Conv_output_0 : Float(1, 64, 12, 20, strides=[15360, 240, 20, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[4, 4], onnx_name="/resize_4/Conv"](%/conv/Conv_output_0, %resize_4.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::resize_4 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/conv0/Conv_output_0 : Float(1, 36, 48, 80, strides=[138240, 3840, 80, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv0/Conv"](%/conv/Conv_output_0, %conv0.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv0 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/Transpose_output_0 : Float(1, 48, 80, 36, strides=[138240, 80, 1, 3840], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose"](%/conv0/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:21:0
%/conv1/Conv_output_0 : Float(1, 36, 24, 40, strides=[34560, 960, 40, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv1/Conv"](%/resize_2/Conv_output_0, %conv1.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv1 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/Transpose_1_output_0 : Float(1, 24, 40, 36, strides=[34560, 40, 1, 960], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_1"](%/conv1/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:22:0
%/conv2/Conv_output_0 : Float(1, 48, 12, 20, strides=[11520, 240, 20, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv2/Conv"](%/resize_4/Conv_output_0, %conv2.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv2 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/Transpose_2_output_0 : Float(1, 12, 20, 48, strides=[11520, 20, 1, 240], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_2"](%/conv2/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:23:0
%/Constant_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1 3 [ CPULongType{3} ], onnx_name="/Constant"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:24:0
%/Reshape_output_0 : Float(1, 46080, 3, strides=[138240, 3, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape"](%/Transpose_output_0, %/Constant_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:24:0
%/Constant_1_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1 3 [ CPULongType{3} ], onnx_name="/Constant_1"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:25:0
%/Reshape_1_output_0 : Float(1, 11520, 3, strides=[34560, 3, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_1"](%/Transpose_1_output_0, %/Constant_1_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:25:0
%/Constant_2_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1 3 [ CPULongType{3} ], onnx_name="/Constant_2"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:26:0
%/Reshape_2_output_0 : Float(1, 3840, 3, strides=[11520, 3, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_2"](%/Transpose_2_output_0, %/Constant_2_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:26:0
%/Concat_output_0 : Float(1, 61440, 3, strides=[184320, 3, 1], requires_grad=1, device=cpu) = onnx::Concat[axis=1, onnx_name="/Concat"](%/Reshape_output_0, %/Reshape_1_output_0, %/Reshape_2_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:27:0
%/conv00/Conv_output_0 : Float(1, 36, 48, 80, strides=[138240, 3840, 80, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv00/Conv"](%/conv/Conv_output_0, %conv00.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv00 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/Transpose_3_output_0 : Float(1, 48, 80, 36, strides=[138240, 80, 1, 3840], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_3"](%/conv00/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:29:0
%/conv11/Conv_output_0 : Float(1, 36, 24, 40, strides=[34560, 960, 40, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv11/Conv"](%/resize_2/Conv_output_0, %conv11.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv11 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/Transpose_4_output_0 : Float(1, 24, 40, 36, strides=[34560, 40, 1, 960], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_4"](%/conv11/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:30:0
%/conv22/Conv_output_0 : Float(1, 48, 12, 20, strides=[11520, 240, 20, 1], requires_grad=0, device=cpu) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1], onnx_name="/conv22/Conv"](%/resize_4/Conv_output_0, %conv22.weight), scope: __main__.M::/torch.nn.modules.conv.Conv2d::conv22 # /media/pc/data/tmp/cache/conda/envs/py312x/lib/python3.12/site-packages/torch/nn/modules/conv.py:456:0
%/Transpose_5_output_0 : Float(1, 12, 20, 48, strides=[11520, 20, 1, 240], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1], onnx_name="/Transpose_5"](%/conv22/Conv_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:31:0
%/Constant_3_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1 4 [ CPULongType{3} ], onnx_name="/Constant_3"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:32:0
%/Reshape_3_output_0 : Float(1, 34560, 4, strides=[138240, 4, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_3"](%/Transpose_3_output_0, %/Constant_3_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:32:0
%/Constant_4_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1 4 [ CPULongType{3} ], onnx_name="/Constant_4"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:33:0
%/Reshape_4_output_0 : Float(1, 8640, 4, strides=[34560, 4, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_4"](%/Transpose_4_output_0, %/Constant_4_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:33:0
%/Constant_5_output_0 : Long(3, strides=[1], device=cpu) = onnx::Constant[value= 1 -1 4 [ CPULongType{3} ], onnx_name="/Constant_5"](), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:34:0
%/Reshape_5_output_0 : Float(1, 2880, 4, strides=[11520, 4, 1], requires_grad=1, device=cpu) = onnx::Reshape[allowzero=0, onnx_name="/Reshape_5"](%/Transpose_5_output_0, %/Constant_5_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:34:0
%38 : Float(1, 46080, 4, strides=[184320, 4, 1], requires_grad=1, device=cpu) = onnx::Concat[axis=1, onnx_name="/Concat_1"](%/Reshape_3_output_0, %/Reshape_4_output_0, %/Reshape_5_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:35:0
%output : Float(1, 61440, 3, strides=[184320, 3, 1], requires_grad=1, device=cpu) = onnx::Softmax[axis=2, onnx_name="/Softmax"](%/Concat_output_0), scope: __main__.M:: # /tmp/ipykernel_1148509/1329727449.py:36:0
return (%output, %38)
前端导入#
ONNX 前端:
from tvm import relay
onnx_model = onnx.load(f"{root_dir}/{output_name}.onnx")
mod, params = relay.frontend.from_onnx(onnx_model, {config.name: config.shape}, freeze_params=True)
onnx_mod = relay.transform.InferType()(mod)
onnx_mod.show()
def @main(%data: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] span=/conv/Conv.data:0:0 */) -> (Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) {
%0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 1, 1), float32] span=/conv/Conv.conv.weight:0:0 */, padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 48, 80), float32] span=/conv/Conv:0:0 */;
%1 = nn.conv2d(%0, meta[relay.Constant][1] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv0/Conv.conv0.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv0/Conv:0:0 */;
%2 = transpose(%1, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 48, 80, 36), float32] span=/Transpose:0:0 */;
%3 = nn.conv2d(%0, meta[relay.Constant][2] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_2/Conv.resize_2.weight:0:0 */, strides=[2, 2], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 24, 40), float32] span=/resize_2/Conv:0:0 */;
%4 = nn.conv2d(%3, meta[relay.Constant][3] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv1/Conv.conv1.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv1/Conv:0:0 */;
%5 = transpose(%4, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 24, 40, 36), float32] span=/Transpose_1:0:0 */;
%6 = nn.conv2d(%0, meta[relay.Constant][4] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_4/Conv.resize_4.weight:0:0 */, strides=[4, 4], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 12, 20), float32] span=/resize_4/Conv:0:0 */;
%7 = nn.conv2d(%6, meta[relay.Constant][5] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv2/Conv.conv2.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv2/Conv:0:0 */;
%8 = transpose(%7, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 12, 20, 48), float32] span=/Transpose_2:0:0 */;
%9 = reshape(%2, newshape=[1, -1, 3]) /* ty=Tensor[(1, 46080, 3), float32] span=/Reshape:0:0 */;
%10 = reshape(%5, newshape=[1, -1, 3]) /* ty=Tensor[(1, 11520, 3), float32] span=/Reshape_1:0:0 */;
%11 = reshape(%8, newshape=[1, -1, 3]) /* ty=Tensor[(1, 3840, 3), float32] span=/Reshape_2:0:0 */;
%12 = (%9, %10, %11) /* ty=(Tensor[(1, 46080, 3), float32], Tensor[(1, 11520, 3), float32], Tensor[(1, 3840, 3), float32]) span=/Concat:0:0 */;
%13 = concatenate(%12, axis=1) /* ty=Tensor[(1, 61440, 3), float32] span=/Concat:0:0 */;
%14 = nn.conv2d(%0, meta[relay.Constant][6] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv00/Conv.conv00.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv00/Conv:0:0 */;
%15 = transpose(%14, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 48, 80, 36), float32] span=/Transpose_3:0:0 */;
%16 = nn.conv2d(%3, meta[relay.Constant][7] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv11/Conv.conv11.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv11/Conv:0:0 */;
%17 = transpose(%16, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 24, 40, 36), float32] span=/Transpose_4:0:0 */;
%18 = nn.conv2d(%6, meta[relay.Constant][8] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv22/Conv.conv22.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv22/Conv:0:0 */;
%19 = transpose(%18, axes=[0, 2, 3, 1]) /* ty=Tensor[(1, 12, 20, 48), float32] span=/Transpose_5:0:0 */;
%20 = reshape(%15, newshape=[1, -1, 4]) /* ty=Tensor[(1, 34560, 4), float32] span=/Reshape_3:0:0 */;
%21 = reshape(%17, newshape=[1, -1, 4]) /* ty=Tensor[(1, 8640, 4), float32] span=/Reshape_4:0:0 */;
%22 = reshape(%19, newshape=[1, -1, 4]) /* ty=Tensor[(1, 2880, 4), float32] span=/Reshape_5:0:0 */;
%23 = (%20, %21, %22) /* ty=(Tensor[(1, 34560, 4), float32], Tensor[(1, 8640, 4), float32], Tensor[(1, 2880, 4), float32]) span=/Concat_1:0:0 */;
%24 = nn.softmax(%13, axis=2) /* ty=Tensor[(1, 61440, 3), float32] span=/Softmax:0:0 */;
%25 = concatenate(%23, axis=1) /* ty=Tensor[(1, 46080, 4), float32] span=/Concat_1:0:0 */;
(%24, %25) /* ty=(Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) */
}
验证数值一致性:
import onnxruntime
ort_session = onnxruntime.InferenceSession(f"{root_dir}/{output_name}.onnx", providers=["CPUExecutionProvider"])
# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: data_np}
ort_outputs = ort_session.run(None, ort_inputs)
with torch.no_grad():
torch_outputs = model(torch.from_numpy(data_np))
torch_outputs = [o.numpy() for o in torch_outputs]
[
np.testing.assert_allclose(ort_output, torch_output, rtol=1e-07, atol=1e-5)
for ort_output, torch_output in zip(ort_outputs, torch_outputs)
];
import logging
import textwrap
import numpy as np
from tvm.relay.testing import run_infer_type
from tvm.relay.dataflow_pattern import (
wildcard, is_op, is_tuple,
is_constant, is_tuple_get_item,
DFPatternCallback,
rewrite
)
import tvm
from tvm.relay import transform as _transform
from special_op import det_class_predict, det_bbox_predict
class TransposeReshapeConcatRewrite(DFPatternCallback):
"""融合 Transpose+Reshape+Concat+Softmax => det_class_predict
"""
def __init__(self):
super().__init__()
axes = (0, 2, 3, 1)
# newshape = (1, -1, group)
self.x0 = wildcard()
self.x1 = wildcard()
self.x2 = wildcard()
self.transpose0 = is_op("transpose")(self.x0).has_attr({"axes": axes})
self.reshape0 = is_op("reshape")(self.transpose0) #.has_attr({"newshape": newshape})
self.transpose1 = is_op("transpose")(self.x1).has_attr({"axes": axes})
self.reshape1 = is_op("reshape")(self.transpose1) #.has_attr({"newshape": newshape})
self.transpose2 = is_op("transpose")(self.x2).has_attr({"axes": axes})
self.reshape2 = is_op("reshape")(self.transpose2) #.has_attr({"newshape": newshape})
self.tuple_op = is_tuple((self.reshape0, self.reshape1, self.reshape2))
self.cat = is_op("concatenate")(self.tuple_op).has_attr({"axis": 1})
self.softmax = is_op("nn.softmax")(self.cat)
self.output = self.softmax | self.cat
self.pattern = self.output
def callback(self, pre, post, node_map):
x0 = node_map[self.x0][0]
x1 = node_map[self.x1][0]
x2 = node_map[self.x2][0]
reshape2 = node_map[self.reshape2][0]
shape = _transform.InferTypeLocal(reshape2).shape
softmax = node_map.get(self.softmax, [])
if softmax:
return det_class_predict(x0, x1, x2, class_num=int(shape[-1]))
else:
return det_bbox_predict(x0, x1, x2)
from copy import deepcopy
origin_mod = deepcopy(mod)
mod["main"] = rewrite(TransposeReshapeConcatRewrite(), mod["main"])
mod.show()
def @main(%data: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] span=/conv/Conv.data:0:0 */) -> (Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) {
%0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 1, 1), float32] span=/conv/Conv.conv.weight:0:0 */, padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 48, 80), float32] span=/conv/Conv:0:0 */;
%1 = nn.conv2d(%0, meta[relay.Constant][2] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_2/Conv.resize_2.weight:0:0 */, strides=[2, 2], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 24, 40), float32] span=/resize_2/Conv:0:0 */;
%2 = nn.conv2d(%0, meta[relay.Constant][4] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_4/Conv.resize_4.weight:0:0 */, strides=[4, 4], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 12, 20), float32] span=/resize_4/Conv:0:0 */;
%3 = nn.conv2d(%0, meta[relay.Constant][1] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv0/Conv.conv0.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv0/Conv:0:0 */;
%4 = nn.conv2d(%1, meta[relay.Constant][3] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv1/Conv.conv1.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv1/Conv:0:0 */;
%5 = nn.conv2d(%2, meta[relay.Constant][5] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv2/Conv.conv2.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv2/Conv:0:0 */;
%6 = nn.conv2d(%0, meta[relay.Constant][6] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv00/Conv.conv00.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv00/Conv:0:0 */;
%7 = nn.conv2d(%1, meta[relay.Constant][7] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv11/Conv.conv11.weight:0:0 */, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv11/Conv:0:0 */;
%8 = nn.conv2d(%2, meta[relay.Constant][8] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv22/Conv.conv22.weight:0:0 */, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv22/Conv:0:0 */;
%9 = llvm_special.det_class_predict(%3, %4, %5, __dict__={"class_num"=3});
%10 = llvm_special.det_bbox_predict(%6, %7, %8);
(%9, %10) /* ty=(Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) */
}
target = "llvm"
dev = tvm.device(target, 0)
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target)
func = lib[lib.libmod_name]
module = tvm.contrib.graph_executor.GraphModule(func(dev))
module.run(**{config.name: data_np})
outputs = [module.get_output(k).numpy() for k in range(2)]
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(origin_mod, target)
func = lib[lib.libmod_name]
module = tvm.contrib.graph_executor.GraphModule(func(dev))
module.run(**{config.name: data_np})
origon_outputs = [module.get_output(k).numpy() for k in range(2)]
with torch.no_grad():
torch_outputs = [o.numpy() for o in model(torch.from_numpy(data_np))]
[
np.testing.assert_allclose(torch_output, origon_output, rtol=1e-07, atol=1e-05)
for torch_output, origon_output in zip(torch_outputs, origon_outputs)
]
[
np.testing.assert_allclose(torch_output, output, rtol=1e-07, atol=1e-05)
for torch_output, output in zip(torch_outputs, outputs)
];
from tvm.relay.op import op as _op
from special_op.utils import schedule_special_op
# from tvm import te, topi
# from tvm.relay.op import strategy as _strategy
# from tvm.relay.op.op import OpPattern, OpStrategy
# def det_bbox_predict_compute(attrs, inputs, out_type):
# """det_bbox_predict Relay 计算"""
# assert len(inputs) == 3, "输入参数数量不为 3"
# x0, x1, x2 = inputs
# bbox_size = 4
# x0 = topi.transpose(x0, [0, 2, 3, 1])
# x0 = topi.reshape(x0, [1, x0.shape[1]*x0.shape[2]*x0.shape[3]//bbox_size, bbox_size])
# x1 = topi.transpose(x1, [0, 2, 3, 1])
# x1 = topi.reshape(x1, [1, x1.shape[1]*x1.shape[2]*x1.shape[3]//bbox_size, bbox_size])
# x2 = topi.transpose(x2, [0, 2, 3, 1])
# x2 = topi.reshape(x2, [1, x2.shape[1]*x2.shape[2]*x2.shape[3]//bbox_size, bbox_size])
# x = topi.concatenate([x0, x1, x2], axis=1)
# print(x)
# return [x]
# op_name = "vta_special.det_bbox_predict"
# def det_bbox_predict_strategy_vta(attrs, inputs, out_type, target):
# strategy = OpStrategy()
# strategy.add_implementation(
# det_bbox_predict_compute,
# schedule_special_op,
# name=f"{op_name}.llvm",
# )
# return strategy
# _op.get(op_name).get_attr("FTVMStrategy").register(det_bbox_predict_strategy_vta, "llvm", allow_override=True)
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, "llvm")
# func = lib[lib.libmod_name]
# module = tvm.contrib.graph_executor.GraphModule(func(tvm.ext_dev(0)))
# module.run(**{config.name: data_np})
# outputs = [module.get_output(k).numpy() for k in range(2)]
from tvm.relay.dataflow_pattern import wildcard, is_constant, is_op
# import logging
def is_QPartitionExpr(op):
r = is_op("annotation.cast_hint")(op)
r = is_op("annotation.stop_fusion")(r)
return r
def debug_partition(func):
def is_QPartitionExpr():
# logging.debug(f"enter {func.__name__}()")
r = func()
r = is_op("annotation.cast_hint")(r) | r
# r = is_op("annotation.stop_fusion")(r) | r
return r
return is_QPartitionExpr
@debug_partition
def make_conv_add_squeeze_pattern():
x = wildcard()
w = wildcard()
bias = wildcard()
x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant())
x_ = x_ | x
w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant())
w_ = w_ | w
bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
conv_node = is_op("nn.conv2d")(x_, w_)
r = is_op("add")(conv_node, bias_)
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
r = is_op("squeeze")(r)
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_conv_add_relu_max_pool2d_pattern():
x = wildcard()
w = wildcard()
bias = wildcard()
# x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
# w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
conv_node = is_op("nn.conv2d")(x, w)
r = is_op("add")(conv_node, bias_) | conv_node
r = is_op("nn.relu")(r) | is_op("nn.prelu")(r, wildcard())
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
xx = is_op("annotation.cast_hint")(r)
xx = is_op("annotation.stop_fusion")(xx)
# xx = is_op("nn.max_pool2d")(xx)
r = is_op("nn.max_pool2d")(xx|r).has_attr({
"padding": [0, 0, 0, 0],
# "ceil_mode": False,
})
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_conv_add_activate_pattern():
r"""Create a pattern to match the following graph.
conv2d
|
add
|
(relu|relu6|prelu|sigmoid|relux)
"""
x = wildcard()
w = wildcard()
bias = wildcard()
alpha = wildcard()
# x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
# w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
# alpha_ = is_op("relay.op.annotation.simulated_quantize")(alpha, is_constant(), is_constant(), is_constant()) | alpha
conv_node = is_op("nn.conv2d")(x, w)
r = is_op("add")(conv_node, bias_) | is_op("nn.bias_add")(conv_node, bias_) | conv_node
# 激活函数
r1 = r.optional(lambda x: is_op("nn.relu")(x))
r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
r3 = r.optional(lambda x: is_op("nn.prelu")(x, alpha)) # prelu
r4 = r.optional(lambda x: is_op("sigmoid")(x)) # sigmoid
# r5 = r.optional(lambda x: is_op("nn.relux")(x, alpha)) # relux
# r6 = r.optional(lambda x: is_op("silu")(x)) # silu
# r7 = r.optional(lambda x: is_op("hard_sigmoid")(x)) # hard_sigmoid
# r8 = r.optional(lambda x: is_op("hard_swish")(x)) # hard_swish
r = r1 | r2 | r3 | r4
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_conv2d_transpose_add_activate_pattern():
r"""Create a pattern to match the following graph.
conv2d_transpose
|
add
"""
x = wildcard()
w = wildcard()
bias = wildcard()
alpha = wildcard()
alpha_ = is_op("relay.op.annotation.simulated_quantize")(alpha, is_constant(), is_constant(), is_constant()) | alpha
# x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
# w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
bias_ = is_op("relay.op.annotation.simulated_quantize")(bias, is_constant(), is_constant(), is_constant()) | bias
r = is_op("nn.conv2d_transpose")(x, w)
r = is_op("add")(r, bias_) | r
# 激活函数
r1 = r.optional(lambda x: is_op("nn.relu")(x))
r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
r3 = r.optional(lambda x: is_op("nn.prelu")(x, alpha)) # prelu
r4 = r.optional(lambda x: is_op("sigmoid")(x)) # sigmoid
r = r1 | r2 | r3 | r4
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
# r = is_QPartitionExpr(r) | r
# r = is_op("strided_slice")(r)
return r
@debug_partition
def make_max_pool2d_pattern():
x = wildcard()
r = is_op("nn.max_pool2d")(x)
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_strided_slice_pattern():
x = wildcard()
r = is_op("strided_slice")(x)
return r
@debug_partition
def make_concat_pattern():
x = wildcard()
r = is_op("concatenate")(x)
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_avg_pool2d_pattern():
x = wildcard()
r = is_op("nn.avg_pool2d")(x)
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_adaptive_avg_pool2d_pattern():
x = wildcard()
r = is_op("nn.adaptive_avg_pool2d")(x)
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_dense_add_pattern():
r"""Create a pattern to match the following graph.
nn.dense
|
add
"""
x = wildcard()
y = wildcard()
w = wildcard()
# x_ = is_op("relay.op.annotation.simulated_quantize")(x, is_constant(), is_constant(), is_constant()) | x
# y_ = is_op("relay.op.annotation.simulated_quantize")(y, is_constant(), is_constant(), is_constant()) | y
w_ = is_op("relay.op.annotation.simulated_quantize")(w, is_constant(), is_constant(), is_constant()) | w
node = is_op("nn.dense")(x, y)
r = is_op("add")(node, w_) | node
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_add_add_pattern():
"""
z = x + y
z2 = z + c1
其中 c1 为常量
"""
c1 = is_constant()
c1_ = is_op("relay.op.annotation.simulated_quantize")(c1, is_constant(), is_constant(), is_constant()) | c1
r = wildcard() + wildcard()
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
r = r + c1_
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_multiply_add_pattern():
"""
z = x * y
z2 = z + c1
"""
c1 = is_constant()
c1_ = is_op("relay.op.annotation.simulated_quantize")(c1, is_constant(), is_constant(), is_constant()) | c1
r = wildcard() * wildcard()
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
r = r + c1_
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_add_multiply_add_pattern():
"""
z = x + y
z2 = z * c1 + c2
其中 c1,c2 为常量
"""
c1 = is_constant()
c2 = is_constant()
c1_ = is_op("relay.op.annotation.simulated_quantize")(c1, is_constant(), is_constant(), is_constant()) | c1
c2_ = is_op("relay.op.annotation.simulated_quantize")(c2, is_constant(), is_constant(), is_constant()) | c2
r = wildcard() + wildcard()
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
r = r * c1_
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
r = r + c2_
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_add_pattern():
r"""Create a pattern to match the following graph.
add
|
relu|relu6
"""
r = wildcard() + wildcard()
r1 = r.optional(lambda x: is_op("nn.relu")(x))
r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
r = r1 | r2
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
@debug_partition
def make_multiply_pattern():
r"""Create a pattern to match the following graph.
multiply
|
relu|relu6
"""
r = wildcard() * wildcard()
r1 = r.optional(lambda x: is_op("nn.relu")(x))
r2 = r.optional(lambda x: is_op("clip")(x)) # relu6
r = r1 | r2
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
def make_reshape_squeeze_pattern():
x = wildcard()
r = is_op("reshape")(x)
r = is_op("squeeze")(r)
r = is_op("relay.op.annotation.simulated_quantize")(r, is_constant(), is_constant(), is_constant()) | r
return r
# 配置融合规则
compiler_name = "vta_special"
pattern_table = [
# (f"{compiler_name}.concat", make_concat_pattern()),
(f"{compiler_name}.conv_add_squeeze", make_conv_add_squeeze_pattern()), # mobilenet_v2_tf 最后一层
(f"{compiler_name}.conv_add_relu_max_pool2d", make_conv_add_relu_max_pool2d_pattern()),
(f"{compiler_name}.conv2d_transpose_add_activate", make_conv2d_transpose_add_activate_pattern()),
(f"{compiler_name}.conv_add_activate", make_conv_add_activate_pattern()),
(f"{compiler_name}.max_pool2d", make_max_pool2d_pattern()),
(f"{compiler_name}.dense_add", make_dense_add_pattern()),
(f"{compiler_name}.adaptive_avg_pool2d", make_adaptive_avg_pool2d_pattern()),
(f"{compiler_name}.avg_pool2dd", make_avg_pool2d_pattern()),
(f"{compiler_name}.add_multiply_add", make_add_multiply_add_pattern()), # kr_karen
(f"{compiler_name}.add_add", make_add_add_pattern()),
(f"{compiler_name}.multiply_add", make_multiply_add_pattern()),
(f"{compiler_name}.add", make_add_pattern()),
(f"{compiler_name}.multiply", make_multiply_pattern()),
# ("det_class_predict_temp", make_det_class_predict_temp_pattern()),
# ("det_bbox_predict_temp",make_det_bbox_predict_temp_pattern())
# (f"{compiler_name}.strided_slice", make_strided_slice_pattern()),
]
merge_passes = tvm.transform.Sequential([
relay.transform.InferType(),
relay.transform.MergeComposite(pattern_table),
])
run_mod = deepcopy(mod)
with tvm.transform.PassContext(opt_level=3):
partition_mod_t = merge_passes(run_mod)
print(partition_mod_t)
def @main(%data: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] span=/conv/Conv.data:0:0 */) -> (Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) {
%0 = fn (%FunctionVar_8_0: Tensor[(1, 3, 48, 80), float32] /* ty=Tensor[(1, 3, 48, 80), float32] */, %FunctionVar_8_1: Tensor[(64, 3, 1, 1), float32] /* ty=Tensor[(64, 3, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 64, 48, 80), float32] {
nn.conv2d(%FunctionVar_8_0, %FunctionVar_8_1, padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 48, 80), float32] span=/conv/Conv:0:0 */
} /* ty=fn (Tensor[(1, 3, 48, 80), float32], Tensor[(64, 3, 1, 1), float32]) -> Tensor[(1, 64, 48, 80), float32] */;
%1 = %0(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 1, 1), float32] span=/conv/Conv.conv.weight:0:0 */) /* ty=Tensor[(1, 64, 48, 80), float32] */;
%2 = fn (%FunctionVar_7_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_7_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 48, 80), float32] {
nn.conv2d(%FunctionVar_7_0, %FunctionVar_7_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv0/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 48, 80), float32] */;
%3 = fn (%FunctionVar_6_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_6_1: Tensor[(64, 64, 1, 1), float32] /* ty=Tensor[(64, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 64, 24, 40), float32] {
nn.conv2d(%FunctionVar_6_0, %FunctionVar_6_1, strides=[2, 2], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 24, 40), float32] span=/resize_2/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(64, 64, 1, 1), float32]) -> Tensor[(1, 64, 24, 40), float32] */;
%4 = %3(%1, meta[relay.Constant][2] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_2/Conv.resize_2.weight:0:0 */) /* ty=Tensor[(1, 64, 24, 40), float32] */;
%5 = fn (%FunctionVar_5_0: Tensor[(1, 64, 24, 40), float32] /* ty=Tensor[(1, 64, 24, 40), float32] */, %FunctionVar_5_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 24, 40), float32] {
nn.conv2d(%FunctionVar_5_0, %FunctionVar_5_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv1/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 24, 40), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 24, 40), float32] */;
%6 = fn (%FunctionVar_4_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_4_1: Tensor[(64, 64, 1, 1), float32] /* ty=Tensor[(64, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 64, 12, 20), float32] {
nn.conv2d(%FunctionVar_4_0, %FunctionVar_4_1, strides=[4, 4], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]) /* ty=Tensor[(1, 64, 12, 20), float32] span=/resize_4/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(64, 64, 1, 1), float32]) -> Tensor[(1, 64, 12, 20), float32] */;
%7 = %6(%1, meta[relay.Constant][4] /* ty=Tensor[(64, 64, 1, 1), float32] span=/resize_4/Conv.resize_4.weight:0:0 */) /* ty=Tensor[(1, 64, 12, 20), float32] */;
%8 = fn (%FunctionVar_3_0: Tensor[(1, 64, 12, 20), float32] /* ty=Tensor[(1, 64, 12, 20), float32] */, %FunctionVar_3_1: Tensor[(48, 64, 1, 1), float32] /* ty=Tensor[(48, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 48, 12, 20), float32] {
nn.conv2d(%FunctionVar_3_0, %FunctionVar_3_1, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv2/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 12, 20), float32], Tensor[(48, 64, 1, 1), float32]) -> Tensor[(1, 48, 12, 20), float32] */;
%9 = %2(%1, meta[relay.Constant][1] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv0/Conv.conv0.weight:0:0 */) /* ty=Tensor[(1, 36, 48, 80), float32] */;
%10 = %5(%4, meta[relay.Constant][3] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv1/Conv.conv1.weight:0:0 */) /* ty=Tensor[(1, 36, 24, 40), float32] */;
%11 = %8(%7, meta[relay.Constant][5] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv2/Conv.conv2.weight:0:0 */) /* ty=Tensor[(1, 48, 12, 20), float32] */;
%12 = fn (%FunctionVar_2_0: Tensor[(1, 64, 48, 80), float32] /* ty=Tensor[(1, 64, 48, 80), float32] */, %FunctionVar_2_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 48, 80), float32] {
nn.conv2d(%FunctionVar_2_0, %FunctionVar_2_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 48, 80), float32] span=/conv00/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 48, 80), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 48, 80), float32] */;
%13 = fn (%FunctionVar_1_0: Tensor[(1, 64, 24, 40), float32] /* ty=Tensor[(1, 64, 24, 40), float32] */, %FunctionVar_1_1: Tensor[(36, 64, 1, 1), float32] /* ty=Tensor[(36, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 36, 24, 40), float32] {
nn.conv2d(%FunctionVar_1_0, %FunctionVar_1_1, padding=[0, 0, 0, 0], channels=36, kernel_size=[1, 1]) /* ty=Tensor[(1, 36, 24, 40), float32] span=/conv11/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 24, 40), float32], Tensor[(36, 64, 1, 1), float32]) -> Tensor[(1, 36, 24, 40), float32] */;
%14 = fn (%FunctionVar_0_0: Tensor[(1, 64, 12, 20), float32] /* ty=Tensor[(1, 64, 12, 20), float32] */, %FunctionVar_0_1: Tensor[(48, 64, 1, 1), float32] /* ty=Tensor[(48, 64, 1, 1), float32] */, PartitionedFromPattern="nn.conv2d_", Composite="vta_special.conv_add_activate") -> Tensor[(1, 48, 12, 20), float32] {
nn.conv2d(%FunctionVar_0_0, %FunctionVar_0_1, padding=[0, 0, 0, 0], channels=48, kernel_size=[1, 1]) /* ty=Tensor[(1, 48, 12, 20), float32] span=/conv22/Conv:0:0 */
} /* ty=fn (Tensor[(1, 64, 12, 20), float32], Tensor[(48, 64, 1, 1), float32]) -> Tensor[(1, 48, 12, 20), float32] */;
%15 = %12(%1, meta[relay.Constant][6] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv00/Conv.conv00.weight:0:0 */) /* ty=Tensor[(1, 36, 48, 80), float32] */;
%16 = %13(%4, meta[relay.Constant][7] /* ty=Tensor[(36, 64, 1, 1), float32] span=/conv11/Conv.conv11.weight:0:0 */) /* ty=Tensor[(1, 36, 24, 40), float32] */;
%17 = %14(%7, meta[relay.Constant][8] /* ty=Tensor[(48, 64, 1, 1), float32] span=/conv22/Conv.conv22.weight:0:0 */) /* ty=Tensor[(1, 48, 12, 20), float32] */;
%18 = llvm_special.det_class_predict(%9, %10, %11, __dict__={"class_num"=3}) /* ty=Tensor[(1, 61440, 3), float32] */;
%19 = llvm_special.det_bbox_predict(%15, %16, %17) /* ty=Tensor[(1, 46080, 4), float32] */;
(%18, %19) /* ty=(Tensor[(1, 61440, 3), float32], Tensor[(1, 46080, 4), float32]) */
}