在 VTA 上从 MxNet 部署预训练的视觉模型#

Author: Thierry Moreau

本教程提供了端到端的 demo,介绍了如何在 VTA 加速器设计上运行 ImageNet 分类推理来执行 ImageNet 分类任务。它将 Relay 展示为前端编译器,它可以执行量化(VTA 只支持 int8/32 推理)和 graph packing(以便在 core 中支持张量化),从而为硬件目标处理计算图。

安装依赖#

要在 tvm 中使用 autovm 包,需要安装一些额外的依赖项。(如果你使用 python2,将 “3” 改为 “2”):

pip3 install --user mxnet requests "Pillow<7"

现在回到 python 代码。导入包。

import os, time
from PIL import Image

from mxnet.gluon.model_zoo import vision
import numpy as np
from matplotlib import pyplot as plt

import tvm
import logging
from tvm.ir.transform import PassContext
from tvm import rpc, autotvm, relay
from tvm.contrib import graph_executor, utils, download
# from tvm.contrib.debugger import debug_executor
# from tvm.relay import transform

import vta
from vta.testing import simulator
from vta.top import graph_pack

# Make sure that TVM was compiled with RPC=1
assert tvm.runtime.enabled("rpc")

定义 platform#

在 CPU 和 VTA 上执行,并定义模型。

3rdparty/vta-hw/config/vta_config.json 文件加载 VTA 参数:

env = vta.get_env()

设定设备:

  1. 在 CPU 上推理,使用 device=arm_cpu

  2. 在 FPGA 上推理,使用 device=vta

ctx = "vta"

用于查找何时 start/end bit packing 的字典:

pack_dict = {
    "resnet18_v1": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet34_v1": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet18_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet34_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet50_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet101_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
}

备注

start_packstop_pack 标签指示从哪里开始和结束 graph packing relay pass:换句话说,从哪里开始和结束 VTA 卸载。

设定运行目标设备:

target = env.target if ctx == "vta" else env.target_vta_cpu

获取远程执行#

env.TARGET'pynq' 时,重新配置 FPGA 和 runtime。否则,如果 env.TARGET'sim',则在本地执行。

if env.TARGET not in ["sim", "tsim", "intelfocl"]:
    # 如果设置环境变量,从 tracker 节点获取 remote。
    # 要设置 tracker,您需要遵循“自动调优卷积网络用于 VTA ”教程。
    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
    # 否则,如果你有设备,你想直接从 host 编程,
    # 确保你已经设置了下面的变量为你的板的 IP。
    device_host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
    device_port = os.environ.get("VTA_RPC_PORT", "9091")
    if not tracker_host or not tracker_port:
        remote = rpc.connect(device_host, int(device_port))
    else:
        remote = autotvm.measure.request_remote(
            env.TARGET, 
            tracker_host, 
            int(tracker_port), 
            timeout=10000
        )

    # 重新配置 JIT 运行时和 FPGA。
    # 通过将路径传递给 bitstream 文件而不是 None,
    # 您可以使用自己的自定义 bitstream 编程 FPGA。
    reconfig_start = time.time()
    vta.reconfig_runtime(remote)
    vta.program_fpga(remote, bitstream=None)
    reconfig_time = time.time() - reconfig_start
    print(f"Reconfigured FPGA and RPC runtime in {reconfig_time:.2f}s!")
# 在仿真模式中,在本地托管 RPC 服务器。
else:
    remote = rpc.LocalSession()
    if env.TARGET in ["intelfocl"]:
        # program intelfocl aocx
        vta.program_fpga(remote, bitstream="vta.bitstream")

从远程获取执行上下文:

if env.TARGET == "intelfocl":
    ctxes = [remote.ext_dev(0), remote.cpu(0)]
else:
    # Graph runtime
    ctxes = remote.ext_dev(0) if ctx == "vta" else remote.cpu(0)

构建 graph executor 推理#

从 Gluon 模型动物园抓取视觉模型,用 Relay 编译。编译步骤如下:

  1. 将 MXNet 前端模块翻译为 Relay 模块。

  2. 应用 8-bit 量化:这里跳过了第一个 conv 层和 dense 层,这两个层都将在 CPU 上的 fp32 中执行。

  3. 执行 graph packing 来改变张量化的数据布局。

  4. 进行常数折叠以减少算子的数量(例如,消除 batch norm multiply)。

  5. 执行对 object 文件的 relay 构建。

  6. 将 object 文件加载到远程(FPGA 设备)。

加载预配置的 AutoTVM 调度:

# 需要编译的模型名称
model = "resnet18_v1"
assert model in pack_dict
# 为 ImageNet 分类器输入填充 shape 和数据类型字典
dtype_dict = {"data": "float32"}
shape_dict = {"data": (env.BATCH, 3, 224, 224)}

# 取下 gluon 模型,转换成 relay
gluon_model = vision.get_model(model, pretrained=True)
with autotvm.tophub.context(target):
    # 度量构建的开始时间
    build_start = time.time()
    # 开始前端编译
    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
    # 更新 shape 和 type 字典
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
    if target.device_name == "vta":
        # 在 Relay 中执行量化
        # 注意:为了 fold batch norm,将 `opt_level` 设置为 `3`
        with PassContext(opt_level=3):
            with relay.quantize.qconfig(global_scale=8.0,
                                        skip_conv_layers=[]):
                mod = relay.quantize.quantize(mod, params=params)
            # 对 VTA target 进行 graph packing 和 constant folding
            assert env.BLOCK_IN == env.BLOCK_OUT
            # 如果目标是 intelfocl 或 sim,是否有 device annotation
            relay_prog = graph_pack(
                mod["main"],
                env.BATCH,
                env.BLOCK_OUT,
                env.WGT_WIDTH,
                start_name="nn.conv2d", #pack_dict[model][0],
                stop_name=pack_dict[model][1],
                device_annot=(env.TARGET == "intelfocl"),
            )
    else:
        relay_prog = mod["main"]

    # 禁用 AlterOpLayout,编译 Relay 程序
    if target.device_name != "vta":
        with PassContext(opt_level=3,
                         disabled_pass={"AlterOpLayout"}):
            lib = relay.build(
                relay_prog,
                target=target,
                params=params
            )
    else:
        if env.TARGET == "intelfocl":
            # 在 CPU 和 VTA 上运行多个目标
            target = {"cpu": env.target_vta_cpu,
                      "ext_dev": target}
        with vta.build_config(
            opt_level=3,
            disabled_pass={"AlterOpLayout",
                           "tir.CommonSubexprElimTIR"}
        ):
            lib = relay.build(relay_prog,
                              target=target,
                              params=params)
    # 度量 Relay 构建时间
    build_time = time.time() - build_start
    logging.info(f"{model} inference graph built in {build_time:.2f}s!")

    # 将 inference library 发送到远程 RPC 服务器
    temp = utils.tempdir()
    lib.export_library(temp.relpath("graphlib.tar"))
    remote.upload(temp.relpath("graphlib.tar"))
    loaded_lib = remote.load_module("graphlib.tar")
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
[18:35:11] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:12] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:12] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:13] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:13] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:14] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:14] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:15] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:15] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:16] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:16] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:17] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:18] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:18] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:19] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
[18:35:19] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
2023-09-25 18:35:27.285 INFO load_module /tmp/tmp1nbmfxtn/graphlib.tar

进行图像分类推理#

只需要下载 category 文件,synset.txt 和输入测试图像。

# 下载 ImageNet categories
categ_url = "https://github.com/uwsampl/web-data/raw/main/vta/models"
categ_fn = "synset.txt"
download.download(f"{categ_url}/{categ_fn}", categ_fn)
synset = eval(open(categ_fn).read())
# 下载测试图片
image_url = "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg"
image_fn = "cat.png"
download.download(image_url, image_fn)
# 准备用于推理的测试图像
image = Image.open(image_fn).resize((224, 224))
plt.imshow(image)
plt.show()
image = np.array(image) - np.array([123.0, 117.0, 104.0])
image /= np.array([58.395, 57.12, 57.375])
image = image.transpose((2, 0, 1))
image = image[np.newaxis, :]
image = np.repeat(image, env.BATCH, axis=0)

# 生成图执行器(graph executor) `m`。
m = graph_executor.GraphModule(loaded_lib["default"](ctxes))
# 设置网络参数和输入
m.set_input(**params)
m.set_input("data", image)
../../../../../_images/bbf2eae8a9bdea262b6b8ad3a96e193bb88c83ee195b4457f757d5c9bd3af4a5.png

执行推理并收集执行统计信息#

小技巧

更多内容参考 。

num = 4  # 为单个度量运行模块的次数
rep = 3  # 测量的数量(由此得出 std dev)
timer = m.module.time_evaluator("run",
                                ctxes,
                                number=num,
                                repeat=rep)

if env.TARGET in ["sim", "tsim"]:
    simulator.clear_stats()
    timer()
    sim_stats = simulator.stats()
    print("\nExecution statistics:")
    for k, v in sim_stats.items():
        # 由于多次执行 workload,需要 normalize 统计数据。
        # 注意,总是有一次 warm up 运行
        # 因此,将整体统计数据除以 (num * rep + 1)
        print(f"\t{k:<16}: {v // (num * rep + 1):>16}")
else:
    tcost = timer()
    std = np.std(tcost.results) * 1000
    mean = tcost.mean * 1000
    print(f"\nPerformed inference in {mean:.2f}ms (std = {std:.2f}) for {env.BATCH} samples")
    print(f"Average per sample inference time: {mean / env.BATCH:.2f}ms")

# 得到的分类结果
tvm_output = m.get_output(0, tvm.nd.empty(
    (env.BATCH, 1000), "float32", remote.cpu(0)))
for b in range(env.BATCH):
    top_categories = np.argsort(tvm_output.numpy()[b])
    # 报告 top-5 分类结果
    print(f"\n{model} prediction for sample {b}")
    print("\t#1:", synset[top_categories[-1]])
    print("\t#2:", synset[top_categories[-2]])
    print("\t#3:", synset[top_categories[-3]])
    print("\t#4:", synset[top_categories[-4]])
    print("\t#5:", synset[top_categories[-5]])
    # 这只是检查 5 个顶级类别之一是一种猫;
    # 这绝不是对量化如何影响分类 accuracy 的准确评估,
    # 而是旨在捕捉在 CI 中会影响 accuracy 的量化传递的变化。
    cat_detected = False
    for k in top_categories[-5:]:
        if "cat" in synset[k]:
            cat_detected = True
    assert cat_detected
Execution statistics:
	inp_load_nbytes :          5549568
	wgt_load_nbytes :         12763136
	acc_load_nbytes :          6051840
	uop_load_nbytes :            22872
	out_store_nbytes:          2433536
	gemm_counter    :          6623232
	alu_counter     :           702464

resnet18_v1 prediction for sample 0
	#1: tiger cat
	#2: Egyptian cat
	#3: tabby, tabby cat
	#4: lynx, catamount
	#5: weasel
[18:35:30] /media/pc/data/lxw/ai/tvm/src/runtime/profiling.cc:101: Warning: No timer implementation for ext_dev, using default timer instead. It may be inaccurate or have extra overhead.
lib.ir_mod.show()
def @main(%data: Tensor[(1, 3, 224, 224), float32] /* ty=Tensor[(1, 3, 224, 224), float32] */) -> Tensor[(1, 1000), float32] {
  %0 = multiply(%data, 16f /* ty=float32 */) /* ty=Tensor[(1, 3, 224, 224), float32] */;
  %1 = round(%0) /* ty=Tensor[(1, 3, 224, 224), float32] */;
  %2 = clip(%1, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 3, 224, 224), float32] */;
  %3 = cast(%2, dtype="int8") /* ty=Tensor[(1, 3, 224, 224), int8] */;
  %4 = nn.conv2d(%3, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 7, 7), int8] */, strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7], out_dtype="int32") /* ty=Tensor[(1, 64, 112, 112), int32] */;
  %5 = reshape(%4, newshape=[1, 1, 4, 16, 112, 112]) /* ty=Tensor[(1, 1, 4, 16, 112, 112), int32] */;
  %6 = reshape(meta[relay.Constant][1] /* ty=Tensor[(64, 1, 1), int32] */, newshape=[4, 16, 1, 1, 1]) /* ty=Tensor[(4, 16, 1, 1, 1), int32] */;
  %7 = transpose(%6, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %8 = transpose(%5, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(1, 4, 112, 112, 1, 16), int32] */;
  %9 = broadcast_to(%7, shape=[4, 1, 1, 1, 16]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %10 = add(%8, %9) /* ty=Tensor[(1, 4, 112, 112, 1, 16), int32] */;
  %11 = nn.relu(%10) /* ty=Tensor[(1, 4, 112, 112, 1, 16), int32] */;
  %12 = add(%11, 256 /* ty=int32 */) /* ty=Tensor[(1, 4, 112, 112, 1, 16), int32] */;
  %13 = right_shift(%12, 9 /* ty=int32 */) /* ty=Tensor[(1, 4, 112, 112, 1, 16), int32] */;
  %14 = clip(%13, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 112, 112, 1, 16), int32] */;
  %15 = cast(%14, dtype="int8") /* ty=Tensor[(1, 4, 112, 112, 1, 16), int8] */;
  %16 = nn.max_pool2d(%15, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %17 = clip(%16, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %18 = cast(%17, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %19 = annotation.stop_fusion(%18) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %20 = cast(%17, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %21 = reshape(meta[relay.Constant][2] /* ty=Tensor[(64, 64, 3, 3), int8] */, newshape=[4, 16, 4, 16, 3, 3]) /* ty=Tensor[(4, 16, 4, 16, 3, 3), int8] */;
  %22 = annotation.stop_fusion(%20) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %23 = transpose(%21, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(4, 4, 3, 3, 16, 16), int8] */;
  %24 = reshape(meta[relay.Constant][3] /* ty=Tensor[(64, 1, 1), int32] */, newshape=[4, 16, 1, 1, 1]) /* ty=Tensor[(4, 16, 1, 1, 1), int32] */;
  %25 = transpose(%24, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %26 = nn.conv2d(%22, %23, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %27 = broadcast_to(%25, shape=[4, 1, 1, 1, 16]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %28 = add(%26, %27) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %29 = nn.relu(%28) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %30 = add(%29, 64 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %31 = right_shift(%30, 7 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %32 = clip(%31, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %33 = cast(%32, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %34 = reshape(meta[relay.Constant][4] /* ty=Tensor[(64, 64, 3, 3), int8] */, newshape=[4, 16, 4, 16, 3, 3]) /* ty=Tensor[(4, 16, 4, 16, 3, 3), int8] */;
  %35 = annotation.stop_fusion(%33) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %36 = transpose(%34, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(4, 4, 3, 3, 16, 16), int8] */;
  %37 = reshape(meta[relay.Constant][5] /* ty=Tensor[(64, 1, 1), int32] */, newshape=[4, 16, 1, 1, 1]) /* ty=Tensor[(4, 16, 1, 1, 1), int32] */;
  %38 = transpose(%37, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %39 = nn.conv2d(%35, %36, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %40 = broadcast_to(%38, shape=[4, 1, 1, 1, 16]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %41 = add(%39, %40) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %42 = add(%41, 32 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %43 = right_shift(%42, 6 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %44 = clip(%43, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %45 = cast(%44, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %46 = annotation.stop_fusion(%45) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %47 = cast(%19, dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %48 = cast(%46, dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %49 = add(%47, %48) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %50 = nn.relu(%49) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %51 = clip(%50, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %52 = cast(%51, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %53 = annotation.stop_fusion(%52) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %54 = cast(%51, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %55 = reshape(meta[relay.Constant][6] /* ty=Tensor[(64, 64, 3, 3), int8] */, newshape=[4, 16, 4, 16, 3, 3]) /* ty=Tensor[(4, 16, 4, 16, 3, 3), int8] */;
  %56 = annotation.stop_fusion(%54) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %57 = transpose(%55, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(4, 4, 3, 3, 16, 16), int8] */;
  %58 = reshape(meta[relay.Constant][7] /* ty=Tensor[(64, 1, 1), int32] */, newshape=[4, 16, 1, 1, 1]) /* ty=Tensor[(4, 16, 1, 1, 1), int32] */;
  %59 = transpose(%58, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %60 = nn.conv2d(%56, %57, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %61 = broadcast_to(%59, shape=[4, 1, 1, 1, 16]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %62 = add(%60, %61) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %63 = nn.relu(%62) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %64 = add(%63, 256 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %65 = right_shift(%64, 9 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %66 = clip(%65, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %67 = cast(%66, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %68 = reshape(meta[relay.Constant][8] /* ty=Tensor[(64, 64, 3, 3), int8] */, newshape=[4, 16, 4, 16, 3, 3]) /* ty=Tensor[(4, 16, 4, 16, 3, 3), int8] */;
  %69 = annotation.stop_fusion(%67) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %70 = transpose(%68, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(4, 4, 3, 3, 16, 16), int8] */;
  %71 = reshape(meta[relay.Constant][9] /* ty=Tensor[(64, 1, 1), int32] */, newshape=[4, 16, 1, 1, 1]) /* ty=Tensor[(4, 16, 1, 1, 1), int32] */;
  %72 = transpose(%71, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %73 = nn.conv2d(%69, %70, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %74 = broadcast_to(%72, shape=[4, 1, 1, 1, 16]) /* ty=Tensor[(4, 1, 1, 1, 16), int32] */;
  %75 = add(%73, %74) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %76 = add(%75, 32 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %77 = right_shift(%76, 6 /* ty=int32 */) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %78 = clip(%77, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %79 = cast(%78, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %80 = annotation.stop_fusion(%79) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %81 = cast(%53, dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %82 = cast(%80, dtype="int32") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %83 = add(%81, %82) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %84 = nn.relu(%83) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %85 = clip(%84, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int32] */;
  %86 = cast(%85, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %87 = reshape(meta[relay.Constant][10] /* ty=Tensor[(128, 64, 1, 1), int8] */, newshape=[8, 16, 4, 16, 1, 1]) /* ty=Tensor[(8, 16, 4, 16, 1, 1), int8] */;
  %88 = annotation.stop_fusion(%86) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %89 = transpose(%87, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(8, 4, 1, 1, 16, 16), int8] */;
  %90 = reshape(meta[relay.Constant][11] /* ty=Tensor[(128, 1, 1), int32] */, newshape=[8, 16, 1, 1, 1]) /* ty=Tensor[(8, 16, 1, 1, 1), int32] */;
  %91 = transpose(%90, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %92 = nn.conv2d(%88, %89, strides=[2, 2], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %93 = broadcast_to(%91, shape=[8, 1, 1, 1, 16]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %94 = add(%92, %93) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %95 = add(%94, 64 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %96 = right_shift(%95, 7 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %97 = clip(%96, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %98 = cast(%97, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %99 = annotation.stop_fusion(%98) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %100 = cast(%85, dtype="int8") /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %101 = reshape(meta[relay.Constant][12] /* ty=Tensor[(128, 64, 3, 3), int8] */, newshape=[8, 16, 4, 16, 3, 3]) /* ty=Tensor[(8, 16, 4, 16, 3, 3), int8] */;
  %102 = annotation.stop_fusion(%100) /* ty=Tensor[(1, 4, 56, 56, 1, 16), int8] */;
  %103 = transpose(%101, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(8, 4, 3, 3, 16, 16), int8] */;
  %104 = reshape(meta[relay.Constant][13] /* ty=Tensor[(128, 1, 1), int32] */, newshape=[8, 16, 1, 1, 1]) /* ty=Tensor[(8, 16, 1, 1, 1), int32] */;
  %105 = transpose(%104, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %106 = nn.conv2d(%102, %103, strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %107 = broadcast_to(%105, shape=[8, 1, 1, 1, 16]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %108 = add(%106, %107) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %109 = nn.relu(%108) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %110 = add(%109, 256 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %111 = right_shift(%110, 9 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %112 = clip(%111, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %113 = cast(%112, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %114 = reshape(meta[relay.Constant][14] /* ty=Tensor[(128, 128, 3, 3), int8] */, newshape=[8, 16, 8, 16, 3, 3]) /* ty=Tensor[(8, 16, 8, 16, 3, 3), int8] */;
  %115 = annotation.stop_fusion(%113) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %116 = transpose(%114, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(8, 8, 3, 3, 16, 16), int8] */;
  %117 = reshape(meta[relay.Constant][15] /* ty=Tensor[(128, 1, 1), int32] */, newshape=[8, 16, 1, 1, 1]) /* ty=Tensor[(8, 16, 1, 1, 1), int32] */;
  %118 = transpose(%117, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %119 = nn.conv2d(%115, %116, padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %120 = broadcast_to(%118, shape=[8, 1, 1, 1, 16]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %121 = add(%119, %120) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %122 = add(%121, 64 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %123 = right_shift(%122, 7 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %124 = clip(%123, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %125 = cast(%124, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %126 = annotation.stop_fusion(%125) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %127 = cast(%99, dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %128 = cast(%126, dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %129 = add(%127, %128) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %130 = nn.relu(%129) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %131 = clip(%130, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %132 = cast(%131, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %133 = annotation.stop_fusion(%132) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %134 = cast(%131, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %135 = reshape(meta[relay.Constant][16] /* ty=Tensor[(128, 128, 3, 3), int8] */, newshape=[8, 16, 8, 16, 3, 3]) /* ty=Tensor[(8, 16, 8, 16, 3, 3), int8] */;
  %136 = annotation.stop_fusion(%134) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %137 = transpose(%135, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(8, 8, 3, 3, 16, 16), int8] */;
  %138 = reshape(meta[relay.Constant][17] /* ty=Tensor[(128, 1, 1), int32] */, newshape=[8, 16, 1, 1, 1]) /* ty=Tensor[(8, 16, 1, 1, 1), int32] */;
  %139 = transpose(%138, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %140 = nn.conv2d(%136, %137, padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %141 = broadcast_to(%139, shape=[8, 1, 1, 1, 16]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %142 = add(%140, %141) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %143 = nn.relu(%142) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %144 = add(%143, 128 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %145 = right_shift(%144, 8 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %146 = clip(%145, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %147 = cast(%146, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %148 = reshape(meta[relay.Constant][18] /* ty=Tensor[(128, 128, 3, 3), int8] */, newshape=[8, 16, 8, 16, 3, 3]) /* ty=Tensor[(8, 16, 8, 16, 3, 3), int8] */;
  %149 = annotation.stop_fusion(%147) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %150 = transpose(%148, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(8, 8, 3, 3, 16, 16), int8] */;
  %151 = reshape(meta[relay.Constant][19] /* ty=Tensor[(128, 1, 1), int32] */, newshape=[8, 16, 1, 1, 1]) /* ty=Tensor[(8, 16, 1, 1, 1), int32] */;
  %152 = transpose(%151, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %153 = nn.conv2d(%149, %150, padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %154 = broadcast_to(%152, shape=[8, 1, 1, 1, 16]) /* ty=Tensor[(8, 1, 1, 1, 16), int32] */;
  %155 = add(%153, %154) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %156 = add(%155, 64 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %157 = right_shift(%156, 7 /* ty=int32 */) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %158 = clip(%157, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %159 = cast(%158, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %160 = annotation.stop_fusion(%159) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %161 = cast(%133, dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %162 = cast(%160, dtype="int32") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %163 = add(%161, %162) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %164 = nn.relu(%163) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %165 = clip(%164, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int32] */;
  %166 = cast(%165, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %167 = reshape(meta[relay.Constant][20] /* ty=Tensor[(256, 128, 1, 1), int8] */, newshape=[16, 16, 8, 16, 1, 1]) /* ty=Tensor[(16, 16, 8, 16, 1, 1), int8] */;
  %168 = annotation.stop_fusion(%166) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %169 = transpose(%167, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(16, 8, 1, 1, 16, 16), int8] */;
  %170 = reshape(meta[relay.Constant][21] /* ty=Tensor[(256, 1, 1), int32] */, newshape=[16, 16, 1, 1, 1]) /* ty=Tensor[(16, 16, 1, 1, 1), int32] */;
  %171 = transpose(%170, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %172 = nn.conv2d(%168, %169, strides=[2, 2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %173 = broadcast_to(%171, shape=[16, 1, 1, 1, 16]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %174 = add(%172, %173) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %175 = add(%174, 128 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %176 = right_shift(%175, 8 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %177 = clip(%176, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %178 = cast(%177, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %179 = annotation.stop_fusion(%178) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %180 = cast(%165, dtype="int8") /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %181 = reshape(meta[relay.Constant][22] /* ty=Tensor[(256, 128, 3, 3), int8] */, newshape=[16, 16, 8, 16, 3, 3]) /* ty=Tensor[(16, 16, 8, 16, 3, 3), int8] */;
  %182 = annotation.stop_fusion(%180) /* ty=Tensor[(1, 8, 28, 28, 1, 16), int8] */;
  %183 = transpose(%181, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(16, 8, 3, 3, 16, 16), int8] */;
  %184 = reshape(meta[relay.Constant][23] /* ty=Tensor[(256, 1, 1), int32] */, newshape=[16, 16, 1, 1, 1]) /* ty=Tensor[(16, 16, 1, 1, 1), int32] */;
  %185 = transpose(%184, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %186 = nn.conv2d(%182, %183, strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %187 = broadcast_to(%185, shape=[16, 1, 1, 1, 16]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %188 = add(%186, %187) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %189 = nn.relu(%188) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %190 = add(%189, 256 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %191 = right_shift(%190, 9 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %192 = clip(%191, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %193 = cast(%192, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %194 = reshape(meta[relay.Constant][24] /* ty=Tensor[(256, 256, 3, 3), int8] */, newshape=[16, 16, 16, 16, 3, 3]) /* ty=Tensor[(16, 16, 16, 16, 3, 3), int8] */;
  %195 = annotation.stop_fusion(%193) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %196 = transpose(%194, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(16, 16, 3, 3, 16, 16), int8] */;
  %197 = reshape(meta[relay.Constant][25] /* ty=Tensor[(256, 1, 1), int32] */, newshape=[16, 16, 1, 1, 1]) /* ty=Tensor[(16, 16, 1, 1, 1), int32] */;
  %198 = transpose(%197, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %199 = nn.conv2d(%195, %196, padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %200 = broadcast_to(%198, shape=[16, 1, 1, 1, 16]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %201 = add(%199, %200) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %202 = add(%201, 64 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %203 = right_shift(%202, 7 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %204 = clip(%203, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %205 = cast(%204, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %206 = annotation.stop_fusion(%205) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %207 = cast(%179, dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %208 = cast(%206, dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %209 = add(%207, %208) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %210 = nn.relu(%209) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %211 = clip(%210, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %212 = cast(%211, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %213 = annotation.stop_fusion(%212) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %214 = cast(%211, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %215 = reshape(meta[relay.Constant][26] /* ty=Tensor[(256, 256, 3, 3), int8] */, newshape=[16, 16, 16, 16, 3, 3]) /* ty=Tensor[(16, 16, 16, 16, 3, 3), int8] */;
  %216 = annotation.stop_fusion(%214) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %217 = transpose(%215, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(16, 16, 3, 3, 16, 16), int8] */;
  %218 = reshape(meta[relay.Constant][27] /* ty=Tensor[(256, 1, 1), int32] */, newshape=[16, 16, 1, 1, 1]) /* ty=Tensor[(16, 16, 1, 1, 1), int32] */;
  %219 = transpose(%218, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %220 = nn.conv2d(%216, %217, padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %221 = broadcast_to(%219, shape=[16, 1, 1, 1, 16]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %222 = add(%220, %221) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %223 = nn.relu(%222) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %224 = add(%223, 128 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %225 = right_shift(%224, 8 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %226 = clip(%225, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %227 = cast(%226, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %228 = reshape(meta[relay.Constant][28] /* ty=Tensor[(256, 256, 3, 3), int8] */, newshape=[16, 16, 16, 16, 3, 3]) /* ty=Tensor[(16, 16, 16, 16, 3, 3), int8] */;
  %229 = annotation.stop_fusion(%227) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %230 = transpose(%228, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(16, 16, 3, 3, 16, 16), int8] */;
  %231 = reshape(meta[relay.Constant][29] /* ty=Tensor[(256, 1, 1), int32] */, newshape=[16, 16, 1, 1, 1]) /* ty=Tensor[(16, 16, 1, 1, 1), int32] */;
  %232 = transpose(%231, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %233 = nn.conv2d(%229, %230, padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %234 = broadcast_to(%232, shape=[16, 1, 1, 1, 16]) /* ty=Tensor[(16, 1, 1, 1, 16), int32] */;
  %235 = add(%233, %234) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %236 = add(%235, 64 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %237 = right_shift(%236, 7 /* ty=int32 */) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %238 = clip(%237, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %239 = cast(%238, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %240 = annotation.stop_fusion(%239) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %241 = cast(%213, dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %242 = cast(%240, dtype="int32") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %243 = add(%241, %242) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %244 = nn.relu(%243) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %245 = clip(%244, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int32] */;
  %246 = cast(%245, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %247 = reshape(meta[relay.Constant][30] /* ty=Tensor[(512, 256, 1, 1), int8] */, newshape=[32, 16, 16, 16, 1, 1]) /* ty=Tensor[(32, 16, 16, 16, 1, 1), int8] */;
  %248 = annotation.stop_fusion(%246) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %249 = transpose(%247, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(32, 16, 1, 1, 16, 16), int8] */;
  %250 = reshape(meta[relay.Constant][31] /* ty=Tensor[(512, 1, 1), int32] */, newshape=[32, 16, 1, 1, 1]) /* ty=Tensor[(32, 16, 1, 1, 1), int32] */;
  %251 = transpose(%250, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %252 = nn.conv2d(%248, %249, strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %253 = broadcast_to(%251, shape=[32, 1, 1, 1, 16]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %254 = add(%252, %253) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %255 = add(%254, 32 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %256 = right_shift(%255, 6 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %257 = clip(%256, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %258 = cast(%257, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %259 = annotation.stop_fusion(%258) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %260 = cast(%245, dtype="int8") /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %261 = reshape(meta[relay.Constant][32] /* ty=Tensor[(512, 256, 3, 3), int8] */, newshape=[32, 16, 16, 16, 3, 3]) /* ty=Tensor[(32, 16, 16, 16, 3, 3), int8] */;
  %262 = annotation.stop_fusion(%260) /* ty=Tensor[(1, 16, 14, 14, 1, 16), int8] */;
  %263 = transpose(%261, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(32, 16, 3, 3, 16, 16), int8] */;
  %264 = reshape(meta[relay.Constant][33] /* ty=Tensor[(512, 1, 1), int32] */, newshape=[32, 16, 1, 1, 1]) /* ty=Tensor[(32, 16, 1, 1, 1), int32] */;
  %265 = transpose(%264, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %266 = nn.conv2d(%262, %263, strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %267 = broadcast_to(%265, shape=[32, 1, 1, 1, 16]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %268 = add(%266, %267) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %269 = nn.relu(%268) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %270 = add(%269, 128 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %271 = right_shift(%270, 8 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %272 = clip(%271, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %273 = cast(%272, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %274 = reshape(meta[relay.Constant][34] /* ty=Tensor[(512, 512, 3, 3), int8] */, newshape=[32, 16, 32, 16, 3, 3]) /* ty=Tensor[(32, 16, 32, 16, 3, 3), int8] */;
  %275 = annotation.stop_fusion(%273) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %276 = transpose(%274, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(32, 32, 3, 3, 16, 16), int8] */;
  %277 = reshape(meta[relay.Constant][35] /* ty=Tensor[(512, 1, 1), int32] */, newshape=[32, 16, 1, 1, 1]) /* ty=Tensor[(32, 16, 1, 1, 1), int32] */;
  %278 = transpose(%277, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %279 = nn.conv2d(%275, %276, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %280 = broadcast_to(%278, shape=[32, 1, 1, 1, 16]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %281 = add(%279, %280) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %282 = add(%281, 32 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %283 = right_shift(%282, 6 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %284 = clip(%283, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %285 = cast(%284, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %286 = annotation.stop_fusion(%285) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %287 = cast(%259, dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %288 = cast(%286, dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %289 = add(%287, %288) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %290 = nn.relu(%289) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %291 = clip(%290, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %292 = cast(%291, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %293 = annotation.stop_fusion(%292) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %294 = cast(%291, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %295 = reshape(meta[relay.Constant][36] /* ty=Tensor[(512, 512, 3, 3), int8] */, newshape=[32, 16, 32, 16, 3, 3]) /* ty=Tensor[(32, 16, 32, 16, 3, 3), int8] */;
  %296 = annotation.stop_fusion(%294) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %297 = transpose(%295, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(32, 32, 3, 3, 16, 16), int8] */;
  %298 = reshape(meta[relay.Constant][37] /* ty=Tensor[(512, 1, 1), int32] */, newshape=[32, 16, 1, 1, 1]) /* ty=Tensor[(32, 16, 1, 1, 1), int32] */;
  %299 = transpose(%298, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %300 = nn.conv2d(%296, %297, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %301 = broadcast_to(%299, shape=[32, 1, 1, 1, 16]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %302 = add(%300, %301) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %303 = nn.relu(%302) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %304 = add(%303, 128 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %305 = right_shift(%304, 8 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %306 = clip(%305, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %307 = cast(%306, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %308 = reshape(meta[relay.Constant][38] /* ty=Tensor[(512, 512, 3, 3), int8] */, newshape=[32, 16, 32, 16, 3, 3]) /* ty=Tensor[(32, 16, 32, 16, 3, 3), int8] */;
  %309 = annotation.stop_fusion(%307) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %310 = transpose(%308, axes=[0, 2, 4, 5, 1, 3]) /* ty=Tensor[(32, 32, 3, 3, 16, 16), int8] */;
  %311 = reshape(meta[relay.Constant][39] /* ty=Tensor[(512, 1, 1), int32] */, newshape=[32, 16, 1, 1, 1]) /* ty=Tensor[(32, 16, 1, 1, 1), int32] */;
  %312 = transpose(%311, axes=[0, 2, 3, 4, 1]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %313 = nn.conv2d(%309, %310, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3], data_layout="NCHW1n16c", kernel_layout="OIHW16o16i", out_dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %314 = broadcast_to(%312, shape=[32, 1, 1, 1, 16]) /* ty=Tensor[(32, 1, 1, 1, 16), int32] */;
  %315 = add(%313, %314) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %316 = add(%315, 8 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %317 = right_shift(%316, 4 /* ty=int32 */) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %318 = clip(%317, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %319 = cast(%318, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %320 = annotation.stop_fusion(%319) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %321 = cast(%293, dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %322 = cast(%320, dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %323 = add(%321, %322) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %324 = nn.relu(%323) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %325 = clip(%324, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %326 = cast(%325, dtype="int8") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %327 = annotation.stop_fusion(%326) /* ty=Tensor[(1, 32, 7, 7, 1, 16), int8] */;
  %328 = cast(%327, dtype="int32") /* ty=Tensor[(1, 32, 7, 7, 1, 16), int32] */;
  %329 = transpose(%328, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 32, 16, 7, 7), int32] */;
  %330 = reshape(%329, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */;
  %331 = nn.global_avg_pool2d(%330) /* ty=Tensor[(1, 512, 1, 1), int32] */;
  %332 = nn.batch_flatten(%331) /* ty=Tensor[(1, 512), int32] */;
  %333 = cast(%332, dtype="float32") /* ty=Tensor[(1, 512), float32] */;
  %334 = multiply(%333, 0.0625f /* ty=float32 */) /* ty=Tensor[(1, 512), float32] */;
  %335 = nn.dense(%334, meta[relay.Constant][40] /* ty=Tensor[(1000, 512), float32] */, units=1000) /* ty=Tensor[(1, 1000), float32] */;
  add(%335, meta[relay.Constant][41] /* ty=Tensor[(1000), float32] */) /* ty=Tensor[(1, 1000), float32] */
}