VTA topi.conv2d

VTA topi.conv2d#

import numpy as np
from collections import namedtuple

import tvm
from tvm import te
from tvm import relay
from tvm import autotvm
from tvm.contrib.utils import tempdir
# from tvm.contrib.pickle_memoize import memoize
from tvm import topi
import tvm.topi.testing
import vta
import vta.testing
from vta.testing import simulator

# FIXME: 需要自定义 clip 算子来规避某种模式检测的限制。
@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
def my_clip(x, a_min, a_max):
    """与 topi 当前的 clip 不同，将最小值和最大值分为两个阶段。"""
    const_min = tvm.tir.const(a_min, x.dtype)
    const_max = tvm.tir.const(a_max, x.dtype)
    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
    return x

Workload = namedtuple(
    "Conv2DWorkload",
    [
        "batch",
        "height",
        "width",
        "in_filter",
        "out_filter",
        "hkernel",
        "wkernel",
        "hpad",
        "wpad",
        "hstride",
        "wstride",
    ],
)
env = vta.get_env()
remote = tvm.rpc.LocalSession()
# ResNet18 workloads
resnet_wkls = [
    # Workloads of resnet18 on imagenet
    # ('resnet-18.C1',  Workload(env.BATCH, 224, 224, 3,   64,  7, 7, 3, 3, 2, 2)),
    ("resnet-18.C2", Workload(env.BATCH, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)),
    # ("resnet-18.C3", Workload(env.BATCH, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)),
    # ("resnet-18.C4", Workload(env.BATCH, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)),
    # ("resnet-18.C5", Workload(env.BATCH, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)),
    # ("resnet-18.C6", Workload(env.BATCH, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)),
    # ("resnet-18.C7", Workload(env.BATCH, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)),
    # ("resnet-18.C8", Workload(env.BATCH, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)),
    # ("resnet-18.C9", Workload(env.BATCH, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)),
    # ("resnet-18.C10", Workload(env.BATCH, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)),
    # ("resnet-18.C11", Workload(env.BATCH, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)),
]

with autotvm.tophub.context(env.target):
    for _, wl in resnet_wkls:
        print(wl)
        assert wl.hpad == wl.wpad
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        conv2d_fcompute = vta.top.conv2d_packed
        conv2d_fschedule = vta.top.schedule_conv2d_packed
        # Derive shapes depending upon packing
        a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
        b_shape = (wl.batch, wl.out_filter, 1, 1)
        # data pack
        data_shape = (
            wl.batch // env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            wl.height,
            wl.width,
            env.BATCH,
            env.BLOCK_IN,
        )
        kernel_shape = (
            wl.out_filter // env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        bias_shape = (
            wl.batch // env.BATCH,
            wl.out_filter // env.BLOCK_OUT,
            1,
            1,
            env.BATCH,
            env.BLOCK_OUT,
        )
        data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
        kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
        bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
        padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
        # Define base computation schedule
        with env.target:
            res = conv2d_fcompute(
                data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), layout, env.acc_dtype
            )
            
            res = topi.right_shift(res, 8)
            res = topi.add(res, bias)
            res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
            res = topi.cast(res, env.out_dtype)
            # Derive base schedule
            s = conv2d_fschedule([res])
            # print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
        # Derive number of ops
        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
        num_ops = (
            2
            * wl.batch
            * fout_height
            * fout_width
            * wl.hkernel
            * wl.wkernel
            * wl.out_filter
            * wl.in_filter
        )

        # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
        def get_ref_data():
            # derive min max for act, wgt, and bias types (max non inclusive)
            a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
            w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
            b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (
                env.INP_WIDTH + env.WGT_WIDTH - 2
            )
            a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
            w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
            b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype)
            r_np = tvm.topi.testing.conv2d_nchw_python(
                a_np.astype(env.acc_dtype),
                w_np.astype(env.acc_dtype),
                (wl.hstride, wl.wstride),
                wl.hpad,
            ).astype(env.acc_dtype)
            return a_np, w_np, b_np, r_np

        # Data in original format
        data_np, kernel_np, bias_np, res_ref = get_ref_data()
        # data pack
        data_np = data_np.reshape(
            wl.batch // env.BATCH,
            env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.height,
            wl.width,
        ).transpose((0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(
            wl.out_filter // env.BLOCK_OUT,
            env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
        ).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(
            wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT
        )
        # build
        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
            mod = vta.build(
                s,
                [data, kernel, bias, res],
                target=tvm.target.Target(env.target, host=env.target_host),
                name="conv2d",
            )

        temp = tempdir()
        mod.save(temp.relpath("conv2d.o"))
        remote.upload(temp.relpath("conv2d.o"))
        f = remote.load_module("conv2d.o")
        dev = remote.device(str(env.target))

        res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
        data_arr = tvm.nd.array(data_np, dev)
        kernel_arr = tvm.nd.array(kernel_np, dev)
        bias_arr = tvm.nd.array(bias_np, dev)
        res_arr = tvm.nd.array(res_np, dev)
        time_f = f.time_evaluator("conv2d", dev, number=4)

        simulator.clear_stats()
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
        stats = simulator.stats()

        # 正确性
        res_orig = res_arr.numpy()
        # data pack
        res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
            wl.batch, wl.out_filter, fout_height, fout_width
        )
        bias_np = bias_np.transpose((0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref += bias_np
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

        # 打印
        gops = (num_ops / cost.mean) / float(10**9)
        print(f"CONV2D TEST: Time cost = {cost.mean:g} sec/op, {gops:g} GOPS")

Conv2DWorkload(batch=1, height=56, width=56, in_filter=64, out_filter=64, hkernel=3, wkernel=3, hpad=1, wpad=1, hstride=1, wstride=1)
CONV2D TEST: Time cost = 0.0928819 sec/op, 2.4893 GOPS

[17:25:56] /media/pc/data/lxw/ai/tvm/src/tir/transforms/arg_binder.cc:95: Warning: Trying to bind buffer to another one with lower alignment requirement  required_alignment=256, provided_alignment=64
2023-09-25 17:25:56.273 INFO load_module /tmp/tmper0fe63q/conv2d.o