vta.top.op 源代码

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=unused-argument, ungrouped-imports
"""Namespace for supporting Relay operators on VTA."""
from __future__ import absolute_import as _abs

import tvm
from tvm import te
from tvm import autotvm
from tvm import topi

from tvm.relay.op import op as reg
from tvm.relay.op import strategy as _strategy
from tvm.relay.op.op import OpPattern, OpStrategy

from .utils import is_packed_layout
from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
from .vta_dense import dense_packed, schedule_dense_packed
from ..environment import get_env


[文档]
ENV = get_env()


# override to force partition at copy
reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)

# add clip vta strategy

[文档]
def compute_clip_vta(attrs, inputs, output_type):
    """Clip operator."""
    x = inputs[0]
    a_min = attrs.a_min
    a_max = attrs.a_max
    const_min = tvm.tir.const(a_min, x.dtype)
    const_max = tvm.tir.const(a_max, x.dtype)
    with tvm.te.tag_scope(topi.tag.ELEMWISE):
        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
    return [x]




[文档]
def clip_strategy_vta(attrs, inputs, out_type, target):
    strategy = OpStrategy()
    strategy.add_implementation(
        compute_clip_vta,
        _strategy.wrap_topi_schedule(topi.generic.schedule_injective),
        name="clip.vta",
    )
    return strategy



reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta")


@autotvm.register_topi_compute("add.vta")

[文档]
def add_packed(cfg, lhs, rhs):
    return topi.add(lhs, rhs)



@autotvm.register_topi_compute("multiply.vta")

[文档]
def multiply_packed(cfg, lhs, rhs):
    return topi.multiply(lhs, rhs)




[文档]
def schedule_alu_packed(cfg, outs):
    """alu packed schedule"""
    assert len(outs) == 1

    def is_cast_op(op):
        return op.name == "T_cast"

    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
    output = outs[0]
    s = te.create_schedule([x.op for x in outs])
    te.schedule.AutoInlineInjective(s)

    # other target does not support alu-only ops
    if not (ENV.TARGET in ["sim", "tsim", "intelfocl"]):
        return s

    # only put the int-related ops to vta
    if "int" in output.dtype and len(output.shape) == 6:
        ewise_inputs = []
        ewise_ops = []
        const_ops = []

        def _traverse(op):
            if topi.tag.is_broadcast(op.tag):
                if not op.same_as(output.op):
                    if not op.axis:
                        const_ops.append(op)
                    elif not is_cast_op(op):
                        ewise_ops.append(op)

                for tensor in op.input_tensors:
                    if isinstance(tensor.op, tvm.te.PlaceholderOp):
                        ewise_inputs.append((op, tensor))
                    elif is_cast_op(tensor.op) and not op.same_as(output.op):
                        ewise_inputs.append((op, tensor))
                    else:
                        _traverse(tensor.op)
            else:
                for tensor in op.input_tensors:
                    if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) and (
                        not is_cast_op(tensor.op)
                    ):
                        _traverse(tensor.op)

        op = output.op
        _traverse(op)
        for _, t in ewise_inputs:
            if t.dtype == "float32":
                return s

        x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis

        cfg.define_split("tile_co", x_co, num_outputs=2)
        cfg.define_split("tile_h", x_i, num_outputs=2)
        cfg.define_split("tile_w", x_j, num_outputs=2)

        x_co0, x_co1 = cfg["tile_co"].apply(s, output, x_co)
        x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_i)
        x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_j)
        s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
        store_pt = x_j0

        for e_o in ewise_ops:
            s[e_o].set_scope(ENV.acc_scope)
            s[e_o].pragma(s[e_o].op.axis[0], ENV.alu)
            s[e_o].compute_at(s[output], store_pt)

        # cache read input
        cache_read_ewise = []
        for consumer, tensor in ewise_inputs:
            cache_read_ewise.append(s.cache_read(tensor, ENV.acc_scope, [consumer]))

        for tensor in cache_read_ewise:
            if s[tensor].op.axis:
                s[tensor].pragma(s[tensor].op.axis[0], ENV.dma_copy)
            s[tensor].compute_at(s[output], store_pt)

        for op in const_ops:
            s[op].compute_inline()

        s[output].pragma(x_co1, ENV.dma_copy)

    return s



@autotvm.register_topi_schedule("add.vta")

[文档]
def schedule_add_packed(cfg, outs):
    return schedule_alu_packed(cfg, outs)



@autotvm.register_topi_schedule("multiply.vta")

[文档]
def schedule_multiply_packed(cfg, outs):
    return schedule_alu_packed(cfg, outs)




[文档]
def add_strategy_vta(attrs, inputs, out_type, target):
    strategy = OpStrategy()
    strategy.add_implementation(
        _strategy.wrap_topi_compute(add_packed),
        _strategy.wrap_topi_schedule(schedule_add_packed),
        name="add.vta",
    )
    return strategy




[文档]
def multiply_strategy_vta(attrs, inputs, out_type, target):
    strategy = OpStrategy()
    strategy.add_implementation(
        _strategy.wrap_topi_compute(multiply_packed),
        _strategy.wrap_topi_schedule(schedule_multiply_packed),
        name="multiply.vta",
    )
    return strategy



# other target does not support alu-only ops
if ENV.TARGET in ["sim", "intelfocl"]:
    reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
    reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")


@_strategy.conv2d_strategy.register("vta")

[文档]
def conv2d_strategy_vta(attrs, inputs, out_type, target):
    """conv2d vta strategy"""
    strategy = OpStrategy()
    kernel = inputs[1]
    dilation = topi.utils.get_const_tuple(attrs.dilation)
    groups = attrs.groups
    layout = attrs.data_layout

    assert dilation == (1, 1), "support for dilation limited to (1, 1)"
    if is_packed_layout(layout):
        if groups == 1:
            assert ENV.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
            assert ENV.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
            assert kernel.dtype == "int8"

            strategy.add_implementation(
                _strategy.wrap_compute_conv2d(conv2d_packed, need_data_layout=True),
                _strategy.wrap_topi_schedule(schedule_conv2d_packed),
                name="conv2d_packed.vta",
            )
        else:  # group_conv2d
            strategy.add_implementation(
                _strategy.wrap_compute_conv2d(group_conv2d_packed, has_groups=True),
                _strategy.wrap_topi_schedule(schedule_group_conv2d_packed),
                name="group_conv2d_packed.vta",
            )
        return strategy

    # If it's not packed, run on ARM CPU
    arm_tgt = tvm.target.arm_cpu(target.model)
    return _strategy.arm_cpu.conv2d_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)



@_strategy.conv2d_transpose_strategy.register("vta")

[文档]
def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
    """conv2d_transpose vta strategy"""
    dilation = topi.utils.get_const_tuple(attrs.dilation)
    layout = attrs.data_layout
    assert dilation == (1, 1), "support for dilation limited to (1, 1)"

    if is_packed_layout(layout):
        strategy = OpStrategy()
        strategy.add_implementation(
            _strategy.wrap_compute_conv2d_transpose(conv2d_transpose_packed),
            _strategy.wrap_topi_schedule(schedule_conv2d_transpose_packed),
            name="conv2d_transpose_packed.vta",
        )
        return strategy

    # If it's not packed, run on ARM CPU
    arm_tgt = tvm.target.arm_cpu(target.model)
    return _strategy.arm_cpu.conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)



@_strategy.dense_strategy.register("vta")

[文档]
def dense_strategy_vta(attrs, inputs, out_type, target):
    """dense vta strategy"""
    if len(inputs[0].shape) == 4:  # this implies the layout is packed
        strategy = OpStrategy()
        strategy.add_implementation(
            _strategy.wrap_compute_dense(dense_packed),
            _strategy.wrap_topi_schedule(schedule_dense_packed),
            name="dense_packed.vta",
        )
        return strategy
    # If it's not packed, run on ARM CPU
    arm_tgt = tvm.target.arm_cpu(target.model)
    return _strategy.x86.dense_strategy_cpu(attrs, inputs, out_type, arm_tgt)