外部代码生成#

import tvm
from tvm import relay, runtime, testing
from tvm.contrib import utils
def set_external_func_attr(func, compiler, ext_symbol):
    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
    func = func.with_attr("Compiler", compiler)
    func = func.with_attr("global_symbol", ext_symbol)
    return func
from pathlib import Path
from collections import OrderedDict
import numpy as np

import tvm
import tvm.testing
from tvm import relay, runtime
from tvm.relay.build_module import bind_params_by_name
from tvm.relay.op.annotation import compiler_begin, compiler_end
def update_lib(lib, source_dir):
    source_dir = Path(source_dir)
    contrib_path = source_dir/"src/runtime/contrib"

    kwargs = {}
    kwargs["options"] = ["-O2", "-std=c++17", f"-I{contrib_path}"]
    tmp_path = utils.tempdir()
    lib_name = "lib.so"
    lib_path = tmp_path.relpath(lib_name)
    lib.export_library(lib_path, fcompile=False, **kwargs)
    lib = tvm.runtime.load_module(lib_path)
    return lib
def check_result(
    mod, map_inputs, out_shape, result, tol=1e-5, 
    target="llvm", device=tvm.cpu(), 
    source_dir="/media/pc/data/board/arria10/lxw/tasks/tvm-ai"):
    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
        exe = relay.vm.compile(mod, target=target)
    code, lib = exe.save()
    lib = update_lib(lib, source_dir=source_dir)
    exe = runtime.vm.Executable.load_exec(code, lib)
    vm = runtime.vm.VirtualMachine(exe, device)
    out = vm.run(**map_inputs)
    tvm.testing.assert_allclose(out.numpy(), result, rtol=tol, atol=tol)

多节点子图#

x = relay.var("x", shape=(10, 10))
w0 = relay.var("w0", shape=(10, 10))
w1 = relay.var("w1", shape=(10, 10))
w2 = relay.var("w2", shape=(10, 10))
w3 = relay.var("w3", shape=(10, 10))
w4 = relay.var("w4", shape=(10, 10))
w5 = relay.var("w5", shape=(10, 10))
w6 = relay.var("w6", shape=(10, 10))
w7 = relay.var("w7", shape=(10, 10))

# subgraph0
x0 = relay.var("x0", shape=(10, 10))
w00 = relay.var("w00", shape=(10, 10))
w01 = relay.var("w01", shape=(10, 10))
w02 = relay.var("w02", shape=(10, 10))
z00 = relay.add(x0, w00)
p00 = relay.subtract(z00, w01)
q00 = relay.multiply(p00, w02)
subgraph0 = relay.Function([x0, w00, w01, w02], q00)
subgraph0 = set_external_func_attr(subgraph0, "ccompiler", "ccompiler_0")
call0 = relay.Call(subgraph0, [x, w0, w1, w2])

# subgraph1
x1 = relay.var("x1", shape=(10, 10))
w10 = relay.var("w10", shape=(10, 10))
w11 = relay.var("w11", shape=(10, 10))
w12 = relay.var("w12", shape=(10, 10))
z10 = relay.add(x1, w10)
p10 = relay.subtract(z10, w11)
q10 = relay.multiply(p10, w12)
subgraph1 = relay.Function([x1, w10, w11, w12], q10)
subgraph1 = set_external_func_attr(subgraph1, "ccompiler", "ccompiler_1")
call1 = relay.Call(subgraph1, [x, w3, w4, w5])

# Other parts on TVM
z2 = relay.add(x, w6)
q2 = relay.subtract(z2, w7)

r = relay.concatenate((call0, call1, q2), axis=0)
f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
mod = tvm.IRModule()
mod["main"] = f
mod = relay.transform.InferType()(mod)
mod.show()
def @main(%x: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w2: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w3: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w4: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w5: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w6: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w7: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */) -> Tensor[(30, 10), float32] {
  %2 = fn (%x0: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w00: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w01: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w02: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="ccompiler", global_symbol="ccompiler_0") -> Tensor[(10, 10), float32] {
    %0 = add(%x0, %w00) /* ty=Tensor[(10, 10), float32] */;
    %1 = subtract(%0, %w01) /* ty=Tensor[(10, 10), float32] */;
    multiply(%1, %w02) /* ty=Tensor[(10, 10), float32] */
  } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
  %5 = fn (%x1: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w10: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w11: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, %w12: Tensor[(10, 10), float32] /* ty=Tensor[(10, 10), float32] */, Primitive=1, Compiler="ccompiler", global_symbol="ccompiler_1") -> Tensor[(10, 10), float32] {
    %3 = add(%x1, %w10) /* ty=Tensor[(10, 10), float32] */;
    %4 = subtract(%3, %w11) /* ty=Tensor[(10, 10), float32] */;
    multiply(%4, %w12) /* ty=Tensor[(10, 10), float32] */
  } /* ty=fn (Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] */;
  %6 = add(%x, %w6) /* ty=Tensor[(10, 10), float32] */;
  %7 = %2(%x, %w0, %w1, %w2) /* ty=Tensor[(10, 10), float32] */;
  %8 = %5(%x, %w3, %w4, %w5) /* ty=Tensor[(10, 10), float32] */;
  %9 = subtract(%6, %w7) /* ty=Tensor[(10, 10), float32] */;
  %10 = (%7, %8, %9) /* ty=(Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) */;
  concatenate(%10) /* ty=Tensor[(30, 10), float32] */
}
x_data = np.random.rand(10, 10).astype("float32")
w_data = []
for _ in range(8):
    w_data.append(np.random.rand(10, 10).astype("float32"))

map_inputs = OrderedDict([("x", x_data)] + [("w{}".format(i), w_data[i]) for i in range(8)])
out_shape = (30, 10)
result = np.concatenate(
    (
        ((x_data + w_data[0]) - w_data[1]) * w_data[2],
        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
        x_data + w_data[6] - w_data[7],
    ),
    axis=0,
)
check_result(
    mod, map_inputs, out_shape, result, 
    tol=1e-5, target="llvm", device=tvm.cpu(), 
)

外部 gcc 单个算子#

x = relay.var("x", shape=(8, 8))
y = relay.var("y", shape=(8, 8))

x0 = relay.var("x0", shape=(8, 8))
y0 = relay.var("y0", shape=(8, 8))
z = x0 + y0
f = relay.Function([x0, y0], z)
f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
call = relay.Call(f, [x, y])
mod = tvm.IRModule.from_expr(call)
x_data = np.random.rand(8, 8).astype("float32")
y_data = np.random.rand(8, 8).astype("float32")

check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
[15:44:05] /media/pc/data/board/arria10/lxw/tasks/tvm-ai/src/relay/backend/vm/compiler.cc:1199: All lowered functions have been build by BYOC -- generating an empty TVM module

外部 gcc#

x = relay.var("x", shape=(2, 2))
y = relay.var("y", shape=(2, 2))

# subgraph for mul
x0 = relay.var("x0", shape=(2, 2))
y0 = relay.var("y0", shape=(2, 2))
mul = x0 * y0
mul = relay.Function([x0, y0], mul)
mul = set_external_func_attr(mul, "ccompiler", "ccompiler_2")
call_mul = relay.Call(mul, [y, y])

# subgraph for add
x1 = relay.var("x1", shape=(2, 2))
y1 = relay.var("y1", shape=(2, 2))
add = x1 + y1
add = relay.Function([x1, y1], add)
add = set_external_func_attr(add, "ccompiler", "ccompiler_1")
call_add = relay.Call(add, [x, x])

# subgraph for sub
x2 = relay.var("x2", shape=(2, 2))
y2 = relay.var("y2", shape=(2, 2))
sub = x2 - y2
sub = relay.Function([x2, y2], sub)
sub = set_external_func_attr(sub, "ccompiler", "ccompiler_0")
call_sub = relay.Call(sub, [call_mul, call_add])
mod = tvm.IRModule.from_expr(call_sub)

x_data = np.random.rand(2, 2).astype("float32")
y_data = np.random.rand(2, 2).astype("float32")

inputs = OrderedDict(
    [
        ("y", y_data),
        ("x", x_data),
    ]
)

check_result(mod, inputs, (2, 2), (y_data * y_data) - (x_data + x_data))
[15:44:06] /media/pc/data/board/arria10/lxw/tasks/tvm-ai/src/relay/backend/vm/compiler.cc:1199: All lowered functions have been build by BYOC -- generating an empty TVM module
mod.show()
def @main(%y: Tensor[(2, 2), float32], %x: Tensor[(2, 2), float32]) {
  %0 = fn (%x0: Tensor[(2, 2), float32], %y0: Tensor[(2, 2), float32], Primitive=1, Compiler="ccompiler", global_symbol="ccompiler_2") {
    multiply(%x0, %y0)
  };
  %1 = fn (%x1: Tensor[(2, 2), float32], %y1: Tensor[(2, 2), float32], Primitive=1, Compiler="ccompiler", global_symbol="ccompiler_1") {
    add(%x1, %y1)
  };
  %2 = %0(%y, %y);
  %3 = %1(%x, %x);
  %4 = fn (%x2: Tensor[(2, 2), float32], %y2: Tensor[(2, 2), float32], Primitive=1, Compiler="ccompiler", global_symbol="ccompiler_0") {
    subtract(%x2, %y2)
  };
  %4(%2, %3)
}