PackedFunc#

参考:运行时系统

Function 在 TVM 中起着沟通 frontend 和 backend 的关键作用。Function 提供了类型擦除接口(type-erased interface),您可以使用位置参数回调函数。

  • 编译后的模块返回 Function

  • TVM 后端还将其 API 注册并暴露为 Function

PackedFunc 常见使用场景:

  • 自动暴露 C++ API 到 Python。

  • 从 Python 端调用 PackedFunc。

  • 在生成代码(generated code)中回调 Python 回调来检查结果。

  • 将 Python 钩子(hook)引入 C++ 后端。

全局函数#

  • tvm._ffi.registry.register_func() 用于注册全局函数。

下面的代码将 my_packed_func 注册为全局函数。

import testing
import tvm

targs = (10, 10.0, "hello")
@tvm.register_func
def my_packed_func(*args):
    assert(tuple(args) == targs)
    return 10
  • tvm.get_global_func():获取全局函数。

注意,这里只是从全局函数表中返回它,然后从 Python 端回调它。

from tvm.runtime.packed_func import PackedFunc

f = tvm.get_global_func("my_packed_func")
assert isinstance(f, PackedFunc)
y = f(*targs)
assert y == 10

但是,也可以从 C++ 后端或在编译后的 TVM 代码中回调相同的函数。

Python 调用 C++ 接口#

使用 C++ 定义加法运算,并提供 Makefile:

#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>
using namespace tvm::runtime;

void MyAdd(TVMArgs args, TVMRetValue* rv) {
  // 自动将参数转换为所需的类型。
  int a = args[0];
  int b = args[1];
  // 自动分配返回值 rv
  *rv = a + b;
}

// 注册全局 packed function
TVM_REGISTER_GLOBAL("myadd").set_body(MyAdd);
# Minimum Makefile for the extension package
TVM_ROOT=$(shell cd TVM路径; pwd)
PKG_CFLAGS = -std=c++17 -O2 -fPIC\
	-I${TVM_ROOT}/include\
	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
	-I${TVM_ROOT}/3rdparty/dlpack/include\
	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>


PKG_LDFLAGS =-L${TVM_ROOT}/build
UNAME_S := $(shell uname -s)

ifeq ($(UNAME_S), Darwin)
	PKG_LDFLAGS += -undefined dynamic_lookup
endif

lib/libtvm_ext.so: src/tvm_ext.cc
	@mkdir -p $(@D)
	$(CXX) $(PKG_CFLAGS) -shared -o $@ $^ $(PKG_LDFLAGS)

执行 make,输出动态库到 lib/libtvm_ext.so,接着,Python 代码添加如下内容,便可直接调用 C++ 接口:

from pathlib import Path
import ctypes

def load_lib():
    """加载库,函数将被注册到 TVM"""
    curr_dir = Path("tests").resolve()
    # 作为全局加载,这样全局 extern symbol 对其他 dll 是可见的。
    curr_path = str(curr_dir/"lib/libtvm_ext.so")
    lib = ctypes.CDLL(curr_path, ctypes.RTLD_GLOBAL)
    return lib


_LIB = load_lib()

myadd = tvm.get_global_func("myadd")
myadd(4, 5)

PackedFunc 也可以作为参数传递。

在 C++ 端定义:

TVM_REGISTER_GLOBAL("callhello")
.set_body([](TVMArgs args, TVMRetValue* rv) {
  PackedFunc f = args[0];
  f("hello world");
});

python 端可以:

def f(msg):
  print(msg)


callhello = tvm.get_global_func("callhello")
callhello(f)

convert 函数#

convert() 将给定的 value 转换为 TVM 对象。

比如,列表:

a = tvm.runtime.convert([1, 2, 3])
assert len(a) == 3
assert a[-1].value == 3
a_slice = a[-3:-1]
assert (a_slice[0].value, a_slice[1].value) == (1, 2)
type(a)
tvm.ir.container.Array

可以序列化为 JSON:

json_str = tvm.ir.save_json(a)
# 加载
a_loaded = tvm.ir.load_json(json_str)
type(json_str)
tvm.ir.assert_structural_equal(a_loaded, a, map_free_vars=True)

字典:

amap = tvm.runtime.convert({"a": 2, "b": 3})
type(amap)
tvm.ir.container.Map

其他:

x = tvm.nd.array([1, 2, 3])
arr = tvm.runtime.convert([x, x])
arr
[runtime.NDArray(0x9600a80), runtime.NDArray(0x9600a80)]
tvm.runtime.convert(f)
<tvm.runtime.packed_func.PackedFunc at 0x7fab465b96d0>
s = "hello"
a = bytearray(s, encoding="ascii")

def myfunc(ss):
    assert ss == a

f = tvm.runtime.convert(myfunc)
f(a)
def myfunc(ss):
    assert tuple(ss) == ()

x = tvm.runtime.convert(())
tvm.runtime.convert(myfunc)(x)

返回节点的回调#

x = tvm.runtime.convert(10)

def test(y):
    assert y.handle != x.handle
    return y

f2 = tvm.runtime.convert(test)
# register into global function table
@tvm.register_func
def my_callback_with_node(y, f):
    assert y == x
    return f(y)

# get it out from global function table
f = tvm.get_global_func("my_callback_with_node")
assert isinstance(f, tvm.runtime.PackedFunc)
y = f(x, f2)
assert y.value == 10
f2
<tvm.runtime.packed_func.PackedFunc at 0x7fab464b5cd0>

返回函数#

def addy(y):
    def add(x):
        return tvm.runtime.convert(x + y)

    return add

myf = tvm.runtime.convert(addy)
f = myf(10)
assert f(11).value == 21

设备函数#

def test_device_func(dev):
    assert tvm.cuda(7) == dev
    return tvm.cpu(0)

x = test_device_func(tvm.cuda(7))
assert x == tvm.cpu(0)
x = tvm.opencl(10)
x = tvm.testing.device_test(x, x.device_type, x.device_id)
assert x == tvm.opencl(10)

右值引用#

rvalue_ref:

def callback(x, expected_count):
    assert expected_count == tvm.testing.object_use_count(x)
    return x

f = tvm.runtime.convert(callback)

def check0():
    x = tvm.tir.Var("x", "int32")
    assert tvm.testing.object_use_count(x) == 1
    f(x, 2)
    y = f(x._move(), 1)
    assert x.handle.value == None

def check1():
    x = tvm.tir.Var("x", "int32")
    assert tvm.testing.object_use_count(x) == 1
    y = f(x, 2)
    z = f(x._move(), 2)
    assert x.handle.value == None
    assert y.handle.value is not None

check0()
check1()

numpy scalar 和 args#

import numpy as np

def test_numpy_scalar():
    maxint = (1 << 63) - 1
    assert tvm.testing.echo(np.int64(maxint)) == maxint


def test_ndarray_args():
    def check(arr):
        assert not arr.is_view
        assert tvm.testing.object_use_count(arr) == 2

    fcheck = tvm.runtime.convert(check)
    x = tvm.nd.array([1, 2, 3])
    fcheck(x)
    assert tvm.testing.object_use_count(x) == 1

dict 函数值类型#

def test_dict_function_value_type():
    from tvm import tir  # pylint: disable=import-outside-toplevel

    te_func_dict = {"add": lambda a, b: a + b}

    converted_dict = tvm.runtime.convert(te_func_dict)
    f = converted_dict["add"]
    a = tir.Var("a", "float32")
    b = tir.Var("b", "float32")
    tvm.ir.assert_structural_equal(f(a, b), tir.Add(a, b))

test_dict_function_value_type()

Hook Python 函数作为 Extern#

下面的例子注册了 python 函数到 TVM 运行时系统,并使用它来完成计算的一个阶段。这使得 TVM 更加灵活。例如,可以插入前端回调来检查中间结果,或者将定制代码与 TVM 混合使用。

import numpy as np
from tvm import te

te.extern?
Signature:
te.extern(
    shape,
    inputs,
    fcompute,
    name='extern',
    dtype=None,
    in_buffers=None,
    out_buffers=None,
    tag='',
    attrs=None,
)
Docstring:
Compute several tensors via an extern function.

Parameters
----------
shape: tuple or list of tuples.
    The shape of the outputs.

inputs: list of Tensor
    The inputs

fcompute: lambda function of inputs, outputs-> stmt
    Specifies the IR statement to do the computation.
    See the following note for function signature of fcompute

    .. note::
         **Parameters**

         - **ins** (list of :any:`tvm.tir.Buffer`) - Placeholder for each inputs
         - **outs** (list of :any:`tvm.tir.Buffer`) - Placeholder for each outputs

         **Returns**

         - **stmt** (:any:`tvm.tir.Stmt`) - The statement that carries out array computation.

name: str, optional
    The name hint of the tensor

dtype: str or list of str, optional
    The data types of outputs,
    by default dtype will be same as inputs.

in_buffers: tvm.tir.Buffer or list of tvm.tir.Buffer, optional
    Input buffers.

out_buffers: tvm.tir.Buffer or list of tvm.tir.Buffer, optional
    Output buffers.


tag: str, optional
    Additonal tag information about the compute.

attrs: dict, optional
    The additional auxiliary attributes about the compute.

Returns
-------
tensor: Tensor or list of Tensors
    The created tensor or tuple of tensors contains multiple outputs.

Example
-------
In the code below, C is generated by calling external PackedFunc
`tvm.contrib.cblas.matmul`

.. code-block:: python

    A = te.placeholder((n, l), name="A")
    B = te.placeholder((l, m), name="B")
    C = te.extern((n, m), [A, B],
                   lambda ins, outs: tvm.tir.call_packed(
                      "tvm.contrib.cblas.matmul",
                        ins[0], ins[1], outs[0], 0, 0), name="C")
File:      /media/pc/data/lxw/ai/tvm/python/tvm/te/operation.py
Type:      function
@tvm.register_func("tvm.contrib.my_tvm_addone")
def my_tvm_addone(x, y):
    print(f"my_tvm_addone signatures: {type(x)}, {type(y)}")
    tvm.nd.array(x.numpy() + 1).copyto(y)


n = 10
dev = tvm.cpu(0)
A = te.placeholder((n,), name="A")
B = te.extern(
    A.shape,
    [A],
    lambda ins, outs: tvm.tir.call_packed("tvm.contrib.my_tvm_addone",
                                          ins[0], outs[0]),
    name="C",
)
te_func = te.create_prim_func([A, B])
te_func.show()
f = tvm.build(te_func, "llvm")
a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
f(a, b)
np.testing.assert_allclose(b.numpy(), a.numpy() + 1, rtol=1e-5)
# from tvm.script import tir as T

@T.prim_func
def main(var_A: T.handle, var_C: T.handle):
    T.func_attr({"tir.noalias": T.bool(True)})
    A = T.match_buffer(var_A, (10,), offset_factor=1)
    C = T.match_buffer(var_C, (10,), offset_factor=1)
    with T.block("C"):
        T.reads()
        T.writes()
        T.call_packed("tvm.contrib.my_tvm_addone", T.tvm_stack_make_array(A.data, T.tvm_stack_make_shape(10), 0, 1, T.float32(0), A.elem_offset), T.tvm_stack_make_array(C.data, T.tvm_stack_make_shape(10), 0, 1, T.float32(0), C.elem_offset))
my_tvm_addone signatures: <class 'tvm.runtime.ndarray.NDArray'>, <class 'tvm.runtime.ndarray.NDArray'>

PyTorch 调用 TVM 接口#

参考:在环境中集成现有运行库

DLPack 数据:

import torch
import torch.utils.dlpack
from tvm.contrib.dlpack import to_pytorch_func

a = np.random.randn(1337)
tvm_a = tvm.nd.array(a)
np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).numpy(), a)
x = torch.rand(56, 56)
tvm_x = tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(x))
y = tvm.nd.from_dlpack(tvm_x)
np.testing.assert_equal(x.numpy(), tvm_x.numpy())
np.testing.assert_equal(y.numpy(), tvm_x.numpy())
np.testing.assert_equal(
    torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.numpy()
)
def tvm_func(n):
    XX = te.placeholder((n, n), name="X")
    YY = te.placeholder((n, n), name="Y")
    k = te.reduce_axis((0, n), name="k")
    ZZ = te.compute((n, n), lambda i, j: te.sum(XX[i, k] * YY[k, j], axis=k))
    return te.create_prim_func([XX, YY, ZZ])

te_func = tvm_func(tvm.runtime.convert(137))
te_func.show()
f = tvm.build(te_func, name="f")
# from tvm.script import tir as T

@T.prim_func
def main(X: T.Buffer((137, 137), "float32"), Y: T.Buffer((137, 137), "float32"), compute: T.Buffer((137, 137), "float32")):
    T.func_attr({"tir.noalias": T.bool(True)})
    # with T.block("root"):
    for i, j, k in T.grid(137, 137, 137):
        with T.block("compute"):
            v_i, v_j, v_k = T.axis.remap("SSR", [i, j, k])
            T.reads(X[v_i, v_k], Y[v_k, v_j])
            T.writes(compute[v_i, v_j])
            with T.init():
                compute[v_i, v_j] = T.float32(0)
            compute[v_i, v_j] = compute[v_i, v_j] + X[v_i, v_k] * Y[v_k, v_j]
xx = torch.rand(137, 137)
yy = torch.rand(137, 137)
zz = xx.mm(yy)
zz2 = torch.empty(137, 137)
f_pytorch = to_pytorch_func(f)
f_pytorch(xx, yy, zz2)
np.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-4, atol=1e-4)