# %%shell
# # Installs the latest dev build of TVM from PyPI. If you wish to build
# # from source, see https://tvm.apache.org/docs/install/from_source.html
# pip install apache-tvm --pre
在 Relay 中使用管道执行器#
原作者: Hua Jiang
这是关于如何在 Relay 中使用“管道执行器”(Pipeline Executor)的简短教程。
import tvm
from tvm import te
import numpy as np
from tvm.contrib import graph_executor as runtime
from tvm.relay.op.contrib.cutlass import partition_for_cutlass
from tvm import relay
from tvm.relay import testing
from tvm.contrib.cutlass import finalize_modules
img_size = 8
创建简单网络,可以是预训练的模型#
创建非常简单的网络进行演示。它由卷积、batch normalization、dense 和 ReLU 激活组成。
def get_network():
out_channels = 16
batch_size = 1
data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16"))
dense_weight = relay.var(
"dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16")
)
weight = relay.var("weight")
bn_gamma = relay.var("bn_gamma")
bn_beta = relay.var("bn_beta")
bn_mmean = relay.var("bn_mean")
bn_mvar = relay.var("bn_var")
simple_net = relay.nn.conv2d(
data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)
)
simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
simple_net = relay.nn.relu(simple_net)
simple_net = relay.nn.batch_flatten(simple_net)
simple_net = relay.nn.dense(simple_net, dense_weight)
simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
data_shape = (batch_size, 3, img_size, img_size)
net, params = testing.create_workload(simple_net)
return net, params, data_shape
net, params, data_shape = get_network()
将网络分成两个子图#
单元测试中的 ‘graph_split’ 函数只是一个例子。用户可以创建自定义的逻辑来分割计算图。
import inspect
import os
tutorial_dir = os.path.dirname(inspect.getfile(lambda: None))
os.sys.path.append(os.path.join(tutorial_dir, "../../../tests/python/relay"))
from test_pipeline_executor import graph_split
将网络分成两个子图。
split_config = [{"op_name": "nn.relu", "op_index": 0}]
subgraphs = graph_split(net["main"], split_config, params)
生成的子图应该如下所示。
subgraphs
[def @main(%data: Tensor[(1, 3, 8, 8), float16] /* ty=Tensor[(1, 3, 8, 8), float16] */) {
%0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 8, 8), float16] */;
%1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16] */, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1, 16, 8, 8), float16], Tensor[(16), float16], Tensor[(16), float16]) */;
%2 = %1.0 /* ty=Tensor[(1, 16, 8, 8), float16] */;
nn.relu(%2) /* ty=Tensor[(1, 16, 8, 8), float16] */
}
,
def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {
%0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */
}
]
使用 cutlass target 构建子图#
cutlass = tvm.target.Target(
{
"kind": "cutlass",
"sm": int(tvm.target.Target("cuda").arch.split("_")[1]),
"use_3xtf32": True,
"split_k_slices": [1],
"profile_all_alignments": False,
"find_first_valid": True,
"use_multiprocessing": True,
"use_fast_math": False,
"tmp_dir": "./tmp",
},
host=tvm.target.Target("llvm"),
)
def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"):
target = [target, cutlass]
lib = relay.build_module.build(
mod, target=target, params=params, target_host=target_host, mod_name=mod_name
)
return lib
使用管道执行器在管道中运行两个子图#
在 cmake 中将 USE_PIPELINE_EXECUTOR
设置为 ON
,并将 USE_CUTLASS
设置为 ON
。
from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
创建子图管道配置。
将子图模块与目标关联。
使用 CUTLASS BYOC 构建第二个子图模块。
mod0, mod1 = subgraphs[0], subgraphs[1]
# Use cutlass as the codegen.
mod1 = partition_for_cutlass(mod1)
获取管道执行器配置对象。
pipe_config = pipeline_executor_build.PipelineConfig()
设置子图模块的编译目标。
pipe_config[mod0].target = "llvm"
pipe_config[mod0].dev = tvm.cpu(0)
设置第二个子图模块的编译目标为cuda。
pipe_config[mod1].target = "cuda"
pipe_config[mod1].dev = tvm.device("cuda", 0)
pipe_config[mod1].build_func = cutlass_build
pipe_config[mod1].export_cc = "nvcc"
# 通过连接子图模块创建管道。
# 全局输入将 forwarded 到第一个名为 mod0 的模块的输入接口
pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
# mod0 的第一个输出将被转发到 mod1 的输入接口
pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
# mod1 的第一个输出将是第一个全局输出。
pipe_config[mod1]["output"][0].connect(pipe_config["output"][0])
管道配置如下。
"""
print(pipe_config)
Inputs
|data: mod0:data
output
|output(0) : mod1.output(0)
connections
|mod0.output(0)-> mod1.data_n_0
"""
构建管道执行器#
with tvm.transform.PassContext(opt_level=3):
pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
将参数配置导出为文件。
directory_path = tvm.contrib.utils.tempdir().temp_dir
os.makedirs(directory_path, exist_ok=True)
config_file_name = pipeline_mod_factory.export_library(directory_path)
使用 load 函数创建并初始化 PipelineModule#
pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
运行 pipeline executor#
分配输入数据。
data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
pipeline_module.set_input("data", tvm.nd.array(data))
在管道模式下运行两个子图以异步获取输出或同步。在下面的示例中,它是同步的。
pipeline_module.run()
outputs = pipeline_module.get_output()
使用 graph_executor 进行验证#
使用 graph_executor 按顺序运行这两个子图以获得输出。
target = "llvm"
dev0 = tvm.device(target, 0)
lib0 = relay.build_module.build(mod0, target, params=params)
module0 = runtime.GraphModule(lib0["default"](dev0))
cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
lib1 = finalize_modules(lib1, "compile.so", "./tmp")
dev1 = tvm.device("cuda", 0)
module1 = runtime.GraphModule(lib1["default"](dev1))
module0.set_input("data", data)
module0.run()
out_shape = (1, 16, img_size, img_size)
out = module0.get_output(0, tvm.nd.empty(out_shape, "float16"))
module1.set_input("data_n_0", out)
module1.run()
out_shape = (1, 1)
out = module1.get_output(0, tvm.nd.empty(out_shape, "float16"))
验证结果:
np.testing.assert_allclose(outputs[0].numpy(), out.numpy())