复合路径#
import numpy as np
import tvm
from tvm import relax
from tvm.relax.backend.cuda.cublas import partition_for_cublas
from tvm.relax.backend.cuda.cutlass import partition_for_cutlass
from tvm.relax.dpl.pattern import (
is_op,
is_tuple_get_item,
make_fused_bias_activation_pattern,
wildcard,
)
from tvm.relax.transform import PatternCheckContext
from tvm.script import ir as I
from tvm.script import relax as R
from tvm.script import tir as T
创建简单的计算图,只包含一个乘法算子。
x = relax.Var("x", relax.TensorStructInfo([10, 10], "float32"))
y = relax.Var("y", relax.TensorStructInfo([10, 10], "float32"))
bb = relax.BlockBuilder()
with bb.function("main", [x, y]):
with bb.dataflow():
lv0 = bb.emit(relax.op.multiply(x, y))
gv = bb.emit_output(lv0)
bb.emit_func_output(gv)
mod = bb.get()
mod = relax.transform.CanonicalizeBindings()(mod)
mod.show()
# from tvm.script import ir as I
# from tvm.script import relax as R
@I.ir_module
class Module:
@R.function
def main(x: R.Tensor((10, 10), dtype="float32"), y: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="float32"):
with R.dataflow():
gv: R.Tensor((10, 10), dtype="float32") = R.multiply(x, y)
R.output(gv)
return gv
当前有两种 BYOC (自带代码生成)的实现路径:
路径1:
[FuseOpsByPattern(patterns, annotate_codegen=True), RunCodegen()]
路径2:
[FuseOpsByPattern(patterns, annotate_codegen=False), MergeCompositeFunctions(), RunCodegen()]
为了保持一致性,两条路径都应具有与RunCodegen()
相同的接口。
关键点说明:
两种路径最终功能等效但实现方式不同
路径1直接生成带代码生成属性的融合函数
路径2先进行基础融合,再通过
MergeCompositeFunctions
添加代码生成属性
patterns = [("cutlass.multiply", is_op("relax.multiply")(wildcard(), wildcard()))]
mod1 = relax.transform.FuseOpsByPattern(patterns, bind_constants=True, annotate_codegen=True)(
mod
)
assert tvm.relax.analysis.well_formed(mod1)
mod1.show()
# from tvm.script import ir as I
# from tvm.script import relax as R
@I.ir_module
class Module:
@R.function
def fused_relax_multiply_cutlass(x: R.Tensor((10, 10), dtype="float32"), y: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="float32"):
R.func_attr({"Codegen": "cutlass"})
# from tvm.script import relax as R
@R.function
def local_func(x_1: R.Tensor((10, 10), dtype="float32"), y_1: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="float32"):
R.func_attr({"Composite": "cutlass.multiply"})
with R.dataflow():
gv: R.Tensor((10, 10), dtype="float32") = R.multiply(x_1, y_1)
R.output(gv)
return gv
output: R.Tensor((10, 10), dtype="float32") = local_func(x, y)
return output
@R.function
def main(x: R.Tensor((10, 10), dtype="float32"), y: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="float32"):
cls = Module
with R.dataflow():
gv: R.Tensor((10, 10), dtype="float32") = cls.fused_relax_multiply_cutlass(x, y)
R.output(gv)
return gv
路径二:
mod2 = relax.transform.FuseOpsByPattern(patterns, bind_constants=True, annotate_codegen=False)(
mod
)
mod2 = relax.transform.MergeCompositeFunctions()(mod2)
assert tvm.relax.analysis.well_formed(mod2)
mod2.show()
# from tvm.script import ir as I
# from tvm.script import relax as R
@I.ir_module
class Module:
@R.function
def fused_relax_multiply1_cutlass(x: R.Tensor((10, 10), dtype="float32"), y: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="float32"):
R.func_attr({"Codegen": "cutlass"})
# from tvm.script import relax as R
@R.function
def gv(x_1: R.Tensor((10, 10), dtype="float32"), y_1: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="float32"):
R.func_attr({"Composite": "cutlass.multiply"})
with R.dataflow():
gv_1: R.Tensor((10, 10), dtype="float32") = R.multiply(x_1, y_1)
R.output(gv_1)
return gv_1
gv_1: R.Tensor((10, 10), dtype="float32") = gv(x, y)
return gv_1
@R.function
def main(x: R.Tensor((10, 10), dtype="float32"), y: R.Tensor((10, 10), dtype="float32")) -> R.Tensor((10, 10), dtype="float32"):
cls = Module
with R.dataflow():
gv: R.Tensor((10, 10), dtype="float32") = cls.fused_relax_multiply1_cutlass(x, y)
R.output(gv)
return gv