PyTorch 与 TVM Relax 的集成

目录

PyTorch 与 TVM Relax 的集成#

此测试验证以下内容：

PyTorch 张量与 TVM 后端的无缝输入/输出
Python、TIR 和 Relax 函数之间的跨函数调用
动态 Python 函数的添加和执行
端到端流水线测试
错误处理和边缘情况

# 导入测试框架和必要的库
import pytest
import torch
import torch.nn.functional as F
import tvm
from tvm import relax, tir
from tvm.script import ir as I, relax as R, tir as T
from tvm.relax import BasePyModule
import numpy as np

@I.ir_module
class PyTorchIntegrationModule(BasePyModule):
    """用于测试 PyTorch 与 TVM 集成的测试模块。"""

    @I.pyfunc
    def main(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
        """主函数，演示各种函数调用和数据处理流程。"""
        n = x.shape[0]  # 获取输入张量的批次大小

        # 调用 TIR 函数执行矩阵乘法
        lv = self.call_tir(self.matmul, [x, w], out_sinfo=R.Tensor((n, 20), "float32"))

        # 应用 ReLU 激活函数
        lv1 = F.relu(lv)

        # 调用动态添加的打包函数（通过 call_dps_packed 接口）
        lv2 = self.call_dps_packed("my_softmax", [lv1, 1], out_sinfo=R.Tensor((n, 20), "float32"))

        # 调用 Python 函数
        lv3 = self.my_identity_func(lv2)

        return lv3

    @T.prim_func
    def matmul(
        var_A: T.handle,
        var_B: T.handle,
        var_C: T.handle,
    ):
        """TIR 原始函数，实现矩阵乘法操作。"""
        n = T.int32()  # 符号变量，表示批次大小
        # 定义匹配的缓冲区
        A = T.match_buffer(var_A, (n, 16), "float32")
        B = T.match_buffer(var_B, (16, 20), "float32")
        C = T.match_buffer(var_C, (n, 20), "float32")

        # 三重循环实现矩阵乘法
        for i, j, k in T.grid(n, 20, 16):
            with T.block("block"):
                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                with T.init():
                    C[vi, vj] = T.float32(0)
                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]

    @I.pyfunc
    def my_identity_func(self, x: torch.Tensor) -> torch.Tensor:
        """简单的恒等函数，返回输入张量本身。"""
        return x

测试模块的创建和实例化功能#

module = PyTorchIntegrationModule

# 验证模块是否可调用（支持实例化）
assert hasattr(module, "__call__"), "模块应该可调用"

# 在 CPU 上实例化模块
device = tvm.cpu(0)
instance = module(device)

# 验证实例类型和必要方法
assert isinstance(instance, BasePyModule), "实例应该是 BasePyModule 类型"

required_methods = ["main", "call_tir", "call_dps_packed"]
for method in required_methods:
    assert hasattr(instance, method), f"实例应该具有方法: {method}"

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试在 GPU 上创建和实例化模块#

module = PyTorchIntegrationModule

if tvm.cuda().exist:
    # 验证模块是否可调用
    assert hasattr(module, "__call__"), "模块应该可调用"

    # 在 GPU 上实例化模块
    device = tvm.cuda(0)
    instance = module(device)

    # 验证实例类型、必要方法和目标设备
    assert isinstance(instance, BasePyModule), "实例应该是 BasePyModule 类型"
    required_methods = ["main", "call_tir", "call_dps_packed"]
    for method in required_methods:
        assert hasattr(instance, method), f"实例应该具有方法: {method}"
    assert "cuda" in str(instance.target)
else:
    # 如果 CUDA 不可用，则跳过此测试
    pytest.skip("CUDA not available")

Warning: Failed to compile one or more TIR functions: Memory verification failed with the following errors:
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  Did you forget to bind?
# from tvm.script import tir as T

@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
    T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
    n = T.int32()
    A = T.match_buffer(var_A, (n, 16))
    C = T.match_buffer(var_C, (n, 20))
    for i, j, k in T.grid(n, 20, 16):
        cse_v1: T.int32 = i * 20 + j
        C_1 = T.Buffer((n * 20,), data=C.data)
        if k == 0:
            C_1[cse_v1] = T.float32(0.0)
        A_1 = T.Buffer((n * 16,), data=A.data)
        B_1 = T.Buffer((320,), data=B.data)
        C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]
Warning: Failed to compile Relax VM: Memory verification failed with the following errors:
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  Did you forget to bind?
# from tvm.script import tir as T

@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
    T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
    n = T.int32()
    A = T.match_buffer(var_A, (n, 16))
    C = T.match_buffer(var_C, (n, 20))
    for i, j, k in T.grid(n, 20, 16):
        cse_v1: T.int32 = i * 20 + j
        C_1 = T.Buffer((n * 20,), data=C.data)
        if k == 0:
            C_1[cse_v1] = T.float32(0.0)
        A_1 = T.Buffer((n * 16,), data=A.data)
        B_1 = T.Buffer((320,), data=B.data)
        C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]

测试 Python 函数是否正确执行#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 测试恒等函数
input_tensor = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32)
result = instance.my_identity_func(input_tensor)

# 验证结果
assert isinstance(result, torch.Tensor)
assert torch.allclose(result, input_tensor, atol=1e-5)

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试 TIR 函数是否正确执行#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 测试矩阵乘法函数
n = 3
x = torch.randn(n, 16, dtype=torch.float32)
w = torch.randn(16, 20, dtype=torch.float32)

result = instance.call_tir(instance.matmul, [x, w], R.Tensor((n, 20), "float32"))

# 验证结果类型和形状
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)

# 使用 PyTorch 的矩阵乘法验证结果正确性
expected = torch.matmul(x, w)
assert torch.allclose(result, expected, atol=1e-3)

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试动态添加 Python 函数的功能#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 定义一个自定义函数
def custom_activation(x):
    return torch.sigmoid(x)

# 添加函数到模块实例
instance.add_python_function("custom_activation", custom_activation)

# 验证函数已添加
assert hasattr(instance, "custom_activation")
assert "custom_activation" in instance.pyfuncs

# 测试函数执行
input_tensor = torch.tensor([1.0, -1.0, 0.0], dtype=torch.float32)
result = instance.custom_activation(input_tensor)

# 验证结果
assert isinstance(result, torch.Tensor)
expected = torch.sigmoid(input_tensor)
assert torch.allclose(result, expected, atol=1e-5)

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试通过 `call_dps_packed` 调用动态添加的函数#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 定义 softmax 函数
def my_softmax(tensor, dim):
    """用于测试 call_dps_packed 的自定义 softmax 函数。"""
    # 必要时将 TVM NDArray 转换为 PyTorch 张量
    if hasattr(tensor, "numpy"):
        tensor = torch.from_numpy(tensor.numpy())
    return F.softmax(tensor, dim=dim)

# 添加函数到实例
instance.my_softmax = my_softmax

# 测试 call_dps_packed 调用
input_tensor = torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=torch.float32)

result = instance.call_dps_packed(
    "my_softmax", [input_tensor, 1], R.Tensor((2, 2), "float32")
)

# 验证结果
assert isinstance(result, torch.Tensor)
expected = F.softmax(input_tensor, dim=1)
assert torch.allclose(result, expected, atol=1e-5)

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试端到端的执行流水线#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 添加必要的 softmax 函数
def my_softmax(tensor, dim):
    if hasattr(tensor, "numpy"):
        tensor = torch.from_numpy(tensor.numpy())
    return F.softmax(tensor, dim=dim)

instance.my_softmax = my_softmax

# 创建测试数据
n = 5
x = torch.randn(n, 16, dtype=torch.float32)
w = torch.randn(16, 20, dtype=torch.float32)

# 执行完整流水线
result = instance.main(x, w)

# 验证结果
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)
assert result.dtype == torch.float32

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试在 GPU 上的端到端执行流水线#

module = PyTorchIntegrationModule

if tvm.cuda().exist:
    device = tvm.cuda(0)
    instance = module(device)

    # 测试基本的 GPU 功能（不进行复杂的 TIR 操作）
    assert isinstance(instance, BasePyModule)
    assert "cuda" in str(instance.target)

    # 测试创建和使用 GPU 张量
    n = 5
    x = torch.randn(n, 16, dtype=torch.float32, device="cuda")
    w = torch.randn(16, 20, dtype=torch.float32, device="cuda")

    # 验证张量设备和形状
    assert x.device.type == "cuda"
    assert w.device.type == "cuda"
    assert x.shape == (n, 16)
    assert w.shape == (16, 20)

    # 测试 GPU 上的基本 PyTorch 操作
    result = torch.matmul(x, w)
    assert isinstance(result, torch.Tensor)
    assert result.shape == (n, 20)
    assert result.dtype == torch.float32
    assert result.device.type == "cuda"
else:
    # 如果 CUDA 不可用，则跳过此测试
    pytest.skip("CUDA not available")

Warning: Failed to compile one or more TIR functions: Memory verification failed with the following errors:
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  Did you forget to bind?
# from tvm.script import tir as T

@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
    T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
    n = T.int32()
    A = T.match_buffer(var_A, (n, 16))
    C = T.match_buffer(var_C, (n, 20))
    for i, j, k in T.grid(n, 20, 16):
        cse_v1: T.int32 = i * 20 + j
        C_1 = T.Buffer((n * 20,), data=C.data)
        if k == 0:
            C_1[cse_v1] = T.float32(0.0)
        A_1 = T.Buffer((n * 16,), data=A.data)
        B_1 = T.Buffer((320,), data=B.data)
        C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]
Warning: Failed to compile Relax VM: Memory verification failed with the following errors:
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  Did you forget to bind?
# from tvm.script import tir as T

@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
    T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
    n = T.int32()
    A = T.match_buffer(var_A, (n, 16))
    C = T.match_buffer(var_C, (n, 20))
    for i, j, k in T.grid(n, 20, 16):
        cse_v1: T.int32 = i * 20 + j
        C_1 = T.Buffer((n * 20,), data=C.data)
        if k == 0:
            C_1[cse_v1] = T.float32(0.0)
        A_1 = T.Buffer((n * 16,), data=A.data)
        B_1 = T.Buffer((320,), data=B.data)
        C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]

测试不同函数类型之间的数据流#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 添加必要的函数
def my_softmax(tensor, dim):
    if hasattr(tensor, "numpy"):
        tensor = torch.from_numpy(tensor.numpy())
    return F.softmax(tensor, dim=dim)

instance.my_softmax = my_softmax

# 创建测试数据
n = 4
x = torch.randn(n, 16, dtype=torch.float32)
w = torch.randn(16, 20, dtype=torch.float32)

# 逐步执行以验证数据流
# 步骤1: TIR 矩阵乘法
lv = instance.call_tir(instance.matmul, [x, w], R.Tensor((n, 20), "float32"))
assert isinstance(lv, torch.Tensor)
assert lv.shape == (n, 20)

# 步骤2: ReLU 激活
lv1 = F.relu(lv)
assert isinstance(lv1, torch.Tensor)
assert lv1.shape == (n, 20)

# 步骤3: 通过 call_dps_packed 应用 Softmax
lv2 = instance.call_dps_packed("my_softmax", [lv1, 1], R.Tensor((n, 20), "float32"))
assert isinstance(lv2, torch.Tensor)
assert lv2.shape == (n, 20)

# 步骤4: 应用恒等函数
lv3 = instance.my_identity_func(lv2)
assert isinstance(lv3, torch.Tensor)
assert lv3.shape == (n, 20)

# 验证最终结果与预期一致
expected = F.softmax(F.relu(torch.matmul(x, w)), dim=1)
assert torch.allclose(lv3, expected, atol=1e-3)

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试各种边缘情况下的错误处理#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 测试调用不存在的函数
with pytest.raises(Exception):
    instance.call_dps_packed(
        "non_existent_function", [torch.tensor([1.0])], R.Tensor((1,), "float32")
    )

# 测试传入错误形状的张量
x = torch.randn(3, 16, dtype=torch.float32)
w = torch.randn(15, 20, dtype=torch.float32)  # 错误的形状

with pytest.raises(Exception):
    instance.call_tir(instance.matmul, [x, w], R.Tensor((3, 20), "float32"))

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试张量类型在整个执行过程中的保留#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 添加必要的函数
def my_softmax(tensor, dim):
    if hasattr(tensor, "numpy"):
        tensor = torch.from_numpy(tensor.numpy())
    return F.softmax(tensor, dim=dim)

instance.my_softmax = my_softmax

# 测试 float32 数据类型（TIR 函数硬编码为 float32）
test_dtype = torch.float32
n = 3
x = torch.randn(n, 16, dtype=test_dtype)
w = torch.randn(16, 20, dtype=test_dtype)

# 执行完整流水线
result = instance.main(x, w)

# 验证类型保留
assert result.dtype == test_dtype
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)
assert result.dtype == torch.float32

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'

测试批处理多个输入的功能#

module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)

# 添加必要的函数
def my_softmax(tensor, dim):
    if hasattr(tensor, "numpy"):
        tensor = torch.from_numpy(tensor.numpy())
    return F.softmax(tensor, dim=dim)

instance.my_softmax = my_softmax

# 处理多个输入
batch_size = 5
results = []

for i in range(batch_size):
    n = 3 + i  # 变化的批次大小
    x = torch.randn(n, 16, dtype=torch.float32)
    w = torch.randn(16, 20, dtype=torch.float32)

    result = instance.main(x, w)
    results.append(result)

    assert isinstance(result, torch.Tensor)
    assert result.shape == (n, 20)

# 验证所有结果有效
assert len(results) == batch_size
for result in results:
    assert isinstance(result, torch.Tensor)

Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'