PyTorch 与 TVM Relax 的集成#
此测试验证以下内容:
PyTorch 张量与 TVM 后端的无缝输入/输出
Python、TIR 和 Relax 函数之间的跨函数调用
动态 Python 函数的添加和执行
端到端流水线测试
错误处理和边缘情况
# 导入测试框架和必要的库
import pytest
import torch
import torch.nn.functional as F
import tvm
from tvm import relax, tir
from tvm.script import ir as I, relax as R, tir as T
from tvm.relax import BasePyModule
import numpy as np
@I.ir_module
class PyTorchIntegrationModule(BasePyModule):
"""用于测试 PyTorch 与 TVM 集成的测试模块。"""
@I.pyfunc
def main(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
"""主函数,演示各种函数调用和数据处理流程。"""
n = x.shape[0] # 获取输入张量的批次大小
# 调用 TIR 函数执行矩阵乘法
lv = self.call_tir(self.matmul, [x, w], out_sinfo=R.Tensor((n, 20), "float32"))
# 应用 ReLU 激活函数
lv1 = F.relu(lv)
# 调用动态添加的打包函数(通过 call_dps_packed 接口)
lv2 = self.call_dps_packed("my_softmax", [lv1, 1], out_sinfo=R.Tensor((n, 20), "float32"))
# 调用 Python 函数
lv3 = self.my_identity_func(lv2)
return lv3
@T.prim_func
def matmul(
var_A: T.handle,
var_B: T.handle,
var_C: T.handle,
):
"""TIR 原始函数,实现矩阵乘法操作。"""
n = T.int32() # 符号变量,表示批次大小
# 定义匹配的缓冲区
A = T.match_buffer(var_A, (n, 16), "float32")
B = T.match_buffer(var_B, (16, 20), "float32")
C = T.match_buffer(var_C, (n, 20), "float32")
# 三重循环实现矩阵乘法
for i, j, k in T.grid(n, 20, 16):
with T.block("block"):
vi, vj, vk = T.axis.remap("SSR", [i, j, k])
with T.init():
C[vi, vj] = T.float32(0)
C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
@I.pyfunc
def my_identity_func(self, x: torch.Tensor) -> torch.Tensor:
"""简单的恒等函数,返回输入张量本身。"""
return x
测试模块的创建和实例化功能#
module = PyTorchIntegrationModule
# 验证模块是否可调用(支持实例化)
assert hasattr(module, "__call__"), "模块应该可调用"
# 在 CPU 上实例化模块
device = tvm.cpu(0)
instance = module(device)
# 验证实例类型和必要方法
assert isinstance(instance, BasePyModule), "实例应该是 BasePyModule 类型"
required_methods = ["main", "call_tir", "call_dps_packed"]
for method in required_methods:
assert hasattr(instance, method), f"实例应该具有方法: {method}"
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试在 GPU 上创建和实例化模块#
module = PyTorchIntegrationModule
if tvm.cuda().exist:
# 验证模块是否可调用
assert hasattr(module, "__call__"), "模块应该可调用"
# 在 GPU 上实例化模块
device = tvm.cuda(0)
instance = module(device)
# 验证实例类型、必要方法和目标设备
assert isinstance(instance, BasePyModule), "实例应该是 BasePyModule 类型"
required_methods = ["main", "call_tir", "call_dps_packed"]
for method in required_methods:
assert hasattr(instance, method), f"实例应该具有方法: {method}"
assert "cuda" in str(instance.target)
else:
# 如果 CUDA 不可用,则跳过此测试
pytest.skip("CUDA not available")
Warning: Failed to compile one or more TIR functions: Memory verification failed with the following errors:
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Did you forget to bind?
# from tvm.script import tir as T
@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
n = T.int32()
A = T.match_buffer(var_A, (n, 16))
C = T.match_buffer(var_C, (n, 20))
for i, j, k in T.grid(n, 20, 16):
cse_v1: T.int32 = i * 20 + j
C_1 = T.Buffer((n * 20,), data=C.data)
if k == 0:
C_1[cse_v1] = T.float32(0.0)
A_1 = T.Buffer((n * 16,), data=A.data)
B_1 = T.Buffer((320,), data=B.data)
C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]
Warning: Failed to compile Relax VM: Memory verification failed with the following errors:
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Did you forget to bind?
# from tvm.script import tir as T
@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
n = T.int32()
A = T.match_buffer(var_A, (n, 16))
C = T.match_buffer(var_C, (n, 20))
for i, j, k in T.grid(n, 20, 16):
cse_v1: T.int32 = i * 20 + j
C_1 = T.Buffer((n * 20,), data=C.data)
if k == 0:
C_1[cse_v1] = T.float32(0.0)
A_1 = T.Buffer((n * 16,), data=A.data)
B_1 = T.Buffer((320,), data=B.data)
C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]
测试 Python 函数是否正确执行#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 测试恒等函数
input_tensor = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32)
result = instance.my_identity_func(input_tensor)
# 验证结果
assert isinstance(result, torch.Tensor)
assert torch.allclose(result, input_tensor, atol=1e-5)
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试 TIR 函数是否正确执行#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 测试矩阵乘法函数
n = 3
x = torch.randn(n, 16, dtype=torch.float32)
w = torch.randn(16, 20, dtype=torch.float32)
result = instance.call_tir(instance.matmul, [x, w], R.Tensor((n, 20), "float32"))
# 验证结果类型和形状
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)
# 使用 PyTorch 的矩阵乘法验证结果正确性
expected = torch.matmul(x, w)
assert torch.allclose(result, expected, atol=1e-3)
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试动态添加 Python 函数的功能#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 定义一个自定义函数
def custom_activation(x):
return torch.sigmoid(x)
# 添加函数到模块实例
instance.add_python_function("custom_activation", custom_activation)
# 验证函数已添加
assert hasattr(instance, "custom_activation")
assert "custom_activation" in instance.pyfuncs
# 测试函数执行
input_tensor = torch.tensor([1.0, -1.0, 0.0], dtype=torch.float32)
result = instance.custom_activation(input_tensor)
# 验证结果
assert isinstance(result, torch.Tensor)
expected = torch.sigmoid(input_tensor)
assert torch.allclose(result, expected, atol=1e-5)
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试通过 call_dps_packed 调用动态添加的函数#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 定义 softmax 函数
def my_softmax(tensor, dim):
"""用于测试 call_dps_packed 的自定义 softmax 函数。"""
# 必要时将 TVM NDArray 转换为 PyTorch 张量
if hasattr(tensor, "numpy"):
tensor = torch.from_numpy(tensor.numpy())
return F.softmax(tensor, dim=dim)
# 添加函数到实例
instance.my_softmax = my_softmax
# 测试 call_dps_packed 调用
input_tensor = torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=torch.float32)
result = instance.call_dps_packed(
"my_softmax", [input_tensor, 1], R.Tensor((2, 2), "float32")
)
# 验证结果
assert isinstance(result, torch.Tensor)
expected = F.softmax(input_tensor, dim=1)
assert torch.allclose(result, expected, atol=1e-5)
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试端到端的执行流水线#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 添加必要的 softmax 函数
def my_softmax(tensor, dim):
if hasattr(tensor, "numpy"):
tensor = torch.from_numpy(tensor.numpy())
return F.softmax(tensor, dim=dim)
instance.my_softmax = my_softmax
# 创建测试数据
n = 5
x = torch.randn(n, 16, dtype=torch.float32)
w = torch.randn(16, 20, dtype=torch.float32)
# 执行完整流水线
result = instance.main(x, w)
# 验证结果
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)
assert result.dtype == torch.float32
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试在 GPU 上的端到端执行流水线#
module = PyTorchIntegrationModule
if tvm.cuda().exist:
device = tvm.cuda(0)
instance = module(device)
# 测试基本的 GPU 功能(不进行复杂的 TIR 操作)
assert isinstance(instance, BasePyModule)
assert "cuda" in str(instance.target)
# 测试创建和使用 GPU 张量
n = 5
x = torch.randn(n, 16, dtype=torch.float32, device="cuda")
w = torch.randn(16, 20, dtype=torch.float32, device="cuda")
# 验证张量设备和形状
assert x.device.type == "cuda"
assert w.device.type == "cuda"
assert x.shape == (n, 16)
assert w.shape == (16, 20)
# 测试 GPU 上的基本 PyTorch 操作
result = torch.matmul(x, w)
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)
assert result.dtype == torch.float32
assert result.device.type == "cuda"
else:
# 如果 CUDA 不可用,则跳过此测试
pytest.skip("CUDA not available")
Warning: Failed to compile one or more TIR functions: Memory verification failed with the following errors:
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Did you forget to bind?
# from tvm.script import tir as T
@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
n = T.int32()
A = T.match_buffer(var_A, (n, 16))
C = T.match_buffer(var_C, (n, 20))
for i, j, k in T.grid(n, 20, 16):
cse_v1: T.int32 = i * 20 + j
C_1 = T.Buffer((n * 20,), data=C.data)
if k == 0:
C_1[cse_v1] = T.float32(0.0)
A_1 = T.Buffer((n * 16,), data=A.data)
B_1 = T.Buffer((320,), data=B.data)
C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]
Warning: Failed to compile Relax VM: Memory verification failed with the following errors:
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `C` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `A` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable `B` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Did you forget to bind?
# from tvm.script import tir as T
@T.prim_func
def matmul(var_A: T.handle, B: T.Buffer((16, 20), "float32"), var_C: T.handle):
T.func_attr({"target": T.target({"arch": "sm_86", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32})})
n = T.int32()
A = T.match_buffer(var_A, (n, 16))
C = T.match_buffer(var_C, (n, 20))
for i, j, k in T.grid(n, 20, 16):
cse_v1: T.int32 = i * 20 + j
C_1 = T.Buffer((n * 20,), data=C.data)
if k == 0:
C_1[cse_v1] = T.float32(0.0)
A_1 = T.Buffer((n * 16,), data=A.data)
B_1 = T.Buffer((320,), data=B.data)
C_1[cse_v1] = C_1[cse_v1] + A_1[i * 16 + k] * B_1[k * 20 + j]
测试不同函数类型之间的数据流#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 添加必要的函数
def my_softmax(tensor, dim):
if hasattr(tensor, "numpy"):
tensor = torch.from_numpy(tensor.numpy())
return F.softmax(tensor, dim=dim)
instance.my_softmax = my_softmax
# 创建测试数据
n = 4
x = torch.randn(n, 16, dtype=torch.float32)
w = torch.randn(16, 20, dtype=torch.float32)
# 逐步执行以验证数据流
# 步骤1: TIR 矩阵乘法
lv = instance.call_tir(instance.matmul, [x, w], R.Tensor((n, 20), "float32"))
assert isinstance(lv, torch.Tensor)
assert lv.shape == (n, 20)
# 步骤2: ReLU 激活
lv1 = F.relu(lv)
assert isinstance(lv1, torch.Tensor)
assert lv1.shape == (n, 20)
# 步骤3: 通过 call_dps_packed 应用 Softmax
lv2 = instance.call_dps_packed("my_softmax", [lv1, 1], R.Tensor((n, 20), "float32"))
assert isinstance(lv2, torch.Tensor)
assert lv2.shape == (n, 20)
# 步骤4: 应用恒等函数
lv3 = instance.my_identity_func(lv2)
assert isinstance(lv3, torch.Tensor)
assert lv3.shape == (n, 20)
# 验证最终结果与预期一致
expected = F.softmax(F.relu(torch.matmul(x, w)), dim=1)
assert torch.allclose(lv3, expected, atol=1e-3)
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试各种边缘情况下的错误处理#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 测试调用不存在的函数
with pytest.raises(Exception):
instance.call_dps_packed(
"non_existent_function", [torch.tensor([1.0])], R.Tensor((1,), "float32")
)
# 测试传入错误形状的张量
x = torch.randn(3, 16, dtype=torch.float32)
w = torch.randn(15, 20, dtype=torch.float32) # 错误的形状
with pytest.raises(Exception):
instance.call_tir(instance.matmul, [x, w], R.Tensor((3, 20), "float32"))
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试张量类型在整个执行过程中的保留#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 添加必要的函数
def my_softmax(tensor, dim):
if hasattr(tensor, "numpy"):
tensor = torch.from_numpy(tensor.numpy())
return F.softmax(tensor, dim=dim)
instance.my_softmax = my_softmax
# 测试 float32 数据类型(TIR 函数硬编码为 float32)
test_dtype = torch.float32
n = 3
x = torch.randn(n, 16, dtype=test_dtype)
w = torch.randn(16, 20, dtype=test_dtype)
# 执行完整流水线
result = instance.main(x, w)
# 验证类型保留
assert result.dtype == test_dtype
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)
assert result.dtype == torch.float32
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'
测试批处理多个输入的功能#
module = PyTorchIntegrationModule
device = tvm.cpu(0)
instance = module(device)
# 添加必要的函数
def my_softmax(tensor, dim):
if hasattr(tensor, "numpy"):
tensor = torch.from_numpy(tensor.numpy())
return F.softmax(tensor, dim=dim)
instance.my_softmax = my_softmax
# 处理多个输入
batch_size = 5
results = []
for i in range(batch_size):
n = 3 + i # 变化的批次大小
x = torch.randn(n, 16, dtype=torch.float32)
w = torch.randn(16, 20, dtype=torch.float32)
result = instance.main(x, w)
results.append(result)
assert isinstance(result, torch.Tensor)
assert result.shape == (n, 20)
# 验证所有结果有效
assert len(results) == batch_size
for result in results:
assert isinstance(result, torch.Tensor)
Warning: Failed to compile Relax VM: 'NoneType' object has no attribute 'kind'