测试 Roofline 模型#
import set_env
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # 忽略用户警告
import csv
import json
import os
import platform
from io import StringIO
import numpy as np
import tvm.testing
import tvm.utils
from tvm import relay, rpc
from tvm.contrib import utils
from tvm.contrib.debugger import debug_executor
from tvm.relay.testing import mlp
from tvm.runtime import profiler_vm
from tvm.runtime.profiling import Report
from tvm.script import tir as T
estimate_peak_flops_cpu#
for dtype in ["float32", "int8", "int32"]:
server = rpc.Server(key="roofline_flops_cpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_cpu")
target = tvm.target.Target("llvm -mattr=+fma,+avx2")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
flops = tvm.utils.roofline.x86.estimate_peak_fma_vector_flops(target, dev, remote, dtype)
# Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
assert (
flops > 10**9 and flops < 10**14
), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"
2024-01-19 14:00:46.799 INFO bind to 0.0.0.0:9091
2024-01-19 14:00:46.800 INFO connected from ('127.0.0.1', 37002)
2024-01-19 14:00:46.801 INFO start serving at /tmp/tmpb673zo51
2024-01-19 14:00:46.940 INFO load_module /tmp/tmpb673zo51/peak_fma_flops.tar
2024-01-19 14:00:48.501 INFO bind to 0.0.0.0:9092
2024-01-19 14:00:48.538 INFO connected from ('127.0.0.1', 54284)
2024-01-19 14:00:48.539 INFO start serving at /tmp/tmp_2qik6wl
2024-01-19 14:00:48.684 INFO load_module /tmp/tmp_2qik6wl/peak_fma_flops.tar
2024-01-19 14:00:51.826 INFO bind to 0.0.0.0:9091
2024-01-19 14:00:51.880 INFO connected from ('127.0.0.1', 43448)
2024-01-19 14:00:51.881 INFO start serving at /tmp/tmp_8y155vs
2024-01-19 14:00:52.016 INFO load_module /tmp/tmp_8y155vs/peak_fma_flops.tar
estimate_peak_flops_gpu#
from tvm_book.config.env import set_cudnn
set_cudnn() # 设置 CUDA 环境
server = rpc.Server(key="roofline_flops_gpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_gpu")
target = tvm.target.Target("cuda")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
flops = tvm.utils.roofline.cuda.estimate_peak_flops_tensorcore(target, dev, remote)
# should be able to hit a TFLOP/s with tensor cores
assert (
flops > 10**12 and flops < 10**14
), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
# this test should run on all gpus
flops = tvm.utils.roofline.cuda.estimate_peak_flops_fma(target, dev, remote, "float32")
# most gpus since 2016 should be able to hit a TFLOP/s with fma instructions
assert (
flops > 10**12 and flops < 10**14
), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
2024-01-19 14:00:54.378 INFO bind to 0.0.0.0:9092
2024-01-19 14:00:54.410 INFO connected from ('127.0.0.1', 55232)
2024-01-19 14:00:54.411 INFO start serving at /tmp/tmpmkby12ft
2024-01-19 14:00:56.002 INFO load_module /tmp/tmpmkby12ft/peak_mma_flops.tar
2024-01-19 14:00:56.861 INFO load_module /tmp/tmpmkby12ft/peak_fma_flops.tar
estimate_peak_bandwidth_cpu#
server = rpc.Server(key="roofline_bandwidth_cpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_cpu")
target = tvm.target.Target("llvm -mattr=+fma,+avx2")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
bandwidth = tvm.utils.roofline.x86.estimate_peak_bandwidth_dram(target, dev, remote)
# Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
# GB/s, so this should leave enough wiggle room.
assert (
bandwidth > 10**9 and bandwidth < 10**12
), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
2024-01-19 14:00:58.037 INFO bind to 0.0.0.0:9091
2024-01-19 14:00:58.094 INFO connected from ('127.0.0.1', 43462)
2024-01-19 14:00:58.095 INFO start serving at /tmp/tmpcxaxh3kt
2024-01-19 14:00:58.229 INFO load_module /tmp/tmpcxaxh3kt/peak_bandwidth.tar
estimate_peak_bandwidth_gpu#
server = rpc.Server(key="roofline_bandwidth_gpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_gpu")
target = tvm.target.Target("cuda")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
bandwidth = tvm.utils.roofline.cuda.estimate_peak_bandwidth_global_mem(target, dev, remote)
# should be able to hit a 100 GB/s on a GPU. GTX 280 hits 140 GB/s and
# it is really old.
assert (
bandwidth > 10**11 and bandwidth < 10**13
), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
2024-01-19 14:01:11.762 INFO bind to 0.0.0.0:9092
2024-01-19 14:01:11.815 INFO connected from ('127.0.0.1', 48560)
2024-01-19 14:01:11.816 INFO start serving at /tmp/tmpwz5b9dc6
2024-01-19 14:01:12.493 INFO load_module /tmp/tmpwz5b9dc6/peak_bandwidth.tar
roofline_analysis#
target, dev = "llvm -mattr=+fma,+avx2", "cuda"
a = relay.var("a", relay.TensorType((512, 512), "float32"))
b = relay.var("b", relay.TensorType((512, 512), "float32"))
c = relay.nn.dense(a, b)
mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
params = {}
server = rpc.Server(key="roofline")
remote = rpc.connect("127.0.0.1", server.port, key="roofline")
dev = remote.device(target)
report = tvm.utils.roofline_analysis(mod, params, target, dev, remote=remote)
print(report)
assert "Bound" in report.table()
assert "Percent of Theoretical Optimal" in report.table()
for call in report.calls:
if "Percent of Theoretical Optimal" in call:
if target.startswith("llvm"):
# Ideally we'd like a little tighter bound here, but it is hard to
# know how well this dense will perform without tuning. And we
# don't have an operator that uses a specific number of flops.
assert call["Percent of Theoretical Optimal"].ratio >= 5.0
elif target == "cuda":
# The cuda gpu kernel is really poorly optimized
assert 90 >= call["Percent of Theoretical Optimal"].ratio >= 0.01
Name Duration (us) Percent Device Count Argument Shapes Arithmetic Intensity Bandwidth Bound Estimated FLOPs FLOP/s Hash Loaded Bytes Percent of Theoretical Optimal VM::Argument Shapes
vm_mod_fused_nn_dense 1,911.23 95.44 cpu0 1 float32[512, 512], float32[512, 512], float32[512, 512] 18 7.7e+09 memory 268,435,456 1.4e+11 6bf92d0ede030db0 14,696,448 32
VM::AllocStorage 13.25 0.66 cpu0 1 float32[512, 512]
VM::AllocTensor 1.98 0.10 cpu0 1 float32[512, 512]
VM::UnknownOp 1.07 0.05 cpu0 3
----------
Sum 1,927.53 96.25 6 268,435,456 14,696,448
Total 2,002.57 cpu0 1
Configuration
-------------
Number of threads: 24
Estimated Peak Bandwidth (DRAM, byte/second): 2.4e+10
Executor: VM
Estimated Peak FLOP/s (float32 FMA): 2e+12
2024-01-19 14:01:28.584 INFO bind to 0.0.0.0:9091
2024-01-19 14:01:28.618 INFO connected from ('127.0.0.1', 51894)
2024-01-19 14:01:28.619 INFO start serving at /tmp/tmpmu073udv
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
2024-01-19 14:01:28.871 INFO load_module /tmp/tmpmu073udv/roofline_lib.tar
2024-01-19 14:01:30.796 INFO load_module /tmp/tmpmu073udv/peak_fma_flops.tar
2024-01-19 14:01:31.740 INFO load_module /tmp/tmpmu073udv/peak_bandwidth.tar