%%shell
# Installs the latest dev build of TVM from PyPI. If you wish to build
# from source, see https://tvm.apache.org/docs/install/from_source.html
pip install apache-tvm --pre

部署在 CPU 上进行了修剪模型的 Hugging Face#

Author: Josh Fromm

本教程演示了如何使用任何剪枝模型，在这种情况下，从 Hugging Face 的 PruneBert，并使用 TVM 利用模型的稀疏性支持来产生实际的加速。虽然本教程的主要目的是实现对已经剪枝（Pruning）的模型的加速，但它也可能有助于估计模型剪枝后的速度。本文还提供了一个函数，该函数接受未剪枝的模型，并在指定的稀疏度下用随机和剪枝的权重替换其权重。当试图决定模型是否值得剪枝时，这可能是有用的功能。

在我们开始代码之前，先讨论稀疏性和剪枝，并深入研究两种不同类型的稀疏性：结构化（structured）和非结构化（unstructured），这很有用。

剪枝是一种主要用于通过将权重值替换为 0 来减少模型参数大小的技术。虽然有很多方法可以选择哪些权重应该设置为 0，但最直接的方法是选择最小的权重。通常，权重被修剪到所需的稀疏度百分比。例如，\(95\%\) 稀疏的模型只有 \(5\%\) 的权重非零。修剪到非常高的稀疏度通常需要微调或完全重新训练，因为它往往是有损的近似。尽管通过简单的压缩可以很容易地从剪枝后的模型中获得参数大小的优势，但利用稀疏性来产生运行时加速则更加复杂。

在结构化稀疏性中，权重被修剪以将修剪的权重聚集在一起。换句话说，它们是使用它们的值和位置进行修剪的。聚集修剪权重的好处是允许像矩阵乘法这样的算法跳过整个块。事实证明，在大多数当前可用的硬件上实现显著加速非常重要的某种程度的块稀疏性。这是因为在大多数 CPU 或 GPU 中加载内存时，跳过一次读取单个值并不能节省任何工作，而是需要读取整个块或瓦片并使用像向量化指令之类的东西执行。

非结构化稀疏权重是仅基于原始权重值进行修剪的权重。它们可能看起来随机分散在张量中，而不像块稀疏权重那样集中在块中。在低稀疏度下，非结构化剪枝技术很难加速。然而，在高稀疏度下，许多全零值的块自然会出现，从而可能加速。

本教程涉及结构化稀疏性和非结构化稀疏性。Hugging Face 的 PruneBert 模型是非结构化的，但稀疏度为 \(95\%\)，因此即使不是最优，也可以将 TVM 的块稀疏优化应用于它。在为未修剪的模型生成随机稀疏权重时，我们使用结构稀疏性。有趣的练习是比较 PruneBert 的真实速度和使用虚假权重的块稀疏速度，以看到结构稀疏性的好处。

加载所需模块#

需要除了 TVM 之外的其他软件，包括 scipy、最新的 transformers 和 tensorflow 2.2+。

import os
import tvm
import time
import itertools
import numpy as np
import tensorflow as tf
from tvm import relay, runtime
from tvm.contrib import graph_executor
from tvm.relay import data_dep_optimization as ddo
from tensorflow.python.framework.convert_to_constants import (
    convert_variables_to_constants_v2,
)
import scipy.sparse as sp


# Ask tensorflow to limit its GPU memory to what's actually needed
# instead of gobbling everything that's available.
# https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth
# This way this tutorial is a little more friendly to sphinx-gallery.
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("tensorflow will use experimental.set_memory_growth(True)")
    except RuntimeError as e:
        print("experimental.set_memory_growth option is not available: {}".format(e))

2023-06-08 16:48:12.605960: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-08 16:48:12.658952: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-08 16:48:13.505451: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
2023-06-08 16:48:14.728585: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...

配置设置#

让我们从定义一些参数开始，以定义要运行的模型和稀疏性类型。

# The name of the transformer model to download and run.
name = "huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad"
# The number of batches in an input.
batch_size = 1
# The length of each input sequence.
seq_len = 128
# TVM platform identifier. Note that best cpu performance can be achieved by setting -mcpu
# appropriately for your specific machine. CUDA and ROCm are also supported.
target = "llvm"
# Which device to run on. Should be one of tvm.cpu() or tvm.cuda().
dev = tvm.cpu()
# If true, then a sparse variant of the network will be run and
# benchmarked.
measure_sparse = True
# The block size of structured sparsity to convert weight tensors
# into. Changing this parameter may yield speedups for some platforms.
bs_r = 1
# For models besides PruneBert (which is 95% sparse), this parameter
# determines how sparse the generated weights should be. The higher
# the sparsity, the faster the result.
sparsity = 0.85

下载并转换 Transformers 模型#

现在，将从 transformers 模块中获取模型，下载它，将其转换为 TensorFlow 的 graphdef 格式，为将该 graphdef 转换为 relay graph 做准备，以便可以对其进行优化和部署。

def load_keras_model(module, name, seq_len, batch_size, report_runtime=True):
    model = module.from_pretrained(name)
    dummy_input = tf.keras.Input(shape=[seq_len], batch_size=batch_size, dtype="int32")
    dummy_out = model(dummy_input)  # Propagate shapes through the keras model.
    if report_runtime:
        np_input = np.random.uniform(size=[batch_size, seq_len], low=0, high=seq_len).astype(
            "int32"
        )
        start = time.time()
        repeats = 50
        for i in range(repeats):
            np_out = model(np_input)
        end = time.time()
        print("Keras Runtime: %f ms." % (1000 * ((end - start) / repeats)))
    return model


def convert_to_graphdef(model, batch_size, seq_len):
    model_func = tf.function(lambda x: model(x))
    input_dict = model._saved_model_inputs_spec
    input_spec = input_dict[list(input_dict.keys())[0]]
    model_func = model_func.get_concrete_function(
        tf.TensorSpec([batch_size, seq_len], input_spec.dtype)
    )
    frozen_func = convert_variables_to_constants_v2(model_func)
    return frozen_func.graph.as_graph_def()


def download_model(name, batch_size, seq_len):
    import transformers

    module = getattr(transformers, "TFBertForSequenceClassification")
    model = load_keras_model(module, name=name, batch_size=batch_size, seq_len=seq_len)
    return convert_to_graphdef(model, batch_size, seq_len)

转换为 Relay Graph#

现在，已经拥有了将 transformers 模型转换为 relay 格式的所有工具。在下面的函数中，将导入的 graph 保存在 relay 的 JSON 格式中，以便每次运行此脚本时无需从 tensorflow 重新导入。

__file__ = "."
def import_graphdef(
    name,
    batch_size,
    seq_len,
    save_relay=True,
    relay_file="model.json",
    relay_params="model.params",
):
    abs_path = os.path.dirname(os.path.abspath(__file__))
    shape_dict = {"input_1": (batch_size, seq_len)}
    relay_file = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_file)).replace("/", "_")
    relay_params = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_params)).replace("/", "_")
    if os.path.exists(os.path.join(abs_path, relay_file)) and os.path.exists(
        os.path.join(abs_path, relay_params)
    ):
        with open(os.path.join(abs_path, relay_file), "r") as fi:
            mod = tvm.ir.load_json(fi.read())
        with open(os.path.join(abs_path, relay_params), "rb") as fi:
            params = relay.load_param_dict(fi.read())
    else:
        graph_def = download_model(name, batch_size, seq_len)

        mod, params = relay.frontend.from_tensorflow(graph_def, shape=shape_dict)

        if save_relay:
            with open(os.path.join(abs_path, relay_file), "w") as fo:
                fo.write(tvm.ir.save_json(mod))
            with open(os.path.join(abs_path, relay_params), "wb") as fo:
                fo.write(runtime.save_param_dict(params))

    return mod, dict(params.items()), shape_dict

运行 Dense Graph#

让我们运行导入模型的默认版本。请注意，即使权重是稀疏的，我们也不会看到任何加速，因为我们在这些密集（但大多数是零）张量上使用的是常规的密集矩阵乘法，而不是稀疏感知的内核。

def run_relay_graph(mod, params, shape_dict, target, dev):
    with relay.build_config(opt_level=3):
        lib = relay.build(mod, target=target, params=params)
    input_shape = shape_dict["input_1"]
    dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype("int32")

    m = graph_executor.GraphModule(lib["default"](dev))
    m.set_input(0, dummy_data)
    m.run()
    tvm_output = m.get_output(0)

    print(m.benchmark(dev, repeat=5, number=5))
    return tvm_output


def run_dense(mod, params, shape_dict, target, dev):
    print("Dense Model Benchmark:")
    return run_relay_graph(mod, params, shape_dict, target, dev)

运行 Sparse Graph#

接下来，我们将把图形转换为稀疏表示，并在需要时生成虚拟稀疏权重。然后，我们将使用与密集矩阵相同的基准测试脚本来查看我们的速度提升情况！我们对图形应用了几个中继通行证来利用稀疏性。首先，我们使用simplify_fc_transpose将密集层的权重转置为参数。这使得更容易将其转换为矩阵乘法的稀疏版本。接下来，我们应用bsr_dense.convert来识别所有可以稀疏的权重矩阵，并自动替换它们。

下面的bsr_dense.convert调用正在做重活，通过检查模型中的权重是否至少稀疏sparsity_threshold百分之几来确定哪些权重可以被稀疏化。如果是，则将这些权重转换为块压缩行格式(BSR)。BSR实质上是一种将张量中的非零块索引化的表示形式，这使得算法可以轻松加载那些非零块并忽略张量的其余部分。一旦稀疏权重以BSR格式存在，就会应用relay.transform.DenseToSparse来实际替换relay.dense操作，使用relay.sparse_dense调用以实现更快的运行速度。

def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype="float32"):
    Y = np.zeros((M, N), dtype=dtype)
    assert M % BS_R == 0
    assert N % BS_C == 0
    nnz = int(density * M * N)
    num_blocks = int(nnz / (BS_R * BS_C)) + 1
    candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C))))
    assert candidate_blocks.shape[0] == M // BS_R * N // BS_C
    chosen_blocks = candidate_blocks[
        np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False)
    ]
    for i in range(len(chosen_blocks)):
        r, c = chosen_blocks[i]
        Y[r : r + BS_R, c : c + BS_C] = np.random.uniform(-0.1, 0.1, (BS_R, BS_C))
    s = sp.bsr_matrix(Y, blocksize=(BS_R, BS_C))
    assert s.data.shape == (num_blocks, BS_R, BS_C)
    assert s.data.size >= nnz
    assert s.indices.shape == (num_blocks,)
    assert s.indptr.shape == (M // BS_R + 1,)
    return s.todense()


def random_sparse_bert_params(func, params, density, BS_R, BS_C):
    def deepcopy(param_dic):
        ret = {}
        for k, v in param_dic.items():
            ret[k] = tvm.nd.array(v.numpy())
        return ret

    new_params = deepcopy(params)
    dense_weight_names = relay.analysis.sparse_dense._search_dense_op_weight(func)
    for item in dense_weight_names:
        name = str(item)
        shape = new_params[name].shape
        if shape[0] % BS_R == 0 and shape[1] % BS_C == 0:
            new_w = random_bsr_matrix(shape[0], shape[1], BS_R, BS_C, density)
            new_params[name] = tvm.nd.array(new_w)
    return new_params


def run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights):
    mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params)
    if gen_weights:
        params = random_sparse_bert_params(mod, params, BS_R=bs_r, BS_C=1, density=1 - sparsity)
    mod, params = ddo.bsr_dense.convert(mod, params, (bs_r, 1), sparsity_threshold=0.8)
    print("Block Sparse Model with {blocksize}x1 blocks:".format(blocksize=bs_r))
    return run_relay_graph(mod, params, shape_dict, target, dev)

运行全部代码#

现在，我们只需调用所有必要的函数，根据设置的参数对模型进行基准测试。请注意，要运行此代码，您需要先取消注释最后一行。

def benchmark():
    mod, params, shape_dict = import_graphdef(name, batch_size, seq_len)
    run_dense(mod, params, shape_dict, target, dev)
    if measure_sparse:
        gen_weights = "prune" not in name
        run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights)


# benchmark()

输出样例#

供参考，以下是在 AMD CPU 上运行脚本时的输出，显示使用稀疏性可以加速约 2.5 倍。

# Dense Model Benchmark:
# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=llvm, workload=('batch_matmul.x86', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32')). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=llvm, workload=('batch_matmul.x86', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32')). A fallback configuration is used, which may bring great performance regression.
# Runtime:             165.26 ms           (12.83 ms)
# Block Sparse Model with 1x1 blocks:
# Runtime:             67.75 ms            (8.83 ms)

# Here is the output of this script on a GPU (GTX 1070) with the target "cuda -libs=cublas".
#
# Dense Model Benchmark:
# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32'), (12, 128, 64)). A fallback configuration is used, which may bring great performance regression.
# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32'), (12, 128, 128)). A fallback configuration is used, which may bring great performance regression.
# Runtime:             10.64 ms            (0.29 ms)
# Block Sparse Model with 1x1 blocks:
# Runtime:             6.46 ms             (0.05 ms)

部署在 CPU 上进行了修剪模型的 Hugging Face

目录