fake_quantization_to_integer

fake_quantization_to_integer#

import numpy as np
import tvm
from tvm import relay
from tvm.relay.transform import fake_quantization_to_integer
def compare_fq_to_int(expr, args, allow_rounding_error=False):
    mod = tvm.IRModule.from_expr(expr)
    mod = tvm.relay.transform.InferType()(mod)
    mod_int = tvm.relay.transform.FakeQuantizationToInteger()(mod)
    assert not tvm.ir.structural_equal(mod, mod_int)
    result = (
        relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
        .evaluate()(*args)
        .numpy()
    )
    result_int = (
        relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
        .evaluate()(*args)
        .numpy()
    )

    if allow_rounding_error:
        assert np.all(np.abs(result.astype("int32") - result_int.astype("int32")) <= 1)
    else:
        assert np.array_equal(result, result_int)

测试 fake_quantize_conv#

x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
one = relay.const(1.0)
zero = relay.const(0)
for out_dtype in ["uint8", "int8"]:
    op = relay.op.nn.conv2d(
        relay.qnn.op.dequantize(x, relay.const(2.0), zero),
        relay.qnn.op.dequantize(w, relay.const(0.5), zero),
        kernel_size=[5, 5],
    )

    op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)

    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
    w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")

    compare_fq_to_int(op, [x_np, w_np])
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
expr = op
mod = tvm.IRModule.from_expr(expr)
mod = tvm.relay.transform.InferType()(mod)
mod_int = tvm.relay.transform.FakeQuantizationToInteger()(mod)
mod.show()
mod_int.show()
def @main(%x: Tensor[(1, 3, 224, 224), int8] /* ty=Tensor[(1, 3, 224, 224), int8] */, %w: Tensor[(16, 3, 5, 5), int8] /* ty=Tensor[(16, 3, 5, 5), int8] */) -> Tensor[(1, 16, 220, 220), int8] {
  %0 = qnn.dequantize(%x, 2f /* ty=float32 */, 0 /* ty=int32 */, out_dtype="float32") /* ty=Tensor[(1, 3, 224, 224), float32] */;
  %1 = qnn.dequantize(%w, 0.5f /* ty=float32 */, 0 /* ty=int32 */, out_dtype="float32") /* ty=Tensor[(16, 3, 5, 5), float32] */;
  %2 = nn.conv2d(%0, %1, padding=[0, 0, 0, 0], kernel_size=[5, 5]) /* ty=Tensor[(1, 16, 220, 220), float32] */;
  qnn.quantize(%2, 1f /* ty=float32 */, 0 /* ty=int32 */, out_dtype="int8") /* ty=Tensor[(1, 16, 220, 220), int8] */
}
def @main(%x: Tensor[(1, 3, 224, 224), int8] /* ty=Tensor[(1, 3, 224, 224), int8] */, %w: Tensor[(16, 3, 5, 5), int8] /* ty=Tensor[(16, 3, 5, 5), int8] */) -> Tensor[(1, 16, 220, 220), int8] {
  %0 = qnn.conv2d(%x, %w, 0 /* ty=int32 */, 0 /* ty=int32 */, 2f /* ty=float32 */, 0.5f /* ty=float32 */, padding=[0, 0, 0, 0], kernel_size=[5, 5], out_dtype="int32") /* ty=Tensor[(1, 16, 220, 220), int32] */;
  qnn.requantize(%0, 1f /* ty=float32 */, 0 /* ty=int32 */, 1f /* ty=float32 */, 0 /* ty=int32 */, axis=1, out_dtype="int8") /* ty=Tensor[(1, 16, 220, 220), int8] */
}
mod_int.astext()
'#[version = "0.0.5"]\ndef @main(%x: Tensor[(1, 3, 224, 224), int8] /* ty=Tensor[(1, 3, 224, 224), int8] */, %w: Tensor[(16, 3, 5, 5), int8] /* ty=Tensor[(16, 3, 5, 5), int8] */) -> Tensor[(1, 16, 220, 220), int8] {\n  %0 = qnn.conv2d(%x, %w, 0 /* ty=int32 */, 0 /* ty=int32 */, 2f /* ty=float32 */, 0.5f /* ty=float32 */, padding=[0, 0, 0, 0], kernel_size=[5, 5], out_dtype="int32") /* ty=Tensor[(1, 16, 220, 220), int32] */;\n  qnn.requantize(%0, 1f /* ty=float32 */, 0 /* ty=int32 */, 1f /* ty=float32 */, 0 /* ty=int32 */, axis=1, out_dtype="int8") /* ty=Tensor[(1, 16, 220, 220), int8] */\n}\n'

fake_quantize_conv_per_channel#

for out_dtype in ["int8", "uint8"]:
    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
    w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
    one = relay.const([1.0] * 16)
    zero_point = relay.const([np.random.randint(0, 255)] * 16)

    op = relay.op.nn.conv2d(
        relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0)),
        relay.qnn.op.dequantize(
            w, relay.const(np.random.random([16]).astype("float32")), zero_point, axis=0
        ),
        kernel_size=[5, 5],
        channels=16,
    )
    op = relay.qnn.op.quantize(op, relay.const(1.0), relay.const(0), out_dtype=out_dtype)

    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
    w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8")

    compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)
print(op)