测试分析提取伪量化算子#

import tvm
from tvm import relay

测试伪量化卷积#

x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8")
zero = relay.const(0)

op = relay.op.nn.conv2d(
    relay.qnn.op.dequantize(x, relay.const(2.0), zero),
    relay.qnn.op.dequantize(w, relay.const(0.5), zero),
    kernel_size=[5, 5],
)
op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")

mod = tvm.IRModule.from_expr(op)
mod.show()
fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)

assert dict(fake_quantized_op_freqs) == {"nn.conv2d": 1}
def @main(%x: Tensor[(1, 3, 224, 224), int8], %w: Tensor[(16, 3, 5, 5), int8]) {
  %0 = qnn.dequantize(%x, 2f, 0, out_dtype="float32");
  %1 = qnn.dequantize(%w, 0.5f, 0, out_dtype="float32");
  %2 = nn.conv2d(%0, %1, padding=[0, 0, 0, 0], kernel_size=[5, 5]);
  qnn.quantize(%2, 1f, 0, out_dtype="int8")
}

测试伪量化 dense#

x = relay.var("x", shape=[128, 64], dtype="int8")
w = relay.var("w", shape=[256, 64], dtype="int8")
zero = relay.const(0)

op = relay.op.nn.dense(
    relay.qnn.op.dequantize(x, relay.const(2.0), zero),
    relay.qnn.op.dequantize(w, relay.const(0.5), zero),
)
op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")

mod = tvm.IRModule.from_expr(op)
mod.show()
fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)

assert dict(fake_quantized_op_freqs) == {"nn.dense": 1}
def @main(%x: Tensor[(128, 64), int8], %w: Tensor[(256, 64), int8]) {
  %0 = qnn.dequantize(%x, 2f, 0, out_dtype="float32");
  %1 = qnn.dequantize(%w, 0.5f, 0, out_dtype="float32");
  %2 = nn.dense(%0, %1, units=None);
  qnn.quantize(%2, 1f, 0, out_dtype="int8")
}

测试伪量化多个区域#

x = relay.var("x", shape=[128, 64], dtype="int8")
w = relay.var("w", shape=[256, 64], dtype="int8")
zero = relay.const(0)

op = relay.op.nn.dense(
    relay.qnn.op.dequantize(x, relay.const(2.0), zero),
    relay.qnn.op.dequantize(w, relay.const(0.5), zero),
)
op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")

op = relay.qnn.op.dequantize(op, relay.const(2.0), relay.const(114))
op = relay.op.nn.relu(op)
op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")

w2 = relay.var("w2", shape=[64, 256], dtype="int8")
op = relay.op.nn.dense(
    relay.qnn.op.dequantize(op, relay.const(1.0), zero),
    relay.qnn.op.dequantize(w2, relay.const(0.5), zero),
)
op = relay.qnn.op.quantize(op, relay.const(1.0), zero, out_dtype="int8")

# We expect to ignore this sigmoid op since it's just outside a fake
# quantized region
op = relay.op.sigmoid(op)

mod = tvm.IRModule.from_expr(op)
mod.show()
fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)

assert dict(fake_quantized_op_freqs) == {"nn.dense": 2, "nn.relu": 1}
def @main(%x: Tensor[(128, 64), int8], %w: Tensor[(256, 64), int8], %w2: Tensor[(64, 256), int8]) {
  %0 = qnn.dequantize(%x, 2f, 0, out_dtype="float32");
  %1 = qnn.dequantize(%w, 0.5f, 0, out_dtype="float32");
  %2 = nn.dense(%0, %1, units=None);
  %3 = qnn.quantize(%2, 1f, 0, out_dtype="int8");
  %4 = qnn.dequantize(%3, 2f, 114, out_dtype="float32");
  %5 = nn.relu(%4);
  %6 = qnn.quantize(%5, 1f, 0, out_dtype="int8");
  %7 = qnn.dequantize(%6, 1f, 0, out_dtype="float32");
  %8 = qnn.dequantize(%w2, 0.5f, 0, out_dtype="float32");
  %9 = nn.dense(%7, %8, units=None);
  %10 = qnn.quantize(%9, 1f, 0, out_dtype="int8");
  sigmoid(%10)
}

测试伪量化 maxpool#

x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")

zero = relay.const(0)
x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
op = relay.op.nn.max_pool2d(x, [3, 3])
op = relay.qnn.op.quantize(op, relay.const(2.0), zero)

mod = tvm.IRModule.from_expr(op)
mod.show()
fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)

assert dict(fake_quantized_op_freqs) == {"nn.max_pool2d": 1}
def @main(%x: Tensor[(1, 3, 224, 224), int8]) {
  %0 = qnn.dequantize(%x, 2f, 0, out_dtype="float32");
  %1 = nn.max_pool2d(%0, pool_size=[3, 3], padding=[0, 0, 0, 0]);
  qnn.quantize(%1, 2f, 0, out_dtype="int8")
}

测试伪量化转置#

x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")

zero = relay.const(0)
x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
op = relay.op.transpose(x, [1, 0, 2, 3])
op = relay.op.reshape(op, [3, -1])
op = relay.qnn.op.quantize(op, relay.const(2.0), zero)

mod = tvm.IRModule.from_expr(op)
mod.show()
fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)

assert dict(fake_quantized_op_freqs) == {"transpose": 1, "reshape": 1}
def @main(%x: Tensor[(1, 3, 224, 224), int8]) {
  %0 = qnn.dequantize(%x, 2f, 0, out_dtype="float32");
  %1 = transpose(%0, axes=[1, 0, 2, 3]);
  %2 = reshape(%1, newshape=[3, -1]);
  qnn.quantize(%2, 2f, 0, out_dtype="int8")
}

测试伪量化 concat#

zero = relay.const(0)
inputs = []
for i in range(4):
    inputs.append(
        relay.qnn.op.dequantize(
            relay.var("x%d" % i, shape=[1, 4], dtype="int8"), relay.const(i + 0.5), zero
        )
    )
concat = relay.op.concatenate(inputs, axis=1)
op = relay.qnn.op.quantize(concat, relay.const(3.5), zero)

mod = tvm.IRModule.from_expr(op)
mod.show()
fake_quantized_op_freqs = relay.analysis.list_fake_quantized_op_freqs(mod)

assert dict(fake_quantized_op_freqs) == {"concatenate": 1}
def @main(%x0: Tensor[(1, 4), int8], %x1: Tensor[(1, 4), int8], %x2: Tensor[(1, 4), int8], %x3: Tensor[(1, 4), int8]) {
  %0 = qnn.dequantize(%x0, 0.5f, 0, out_dtype="float32");
  %1 = qnn.dequantize(%x1, 1.5f, 0, out_dtype="float32");
  %2 = qnn.dequantize(%x2, 2.5f, 0, out_dtype="float32");
  %3 = qnn.dequantize(%x3, 3.5f, 0, out_dtype="float32");
  %4 = (%0, %1, %2, %3);
  %5 = concatenate(%4, axis=1);
  qnn.quantize(%5, 3.5f, 0, out_dtype="int8")
}