extern#

import sys
from pathlib import Path
ROOT = Path(".").resolve().parents[2]
# print(ROOT)
sys.path.extend([f"{ROOT}/tests"])
# from tools.tag_span import _create_span, _set_span, _verify_structural_equal_with_span
import tools
from d2py.utils.file import mkdir
root_dir = ".temp"
mkdir(root_dir )
import tvm
from tvm import te
import numpy as np
import tvm.testing

extern add pipeline#

nn = 64
max_threads = 4
n = tvm.runtime.convert(nn)
A = te.placeholder((n,), name="A")

def extern_generator(ins, outs):
    """Manually write the IR for the extern function, add pipeline"""
    ib = tvm.tir.ir_builder.create()
    with ib.for_range(0, (n + 1) // 2) as i:
        ib.emit(
            outs[0].vstore(
                i * 2, ins[0].vload(i * 2, "float32x2") + tvm.tir.const(1, "float32x2")
            )
        )
    return ib.get()

def extern_generator_gpu(ins, outs):
    """Manually write the IR for the extern function, add pipeline"""
    ib = tvm.tir.ir_builder.create()
    bx = te.thread_axis("blockIdx.x")
    tx = te.thread_axis("threadIdx.x")
    ib.scope_attr(bx, "thread_extent", (nn + max_threads - 1) // max_threads)
    ib.scope_attr(tx, "thread_extent", max_threads)
    idx = bx.var * max_threads + tx.var
    with ib.if_scope(ib.likely(idx < n)):
        ib.emit(
            outs[0].vstore(
                idx * 2, ins[0].vload(idx * 2, "float32x2") + tvm.tir.const(1, "float32x2")
            )
        )
    return ib.get()

C_cpu = te.extern(A.shape, [A], extern_generator, name="C")
C_gpu = te.extern(A.shape, [A], extern_generator_gpu, name="C")
s_cpu = te.create_schedule(C_cpu.op)
s_gpu = te.create_schedule(C_gpu.op)
print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((64,), "float32"), C: T.Buffer((64,), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        T.attr(0, "extern_scope", 0)
        for i in range(32):
            cse_var_1: T.int32 = i * 2
            C[cse_var_1:cse_var_1 + 2] = A[cse_var_1:cse_var_1 + 2] + T.Broadcast(T.float32(1), 2)
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((64,), "float32"), C: T.Buffer((64,), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        T.attr(0, "extern_scope", 0)
        blockIdx_x = T.launch_thread("blockIdx.x", 16)
        threadIdx_x = T.launch_thread("threadIdx.x", 4)
        C[blockIdx_x * 8 + threadIdx_x * 2:blockIdx_x * 8 + threadIdx_x * 2 + 2] = A[blockIdx_x * 8 + threadIdx_x * 2:blockIdx_x * 8 + threadIdx_x * 2 + 2] + T.Broadcast(T.float32(1), 2)
def check_target(target):
    if not tvm.testing.device_enabled(target):
        return
    s = s_gpu if target in ["opencl", "cuda"] else s_cpu
    C = C_gpu if target in ["opencl", "cuda"] else C_cpu
    # build and invoke the kernel.
    f = tvm.build(s, [A, C], target)
    dev = tvm.device(target, 0)
    # launch the kernel.
    n = nn
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
    f(a, c)
    tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)

check_target("llvm")
check_target("opencl")
check_target("cuda")
[15:33:24] /media/pc/data/lxw/ai/tvm/src/target/target_kind.cc:158: Warning: Unable to detect CUDA version, default to "-arch=sm_50" instead
[15:33:25] /media/pc/data/lxw/ai/tvm/src/target/target_kind.cc:158: Warning: Unable to detect CUDA version, default to "-arch=sm_50" instead
[15:33:25] /media/pc/data/lxw/ai/tvm/src/target/target_kind.cc:158: Warning: Unable to detect CUDA version, default to "-arch=sm_50" instead

extern pack buffer#

nn = 1024
n = tvm.runtime.convert(nn)
A = te.placeholder((n,), name="A")

def extern_generator(ins, outs):
    """Manually write the IR for the extern function, add pipeline."""
    return tvm.tir.call_packed("my_extern_array_func1", ins[0], outs[0])

C = te.extern(A.shape, [A], extern_generator, name="C")
s = te.create_schedule(C.op)

@tvm.register_func
def my_extern_array_func1(aa, bb):
    aa.copyto(bb)

def check_target(target):
    if not tvm.testing.device_enabled(target):
        return
    # build and invoke the kernel.
    f = tvm.build(s, [A, C], target)
    dev = tvm.cpu(0)
    # launch the kernel.
    n = nn
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)

    f(a, c)
    tvm.testing.assert_allclose(c.numpy(), a.numpy())

check_target("stackvm")
check_target("llvm")
[15:34:16] /media/pc/data/lxw/ai/tvm/src/target/target_kind.cc:158: Warning: Unable to detect CUDA version, default to "-arch=sm_50" instead
[15:34:16] /media/pc/data/lxw/ai/tvm/src/target/target_kind.cc:158: Warning: Unable to detect CUDA version, default to "-arch=sm_50" instead

extern pack buffer intermediate#

nn = 1024
n = tvm.runtime.convert(nn)
A = te.placeholder((n,), name="A")
B = te.compute((n,), lambda i: A[i] + 1, name="B")

def extern_generator(ins, outs):
    """Manually write the IR for the extern function, add pipeline."""
    return tvm.tir.call_packed("my_extern_array_func2", ins[0], outs[0])

C = te.extern(B.shape, [B], extern_generator, name="C")
s = te.create_schedule(C.op)

def check_target(target):
    if not tvm.testing.device_enabled(target):
        return
    # build and invoke the kernel.
    f = tvm.build(s, [A, C], target)
    dev = tvm.cpu(0)
    # launch the kernel.
    n = nn
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)

    @tvm.register_func
    def my_extern_array_func2(aa, bb):
        assert aa.shape == a.shape
        tvm.testing.assert_allclose(aa.numpy(), a.numpy() + 1)
        aa.copyto(bb)

    f(a, c)
    tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)

check_target("llvm")
[15:34:46] /media/pc/data/lxw/ai/tvm/src/target/target_kind.cc:158: Warning: Unable to detect CUDA version, default to "-arch=sm_50" instead