importtorchimportnumpyasnpfromtvmimportffiastvm_ffiimporttimedefprint_speed(name,speed):print(f"{name:<40}{speed} sec/call")defprint_error(name,error):print(f"{name:<40}{error}")defbaseline_torch_add(repeat):"""Run torch.add with one element"""defrun_bench(device):x=torch.arange(1,device=device)y=torch.arange(1,device=device)z=torch.arange(1,device=device)torch.add(x,y,out=z)ifdevice=="cuda":torch.cuda.synchronize()start=time.time()foriinrange(repeat):torch.add(x,y,out=z)# note we deliberately do not use torch.cuda.synchronize()# because we want to see the overhead of the FFI call.end=time.time()print_speed(f"torch.add[{device}]",(end-start)/repeat)# rough take away: add on cuda roughly takes 3e-6 sec/callrun_bench("cpu")run_bench("cuda")defbaseline_numpy_add(repeat):"""Run numpy.add with one element"""x=np.arange(1)y=np.arange(1)z=np.arange(1)np.add(x,y,out=z)start=time.time()foriinrange(repeat):np.add(x,y,out=z)end=time.time()speed=(end-start)/repeatprint_speed("numpy.add",speed)defbaseline_cupy_add(repeat):"""Run cupy.add with one element"""try:importcupyexceptImportError:# skip if cupy is not installedreturnx=cupy.arange(1)y=cupy.arange(1)z=cupy.arange(1)cupy.add(x,y,out=z)start=time.time()foriinrange(repeat):cupy.add(x,y,out=z)end=time.time()speed=(end-start)/repeatprint_speed("cupy.add",speed)deftvm_ffi_nop(repeat):"""Overhead of tvm FFI python call via calling a NOP. testing.nop is defined in c++ and do nothing. """nop=tvm_ffi.get_global_func("testing.nop")x=tvm_ffi.from_dlpack(torch.arange(1))y=tvm_ffi.from_dlpack(torch.arange(1))z=tvm_ffi.from_dlpack(torch.arange(1))nop(x,y,z)start=time.time()foriinrange(repeat):y=tvm_ffi.from_dlpack(x)end=time.time()print_speed("tvm.ffi.nop",(end-start)/repeat)defbench_ffi_nop_from_dlpack(name,x,y,z,repeat):"""run dlpack conversion + tvm.ffi.nop Measures overhead of running dlpack for each args then invoke """nop=tvm_ffi.get_global_func("testing.nop")tx=tvm_ffi.from_dlpack(x)ty=tvm_ffi.from_dlpack(y)tz=tvm_ffi.from_dlpack(z)nop(tx,ty,tz)start=time.time()foriinrange(repeat):tx=tvm_ffi.from_dlpack(x)ty=tvm_ffi.from_dlpack(y)tz=tvm_ffi.from_dlpack(z)nop(tx,ty,tz)end=time.time()print_speed(name,(end-start)/repeat)deftvm_ffi_nop_from_torch_dlpack(repeat):"""run dlpack conversion + tvm.ffi.nop Measures overhead of running dlpack for each args then invoke """x=torch.arange(1)y=torch.arange(1)z=torch.arange(1)bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(torch)",x,y,z,repeat)deftvm_ffi_nop_from_numpy_dlpack(repeat):"""run dlpack conversion + tvm.ffi.nop Measures overhead of running dlpack for each args then invoke """x=np.arange(1)y=np.arange(1)z=np.arange(1)bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(numpy)",x,y,z,repeat)deftvm_ffi_self_dlpack_nop(repeat):"""run dlpack conversion + tvm.ffi.nop Measures overhead of running dlpack for each args then invoke """x=tvm_ffi.from_dlpack(torch.arange(1))y=tvm_ffi.from_dlpack(torch.arange(1))z=tvm_ffi.from_dlpack(torch.arange(1))bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(tvm)",x,y,z,repeat)defbench_ffi_nop_from_dlpack(name,x,y,z,repeat):"""run dlpack conversion + tvm.ffi.nop Measures overhead of running dlpack for each args then invoke """nop=tvm_ffi.get_global_func("testing.nop")tx=tvm_ffi.from_dlpack(x)ty=tvm_ffi.from_dlpack(y)tz=tvm_ffi.from_dlpack(z)nop(tx,ty,tz)start=time.time()foriinrange(repeat):tx=tvm_ffi.from_dlpack(x)ty=tvm_ffi.from_dlpack(y)tz=tvm_ffi.from_dlpack(z)nop(tx,ty,tz)end=time.time()print_speed(name,(end-start)/repeat)deftvm_ffi_nop_from_torch_utils_to_dlpack(repeat):""" Measures overhead of running dlpack for each args then invoke but uses the legacy torch.utils.dlpack.to_dlpack API This helps to measure possible implementation overhead of torch. """nop=tvm_ffi.get_global_func("testing.nop")x=torch.arange(1)y=torch.arange(1)z=torch.arange(1)tx=tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(x))ty=tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(y))tz=tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(z))nop(tx,ty,tz)start=time.time()foriinrange(repeat):tx=tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(x))ty=tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(y))tz=tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(z))nop(tx,ty,tz)end=time.time()speed=(end-start)/repeatprint_speed("tvm.ffi.nop+from_dlpack(torch.utils)",speed)defbench_tvm_ffi_nop_autodlpack(name,x,y,z,repeat):""" Measures overhead of running dlpack via auto convert by directly take torch.Tensor as inputs. """nop=tvm_ffi.get_global_func("testing.nop")nop(x,y,z)start=time.time()foriinrange(repeat):nop(x,y,z)end=time.time()speed=(end-start)/repeatprint_speed(name,speed)deftvm_ffi_nop_autodlpack_from_torch(repeat,device="cpu"):""" Measures overhead of running dlpack via auto convert by directly take torch.Tensor as inputs. """# use larger to ensure alignment req is metx=torch.arange(1,device=device)y=torch.arange(1,device=device)z=torch.arange(1,device=device)bench_tvm_ffi_nop_autodlpack(f"tvm.ffi.nop.autodlpack(torch[{device}])",x,y,z,repeat)deftvm_ffi_nop_autodlpack_from_numpy(repeat):""" Measures overhead of running dlpack via auto convert by directly take numpy.ndarray as inputs. """# use larger to ensure alignment req is metx=np.arange(256)y=np.arange(256)z=np.arange(256)bench_tvm_ffi_nop_autodlpack("tvm.ffi.nop.autodlpack(numpy)",x,y,z,repeat)defbench_to_dlpack(x,name,repeat):x.__dlpack__()start=time.time()foriinrange(repeat):x.__dlpack__()end=time.time()speed=(end-start)/repeatprint_speed(name,speed)defbench_to_dlpack_versioned(x,name,repeat,max_version=(1,1)):""" Measures overhead of running dlpack with latest 1.1. """try:x.__dlpack__(max_version=max_version)start=time.time()foriinrange(repeat):x.__dlpack__(max_version=max_version)end=time.time()speed=(end-start)/repeatprint_speed(name,speed)exceptExceptionase:print_error(name,e)defbench_torch_utils_to_dlpack(repeat):""" Measures overhead of running torch.utils.dlpack.to_dlpack """x=torch.arange(1)torch.utils.dlpack.to_dlpack(x)start=time.time()foriinrange(repeat):torch.utils.dlpack.to_dlpack(x)end=time.time()speed=(end-start)/repeatprint_speed("torch.utils.dlpack.to_dlpack",speed)defmain():repeat=10000print("-----------------------------")print("Benchmark f(x, y, z) overhead")print("-----------------------------")baseline_numpy_add(repeat)baseline_torch_add(repeat)baseline_cupy_add(repeat)tvm_ffi_nop(repeat)tvm_ffi_nop_from_torch_dlpack(repeat)tvm_ffi_nop_from_numpy_dlpack(repeat)tvm_ffi_self_dlpack_nop(repeat)tvm_ffi_nop_from_torch_utils_to_dlpack(repeat)tvm_ffi_nop_autodlpack_from_torch(repeat,"cpu")tvm_ffi_nop_autodlpack_from_torch(repeat,"cuda")tvm_ffi_nop_autodlpack_from_numpy(repeat)print("-------------------------------")print("Benchmark x.__dlpack__ overhead")print("-------------------------------")bench_torch_utils_to_dlpack(repeat)bench_to_dlpack(torch.arange(1),"torch.__dlpack__",repeat)bench_to_dlpack(np.arange(1),"numpy.__dlpack__",repeat)bench_to_dlpack(tvm_ffi.from_dlpack(torch.arange(1)),"tvm.__dlpack__",repeat)print("---------------------------------------------------")print("Benchmark x.__dlpack__(max_version=(1,1)) overhead")print("---------------------------------------------------")bench_to_dlpack_versioned(torch.arange(1),"torch.__dlpack__(max_version=(1,1))",repeat)bench_to_dlpack_versioned(np.arange(1),"numpy.__dlpack__(max_version=(1,1))",repeat)bench_to_dlpack_versioned(tvm_ffi.from_dlpack(torch.arange(1)),"tvm.__dlpack__(max_version=(1,1))",repeat)if__name__=="__main__":main()