包含以下内容:
- hgemv_k32_f16_kernel
- hgemv_k128_f16x4_kernel
- hgemv_k16_f16_kernel
- PyTorch bindings
# 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
export TORCH_CUDA_ARCH_LIST=Ada
python3 hgemv.py
输出:
--------------------------------------------------------------------------------
out_k32f16: [-0.00585938, -1.91015625, 10.8281250], time:0.00373125ms
out_k128f16x4: [-0.01171875, -1.89648438, 10.8203125], time:0.00373244ms
out_f16_th: [-0.00727463, -1.89648438, 10.8281250], time:0.00845909ms
--------------------------------------------------------------------------------
out_k16f16: [6.03125, -2.15234375, 3.140625], time:0.00377536ms
out_f16_th: [6.03125, -2.15234375, 3.140625], time:0.00835538ms
--------------------------------------------------------------------------------