From e69d23be553760fdedbd2e698e5295d081fca755 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Sun, 19 May 2024 11:37:34 -0400 Subject: [PATCH 001/154] [Kernel] Add marlin_24 unit tests (#4901) --- tests/kernels/test_marlin_gemm.py | 87 ++++- .../layers/quantization/gptq_marlin_24.py | 27 +- .../layers/quantization/utils/format_24.py | 308 ++++++++++++++++++ .../quantization/utils/marlin_24_perms.py | 58 ++++ .../layers/quantization/utils/marlin_perms.py | 58 ++++ .../layers/quantization/utils/marlin_utils.py | 214 +++++++----- 6 files changed, 649 insertions(+), 103 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/utils/format_24.py create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_24_perms.py create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_perms.py diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index b0ad85c25c572..587fc3901eb7c 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -7,23 +7,32 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS) +from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( + GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS) +from vllm.model_executor.layers.quantization.utils.marlin_perms import ( + marlin_perm) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - MarlinWorkspace, is_marlin_supported, marlin_quantize, marlin_weights) + MarlinWorkspace, compute_max_diff, is_marlin_supported, marlin_24_quantize, + marlin_quantize, marlin_weights) from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] -K_CHUNKS = [128, 256] -N_CHUNKS = [64, 128, 256] +MARLIN_K_CHUNKS = [128] +MARLIN_N_CHUNKS = [64, 128, 256] + +MARLIN_24_K_CHUNKS = [128] +MARLIN_24_N_CHUNKS = [256] MNK_FACTORS = [ (1, 1, 1), (1, 4, 8), (1, 7, 5), - (1, 7 * 4, 5 * 1), (13, 17, 67), (26, 37, 13), (67, 13, 11), @@ -31,14 +40,13 @@ def rand_data(shape): - data = torch.rand(shape).to(torch.half).cuda() - return data + return torch.randn(shape, dtype=torch.half, device="cuda") @pytest.mark.skipif(not is_marlin_supported(), reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("k_chunk", K_CHUNKS) -@pytest.mark.parametrize("n_chunk", N_CHUNKS) +@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) +@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) @pytest.mark.parametrize("num_bits", GPTQ_MARLIN_SUPPORTED_NUM_BITS) @pytest.mark.parametrize("group_size", GPTQ_MARLIN_SUPPORTED_GROUP_SIZES) @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS) @@ -82,7 +90,8 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order, q_w, g_idx, sort_indices = sort_weights(q_w, g_idx) # Pack to Marlin format - marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits) + marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, + marlin_perm[num_bits]) # Run Marlin repack GPU kernel marlin_q_w_2 = ops.gptq_marlin_repack( @@ -99,8 +108,8 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order, @pytest.mark.skipif(not is_marlin_supported(), reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("k_chunk", K_CHUNKS) -@pytest.mark.parametrize("n_chunk", N_CHUNKS) +@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) +@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) @pytest.mark.parametrize("num_bits", GPTQ_MARLIN_SUPPORTED_NUM_BITS) @pytest.mark.parametrize("group_size", GPTQ_MARLIN_SUPPORTED_GROUP_SIZES) @pytest.mark.parametrize("mnk_factors", MNK_FACTORS) @@ -136,7 +145,8 @@ def test_marlin_gemm( w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize( b_weight, num_bits, group_size, act_order) - workspace = MarlinWorkspace(size_n) + workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) output = ops.gptq_marlin_gemm( a_input, @@ -155,4 +165,55 @@ def test_marlin_gemm( torch.cuda.synchronize() - assert torch.allclose(output, output_ref, rtol=1e-2) + max_diff = compute_max_diff(output, output_ref) + print("max_diff = {}".format(max_diff)) + + assert max_diff < 0.04 + + +@pytest.mark.skipif(not is_marlin_supported(), + reason="Marlin is not supported on this GPU type.") +@pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS) +@pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS) +@pytest.mark.parametrize("num_bits", GPTQ_MARLIN_24_SUPPORTED_NUM_BITS) +@pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES) +@pytest.mark.parametrize("mnk_factors", MNK_FACTORS) +def test_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size, mnk_factors): + m_factor, n_factor, k_factor = mnk_factors + + size_m = m_factor + size_k = k_chunk * k_factor + size_n = n_chunk * n_factor + + print(f"MNK = {size_m} {size_n} {size_k}") + print(f"groupsize = {group_size}") + + a_input = rand_data((size_m, size_k)) + b_weight = rand_data((size_k, size_n)) + + (w_24_ref, marlin_24_q_w_comp, marlin_24_meta, + marlin_24_s) = marlin_24_quantize(b_weight, num_bits, group_size) + + workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_MAX_PARALLEL) + + output_ref = torch.matmul(a_input, w_24_ref) + + output = ops.gptq_marlin_24_gemm( + a_input, + marlin_24_q_w_comp, + marlin_24_meta, + marlin_24_s, + workspace_24.scratch, + num_bits, + a_input.shape[0], + b_weight.shape[1], + a_input.shape[1], + ) + + torch.cuda.synchronize() + + max_diff = compute_max_diff(output, output_ref) + print("max_diff = {}".format(max_diff)) + + assert max_diff < 0.04 diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index 1bd6127104654..f5345c0443029 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -12,6 +12,15 @@ logger = init_logger(__name__) +GPTQ_MARLIN_24_TILE = 16 +GPTQ_MARLIN_24_MIN_THREAD_N = 128 +GPTQ_MARLIN_24_MIN_THREAD_K = 128 +GPTQ_MARLIN_24_MAX_PARALLEL = 16 + +GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8] +GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128] +GPTQ_MARLIN_24_SUPPORTED_SYM = [True] + class GPTQMarlin24Config(QuantizationConfig): """Config class for Marlin24. @@ -25,15 +34,17 @@ def __init__( self.weight_bits = weight_bits self.group_size = group_size - if self.weight_bits != 4 and self.weight_bits != 8: - raise ValueError("weight_bits must be 4 or 8. Got = {}".format( - self.weight_bits)) - - if self.group_size != 128 and self.group_size != -1: + # Verify + if self.weight_bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS: + raise ValueError( + f"Marlin_24 does not support weight_bits = {self.weight_bits}. " + f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_NUM_BITS} " + "are supported.") + if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES: raise ValueError( - "Currently, only group size 128 and -1 (channelwise) " - "is supported for Marlin24, but got group_size of " - f"{self.group_size}") + f"Marlin_24 does not support group_size = {self.group_size}. " + f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} " + "are supported.") # 4 Bits packed into 32 bit datatype. self.pack_factor = 32 // self.weight_bits diff --git a/vllm/model_executor/layers/quantization/utils/format_24.py b/vllm/model_executor/layers/quantization/utils/format_24.py new file mode 100644 index 0000000000000..01c8cf789204b --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/format_24.py @@ -0,0 +1,308 @@ +# +# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es). +# + +import torch + + +# This is PyTorch implementation of main part of reorder_meta() +# function, from tools/util/include/cutlass/util/host_reorder.h file +# of CUTLASS source tree. Furthermore, CUTLASS template for sparse +# GEMM decides upon layout of this matrix, and at the moment for the +# sparse GEMM executed on tensor cores, this is layout described by +# ColumnMajorInterleaved<2> data structure, in +# include/cutlass/layout/matrix.h of CUTLASS source tree. The +# reordering of meta matrix into meta_reordered matrix calculated +# according to these segments of CUTLASS code is re-implemented here. +# Note that this calculation produces offsets for scattering metadata +# matrix elements into reordered metadata matrix elements (or, +# equivalently, for gathering reordered metadata matrix element back +# into metadata matrix elements). +def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype, + device): + dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols) + dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1) + + # Reorder the rows, then swizzle the 2x2 blocks. + group_x = 64 + group_y = 32 if meta_dtype.itemsize == 2 else 16 + + dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 + + (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 + + ((dst_rows % group_x) // 8) * 4) + + topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8) + bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8) + dst_rows += topright - bottomleft + dst_cols -= topright - bottomleft + + # Assumed that meta tensor is to be stored in CUTLASS + # InterleavedColumnMajor layout, and reverse engineered + # corresponding code to store values into this tensor. + interleave = 2 + cols_maj = dst_cols // interleave + cols_min = dst_cols % interleave + return (cols_maj * m * interleave + dst_rows * interleave + + cols_min).view(-1) + + +# This function converts dense matrix into sparse semi-structured +# representation, producing "compressed" matrix, in the layout used by +# CUTLASS backend, and corresponding metadata matrix. +def sparse_semi_structured_from_dense_cutlass(dense): + if dense.dim() != 2: + raise RuntimeError( + f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor" # noqa: E501 + ) + + m, k = dense.shape + device = dense.device + + meta_dtype = torch.int8 + if dense.dtype == torch.int8: + meta_dtype = torch.int32 + elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]: + meta_dtype = torch.int16 + else: + raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix") + quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4 + if quadbits_per_meta_elem not in (4, 8): + raise RuntimeError( + "Invalid number of elements per meta element calculated") + + if meta_dtype == torch.int32: + if m % 16 != 0: + raise RuntimeError( + f"Number of rows of dense matrix {m} must be divisible by 16") + else: + if m % 32 != 0: + raise RuntimeError( + f"Number of rows of dense matrix {m} must be divisible by 32") + if k % (4 * quadbits_per_meta_elem) != 0: + raise RuntimeError( + f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}" # noqa: E501 + ) + + if dense.dtype != torch.float: + ksparse = 4 + dense_4 = dense.view(-1, k // ksparse, ksparse) + m0, m1, m2, m3 = (dense_4 != 0).unbind(-1) + else: + ksparse = 2 + dense_2 = dense.view(-1, k // ksparse, ksparse) + m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1) + meta_ncols = k // (ksparse * quadbits_per_meta_elem) + + # Encoding quadruples of True/False values as follows: + # [True, True, False, False] -> 0b0100 + # [True, False, True, False] -> 0b1000 + # [False, True, True, False] -> 0b1001 + # [True, False, False, True ] -> 0b1100 + # [False, True, False, True ] -> 0b1101 + # [False, False, True, True ] -> 0b1110 + # Thus, lower two bits in the encoding are index of the True value + # at the lowest index in the quadruple, and the higher two bits in + # the encoding are index of the other True value in the quadruple. + # In case there are less than two True values, than False value or + # values at some index or indices are considered True for the + # encoding. In case there are more than two True values, then the + # excess True value(s) at some indices are considered False for + # the encoding. The exact encodings used for these cases are as + # follows: + # [False, False, False, False] -> 0b1110 + # [False, False, False, True ] -> 0b1110 + # [False, False, True, False] -> 0b1110 + # [False, True, False, False] -> 0b1001 + # [False, True, True, True ] -> 0b1101 + # [True, False, False, False] -> 0b1000 + # [True, False, True, True ] -> 0b1100 + # [True, True, False, True ] -> 0b0100 + # [True, True, True, False] -> 0b0100 + # [True, True, True, True ] -> 0b0100 + # These particular encodings are chosen, with the help of Espresso + # logic minimizer software, for the purpose of minimization of + # corresponding Boolean functions, that translate non-zero flags + # into encoding bits. Note also possible choices for the first + # and last of these encodings were limited only to (0b0100, + # 0b1110), in order to produce valid encodings for 1:2 sparsity + # case. + + expr0 = m0 & m1 + expr1 = ~m0 & m1 + expr2 = ~m0 & ~m1 + bit0 = expr1 + bit1 = expr2 + bit2 = expr0 | expr2 | m3 + bit3 = expr1 | ~m1 + idxs0 = bit0 | (bit1.to(torch.int64) << 1) + idxs1 = bit2 | (bit3.to(torch.int64) << 1) + + if dense.dtype != torch.float: + sparse0 = dense_4.gather( + -1, idxs0.unsqueeze(-1)) # type: ignore[possibly-undefined] + sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1)) + sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2) + else: + sparse = dense_2.gather(-1, + idxs0.unsqueeze(-1) // 2).view( + m, + k // 2) # type: ignore[possibly-undefined] + + meta_4 = idxs0 | (idxs1 << 2) + meta_n = meta_4.view( + (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype) + + if quadbits_per_meta_elem == 4: + meta = (meta_n[:, :, 0] + | (meta_n[:, :, 1] << 4) + | (meta_n[:, :, 2] << 8) + | (meta_n[:, :, 3] << 12)) + elif quadbits_per_meta_elem == 8: + meta = (meta_n[:, :, 0] + | (meta_n[:, :, 1] << 4) + | (meta_n[:, :, 2] << 8) + | (meta_n[:, :, 3] << 12) + | (meta_n[:, :, 4] << 16) + | (meta_n[:, :, 5] << 20) + | (meta_n[:, :, 6] << 24) + | (meta_n[:, :, 7] << 28)) + + # Reorder meta tensor elements. + meta_reordered = meta.new_empty( + (m * meta_ncols, )) # type: ignore[possibly-undefined] + meta_offsets = _calculate_meta_reordering_scatter_offsets( + m, meta_ncols, meta_dtype, device) + meta_reordered.scatter_(0, meta_offsets, meta.view(-1)) + + return (sparse, meta_reordered.view(m, meta_ncols)) + + +# This function performs reverse of the function above - it +# reconstructs dense matrix from a pair of "compressed" matrix, given +# in the layout used by CUTLASS backend, and accompanying metadata +# matrix. +def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered): + if sparse.dim() != 2: + raise RuntimeError( + f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor" # noqa: E501 + ) + + m, k = sparse.shape + device = sparse.device + + if meta_reordered.dim() != 2: + raise RuntimeError( + f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor" # noqa: E501 + ) + if meta_reordered.device != device: + raise RuntimeError( + f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device" # noqa: E501 + ) + + meta_dtype = meta_reordered.dtype + if meta_dtype not in (torch.int16, torch.int32): + raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix") + quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4 + + ksparse = 4 if sparse.dtype != torch.float else 2 + + meta_nrows, meta_ncols = meta_reordered.shape + if meta_nrows != m: + raise RuntimeError( + f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}" # noqa: E501 + ) + if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k: + raise RuntimeError( + f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, " # noqa: E501 + "expected according to the number of columns of meta matrix") + + # Undo meta tensor elements reordering. + meta_offsets = _calculate_meta_reordering_scatter_offsets( + m, meta_ncols, meta_dtype, device) + meta = torch.gather(meta_reordered.view(-1), 0, + meta_offsets).view(m, meta_ncols) + + # Unpack sparse tensor back to original dense tensor, using + # information provided by meta tensor. Note that torch.float + # datatype is handled pretty much the same as + # torch.half/torch.bfloat16, as metadata for a pair of torch.float + # value is encoded as if underlying 8 bytes contain four + # torch.half/torch.bfloat16 values, where either first two or last + # two are zeros. + meta_2 = torch.empty( + (m, meta_ncols, 2 * quadbits_per_meta_elem), + dtype=meta_dtype, + device=device, + ) + if quadbits_per_meta_elem == 4: + meta_2[:, :, 0] = meta & 0b11 + meta_2[:, :, 1] = (meta >> 2) & 0b11 + meta_2[:, :, 2] = (meta >> 4) & 0b11 + meta_2[:, :, 3] = (meta >> 6) & 0b11 + meta_2[:, :, 4] = (meta >> 8) & 0b11 + meta_2[:, :, 5] = (meta >> 10) & 0b11 + meta_2[:, :, 6] = (meta >> 12) & 0b11 + meta_2[:, :, 7] = (meta >> 14) & 0b11 + elif quadbits_per_meta_elem == 8: + meta_2[:, :, 0] = meta & 0b11 + meta_2[:, :, 1] = (meta >> 2) & 0b11 + meta_2[:, :, 2] = (meta >> 4) & 0b11 + meta_2[:, :, 3] = (meta >> 6) & 0b11 + meta_2[:, :, 4] = (meta >> 8) & 0b11 + meta_2[:, :, 5] = (meta >> 10) & 0b11 + meta_2[:, :, 6] = (meta >> 12) & 0b11 + meta_2[:, :, 7] = (meta >> 14) & 0b11 + meta_2[:, :, 8] = (meta >> 16) & 0b11 + meta_2[:, :, 9] = (meta >> 18) & 0b11 + meta_2[:, :, 10] = (meta >> 20) & 0b11 + meta_2[:, :, 11] = (meta >> 22) & 0b11 + meta_2[:, :, 12] = (meta >> 24) & 0b11 + meta_2[:, :, 13] = (meta >> 26) & 0b11 + meta_2[:, :, 14] = (meta >> 28) & 0b11 + meta_2[:, :, 15] = (meta >> 30) & 0b11 + + dense_offsets = meta_2.view(-1) + ( + torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view( + -1, 1).repeat(1, 2).view(-1) + + dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device) + if sparse.dtype != torch.float: + # dense.scatter_(0, dense_offsets, sparse.view(-1)) + dense.scatter_(0, dense_offsets, sparse.reshape(-1)) + else: + dense.view(torch.half).scatter_(0, dense_offsets, + sparse.view(torch.half).view(-1)) + + return dense.view(m, 2 * k) + + +def mask_creator(tensor): + """ + Class for creating N:M sparsity masks. + Masks will be created using the N:M ratio, where for every block of + M weights, N will be pruned based on ranked weight value. Each mask + will correspond to the given tensor. + + :param N: The number of weights in a group to keep + :param M: The size of a weight group + """ + N = 2 + M = 4 + + mask = None + # for i, tensor in enumerate(tensors): + if tensor.numel() % M != 0: + raise ValueError( + f"Tensor of size {tensor.shape} can't be evenly divided into " + f"{M} groups") + + num_groups = tensor.numel() // M + + # N:M sparsity for linear layers + tensor_temp = tensor.detach().abs().reshape(num_groups, M) + index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)] + + w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device) + mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape) + + return mask diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py new file mode 100644 index 0000000000000..12e77cb710687 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py @@ -0,0 +1,58 @@ +"""This file is used for /tests and /benchmarks""" +import numpy +import torch + + +# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501 +# +# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501 +# with the tensor-core format that is described here: +# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501 +# +# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 +# (without the need to use ldmatrix instructions) # noqa: E501 +def get_perms_24(num_bits): + perm_list = [] + for i in range(32): + perm1 = [] + col = i // 4 + col_o = col // 2 + for block in [0, 1]: + for row in [ + 2 * (i % 4), + 2 * (i % 4) + 1, + 2 * (i % 4 + 4), + 2 * (i % 4 + 4) + 1, + ]: + perm1.append(16 * row + col_o * 256 + 8 * (col % 2) + + 4 * block) + for j in range(4): + perm_list.extend([p + 1 * j for p in perm1]) + perm = numpy.array(perm_list) + + if num_bits == 4: + interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) + elif num_bits == 8: + interleave = numpy.array([0, 2, 1, 3]) + else: + raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits)) + + perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() + perm = torch.from_numpy(perm) + scale_perm = [] + for i in range(8): + scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) + scale_perm_single = [] + for i in range(8): + scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) + return perm, scale_perm, scale_perm_single + + +marlin_24_perm = {} +marlin_24_scale_perm = {} +marlin_24_scale_perm_single = {} +for num_bits in [4, 8]: + perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits) + marlin_24_perm[num_bits] = perm_24 + marlin_24_scale_perm[num_bits] = scale_perm_24 + marlin_24_scale_perm_single[num_bits] = scale_perm_single_24 diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py new file mode 100644 index 0000000000000..76bd2ff7c724e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py @@ -0,0 +1,58 @@ +"""This file is used for /tests and /benchmarks""" +import numpy +import torch + + +# Precompute permutations for Marlin weight and scale shuffling # noqa: E501 +# +# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501 +# with the tensor-core format that is described here: +# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501 +# +# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 +# (without the need to use ldmatrix instructions) # noqa: E501 +def get_perms(num_bits): + perm_list = [] + for i in range(32): + perm1 = [] + col = i // 4 + for block in [0, 1]: + for row in [ + 2 * (i % 4), + 2 * (i % 4) + 1, + 2 * (i % 4 + 4), + 2 * (i % 4 + 4) + 1, + ]: + perm1.append(16 * row + col + 8 * block) + for j in range(4): + perm_list.extend([p + 256 * j for p in perm1]) + + perm = numpy.array(perm_list) + + if num_bits == 4: + interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) + elif num_bits == 8: + interleave = numpy.array([0, 2, 1, 3]) + else: + raise Exception("num_bits must be 4 or 8, got {}".format(num_bits)) + + perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() + perm = torch.from_numpy(perm) + scale_perm = [] + for i in range(8): + scale_perm.extend([i + 8 * j for j in range(8)]) + scale_perm_single = [] + for i in range(4): + scale_perm_single.extend( + [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) + return perm, scale_perm, scale_perm_single + + +marlin_perm = {} +marlin_scale_perm = {} +marlin_scale_perm_single = {} +for num_bits in [4, 8]: + perm, scale_perm, scale_perm_single = get_perms(num_bits) + marlin_perm[num_bits] = perm + marlin_scale_perm[num_bits] = scale_perm + marlin_scale_perm_single[num_bits] = scale_perm_single diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 33b3169983475..0d027d0620ab3 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -1,79 +1,28 @@ """This file is used for /tests and /benchmarks""" +import random + import numpy import torch -from vllm.model_executor.layers.quantization.gptq_marlin import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_TILE) +from vllm.model_executor.layers.quantization.utils.format_24 import ( + mask_creator, sparse_semi_structured_from_dense_cutlass) +from vllm.model_executor.layers.quantization.utils.marlin_24_perms import ( + marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single) +from vllm.model_executor.layers.quantization.utils.marlin_perms import ( + marlin_perm, marlin_scale_perm, marlin_scale_perm_single) from vllm.model_executor.layers.quantization.utils.quant_utils import ( get_pack_factor, quantize_weights, sort_weights) __cuda_arch = torch.cuda.get_device_capability() +MARLIN_TILE = 16 + def is_marlin_supported(): return __cuda_arch[0] >= 8 -# Precompute permutations for Marlin weight and scale shuffling # noqa: E501 -# -# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501 -# with the tensor-core format that is described here: -# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501 -# -# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 -# (without the need to use ldmatrix instructions) # noqa: E501 -def _get_perms(num_bits): - perm_list = [] - for i in range(32): - perm1 = [] - col = i // 4 - for block in [0, 1]: - for row in [ - 2 * (i % 4), - 2 * (i % 4) + 1, - 2 * (i % 4 + 4), - 2 * (i % 4 + 4) + 1, - ]: - perm1.append(16 * row + col + 8 * block) - for j in range(4): - perm_list.extend([p + 256 * j for p in perm1]) - - perm = numpy.array(perm_list) - - if num_bits == 4: - interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) - elif num_bits == 8: - interleave = numpy.array([0, 2, 1, 3]) - else: - raise Exception("num_bits must be 4 or 8, got {}".format(num_bits)) - - perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() - perm = torch.from_numpy(perm) - scale_perm = [] - for i in range(8): - scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single = [] - for i in range(4): - scale_perm_single.extend( - [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) - return perm, scale_perm, scale_perm_single - - -_perm = {} -_scale_perm = {} -_scale_perm_single = {} -for num_bits in [4, 8]: - perm, scale_perm, scale_perm_single = _get_perms(num_bits) - _perm[num_bits] = perm - _scale_perm[num_bits] = scale_perm - _scale_perm_single[num_bits] = scale_perm_single - - -def marlin_permute_weights(q_w, - size_k, - size_n, - num_bits, - tile=GPTQ_MARLIN_TILE): +def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE): assert q_w.shape == (size_k, size_n) assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}" assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}" @@ -83,15 +32,14 @@ def marlin_permute_weights(q_w, q_w = q_w.permute((0, 2, 1, 3)) q_w = q_w.reshape((size_k // tile, size_n * tile)) - q_w = q_w.reshape( - (-1, _perm[num_bits].numel()))[:, _perm[num_bits]].reshape(q_w.shape) + q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape) return q_w -def marlin_weights(q_w, size_k, size_n, num_bits): +def marlin_weights(q_w, size_k, size_n, num_bits, perm): # Permute - q_w = marlin_permute_weights(q_w, size_k, size_n, num_bits) + q_w = marlin_permute_weights(q_w, size_k, size_n, perm) # Pack pack_factor = get_pack_factor(num_bits) @@ -101,7 +49,6 @@ def marlin_weights(q_w, size_k, size_n, num_bits): q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), dtype=numpy.uint32) - for i in range(pack_factor): q_packed |= q_w[:, i::pack_factor] << num_bits * i @@ -110,15 +57,12 @@ def marlin_weights(q_w, size_k, size_n, num_bits): return q_packed -def marlin_permute_scales(s, size_k, size_n, group_size, num_bits): +def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm, + scale_perm_single): if group_size < size_k and group_size != -1: - s = s.reshape((-1, len(_scale_perm[num_bits])))[:, - _scale_perm[num_bits]] + s = s.reshape((-1, len(scale_perm)))[:, scale_perm] else: - s = s.reshape( - (-1, - len(_scale_perm_single[num_bits])))[:, - _scale_perm_single[num_bits]] + s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single] s = s.reshape((-1, size_n)).contiguous() return s @@ -148,8 +92,11 @@ def marlin_quantize( q_w, g_idx, sort_indices = sort_weights(q_w, g_idx) # Reformat to marlin - marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits) - marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, num_bits) + marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, + marlin_perm[num_bits]) + marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, + marlin_scale_perm[num_bits], + marlin_scale_perm_single[num_bits]) # Create result res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm] @@ -159,15 +106,118 @@ def marlin_quantize( return res_list +def inject_24(w, size_k, size_n): + assert w.shape == (size_k, size_n) + + mask = mask_creator(w.t()).t().cuda().bool() + + return (mask * w).contiguous(), mask.contiguous() + + +def check_24(w, num_rows_to_sample=50, _verbose=False): + BLOCK_SIZE = 4 + MAX_NON_ZEROS = 2 + + w = w.t().contiguous() + + print("check_24: w.shape = {}".format(w.shape)) + + num_rows, num_cols = w.shape + sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample) + if _verbose: + print(f"Sampled row idxs = {sampled_row_idxs}") + + total_segments = 0 + non_24_segments = 0 + for i in sampled_row_idxs: + for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE): + total_segments += 1 + block = w[i, j:j + BLOCK_SIZE] + num_nonzero = torch.count_nonzero(block) + if num_nonzero > MAX_NON_ZEROS: + print("i = {} j = {} block = {}".format(i, j, block)) + non_24_segments += 1 + + print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.") + + +def compress_quantized_24_weight(q_24, size_k, size_n, num_bits): + assert q_24.shape == (size_k, size_n) + + # Remove zp to normalize over 0 + max_q_val = (1 << num_bits) - 1 + zp = (max_q_val + 1) // 2 + q_24_no_zp = q_24 - zp + + # Compress + q_24_no_zp = q_24_no_zp.t().contiguous() + q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass( + q_24_no_zp) + q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous() + + # Restore zp + q_24_comp = q_24_no_zp_comp + zp + + # Resize meta to its actual shape (without moving any data) + meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2) + + return q_24_comp, meta + + +def marlin_24_quantize( + w: torch.Tensor, + num_bits: int, + group_size: int, +): + size_k, size_n = w.shape + + # Normalize group_size + if group_size == -1: + group_size = size_k + assert group_size <= size_k + + # Inject 2:4 sparsity + w_24, mask_24 = inject_24(w, size_k, size_n) + + # Quantize + w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24, + num_bits, + group_size, + act_order=False) + + # Compress quantized weight + q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n, + num_bits) + size_k_comp = size_k // 2 + + # Reformat to marlin + marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n, + num_bits, marlin_24_perm[num_bits]) + marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size, + marlin_24_scale_perm[num_bits], + marlin_24_scale_perm_single[num_bits]) + + # Create result + res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s] + for i in range(len(res_list)): + res_list[i] = res_list[i].to(w.device) + + return res_list + + +def compute_max_diff(output, output_ref): + return torch.mean(torch.abs(output - output_ref)) / torch.mean( + torch.abs(output_ref)) + + class MarlinWorkspace: - def __init__(self, out_features): - assert (out_features % GPTQ_MARLIN_MIN_THREAD_N == 0), ( - "out_features = {} is undivisible by GPTQ_MARLIN_MIN_THREAD_N = {}" - .format(out_features, GPTQ_MARLIN_MIN_THREAD_N)) + def __init__(self, out_features, min_thread_n, max_parallel): + assert (out_features % min_thread_n == 0), ( + "out_features = {} is undivisible by min_thread_n = {}".format( + out_features, min_thread_n)) - max_workspace_size = ((out_features // GPTQ_MARLIN_MIN_THREAD_N) * - GPTQ_MARLIN_MAX_PARALLEL) + max_workspace_size = ((out_features // min_thread_n) * max_parallel) self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, From 81ec16bc76d7de608c82bd3db2541c35e5fd26e0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 19 May 2024 18:11:30 -0700 Subject: [PATCH 002/154] [Kernel] Add flash-attn back (#4907) --- requirements-cuda.txt | 2 +- tests/kernels/test_flash_attn.py | 208 ++++++++++++++++++++++++++ tests/models/test_fp8.py | 10 +- vllm/attention/backends/flash_attn.py | 129 +++++++++------- vllm/attention/selector.py | 14 ++ 5 files changed, 303 insertions(+), 60 deletions(-) create mode 100644 tests/kernels/test_flash_attn.py diff --git a/requirements-cuda.txt b/requirements-cuda.txt index ba8c614d205d2..acb0164007dba 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -7,4 +7,4 @@ nvidia-ml-py # for pynvml package vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library torch == 2.3.0 xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 -vllm-flash-attn == 2.5.8.post1 # Requires PyTorch 2.3.0 +vllm-flash-attn == 2.5.8.post2 # Requires PyTorch 2.3.0 diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py new file mode 100644 index 0000000000000..22772d4ea4422 --- /dev/null +++ b/tests/kernels/test_flash_attn.py @@ -0,0 +1,208 @@ +from typing import List, Optional, Tuple + +import pytest +import torch +from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache + +NUM_HEADS = [(16, 16), (32, 8), (64, 8)] +HEAD_SIZES = [128, 256] +BLOCK_SIZES = [16, 32] +DTYPES = [torch.float16, torch.bfloat16] +NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. + + +def ref_paged_attn( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + query_lens: List[int], + kv_lens: List[int], + block_tables: torch.Tensor, + scale: float, + sliding_window: Optional[int] = None, +) -> torch.Tensor: + num_seqs = len(query_lens) + block_tables = block_tables.cpu().numpy() + _, block_size, num_kv_heads, head_size = key_cache.shape + + outputs = [] + start_idx = 0 + for i in range(num_seqs): + query_len = query_lens[i] + kv_len = kv_lens[i] + q = query[start_idx:start_idx + query_len] + q *= scale + + num_kv_blocks = (kv_len + block_size - 1) // block_size + block_indices = block_tables[i, :num_kv_blocks] + + k = key_cache[block_indices].view(-1, num_kv_heads, head_size) + k = k[:kv_len] + v = value_cache[block_indices].view(-1, num_kv_heads, head_size) + v = v[:kv_len] + + if q.shape[1] != k.shape[1]: + k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1) + v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1) + attn = torch.einsum("qhd,khd->hqk", q, k).float() + empty_mask = torch.ones(query_len, kv_len) + mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool() + if sliding_window is not None: + sliding_window_mask = torch.triu(empty_mask, + diagonal=kv_len - + (query_len + sliding_window) + + 1).bool().logical_not() + mask |= sliding_window_mask + attn.masked_fill_(mask, float("-inf")) + attn = torch.softmax(attn, dim=-1).to(v.dtype) + out = torch.einsum("hqk,khd->qhd", attn, v) + + outputs.append(out) + start_idx += query_len + + return torch.cat(outputs, dim=0) + + +@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode +def test_flash_attn_with_paged_kv( + kv_lens: List[Tuple[int, int]], + num_heads: Tuple[int, int], + head_size: int, + dtype: torch.dtype, + block_size: int, +) -> None: + torch.set_default_device("cuda") + torch.cuda.manual_seed_all(0) + num_seqs = len(kv_lens) + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + max_kv_len = max(kv_lens) + scale = head_size**-0.5 + + query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype) + key_cache = torch.randn(NUM_BLOCKS, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + value_cache = torch.randn_like(key_cache) + kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32) + + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + NUM_BLOCKS, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + + output = flash_attn_with_kvcache( + q=query.unsqueeze(1), + k_cache=key_cache, + v_cache=value_cache, + softmax_scale=scale, + causal=True, + block_table=block_tables, + cache_seqlens=kv_lens_tensor, + ).squeeze(1) + + ref_output = ref_paged_attn( + query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=[1] * num_seqs, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + ) + assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \ + f"{torch.max(torch.abs(output - ref_output))}" + + +@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("sliding_window", [None]) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode +def test_varlen_with_paged_kv( + seq_lens: List[Tuple[int, int]], + num_heads: Tuple[int, int], + head_size: int, + sliding_window: Optional[int], + dtype: torch.dtype, + block_size: int, +) -> None: + torch.set_default_device("cuda") + torch.cuda.manual_seed_all(0) + num_seqs = len(seq_lens) + query_lens = [x[0] for x in seq_lens] + kv_lens = [x[1] for x in seq_lens] + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + max_query_len = max(query_lens) + max_kv_len = max(kv_lens) + window_size = ((sliding_window, + sliding_window) if sliding_window is not None else + (-1, -1)) + scale = head_size**-0.5 + + query = torch.randn(sum(query_lens), + num_query_heads, + head_size, + dtype=dtype) + key_cache = torch.randn(NUM_BLOCKS, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + value_cache = torch.randn_like(key_cache) + # Normalize the scale of the key and value caches to mitigate + # numerical instability. + key_cache /= head_size**0.5 + value_cache /= head_size**0.5 + cu_query_lens = torch.tensor([0] + query_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + cu_kv_lens = torch.tensor([0] + kv_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + NUM_BLOCKS, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + + output = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=cu_query_lens, + cu_seqlens_k=cu_kv_lens, + max_seqlen_q=max_query_len, + max_seqlen_k=max_kv_len, + softmax_scale=scale, + causal=True, + window_size=window_size, + block_table=block_tables, + ) + + ref_output = ref_paged_attn( + query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=query_lens, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + sliding_window=sliding_window, + ) + assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \ + f"{torch.max(torch.abs(output - ref_output))}" diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index e87a1783a83f1..664e951a89f2a 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -25,18 +25,18 @@ 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', + 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', + 'Zeta-5, a highly advanced robot designed for menial labor, whirred to a', + 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**' + 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o', ], "meta-llama/Meta-Llama-3-8B-Instruct": [ 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of', + 'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short', 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', 'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu' diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 856f399741375..0361dd3bd4ead 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -1,19 +1,15 @@ -"""Attention layer with Flash and PagedAttention. - -NOTE(woosuk): At the moment, this file includes a lot of duplicated code from -XFormers backend. The duplicated code will be removed once we use flash-attn or -flashinfer for all the attention operations. -""" +"""Attention layer with FlashAttention.""" from dataclasses import dataclass from typing import List, Optional, Tuple, Type import torch -from vllm_flash_attn import flash_attn_varlen_func +from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache +from vllm._C import cache_ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) + +_SUPPORTED_HEAD_SIZES = [32, 64, 96, 128, 160, 192, 224, 256] class FlashAttentionBackend(AttentionBackend): @@ -37,8 +33,9 @@ def get_kv_cache_shape( num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (2, num_blocks, block_size, num_kv_heads, head_size) @staticmethod def swap_blocks( @@ -46,18 +43,26 @@ def swap_blocks( dst_kv_cache: torch.Tensor, src_to_dst: torch.Tensor, ) -> None: - PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + src_key_cache = src_kv_cache[0] + dst_key_cache = dst_kv_cache[0] + cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + + src_value_cache = src_kv_cache[1] + dst_value_cache = dst_kv_cache[1] + cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) @staticmethod def copy_blocks( kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + cache_ops.copy_blocks(key_caches, value_caches, src_to_dists) @dataclass -class FlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): +class FlashAttentionMetadata(AttentionMetadata): """Metadata for FlashAttentionBackend. NOTE: Any python object stored here is not updated when it is @@ -99,6 +104,14 @@ class FlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): # so far). context_lens_tensor: Optional[torch.Tensor] + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: Optional[torch.Tensor] + # Whether or not if cuda graph is enabled. # Cuda-graph is currently enabled for decoding only. # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. @@ -219,11 +232,15 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - suppored_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in suppored_head_sizes: + if sliding_window is not None: + # NOTE(woosuk): flash-attn's sliding window does not work with + # paged KV cache. + raise ValueError( + "Sliding window is not supported in FlashAttention.") + if head_size not in _SUPPORTED_HEAD_SIZES: raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {suppored_head_sizes}.") + f"Head size {head_size} is not supported by FlashAttention. " + f"Supported head sizes are: {_SUPPORTED_HEAD_SIZES}.") def forward( self, @@ -234,17 +251,20 @@ def forward( attn_metadata: FlashAttentionMetadata, kv_scale: float = 1.0, ) -> torch.Tensor: - """Forward pass with FlashAttention and PagedAttention. + """Forward pass with FlashAttention. Args: query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] """ + # NOTE(woosuk): FlashAttention does not support FP8 KV cache. + assert kv_scale == 1.0, "kv_scale is not supported in FlashAttention." + num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) @@ -252,16 +272,20 @@ def forward( value = value.view(-1, self.num_kv_heads, self.head_size) if kv_cache is not None: - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) + key_cache = kv_cache[0] + value_cache = kv_cache[1] # Reshape the input keys and values and store them in the cache. # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory profiling run. - PagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, kv_scale) + cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping.flatten(), + self.kv_cache_dtype, + ) num_prefill_tokens = attn_metadata.num_prefill_tokens num_decode_tokens = attn_metadata.num_decode_tokens @@ -281,7 +305,8 @@ def forward( if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. - if kv_cache is None or prefill_meta.block_tables.numel() == 0: + if (kv_cache is None or prefill_meta.block_tables is None + or prefill_meta.block_tables.numel() == 0): # normal attention # When block_tables are not filled, it means q and k are the # prompt, and they have the same length. @@ -302,38 +327,34 @@ def forward( output[:num_prefill_tokens] = out else: # prefix-enabled attention - # TODO(Hai) this triton kernel has regression issue (broke) to - # deal with different data types between KV and FP8 KV cache, - # to be addressed separately. - output[:num_prefill_tokens] = PagedAttention.forward_prefix( - query, - key, - value, - key_cache, - value_cache, - prefill_meta.block_tables, - prefill_meta.query_start_loc, - prefill_meta.seq_lens_tensor, - prefill_meta.context_lens_tensor, - prefill_meta.max_query_len, - self.alibi_slopes, - self.sliding_window[0], + assert prefill_meta.seq_lens is not None + max_seq_len = max(prefill_meta.seq_lens) + output[:num_prefill_tokens] = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=prefill_meta.query_start_loc, + max_seqlen_q=prefill_meta.max_query_len, + cu_seqlens_k=prefill_meta.seq_start_loc, + max_seqlen_k=max_seq_len, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + block_table=prefill_meta.block_tables, ) + if decode_meta := attn_metadata.decode_metadata: # Decoding run. - output[num_prefill_tokens:] = PagedAttention.forward_decode( - decode_query, + output[num_prefill_tokens:] = flash_attn_with_kvcache( + decode_query.unsqueeze(1), key_cache, value_cache, - decode_meta.block_tables, - decode_meta.seq_lens_tensor, - decode_meta.max_decode_seq_len, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - kv_scale, - ) + block_table=decode_meta.block_tables, + cache_seqlens=decode_meta.seq_lens_tensor, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + ).squeeze(1) # Reshape the output tensor. return output.view(num_tokens, hidden_size) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 06f99718a4dee..5140c3cc86a31 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -93,6 +93,20 @@ def _which_attn_to_use( "torch.float16 or torch.bfloat16.") return _Backend.XFORMERS + if kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"): + logger.info("Cannot use FlashAttention-2 backend for FP8 KV cache.") + return _Backend.XFORMERS + + if block_size % 16 != 0: + logger.info("Cannot use FlashAttention-2 backend for block size not " + "divisible by 16.") + return _Backend.XFORMERS + + if sliding_window is not None: + logger.info( + "Cannot use FlashAttention-2 backend due to sliding window.") + return _Backend.XFORMERS + try: import vllm_flash_attn # noqa: F401 except ImportError: From 550097550ccf6efe0f088296ca1bc088cf124de2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 20 May 2024 16:11:25 +0800 Subject: [PATCH 003/154] [Model] LLaVA model refactor (#4910) --- vllm/model_executor/models/llava.py | 137 ++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index e8a5b6237d4db..fbd7638097286 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union import torch from torch import nn @@ -67,6 +67,21 @@ def _merge_vision_embeddings(input_ids: torch.Tensor, return inputs_embeds +class LlavaImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, num_channels, height, width)""" + + +class LlavaImageFeatureInputs(TypedDict): + type: Literal["image_features"] + data: torch.Tensor + """Shape: (batch_size, image_feature_size, hidden_size)""" + + +LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs] + + class LlavaForConditionalGeneration(VisionLanguageModelBase): def __init__(self, @@ -102,6 +117,90 @@ def __init__(self, config.vocab_size, logit_scale) self.sampler = Sampler() + def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor: + if list(data.shape[1:]) != list( + self.vision_language_config.image_input_shape[1:]): + raise ValueError( + f"The expected image tensor shape is batch dimension plus " + f"{self.vision_language_config.image_input_shape[1:]}. " + f"You supplied {data.shape}. " + f"If you are using vLLM's entrypoint, make sure your " + f"supplied image input is consistent with " + f"image_input_shape in engine args.") + + return data + + def _parse_and_validate_image_input( + self, data: object) -> Optional[LlavaImageInputs]: + expected_input_type = self.vision_language_config.image_input_type + ImageInputType = VisionLanguageConfig.ImageInputType + + if data is None: + return None + + if expected_input_type == ImageInputType.PIXEL_VALUES: + if not isinstance(data, torch.Tensor): + raise TypeError("Image pixel vector should be a tensor, " + f"but received type: {type(data)}") + + return LlavaImagePixelInputs( + type="pixel_values", + data=self._validate_image_data(data), + ) + elif expected_input_type == ImageInputType.IMAGE_FEATURES: + if not isinstance(data, torch.Tensor): + raise TypeError("Image feature vector should be a tensor, " + f"but received type: {type(data)}") + + return LlavaImageFeatureInputs( + type="image_features", + data=self._validate_image_data(data), + ) + + return None + + def _select_image_features(self, image_features: torch.Tensor, *, + strategy: str) -> torch.Tensor: + # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa + if strategy == "default": + return image_features[:, 1:] + elif strategy == "full": + return image_features + + raise ValueError(f"Unexpected select feature strategy: {strategy}") + + def _image_pixels_to_features(self, vision_tower: CLIPVisionModel, + pixel_values: torch.Tensor) -> torch.Tensor: + # TODO(xwjiang): Maybe port minimal CLIPVisionModel over. + image_outputs = vision_tower(pixel_values.to(vision_tower.device), + output_hidden_states=True) + + image_features = image_outputs.hidden_states[ + self.config.vision_feature_layer] + + return self._select_image_features( + image_features, + strategy=self.config.vision_feature_select_strategy, + ) + + def _process_image_pixels(self, + inputs: LlavaImagePixelInputs) -> torch.Tensor: + assert self.vision_tower is not None + + pixel_values = inputs["data"] + + return self._image_pixels_to_features(self.vision_tower, pixel_values) + + def _process_image_input(self, + image_input: LlavaImageInputs) -> torch.Tensor: + if image_input["type"] == "pixel_values": + assert self.vision_tower is not None + image_features = self._process_image_pixels(image_input) + else: + image_features = image_input["data"] + + return self.multi_modal_projector(image_features) + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, @@ -144,42 +243,20 @@ def forward(self, For PIXEL_VALUES, expecting [1, 3, 336, 336]. For IMAGE_FEATURES, expecting [1, 576, 1024]. """ - if image_input is not None: - if list(image_input.shape[1:]) != list( - self.vision_language_config.image_input_shape[1:]): - raise ValueError( - f"The expected image tensor shape is batch dimension " - f"plus " - f"{self.vision_language_config.image_input_shape[1:]}." - f" You supplied {image_input.shape}. " - f"If you are using vLLM's entrypoint, make sure your " - f"supplied image input is consistent with " - f"image_input_shape in engine args.") - if self.vision_tower is not None: - # TODO(xwjiang): Maybe port minimal CLIPVisionModel over. - image_outputs = self.vision_tower(image_input, - output_hidden_states=True) - image_features = image_outputs.hidden_states[ - self.config.vision_feature_layer] - # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa - if self.config.vision_feature_select_strategy == "default": - image_features = image_features[:, 1:] - elif self.config.vision_feature_select_strategy == "full": - image_features = image_features - else: - raise ValueError( - f"Unexpected select feature strategy: " - f"{self.config.vision_feature_select_strategy}") - else: - image_features = image_input - vision_embeddings = self.multi_modal_projector(image_features) + parsed_image_input = self._parse_and_validate_image_input(image_input) + + if parsed_image_input is not None: + vision_embeddings = self._process_image_input(parsed_image_input) inputs_embeds = self.language_model.get_input_embeddings(input_ids) + inputs_embeds = _merge_vision_embeddings( input_ids, inputs_embeds, vision_embeddings, self.vision_language_config.image_token_id) + input_ids = None else: inputs_embeds = None + hidden_states = self.language_model(input_ids, positions, kv_caches, From b913d04cc5365103d5da5ad3e1e54e289dee8089 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Mon, 20 May 2024 10:55:34 -0400 Subject: [PATCH 004/154] Remove marlin warning (#4918) --- csrc/quantization/gptq_marlin/gptq_marlin.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index fdc0ebef4672e..34950a5d13cf5 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -1519,10 +1519,6 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, } } - printf("WARNING: Marlin kernel is reducing max_m_blocks due to small SM " - "GPU cache. This may " - "hurt performance. Consider upgrading your GPU.\n"); - max_m_blocks--; // Process less M blocks per invocation to reduce cache // usage } From 683a30b117da4990f15c314a1874bff02b4575af Mon Sep 17 00:00:00 2001 From: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> Date: Tue, 21 May 2024 01:45:06 +0800 Subject: [PATCH 005/154] [Misc]: allow user to specify port in distributed setting (#4914) --- vllm/envs.py | 7 +++++++ vllm/utils.py | 3 +++ 2 files changed, 10 insertions(+) diff --git a/vllm/envs.py b/vllm/envs.py index 68d8a074d0914..56ff79e0cdea9 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -3,6 +3,7 @@ if TYPE_CHECKING: VLLM_HOST_IP: str = "" + VLLM_PORT: Optional[int] = None VLLM_USE_MODELSCOPE: bool = False VLLM_INSTANCE_ID: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None @@ -96,6 +97,12 @@ 'VLLM_HOST_IP': lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""), + # used in distributed environment to manually set the communication port + # '0' is used to make mypy happy + 'VLLM_PORT': + lambda: int(os.getenv('VLLM_PORT', '0')) + if 'VLLM_PORT' in os.environ else None, + # If true, will load models from ModelScope instead of Hugging Face Hub. # note that the value is true or false, not numbers "VLLM_USE_MODELSCOPE": diff --git a/vllm/utils.py b/vllm/utils.py index 9a65dbaafc56b..bd47ab055b7b5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -282,6 +282,9 @@ def get_distributed_init_method(ip: str, port: int) -> str: def get_open_port() -> int: + port = envs.VLLM_PORT + if port is not None: + return port # try ipv4 try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: From c8794c342daea16554b9179312b57b8a49d47b34 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Mon, 20 May 2024 13:29:28 -0500 Subject: [PATCH 006/154] [Build/CI] Enabling AMD Entrypoints Test (#4834) Co-authored-by: Alexey Kondratiev --- .buildkite/test-pipeline.yaml | 3 ++- Dockerfile.rocm | 8 ++++++-- requirements-rocm.txt | 3 ++- tests/spec_decode/e2e/conftest.py | 8 ++++++-- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6f5c46e23779f..def8a460e84a7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -60,7 +60,8 @@ steps: command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py - label: Entrypoints Test - #mirror_hardwares: [amd] + mirror_hardwares: [amd] + commands: # these tests have to be separated, because each one will allocate all posible GPU memory - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py diff --git a/Dockerfile.rocm b/Dockerfile.rocm index eefad79e79d83..9bfe8446a519d 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -92,19 +92,23 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \ WORKDIR /vllm-workspace COPY . . +#RUN python3 -m pip install pynvml # to be removed eventually RUN python3 -m pip install --upgrade pip numba # make sure punica kernels are built (for LoRA) ENV VLLM_INSTALL_PUNICA_KERNELS=1 +# Workaround for ray >= 2.10.0 +ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 + +ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so RUN --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \ && python3 setup.py install \ && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \ + && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \ && cd .. -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3 CMD ["/bin/bash"] diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 903845b64d98f..cc42839a975d0 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -2,4 +2,5 @@ -r requirements-common.txt # Dependencies for AMD GPUs -ray == 2.9.3 +ray >= 2.10.0 +pytest-asyncio diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index da8b92711380e..7c5840baf3593 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -6,8 +6,12 @@ import pytest import ray import torch -from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, - nvmlInit) + +from vllm.utils import is_hip + +if (not is_hip()): + from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, + nvmlInit) from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs From 5b6a7b506a2047f2dbe42bb0d0154e61b3399c39 Mon Sep 17 00:00:00 2001 From: Mor Zusman Date: Mon, 20 May 2024 21:44:25 +0300 Subject: [PATCH 007/154] [Bugfix] Fix dummy weight for fp8 (#4916) Allow dummy load format for fp8, torch.uniform_ doesn't support FP8 at the moment Co-authored-by: Mor Zusman --- vllm/model_executor/model_loader/weight_utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index ed6e2f12adb2f..4e826256bdba7 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -388,4 +388,11 @@ def initialize_dummy_weights( """ for param in model.state_dict().values(): if torch.is_floating_point(param): - param.data.uniform_(low, high) + if torch.finfo(param.data.dtype).bits < 16: + # uniform_ doesn't support < 16-bit datatypes (FP8) + dtype = param.data.dtype + tmp_param = param.data.to(torch.float16) + tmp_param = tmp_param.uniform_(low, high).to(dtype) + param.data.copy_(tmp_param) + else: + param.uniform_(low, high) From a5e66c791746fad4f199c74abe3fee2b6bb96e1c Mon Sep 17 00:00:00 2001 From: Aurick Qiao Date: Mon, 20 May 2024 14:46:12 -0400 Subject: [PATCH 008/154] [Core] Sharded State Loader download from HF (#4889) --- vllm/model_executor/model_loader/loader.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index dbc975c5e1d83..4b02fb7fcf984 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -444,6 +444,16 @@ def get_end_ptr(tensor: torch.Tensor) -> int: result[k] = t return result + def _prepare_weights(self, model_name_or_path: str, + revision: Optional[str]): + if os.path.isdir(model_name_or_path): + return model_name_or_path + else: + allow_patterns = ["*.safetensors"] + return download_weights_from_hf(model_name_or_path, + self.load_config.download_dir, + allow_patterns, revision) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -454,6 +464,10 @@ def load_model(self, *, model_config: ModelConfig, from safetensors.torch import safe_open from vllm.distributed import get_tensor_model_parallel_rank + + local_model_path = self._prepare_weights(model_config.model, + model_config.revision) + with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): model = _initialize_model(model_config, self.load_config, @@ -461,7 +475,7 @@ def load_model(self, *, model_config: ModelConfig, cache_config) rank = get_tensor_model_parallel_rank() pattern = os.path.join( - model_config.model, + local_model_path, self.pattern.format(rank=rank, part="*"), ) filepaths = glob.glob(pattern) From 8a78ed88d237e53f5a06fd4e397d509d839a11a0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 20 May 2024 13:16:57 -0700 Subject: [PATCH 009/154] [Doc]Add documentation to benchmarking script when running TGI (#4920) --- benchmarks/benchmark_serving.py | 4 ++++ benchmarks/launch_tgi_server.sh | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3df71ffa5662d..0efb4dba06964 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -19,6 +19,10 @@ --dataset-path \ --request-rate \ # By default is inf --num-prompts # By default is 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. """ import argparse import asyncio diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh index 64d3c4f4b3889..f491c90d0683e 100755 --- a/benchmarks/launch_tgi_server.sh +++ b/benchmarks/launch_tgi_server.sh @@ -4,7 +4,7 @@ PORT=8000 MODEL=$1 TOKENS=$2 -docker run --gpus all --shm-size 1g -p $PORT:80 \ +docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \ -v $PWD/data:/data \ ghcr.io/huggingface/text-generation-inference:1.4.0 \ --model-id $MODEL \ From 6b46dcf5ea42934dedf3f2ff639a521ecec68bbe Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 20 May 2024 17:48:32 -0700 Subject: [PATCH 010/154] [Core] Fix scheduler considering "no LoRA" as "LoRA" (#4897) --- vllm/core/scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c8da54f2889eb..7c70b1b244f7d 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -744,8 +744,8 @@ def _schedule_default(self) -> SchedulerOutputs: budget.add_num_seqs(seq_group.request_id, seq_group.get_max_num_running_seqs()) curr_loras = set( - seq_group.lora_int_id - for seq_group in self.running) if self.lora_enabled else None + seq_group.lora_int_id for seq_group in self.running + if seq_group.lora_int_id > 0) if self.lora_enabled else None remaining_waiting, prefills = (self.waiting, SchedulerPrefillOutputs.create_empty()) From 907d48a427d663d3c4f031dee4be43b328289da8 Mon Sep 17 00:00:00 2001 From: HUANG Fei Date: Tue, 21 May 2024 13:22:22 +0800 Subject: [PATCH 011/154] [Model] add rope_scaling support for qwen2 (#4930) --- vllm/model_executor/models/qwen2.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 31ba6441f9f7a..97ab6168c3230 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -89,7 +89,8 @@ def __init__(self, use_sliding_window: bool = False, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - sliding_window: Optional[int] = None) -> None: + sliding_window: Optional[int] = None, + rope_scaling: Optional[Tuple] = None) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -133,6 +134,7 @@ def __init__(self, rotary_dim=self.head_dim, max_position=max_position, base=self.rope_theta, + rope_scaling=rope_scaling, ) self.attn = Attention(self.num_heads, self.head_dim, @@ -169,6 +171,7 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) + rope_scaling = getattr(config, "rope_scaling", None) use_sliding_window = (config.use_sliding_window and layer_idx < config.max_window_layers) self.self_attn = Qwen2Attention( @@ -180,7 +183,8 @@ def __init__( use_sliding_window=use_sliding_window, cache_config=cache_config, quant_config=quant_config, - sliding_window=config.sliding_window) + sliding_window=config.sliding_window, + rope_scaling=rope_scaling) self.mlp = Qwen2MLP( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, From 11d6f7e88d698f2928e1656910553643aaa7834e Mon Sep 17 00:00:00 2001 From: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Date: Tue, 21 May 2024 13:24:17 +0800 Subject: [PATCH 012/154] [Model] Add Phi-2 LoRA support (#4886) --- docs/source/models/supported_models.rst | 2 +- tests/lora/conftest.py | 5 ++ tests/lora/test_phi.py | 67 +++++++++++++++++++++++++ vllm/model_executor/models/phi.py | 33 +++++++++--- 4 files changed, 100 insertions(+), 7 deletions(-) create mode 100644 tests/lora/test_phi.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 142c8f8573e2f..31d4b53bd4409 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -118,7 +118,7 @@ Alongside each architecture, we include some popular models that use it. * - :code:`PhiForCausalLM` - Phi - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. - - + - ✅︎ * - :code:`Phi3ForCausalLM` - Phi-3 - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc. diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 5c648f72d8ddd..95fc65cdd1a8f 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -165,6 +165,11 @@ def tinyllama_lora_files(): return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") +@pytest.fixture(scope="session") +def phi2_lora_files(): + return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora") + + @pytest.fixture(scope="session") def long_context_lora_files_16k_1(): return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1") diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py new file mode 100644 index 0000000000000..a2b42ce4cb96f --- /dev/null +++ b/tests/lora/test_phi.py @@ -0,0 +1,67 @@ +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "microsoft/phi-2" + +PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 + + +def do_sample(llm, lora_path: str, lora_id: int) -> str: + prompts = [ + PROMPT_TEMPLATE.format( + sql_prompt= + "Which catalog publisher has published the most catalogs?", + context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"), + PROMPT_TEMPLATE.format( + sql_prompt= + "Which trip started from the station with the largest dock count? Give me the trip id.", # noqa: E501 + context= + "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);" # noqa: E501 + ), + PROMPT_TEMPLATE.format( + sql_prompt= + "How many marine species are found in the Southern Ocean?", # noqa: E501 + context= + "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));" # noqa: E501 + ), + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=64, + stop="### End") + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None, + ) + # Print the outputs. + generated_texts = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text.strip() + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +def test_phi2_lora(phi2_lora_files): + # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI, + # Otherwise, the lora-test will fail due to CUDA OOM. + llm = vllm.LLM(MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=2, + enforce_eager=True) + + expected_lora_output = [ + "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;", # noqa: E501 + "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);", # noqa: E501 + "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';", # noqa: E501 + ] + + output1 = do_sample(llm, phi2_lora_files, lora_id=1) + for i in range(len(expected_lora_output)): + assert output1[i].startswith(expected_lora_output[i]) + output2 = do_sample(llm, phi2_lora_files, lora_id=2) + for i in range(len(expected_lora_output)): + assert output2[i].startswith(expected_lora_output[i]) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index ed25a232f4208..193a29d20c894 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -42,7 +42,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -229,11 +229,32 @@ def forward( class PhiForCausalLM(nn.Module): - - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ] + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "dense", + "fc1", + "fc2", + ] + embedding_modules = {} + embedding_padding_modules = [] + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ): + del lora_config # Unused. super().__init__() self.config = config self.quant_config = quant_config From 5d989891e270b3c82aac3797f6c1e20b5ffb8b64 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 21 May 2024 00:17:25 -0700 Subject: [PATCH 013/154] [Docs] Add acknowledgment for sponsors (#4925) --- docs/source/community/sponsors.md | 24 ++++++++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 25 insertions(+) create mode 100644 docs/source/community/sponsors.md diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md new file mode 100644 index 0000000000000..532ce77beb7b8 --- /dev/null +++ b/docs/source/community/sponsors.md @@ -0,0 +1,24 @@ +# Sponsors + +vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! + + + + +- a16z +- AMD +- Anyscale +- AWS +- Crusoe Cloud +- Databricks +- DeepInfra +- Lambda Lab +- NVIDIA +- Replicate +- Roblox +- RunPod +- Trainy +- UC Berkeley +- UC San Diego + +We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index bab00e28e4018..5db1c9346c45d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -118,6 +118,7 @@ Documentation :caption: Community community/meetups + community/sponsors Indices and tables ================== From 58a235b3b59a1aca2f92b4133ff81b53b54b2cb5 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 21 May 2024 12:06:10 -0400 Subject: [PATCH 014/154] [CI/Build] Codespell ignore `build/` directory (#4945) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1c61a9e955b61..96f78c37cfefb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ exclude = [ [tool.codespell] ignore-words-list = "dout, te, indicies" -skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data" +skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" [tool.isort] use_parentheses = true From 253d8fb42125cc4d417e7e16b9a68f1312dc86c8 Mon Sep 17 00:00:00 2001 From: Kante Yin Date: Wed, 22 May 2024 00:30:52 +0800 Subject: [PATCH 015/154] [Bugfix] Fix flag name for `max_seq_len_to_capture` (#4935) Signed-off-by: kerthcet --- vllm/engine/arg_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 07a4d743c2dd2..8019888df9acb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -356,9 +356,9 @@ def add_cli_args( help='Maximum context length covered by CUDA ' 'graphs. When a sequence has context length ' 'larger than this, we fall back to eager mode. ' - '(DEPRECATED. Use --max-seq_len-to-capture instead' + '(DEPRECATED. Use --max-seq-len-to-capture instead' ')') - parser.add_argument('--max-seq_len-to-capture', + parser.add_argument('--max-seq-len-to-capture', type=int, default=EngineArgs.max_seq_len_to_capture, help='Maximum sequence length covered by CUDA ' From f744125b82f801bb95d0718144cf9756bc4ac117 Mon Sep 17 00:00:00 2001 From: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Date: Wed, 22 May 2024 03:33:25 +0800 Subject: [PATCH 016/154] [Bugfix][Kernel] Add head size check for attention backend selection (#4944) --- vllm/attention/backends/flash_attn.py | 12 ++++++++---- vllm/attention/selector.py | 16 +++++++++++++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 0361dd3bd4ead..0f4568070cfc4 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -9,11 +9,13 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) -_SUPPORTED_HEAD_SIZES = [32, 64, 96, 128, 160, 192, 224, 256] - class FlashAttentionBackend(AttentionBackend): + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [32, 64, 96, 128, 160, 192, 224, 256] + @staticmethod def get_name() -> str: return "flash-attn" @@ -237,10 +239,12 @@ def __init__( # paged KV cache. raise ValueError( "Sliding window is not supported in FlashAttention.") - if head_size not in _SUPPORTED_HEAD_SIZES: + + support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() + if head_size not in support_head_sizes: raise ValueError( f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {_SUPPORTED_HEAD_SIZES}.") + f"Supported head sizes are: {support_head_sizes}.") def forward( self, diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 5140c3cc86a31..51c25a81b4130 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -34,11 +34,21 @@ def get_attn_backend( sliding_window, dtype, kv_cache_dtype, block_size) if backend == _Backend.FLASH_ATTN: - logger.info("Using FlashAttention-2 backend.") from vllm.attention.backends.flash_attn import ( # noqa: F401 FlashAttentionBackend) - return FlashAttentionBackend - elif backend == _Backend.XFORMERS: + + # We check it here not in _which_attn_to_use because we cannot know + # the head size until we import FlashAttentionBackend. + supported_head_sizes = FlashAttentionBackend.get_supported_head_sizes() + if head_size in supported_head_sizes: + logger.info("Using FlashAttention-2 backend.") + return FlashAttentionBackend + logger.info( + "Cannot use FlashAttention-2 backend for head size %d. " + "Using XFormers backend instead.", head_size) + backend = _Backend.XFORMERS + + if backend == _Backend.XFORMERS: logger.info("Using XFormers backend.") from vllm.attention.backends.xformers import ( # noqa: F401 XFormersBackend) From c1672a94e6b47224c7cffe8857ed30c9450fdd89 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Wed, 22 May 2024 05:32:35 +0000 Subject: [PATCH 017/154] [Frontend] Dynamic RoPE scaling (#4638) --- tests/test_config.py | 56 ++++++++++++++++++++++++++++++- vllm/config.py | 7 +++- vllm/engine/arg_utils.py | 35 +++++++++---------- vllm/engine/llm_engine.py | 9 ++--- vllm/transformers_utils/config.py | 10 +++++- 5 files changed, 90 insertions(+), 27 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 19db10630bbae..6bc51a53dc07c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -36,4 +36,58 @@ def test_get_sliding_window(): assert mistral_model_config.get_sliding_window() is None mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW - assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW \ No newline at end of file + assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW + + +def test_rope_scaling(): + TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0} + LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0} + + llama_model_config = ModelConfig( + "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3-8B-Instruct", + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + ) + assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None + assert llama_model_config.max_model_len == 8192 + + llama_model_config = ModelConfig( + "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3-8B-Instruct", + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + rope_scaling=TEST_ROPE_SCALING, + ) + assert getattr(llama_model_config.hf_config, "rope_scaling", + None) == TEST_ROPE_SCALING + assert llama_model_config.max_model_len == 16384 + + longchat_model_config = ModelConfig( + "lmsys/longchat-13b-16k", + "lmsys/longchat-13b-16k", + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + ) + assert getattr(longchat_model_config.hf_config, "rope_scaling", + None) == LONGCHAT_ROPE_SCALING + assert longchat_model_config.max_model_len == 16384 + + longchat_model_config = ModelConfig( + "lmsys/longchat-13b-16k", + "lmsys/longchat-13b-16k", + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + rope_scaling=TEST_ROPE_SCALING, + ) + assert getattr(longchat_model_config.hf_config, "rope_scaling", + None) == TEST_ROPE_SCALING + assert longchat_model_config.max_model_len == 4096 diff --git a/vllm/config.py b/vllm/config.py index d1212bc7f81d4..773655aa6c793 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -45,6 +45,9 @@ class ModelConfig: code_revision: The specific revision to use for the model code on Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. + rope_scaling: Dictionary containing the scaling configuration for the + RoPE embeddings. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. tokenizer_revision: The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. @@ -84,6 +87,7 @@ def __init__( seed: int, revision: Optional[str] = None, code_revision: Optional[str] = None, + rope_scaling: Optional[dict] = None, tokenizer_revision: Optional[str] = None, max_model_len: Optional[int] = None, quantization: Optional[str] = None, @@ -104,6 +108,7 @@ def __init__( self.seed = seed self.revision = revision self.code_revision = code_revision + self.rope_scaling = rope_scaling self.tokenizer_revision = tokenizer_revision self.quantization = quantization self.quantization_param_path = quantization_param_path @@ -120,7 +125,7 @@ def __init__( self.skip_tokenizer_init = skip_tokenizer_init self.hf_config = get_config(self.model, trust_remote_code, revision, - code_revision) + code_revision, rope_scaling) self.hf_text_config = get_hf_text_config(self.hf_config) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) self.max_model_len = _get_and_verify_max_len(self.hf_text_config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8019888df9acb..803e1836e654e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -2,6 +2,7 @@ import argparse import dataclasses +import json from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -51,6 +52,7 @@ class EngineArgs: disable_log_stats: bool = False revision: Optional[str] = None code_revision: Optional[str] = None + rope_scaling: Optional[dict] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None # UPSTREAM SYNC: keep sparsity argument @@ -345,6 +347,11 @@ def add_cli_args( 'None, we first check the `sparsity_config` attribute ' 'in the model config file. If that is None we assume ' 'the model weights are dense') + parser.add_argument('--rope-scaling', + default=None, + type=json.loads, + help='RoPE scaling configuration in JSON format. ' + 'For example, {"type":"dynamic","factor":2.0}') parser.add_argument('--enforce-eager', action='store_true', help='Always use eager-mode PyTorch. If False, ' @@ -561,26 +568,14 @@ def from_cli_args(cls, args: argparse.Namespace): def create_engine_config(self, ) -> EngineConfig: device_config = DeviceConfig(self.device) model_config = ModelConfig( - self.model, - self.tokenizer, - self.tokenizer_mode, - self.trust_remote_code, - self.dtype, - self.seed, - self.revision, - self.code_revision, - self.tokenizer_revision, - self.max_model_len, - self.quantization, - self.quantization_param_path, - # UPSTREAM SYNC: keep sparsity argument - self.sparsity, - self.enforce_eager, - self.max_context_len_to_capture, - self.max_seq_len_to_capture, - self.max_logprobs, - self.skip_tokenizer_init, - self.served_model_name) + self.model, self.tokenizer, self.tokenizer_mode, + self.trust_remote_code, self.dtype, self.seed, self.revision, + self.code_revision, self.rope_scaling, + self.tokenizer_revision, self.max_model_len, + self.quantization, self.quantization_param_path, + self.enforce_eager, self.max_context_len_to_capture, + self.max_seq_len_to_capture, self.max_logprobs, + self.skip_tokenizer_init, self.served_model_name) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4e84a19198021..db26779c86b51 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -104,10 +104,10 @@ def __init__( "Initializing an LLM engine (v%s) with config: " "model=%r, speculative_config=%r, tokenizer=%r, " "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " - "tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, " - "max_seq_len=%d, download_dir=%r, load_format=%s, " - "tensor_parallel_size=%d, disable_custom_all_reduce=%s, " - "quantization=%s, sparsity=%s, " + "rope_scaling=%r, tokenizer_revision=%s, " + "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " + "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " + "disable_custom_all_reduce=%s, quantization=%s, sparsity=%s, " "enforce_eager=%s, kv_cache_dtype=%s, " "quantization_param_path=%s, device_config=%s, " "decoding_config=%r, seed=%d, served_model_name=%s)", @@ -118,6 +118,7 @@ def __init__( model_config.skip_tokenizer_init, model_config.tokenizer_mode, model_config.revision, + model_config.rope_scaling, model_config.tokenizer_revision, model_config.trust_remote_code, model_config.dtype, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 1756c91a612f0..f36d84dbdf7f9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -2,9 +2,12 @@ from transformers import AutoConfig, PretrainedConfig +from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, JAISConfig, MPTConfig, RWConfig) +logger = init_logger(__name__) + _CONFIG_REGISTRY: Dict[str, PretrainedConfig] = { "chatglm": ChatGLMConfig, "dbrx": DbrxConfig, @@ -18,7 +21,8 @@ def get_config(model: str, trust_remote_code: bool, revision: Optional[str] = None, - code_revision: Optional[str] = None) -> PretrainedConfig: + code_revision: Optional[str] = None, + rope_scaling: Optional[dict] = None) -> PretrainedConfig: try: config = AutoConfig.from_pretrained( model, @@ -41,6 +45,10 @@ def get_config(model: str, config = config_class.from_pretrained(model, revision=revision, code_revision=code_revision) + if rope_scaling is not None: + logger.info("Updating rope_scaling from %r to %r", + getattr(config, "rope_scaling", None), rope_scaling) + config.update({"rope_scaling": rope_scaling}) return config From 4b6c96163bb9749d4f84d1158970dd0535e06bda Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 22 May 2024 03:18:41 -0400 Subject: [PATCH 018/154] [CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#4722) --- .clang-format | 26 + .github/workflows/clang-format.yml | 42 + csrc/activation_kernels.cu | 139 +- csrc/attention/attention_generic.cuh | 19 +- csrc/attention/attention_kernels.cu | 636 ++- csrc/attention/attention_utils.cuh | 11 +- csrc/attention/dtype_bfloat16.cuh | 74 +- csrc/attention/dtype_float16.cuh | 92 +- csrc/attention/dtype_float32.cuh | 88 +- csrc/attention/dtype_fp8.cuh | 32 +- csrc/cache.h | 44 +- csrc/cache_kernels.cu | 288 +- csrc/cpu/activation.cpp | 60 +- csrc/cpu/attention.cpp | 411 +- csrc/cpu/cache.cpp | 53 +- csrc/cpu/layernorm.cpp | 32 +- csrc/cpu/pos_encoding.cpp | 66 +- csrc/cpu/pybind.cpp | 75 +- csrc/cuda_compat.h | 9 +- csrc/cuda_utils.h | 7 +- csrc/cuda_utils_kernels.cu | 40 +- csrc/custom_all_reduce.cu | 55 +- csrc/custom_all_reduce.cuh | 105 +- csrc/custom_all_reduce_test.cu | 38 +- csrc/dispatch_utils.h | 42 +- csrc/layernorm_kernels.cu | 242 +- csrc/moe/moe_ops.cpp | 3 +- csrc/moe/moe_ops.h | 8 +- csrc/moe_align_block_size_kernels.cu | 211 +- csrc/ops.h | 330 +- csrc/pos_encoding_kernels.cu | 229 +- csrc/pybind.cpp | 142 +- csrc/quantization/aqlm/gemm_kernels.cu | 536 +-- csrc/quantization/awq/dequantize.cuh | 138 +- csrc/quantization/awq/gemm_kernels.cu | 611 +-- .../cutlass_w8a8/scaled_mm_dq_c2x.cu | 38 +- .../cutlass_w8a8/scaled_mm_dq_c3x.cu | 22 +- .../cutlass_w8a8/scaled_mm_dq_entry.cu | 47 +- csrc/quantization/fp8/amd/hip_float8.h | 216 +- csrc/quantization/fp8/amd/hip_float8_impl.h | 520 +-- csrc/quantization/fp8/amd/quant_utils.cuh | 711 ++-- csrc/quantization/fp8/common.cu | 86 +- csrc/quantization/fp8/nvidia/quant_utils.cuh | 138 +- csrc/quantization/gptq/compat.cuh | 70 +- csrc/quantization/gptq/matrix_view.cuh | 503 +-- csrc/quantization/gptq/q_gemm.cu | 3441 ++++++++--------- csrc/quantization/gptq/qdq_2.cuh | 107 +- csrc/quantization/gptq/qdq_3.cuh | 246 +- csrc/quantization/gptq/qdq_4.cuh | 203 +- csrc/quantization/gptq/qdq_8.cuh | 34 +- csrc/quantization/gptq/qdq_util.cuh | 58 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 696 ++-- csrc/quantization/gptq_marlin/gptq_marlin.cuh | 50 +- .../gptq_marlin/gptq_marlin_dtypes.cuh | 89 +- .../gptq_marlin/gptq_marlin_repack.cu | 94 +- .../marlin/dense/marlin_cuda_kernel.cu | 460 ++- csrc/quantization/marlin/sparse/common/base.h | 12 +- csrc/quantization/marlin/sparse/common/mem.h | 64 +- csrc/quantization/marlin/sparse/common/mma.h | 107 +- .../marlin/sparse/marlin_24_cuda_kernel.cu | 446 ++- .../squeezellm/quant_cuda_kernel.cu | 63 +- csrc/reduction_utils.cuh | 20 +- format.sh | 57 +- requirements-dev.txt | 1 + 64 files changed, 6571 insertions(+), 6962 deletions(-) create mode 100644 .clang-format create mode 100644 .github/workflows/clang-format.yml diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000000..7f9e6d720fae5 --- /dev/null +++ b/.clang-format @@ -0,0 +1,26 @@ +BasedOnStyle: Google +UseTab: Never +IndentWidth: 2 +ColumnLimit: 80 + +# Force pointers to the type for C++. +DerivePointerAlignment: false +PointerAlignment: Left + +# Reordering #include statements can (and currently will) introduce errors +SortIncludes: false + +# Style choices +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +IndentPPDirectives: BeforeHash + +IncludeCategories: + - Regex: '^<' + Priority: 4 + - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/' + Priority: 3 + - Regex: '^"(qoda|\.\.)/' + Priority: 2 + - Regex: '.*' + Priority: 1 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml new file mode 100644 index 0000000000000..e9b6e28fa6bcb --- /dev/null +++ b/.github/workflows/clang-format.yml @@ -0,0 +1,42 @@ +name: clang-format + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + clang-format: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install clang-format==18.1.5 + - name: Running clang-format + run: | + EXCLUDES=( + 'csrc/moe/topk_softmax_kernels.cu' + 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu' + 'csrc/punica/bgmv/bgmv_config.h' + 'csrc/punica/bgmv/bgmv_impl.cuh' + 'csrc/punica/bgmv/vec_dtypes.cuh' + 'csrc/punica/punica_ops.cu' + 'csrc/punica/type_convert.h' + ) + find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ + | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ + | xargs clang-format --dry-run --Werror \ No newline at end of file diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 24d972702c858..867f63f12de4b 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -10,11 +10,11 @@ namespace vllm { // Activation and gating kernel template. -template +template __global__ void act_and_mul_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., 2, d] - const int d) { + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., 2, d] + const int d) { const int64_t token_idx = blockIdx.x; for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); @@ -23,72 +23,66 @@ __global__ void act_and_mul_kernel( } } -template +template __device__ __forceinline__ T silu_kernel(const T& x) { // x * sigmoid(x) - return (T) (((float) x) / (1.0f + expf((float) -x))); + return (T)(((float)x) / (1.0f + expf((float)-x))); } -template +template __device__ __forceinline__ T gelu_kernel(const T& x) { // Equivalent to PyTorch GELU with 'none' approximation. // Refer to: // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38 - const float f = (float) x; + const float f = (float)x; constexpr float ALPHA = M_SQRT1_2; - return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA))); + return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA))); } -template +template __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { // Equivalent to PyTorch GELU with 'tanh' approximation. // Refer to: // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30 - const float f = (float) x; + const float f = (float)x; constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f; constexpr float KAPPA = 0.044715; float x_cube = f * f * f; float inner = BETA * (f + KAPPA * x_cube); - return (T) (0.5f * f * (1.0f + ::tanhf(inner))); + return (T)(0.5f * f * (1.0f + ::tanhf(inner))); } -} // namespace vllm +} // namespace vllm // Launch activation and gating kernel. -#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), \ - "act_and_mul_kernel", \ - [&] { \ - vllm::act_and_mul_kernel><<>>( \ - out.data_ptr(), \ - input.data_ptr(), \ - d); \ - }); - -void silu_and_mul( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., 2 * d] +#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "act_and_mul_kernel", [&] { \ + vllm::act_and_mul_kernel> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); + +void silu_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] { LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); } -void gelu_and_mul( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., 2 * d] +void gelu_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] { LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); } -void gelu_tanh_and_mul( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., 2 * d] +void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] { LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel); } @@ -96,11 +90,11 @@ void gelu_tanh_and_mul( namespace vllm { // Element-wise activation kernel template. -template +template __global__ void activation_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., d] - const int d) { + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., d] + const int d) { const int64_t token_idx = blockIdx.x; for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]); @@ -108,54 +102,49 @@ __global__ void activation_kernel( } } -} // namespace vllm +} // namespace vllm // Launch element-wise activation kernel. -#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ - int d = input.size(-1); \ - int64_t num_tokens = input.numel() / d; \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), \ - "activation_kernel", \ - [&] { \ - vllm::activation_kernel><<>>( \ - out.data_ptr(), \ - input.data_ptr(), \ - d); \ - }); +#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ + int d = input.size(-1); \ + int64_t num_tokens = input.numel() / d; \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \ + vllm::activation_kernel> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); namespace vllm { -template +template __device__ __forceinline__ T gelu_new_kernel(const T& x) { - const float x3 = (float) (x * x * x); - const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3)))); - return ((T) 0.5) * x * (((T) 1.0) + t); + const float x3 = (float)(x * x * x); + const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3)))); + return ((T)0.5) * x * (((T)1.0) + t); } -template +template __device__ __forceinline__ T gelu_fast_kernel(const T& x) { - const float f = (float) x; - const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x)); - return ((T) 0.5) * x * (((T) 1.0) + t); + const float f = (float)x; + const T t = + (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x)); + return ((T)0.5) * x * (((T)1.0) + t); } -} // namespace vllm +} // namespace vllm -void gelu_new( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., d] +void gelu_new(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., d] { LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel); } -void gelu_fast( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., d] +void gelu_fast(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., d] { LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); } diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh index 31fb401cbe2c1..62409c0cce93e 100644 --- a/csrc/attention/attention_generic.cuh +++ b/csrc/attention/attention_generic.cuh @@ -1,5 +1,6 @@ /* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * Copyright (c) 2023, The vLLM team. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * @@ -22,31 +23,31 @@ namespace vllm { // A vector type to store Q, K, V elements. -template +template struct Vec {}; // A vector type to store FP32 accumulators. -template +template struct FloatVec {}; // Template vector operations. -template +template inline __device__ Acc mul(A a, B b); -template +template inline __device__ float sum(T v); -template +template inline __device__ float dot(T a, T b) { return sum(mul(a, b)); } -template +template inline __device__ float dot(T a, T b) { return sum(mul(a, b)); } -template +template inline __device__ void zero(T& dst) { constexpr int WORDS = sizeof(T) / 4; union { @@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) { dst = tmp.raw; } -} // namespace vllm +} // namespace vllm diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 41b337dd91d36..d6203174e7275 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -1,5 +1,6 @@ /* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp * Copyright (c) 2023, The vLLM team. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * @@ -27,15 +28,15 @@ #ifdef USE_ROCM #include #include "../quantization/fp8/amd/quant_utils.cuh" - typedef __hip_bfloat16 __nv_bfloat16; +typedef __hip_bfloat16 __nv_bfloat16; #else #include "../quantization/fp8/nvidia/quant_utils.cuh" #endif #ifndef USE_ROCM -#define WARP_SIZE 32 + #define WARP_SIZE 32 #else -#define WARP_SIZE warpSize + #define WARP_SIZE warpSize #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -45,7 +46,7 @@ namespace vllm { // Utility function for attention softmax. -template +template inline __device__ float block_sum(float* red_smem, float sum) { // Decompose the thread index into warp / lane. int warp = threadIdx.x / WARP_SIZE; @@ -82,31 +83,28 @@ inline __device__ float block_sum(float* red_smem, float sum) { // TODO(woosuk): Merge the last two dimensions of the grid. // Grid: (num_heads, num_seqs, max_num_partitions). -template< - typename scalar_t, - typename cache_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - vllm::Fp8KVCacheDataType KV_DTYPE, - int PARTITION_SIZE = 0> // Zero means no partitioning. +template // Zero means no partitioning. __device__ void paged_attention_kernel( - float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, // [num_heads] - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ seq_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride, - const float kv_scale) { + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, // [num_heads] + const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ seq_lens, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + const float kv_scale) { const int seq_idx = blockIdx.y; const int partition_idx = blockIdx.z; const int max_num_partitions = gridDim.z; @@ -118,22 +116,29 @@ __device__ void paged_attention_kernel( } const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE); - const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks; + const int num_blocks_per_partition = + USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks; // [start_block_idx, end_block_idx) is the range of blocks to process. - const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0; - const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks); + const int start_block_idx = + USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0; + const int end_block_idx = + MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks); const int num_blocks = end_block_idx - start_block_idx; // [start_token_idx, end_token_idx) is the range of tokens to process. const int start_token_idx = start_block_idx * BLOCK_SIZE; - const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len); + const int end_token_idx = + MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len); const int num_tokens = end_token_idx - start_token_idx; constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1); - constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS + constexpr int NUM_THREAD_GROUPS = + NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE + // divides NUM_THREADS assert(NUM_THREADS % THREAD_GROUP_SIZE == 0); - constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE); + constexpr int NUM_TOKENS_PER_THREAD_GROUP = + DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE); constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int thread_idx = threadIdx.x; const int warp_idx = thread_idx / WARP_SIZE; @@ -143,13 +148,14 @@ __device__ void paged_attention_kernel( const int num_heads = gridDim.x; const int num_queries_per_kv = num_heads / num_kv_heads; const int kv_head_idx = head_idx / num_queries_per_kv; - const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx]; + const float alibi_slope = + alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx]; // A vector type to store a part of a key or a query. - // The vector size is configured in such a way that the threads in a thread group - // fetch or compute 16 bytes at a time. - // For example, if the size of a thread group is 4 and the data type is half, - // then the vector size is 16 / (4 * sizeof(half)) == 2. + // The vector size is configured in such a way that the threads in a thread + // group fetch or compute 16 bytes at a time. For example, if the size of a + // thread group is 4 and the data type is half, then the vector size is 16 / + // (4 * sizeof(half)) == 2. constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1); using K_vec = typename Vec::Type; using Q_vec = typename Vec::Type; @@ -163,18 +169,21 @@ __device__ void paged_attention_kernel( // Load the query to registers. // Each thread in a thread group has a different part of the query. - // For example, if the the thread group size is 4, then the first thread in the group - // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ... - // th vectors of the query, and so on. - // NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous. + // For example, if the the thread group size is 4, then the first thread in + // the group has 0, 4, 8, ... th vectors of the query, and the second thread + // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because + // q is split from a qkv tensor, it may not be contiguous. const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; #pragma unroll - for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) { + for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; + i += NUM_THREAD_GROUPS) { const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE; - q_vecs[thread_group_offset][i] = *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE); + q_vecs[thread_group_offset][i] = + *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE); } - __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs + __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a + // memory wall right before we use q_vecs // Memory planning. extern __shared__ char shared_mem[]; @@ -193,44 +202,50 @@ __device__ void paged_attention_kernel( // Each thread group in a warp fetches a key from the block, and computes // dot product with the query. const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq; - for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) { - // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64 - // because int32 can lead to overflow when this variable is multiplied by large numbers - // (e.g., kv_block_stride). - const int64_t physical_block_number = static_cast(block_table[block_idx]); + for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; + block_idx += NUM_WARPS) { + // NOTE(woosuk): The block number is stored in int32. However, we cast it to + // int64 because int32 can lead to overflow when this variable is multiplied + // by large numbers (e.g., kv_block_stride). + const int64_t physical_block_number = + static_cast(block_table[block_idx]); // Load a key to registers. // Each thread in a thread group has a different part of the key. - // For example, if the the thread group size is 4, then the first thread in the group - // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th - // vectors of the key, and so on. + // For example, if the the thread group size is 4, then the first thread in + // the group has 0, 4, 8, ... th vectors of the key, and the second thread + // has 1, 5, 9, ... th vectors of the key, and so on. for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) { - const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE; + const int physical_block_offset = + (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE; const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset; K_vec k_vecs[NUM_VECS_PER_THREAD]; #pragma unroll for (int j = 0; j < NUM_VECS_PER_THREAD; j++) { - const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride - + physical_block_offset * x; + const cache_t* k_ptr = + k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + physical_block_offset * x; const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE; const int offset1 = (vec_idx * VEC_SIZE) / x; const int offset2 = (vec_idx * VEC_SIZE) % x; if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) { - k_vecs[j] = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); + k_vecs[j] = *reinterpret_cast( + k_ptr + offset1 * BLOCK_SIZE * x + offset2); } else { // Vector conversion from Quant_vec to K_vec. Quant_vec k_vec_quant = *reinterpret_cast( - k_ptr + offset1 * BLOCK_SIZE * x + offset2); - k_vecs[j] = fp8::scaled_convert(k_vec_quant, kv_scale); + k_ptr + offset1 * BLOCK_SIZE * x + offset2); + k_vecs[j] = fp8::scaled_convert( + k_vec_quant, kv_scale); } } // Compute dot product. // This includes a reduction across the threads in the same thread group. - float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); + float qk = scale * Qk_dot::dot( + q_vecs[thread_group_offset], k_vecs); // Add the ALiBi bias if slopes are given. qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0; @@ -285,13 +300,12 @@ __device__ void paged_attention_kernel( // If partitioning is enabled, store the max logit and exp_sum. if (USE_PARTITIONING && thread_idx == 0) { - float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions - + partition_idx; + float* max_logits_ptr = max_logits + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions + partition_idx; *max_logits_ptr = qk_max; - float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions - + partition_idx; + float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions + partition_idx; *exp_sums_ptr = exp_sum; } @@ -304,7 +318,8 @@ __device__ void paged_attention_kernel( constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE; constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW; - constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER); + constexpr int NUM_ROWS_PER_THREAD = + DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER); // NOTE(woosuk): We use FP32 for the accumulator for better accuracy. float accs[NUM_ROWS_PER_THREAD]; @@ -315,18 +330,21 @@ __device__ void paged_attention_kernel( scalar_t zero_value; zero(zero_value); - for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) { - // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64 - // because int32 can lead to overflow when this variable is multiplied by large numbers - // (e.g., kv_block_stride). - const int64_t physical_block_number = static_cast(block_table[block_idx]); + for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; + block_idx += NUM_WARPS) { + // NOTE(woosuk): The block number is stored in int32. However, we cast it to + // int64 because int32 can lead to overflow when this variable is multiplied + // by large numbers (e.g., kv_block_stride). + const int64_t physical_block_number = + static_cast(block_table[block_idx]); const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE; const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset; L_vec logits_vec; - from_float(logits_vec, *reinterpret_cast(logits + token_idx - start_token_idx)); + from_float(logits_vec, *reinterpret_cast(logits + token_idx - + start_token_idx)); - const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride; + const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride; #pragma unroll for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; @@ -337,14 +355,17 @@ __device__ void paged_attention_kernel( if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) { v_vec = *reinterpret_cast(v_ptr + offset); } else { - V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); + V_quant_vec v_quant_vec = + *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. - v_vec = fp8::scaled_convert(v_quant_vec, kv_scale); + v_vec = fp8::scaled_convert(v_quant_vec, + kv_scale); } if (block_idx == num_seq_blocks - 1) { - // NOTE(woosuk): When v_vec contains the tokens that are out of the context, - // we should explicitly zero out the values since they may contain NaNs. - // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472 + // NOTE(woosuk): When v_vec contains the tokens that are out of the + // context, we should explicitly zero out the values since they may + // contain NaNs. See + // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472 scalar_t* v_vec_ptr = reinterpret_cast(&v_vec); #pragma unroll for (int j = 0; j < V_VEC_SIZE; j++) { @@ -367,8 +388,8 @@ __device__ void paged_attention_kernel( accs[i] = acc; } - // NOTE(woosuk): A barrier is required because the shared memory space for logits - // is reused for the output. + // NOTE(woosuk): A barrier is required because the shared memory space for + // logits is reused for the output. __syncthreads(); // Perform reduction across warps. @@ -405,9 +426,9 @@ __device__ void paged_attention_kernel( // Write the final output. if (warp_idx == 0) { - scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE - + partition_idx * HEAD_SIZE; + scalar_t* out_ptr = + out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE; #pragma unroll for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; @@ -419,79 +440,75 @@ __device__ void paged_attention_kernel( } // Grid: (num_heads, num_seqs, 1). -template< - typename scalar_t, - typename cache_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - vllm::Fp8KVCacheDataType KV_DTYPE> +template __global__ void paged_attention_v1_kernel( - scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, // [num_heads] - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ seq_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride, - const float kv_scale) { - paged_attention_kernel( - /* exp_sums */ nullptr, /* max_logits */ nullptr, - out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens, - max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale); + scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, // [num_heads] + const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ seq_lens, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + const float kv_scale) { + paged_attention_kernel( + /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache, + v_cache, num_kv_heads, scale, block_tables, seq_lens, + max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, + kv_head_stride, kv_scale); } // Grid: (num_heads, num_seqs, max_num_partitions). -template< - typename scalar_t, - typename cache_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - vllm::Fp8KVCacheDataType KV_DTYPE, - int PARTITION_SIZE> +template __global__ void paged_attention_v2_kernel( - float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, // [num_heads] - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ seq_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride, - const float kv_scale) { - paged_attention_kernel( - exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, - block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, - q_stride, kv_block_stride, kv_head_stride, kv_scale); + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, // [num_heads] + const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ seq_lens, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + const float kv_scale) { + paged_attention_kernel( + exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, + block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, + kv_block_stride, kv_head_stride, kv_scale); } // Grid: (num_heads, num_seqs). -template< - typename scalar_t, - int HEAD_SIZE, - int NUM_THREADS, - int PARTITION_SIZE> +template __global__ void paged_attention_v2_reduce_kernel( - scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] - const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - const int* __restrict__ seq_lens, // [num_seqs] - const int max_num_partitions) { + scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] + const float* __restrict__ exp_sums, // [num_seqs, num_heads, + // max_num_partitions] + const float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const int* __restrict__ seq_lens, // [num_seqs] + const int max_num_partitions) { const int num_heads = gridDim.x; const int head_idx = blockIdx.x; const int seq_idx = blockIdx.y; @@ -499,9 +516,11 @@ __global__ void paged_attention_v2_reduce_kernel( const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE); if (num_partitions == 1) { // No need to reduce. Only copy tmp_out to out. - scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; - const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE; + scalar_t* out_ptr = + out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; + const scalar_t* tmp_out_ptr = + tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE; for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) { out_ptr[i] = tmp_out_ptr[i]; } @@ -520,8 +539,9 @@ __global__ void paged_attention_v2_reduce_kernel( // Load max logits to shared memory. float* shared_max_logits = reinterpret_cast(shared_mem); - const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions; + const float* max_logits_ptr = max_logits + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; float max_logit = -FLT_MAX; for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) { const float l = max_logits_ptr[i]; @@ -550,9 +570,11 @@ __global__ void paged_attention_v2_reduce_kernel( max_logit = VLLM_SHFL_SYNC(max_logit, 0); // Load rescaled exp sums to shared memory. - float* shared_exp_sums = reinterpret_cast(shared_mem + sizeof(float) * num_partitions); - const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions; + float* shared_exp_sums = + reinterpret_cast(shared_mem + sizeof(float) * num_partitions); + const float* exp_sums_ptr = exp_sums + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; float global_exp_sum = 0.0f; for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) { float l = shared_max_logits[i]; @@ -565,61 +587,45 @@ __global__ void paged_attention_v2_reduce_kernel( const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f); // Aggregate tmp_out to out. - const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE; - scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; + const scalar_t* tmp_out_ptr = + tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE; + scalar_t* out_ptr = + out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; #pragma unroll for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) { float acc = 0.0f; for (int j = 0; j < num_partitions; ++j) { - acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum; + acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * + inv_global_exp_sum; } from_float(out_ptr[i], acc); } } -} // namespace vllm - -#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ - VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ - ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \ - vllm::paged_attention_v1_kernel<<>>( \ - out_ptr, \ - query_ptr, \ - key_cache_ptr, \ - value_cache_ptr, \ - num_kv_heads, \ - scale, \ - block_tables_ptr, \ - seq_lens_ptr, \ - max_num_blocks_per_seq, \ - alibi_slopes_ptr, \ - q_stride, \ - kv_block_stride, \ - kv_head_stride, \ - kv_scale); +} // namespace vllm + +#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ + VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ + ((void*)vllm::paged_attention_v1_kernel< \ + T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_DTYPE>), \ + shared_mem_size); \ + vllm::paged_attention_v1_kernel \ + <<>>( \ + out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ + scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ + alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ + kv_scale); // TODO(woosuk): Tune NUM_THREADS. -template< - typename T, - typename CACHE_T, - int BLOCK_SIZE, - vllm::Fp8KVCacheDataType KV_DTYPE, - int NUM_THREADS = 128> +template void paged_attention_v1_launcher( - torch::Tensor& out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& seq_lens, - int max_seq_len, - const c10::optional& alibi_slopes, - float kv_scale) { + torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float kv_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -632,9 +638,10 @@ void paged_attention_v1_launcher( assert(head_size % thread_group_size == 0); // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = alibi_slopes ? - reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; T* out_ptr = reinterpret_cast(out.data_ptr()); T* query_ptr = reinterpret_cast(query.data_ptr()); @@ -644,7 +651,8 @@ void paged_attention_v1_launcher( int* seq_lens_ptr = seq_lens.data_ptr(); constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; + int padded_max_seq_len = + DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; int logits_size = padded_max_seq_len * sizeof(float); int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len @@ -683,19 +691,10 @@ void paged_attention_v1_launcher( } } -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE) \ - paged_attention_v1_launcher( \ - out, \ - query, \ - key_cache, \ - value_cache, \ - num_kv_heads, \ - scale, \ - block_tables, \ - seq_lens, \ - max_seq_len, \ - alibi_slopes, \ - kv_scale); +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE) \ + paged_attention_v1_launcher( \ + out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ + seq_lens, max_seq_len, alibi_slopes, kv_scale); // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. @@ -716,74 +715,45 @@ void paged_attention_v1_launcher( } void paged_attention_v1( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] - int num_kv_heads, // [num_heads] - float scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int block_size, - int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, - float kv_scale) { - - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, CALL_V1_LAUNCHER_BLOCK_SIZE) -} - -#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ - vllm::paged_attention_v2_kernel \ - <<>>( \ - exp_sums_ptr, \ - max_logits_ptr, \ - tmp_out_ptr, \ - query_ptr, \ - key_cache_ptr, \ - value_cache_ptr, \ - num_kv_heads, \ - scale, \ - block_tables_ptr, \ - seq_lens_ptr, \ - max_num_blocks_per_seq, \ - alibi_slopes_ptr, \ - q_stride, \ - kv_block_stride, \ - kv_head_stride, \ - kv_scale); \ - vllm::paged_attention_v2_reduce_kernel \ - <<>>( \ - out_ptr, \ - exp_sums_ptr, \ - max_logits_ptr, \ - tmp_out_ptr, \ - seq_lens_ptr, \ - max_num_partitions); - -template< - typename T, - typename CACHE_T, - int BLOCK_SIZE, - vllm::Fp8KVCacheDataType KV_DTYPE, - int NUM_THREADS = 128, - int PARTITION_SIZE = 512> + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int num_kv_heads, // [num_heads] + float scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int block_size, int max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale){ + + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V1_LAUNCHER_BLOCK_SIZE)} +#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ + vllm::paged_attention_v2_kernel \ + <<>>( \ + exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ + value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ + seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ + kv_block_stride, kv_head_stride, kv_scale); \ + vllm::paged_attention_v2_reduce_kernel \ + <<>>( \ + out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ + max_num_partitions); + +template void paged_attention_v2_launcher( - torch::Tensor& out, - torch::Tensor& exp_sums, - torch::Tensor& max_logits, - torch::Tensor& tmp_out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& seq_lens, - int max_seq_len, - const c10::optional& alibi_slopes, - float kv_scale) { + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float kv_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -796,9 +766,10 @@ void paged_attention_v2_launcher( assert(head_size % thread_group_size == 0); // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = alibi_slopes ? - reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; T* out_ptr = reinterpret_cast(out.data_ptr()); float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); @@ -853,59 +824,50 @@ void paged_attention_v2_launcher( } } -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE) \ - paged_attention_v2_launcher( \ - out, \ - exp_sums, \ - max_logits, \ - tmp_out, \ - query, \ - key_cache, \ - value_cache, \ - num_kv_heads, \ - scale, \ - block_tables, \ - seq_lens, \ - max_seq_len, \ - alibi_slopes, \ - kv_scale); +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE) \ + paged_attention_v2_launcher( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ + kv_scale); // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ + switch (block_size) { \ + case 8: \ + CALL_V2_LAUNCHER(T, CACHE_T, 8, KV_DTYPE); \ + break; \ + case 16: \ + CALL_V2_LAUNCHER(T, CACHE_T, 16, KV_DTYPE); \ + break; \ + case 32: \ + CALL_V2_LAUNCHER(T, CACHE_T, 32, KV_DTYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ } void paged_attention_v2( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] - int num_kv_heads, // [num_heads] - float scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int block_size, - int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, - float kv_scale) { - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, CALL_V2_LAUNCHER_BLOCK_SIZE) + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& + tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int num_kv_heads, // [num_heads] + float scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int block_size, int max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale) { + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V2_LAUNCHER_BLOCK_SIZE) } #undef WARP_SIZE diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh index ff64c4bd8f80c..cdcee42748998 100644 --- a/csrc/attention/attention_utils.cuh +++ b/csrc/attention/attention_utils.cuh @@ -1,5 +1,6 @@ /* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp * Copyright (c) 2023, The vLLM team. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * @@ -26,7 +27,7 @@ namespace vllm { // Q*K^T operation. -template +template inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { using A_vec = typename FloatVec::Type; // Compute the parallel products for Q*K^T (treat vector lanes separately). @@ -45,12 +46,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { return qk; } -template +template struct Qk_dot { - template + template static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { return qk_dot_(q, k); } }; -} // namespace vllm +} // namespace vllm diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh index 31e0cee01d2e1..3cdcb95e08099 100644 --- a/csrc/attention/dtype_bfloat16.cuh +++ b/csrc/attention/dtype_bfloat16.cuh @@ -1,6 +1,8 @@ /* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * and + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * Copyright (c) 2023, The vLLM team. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * @@ -28,8 +30,8 @@ #include #include - typedef __hip_bfloat162 __nv_bfloat162; - typedef __hip_bfloat16 __nv_bfloat16; +typedef __hip_bfloat162 __nv_bfloat162; +typedef __hip_bfloat16 __nv_bfloat16; #endif #include @@ -50,37 +52,37 @@ struct bf16_8_t { }; // BF16 vector types for Q, K, V. -template<> +template <> struct Vec<__nv_bfloat16, 1> { using Type = __nv_bfloat16; }; -template<> +template <> struct Vec<__nv_bfloat16, 2> { using Type = __nv_bfloat162; }; -template<> +template <> struct Vec<__nv_bfloat16, 4> { using Type = bf16_4_t; }; -template<> +template <> struct Vec<__nv_bfloat16, 8> { using Type = bf16_8_t; }; // FP32 accumulator vector types corresponding to Vec. -template<> +template <> struct FloatVec<__nv_bfloat16> { using Type = float; }; -template<> +template <> struct FloatVec<__nv_bfloat162> { using Type = float2; }; -template<> +template <> struct FloatVec { using Type = Float4_; }; -template<> +template <> struct FloatVec { using Type = Float8_; }; @@ -108,9 +110,9 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) { assert(false); #else #ifndef USE_ROCM - return a + b; + return a + b; #else - return __hadd(a, b); + return __hadd(a, b); #endif #endif } @@ -161,7 +163,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) { } // Vector multiplication. -template<> +template <> inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 assert(false); @@ -170,7 +172,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) { #endif } -template<> +template <> inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 assert(false); @@ -179,12 +181,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) { #endif } -template<> +template <> inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) { return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b); } -template<> +template <> inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) { bf16_4_t c; c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); @@ -192,7 +194,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) { return c; } -template<> +template <> inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) { __nv_bfloat162 s = bf162bf162(a); bf16_4_t c; @@ -201,7 +203,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) { return c; } -template<> +template <> inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) { bf16_8_t c; c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); @@ -211,7 +213,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) { return c; } -template<> +template <> inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) { __nv_bfloat162 s = bf162bf162(a); bf16_8_t c; @@ -222,26 +224,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) { return c; } -template<> +template <> inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) { float fa = __bfloat162float(a); float fb = __bfloat162float(b); return fa * fb; } -template<> +template <> inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) { float2 fa = bf1622float2(a); float2 fb = bf1622float2(b); return mul(fa, fb); } -template<> +template <> inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) { return mul(bf162bf162(a), b); } -template<> +template <> inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) { Float4_ fc; fc.x = mul(a.x, b.x); @@ -249,7 +251,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) { return fc; } -template<> +template <> inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) { __nv_bfloat162 s = bf162bf162(a); Float4_ fc; @@ -258,7 +260,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) { return fc; } -template<> +template <> inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) { Float8_ fc; fc.x = mul(a.x, b.x); @@ -268,7 +270,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) { return fc; } -template<> +template <> inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) { __nv_bfloat162 s = bf162bf162(a); Float8_ fc; @@ -280,7 +282,8 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) { } // Vector fused multiply-add. -inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) { +inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, + __nv_bfloat162 c) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 assert(false); #else @@ -288,7 +291,8 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bf #endif } -inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) { +inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, + __nv_bfloat162 c) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 assert(false); #else @@ -379,23 +383,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) { } // Vector sum. -template<> +template <> inline __device__ float sum(__nv_bfloat16 v) { return __bfloat162float(v); } -template<> +template <> inline __device__ float sum(__nv_bfloat162 v) { float2 vf = bf1622float2(v); return vf.x + vf.y; } -template<> +template <> inline __device__ float sum(bf16_4_t v) { return sum(v.x) + sum(v.y); } -template<> +template <> inline __device__ float sum(bf16_8_t v) { return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w); } @@ -448,4 +452,4 @@ inline __device__ void zero(__nv_bfloat16& dst) { #endif } -} // namespace vllm +} // namespace vllm diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh index d3271e69cd69d..3a1815f0ed4fc 100644 --- a/csrc/attention/dtype_float16.cuh +++ b/csrc/attention/dtype_float16.cuh @@ -1,6 +1,8 @@ /* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * and + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * Copyright (c) 2023, The vLLM team. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * @@ -30,37 +32,37 @@ namespace vllm { // FP16 vector types for Q, K, V. -template<> +template <> struct Vec { using Type = uint16_t; }; -template<> +template <> struct Vec { using Type = uint32_t; }; -template<> +template <> struct Vec { using Type = uint2; }; -template<> +template <> struct Vec { using Type = uint4; }; // FP32 accumulator vector types corresponding to Vec. -template<> +template <> struct FloatVec { using Type = float; }; -template<> +template <> struct FloatVec { using Type = float2; }; -template<> +template <> struct FloatVec { using Type = Float4_; }; -template<> +template <> struct FloatVec { using Type = Float8_; }; @@ -73,8 +75,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) { return b; #else union { - uint32_t u32; - uint16_t u16[2]; + uint32_t u32; + uint16_t u16[2]; } tmp; tmp.u16[0] = a; tmp.u16[1] = a; @@ -130,10 +132,12 @@ inline __device__ uint32_t float2_to_half2(float2 f) { } tmp; #ifndef USE_ROCM #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x)); + asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" + : "=r"(tmp.u32) + : "f"(f.y), "f"(f.x)); #else - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x)); - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y)); + asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x)); + asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y)); #endif #else tmp.u16[0] = float_to_half(f.x); @@ -201,7 +205,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) { } // Vector multiplication. -template<> +template <> inline __device__ uint16_t mul(uint16_t a, uint16_t b) { uint16_t c; #ifndef USE_ROCM @@ -212,7 +216,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) { return c; } -template<> +template <> inline __device__ uint32_t mul(uint32_t a, uint32_t b) { uint32_t c; #ifndef USE_ROCM @@ -223,12 +227,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) { return c; } -template<> +template <> inline __device__ uint32_t mul(uint16_t a, uint32_t b) { return mul(h0_h0(a), b); } -template<> +template <> inline __device__ uint2 mul(uint2 a, uint2 b) { uint2 c; c.x = mul(a.x, b.x); @@ -236,7 +240,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) { return c; } -template<> +template <> inline __device__ uint2 mul(uint16_t a, uint2 b) { uint32_t s = h0_h0(a); uint2 c; @@ -245,7 +249,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) { return c; } -template<> +template <> inline __device__ uint4 mul(uint4 a, uint4 b) { uint4 c; c.x = mul(a.x, b.x); @@ -255,7 +259,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) { return c; } -template<> +template <> inline __device__ uint4 mul(uint16_t a, uint4 b) { uint32_t s = h0_h0(a); uint4 c; @@ -266,26 +270,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) { return c; } -template<> +template <> inline __device__ float mul(uint16_t a, uint16_t b) { float fa = half_to_float(a); float fb = half_to_float(b); return fa * fb; } -template<> +template <> inline __device__ float2 mul(uint32_t a, uint32_t b) { float2 fa = half2_to_float2(a); float2 fb = half2_to_float2(b); return mul(fa, fb); } -template<> +template <> inline __device__ float2 mul(uint16_t a, uint32_t b) { return mul(h0_h0(a), b); } -template<> +template <> inline __device__ Float4_ mul(uint2 a, uint2 b) { Float4_ fc; fc.x = mul(a.x, b.x); @@ -293,7 +297,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) { return fc; } -template<> +template <> inline __device__ Float4_ mul(uint16_t a, uint2 b) { uint32_t s = h0_h0(a); Float4_ fc; @@ -302,7 +306,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) { return fc; } -template<> +template <> inline __device__ Float8_ mul(uint4 a, uint4 b) { Float8_ fc; fc.x = mul(a.x, b.x); @@ -312,7 +316,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) { return fc; } -template<> +template <> inline __device__ Float8_ mul(uint16_t a, uint4 b) { uint32_t s = h0_h0(a); Float8_ fc; @@ -327,9 +331,13 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) { inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) { uint32_t d; #ifndef USE_ROCM - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(d) + : "r"(a), "r"(b), "r"(c)); #else - asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c)); + asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" + : "=v"(d) + : "v"(a), "v"(b), "v"(c)); #endif return d; } @@ -423,24 +431,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) { } // Vector sum. -template<> +template <> inline __device__ float sum(uint16_t v) { return half_to_float(v); } -template<> +template <> inline __device__ float sum(uint32_t v) { float2 tmp = half2_to_float2(v); return tmp.x + tmp.y; } -template<> +template <> inline __device__ float sum(uint2 v) { uint32_t c = add(v.x, v.y); return sum(c); } -template<> +template <> inline __device__ float sum(uint4 v) { uint32_t c = add(v.x, v.y); c = add(c, v.z); @@ -470,13 +478,9 @@ inline __device__ void from_float(uint4& dst, Float8_ src) { } // From float16 to float32. -inline __device__ float to_float(uint16_t u) { - return half_to_float(u); -} +inline __device__ float to_float(uint16_t u) { return half_to_float(u); } -inline __device__ float2 to_float(uint32_t u) { - return half2_to_float2(u); -} +inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); } inline __device__ Float4_ to_float(uint2 u) { Float4_ tmp; @@ -495,8 +499,6 @@ inline __device__ Float8_ to_float(uint4 u) { } // Zero-out a variable. -inline __device__ void zero(uint16_t& dst) { - dst = uint16_t(0); -} +inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); } -} // namespace vllm +} // namespace vllm diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh index b200d2d226eb0..7c6a686db3ba9 100644 --- a/csrc/attention/dtype_float32.cuh +++ b/csrc/attention/dtype_float32.cuh @@ -1,6 +1,8 @@ /* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * and + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * Copyright (c) 2023, The vLLM team. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * @@ -38,37 +40,35 @@ struct Float8_ { }; // FP32 vector types for Q, K, V. -template<> +template <> struct Vec { using Type = float; }; -template<> +template <> struct Vec { using Type = float2; }; -template<> +template <> struct Vec { using Type = float4; }; // FP32 accumulator vector types corresponding to Vec. -template<> +template <> struct FloatVec { using Type = float; }; -template<> +template <> struct FloatVec { using Type = float2; }; -template<> +template <> struct FloatVec { using Type = float4; }; // Vector addition. -inline __device__ float add(float a, float b) { - return a + b; -} +inline __device__ float add(float a, float b) { return a + b; } inline __device__ float2 add(float2 a, float2 b) { float2 c; @@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) { } // Vector multiplication. -template<> +template <> inline __device__ float mul(float a, float b) { return a * b; } -template<> +template <> inline __device__ float2 mul(float2 a, float2 b) { float2 c; c.x = a.x * b.x; @@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) { return c; } -template<> +template <> inline __device__ float2 mul(float a, float2 b) { float2 c; c.x = a * b.x; @@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) { return c; } -template<> +template <> inline __device__ float4 mul(float4 a, float4 b) { float4 c; c.x = a.x * b.x; @@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) { return c; } -template<> +template <> inline __device__ float4 mul(float a, float4 b) { float4 c; c.x = a * b.x; @@ -129,9 +129,7 @@ inline __device__ float4 mul(float a, float4 b) { } // Vector fused multiply-add. -inline __device__ float fma(float a, float b, float c) { - return a * b + c; -} +inline __device__ float fma(float a, float b, float c) { return a * b + c; } inline __device__ float2 fma(float2 a, float2 b, float2 c) { float2 d; @@ -182,35 +180,33 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) { } // Vector sum. -template<> +template <> inline __device__ float sum(float v) { return v; } -template<> +template <> inline __device__ float sum(float2 v) { return v.x + v.y; } -template<> +template <> inline __device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } -template<> +template <> inline __device__ float sum(Float4_ v) { return v.x.x + v.x.y + v.y.x + v.y.y; } -template<> +template <> inline __device__ float sum(Float8_ v) { return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y; } // Vector dot product. -inline __device__ float dot(float a, float b) { - return a * b; -} +inline __device__ float dot(float a, float b) { return a * b; } inline __device__ float dot(float2 a, float2 b) { float2 c = mul(a, b); @@ -232,42 +228,24 @@ inline __device__ float dot(Float8_ a, Float8_ b) { } // From float to float. -inline __device__ void from_float(float& dst, float src) { - dst = src; -} +inline __device__ void from_float(float& dst, float src) { dst = src; } -inline __device__ void from_float(float2& dst, float2 src) { - dst = src; -} +inline __device__ void from_float(float2& dst, float2 src) { dst = src; } -inline __device__ void from_float(float4& dst, float4 src) { - dst = src; -} +inline __device__ void from_float(float4& dst, float4 src) { dst = src; } // From float to float. -inline __device__ float to_float(float u) { - return u; -} +inline __device__ float to_float(float u) { return u; } -inline __device__ float2 to_float(float2 u) { - return u; -} +inline __device__ float2 to_float(float2 u) { return u; } -inline __device__ float4 to_float(float4 u) { - return u; -} +inline __device__ float4 to_float(float4 u) { return u; } -inline __device__ Float4_ to_float(Float4_ u) { - return u; -} +inline __device__ Float4_ to_float(Float4_ u) { return u; } -inline __device__ Float8_ to_float(Float8_ u) { - return u; -} +inline __device__ Float8_ to_float(Float8_ u) { return u; } // Zero-out a variable. -inline __device__ void zero(float& dst) { - dst = 0.f; -} +inline __device__ void zero(float& dst) { dst = 0.f; } -} // namespace vllm +} // namespace vllm diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh index 2b32ce372a64f..e714e321b0beb 100644 --- a/csrc/attention/dtype_fp8.cuh +++ b/csrc/attention/dtype_fp8.cuh @@ -4,38 +4,38 @@ #include #ifdef ENABLE_FP8 -#ifndef USE_ROCM -#include -#endif // USE_ROCM -#endif // ENABLE_FP8 + #ifndef USE_ROCM + #include + #endif // USE_ROCM +#endif // ENABLE_FP8 namespace vllm { enum class Fp8KVCacheDataType { - kAuto = 0, - kFp8E4M3 = 1, - kFp8E5M2 = 2, + kAuto = 0, + kFp8E4M3 = 1, + kFp8E5M2 = 2, }; // fp8 vector types for quantization of kv cache -template<> +template <> struct Vec { - using Type = uint8_t; + using Type = uint8_t; }; -template<> +template <> struct Vec { - using Type = uint16_t; + using Type = uint16_t; }; -template<> +template <> struct Vec { - using Type = uint32_t; + using Type = uint32_t; }; -template<> +template <> struct Vec { - using Type = uint2; + using Type = uint2; }; -} // namespace vllm +} // namespace vllm diff --git a/csrc/cache.h b/csrc/cache.h index 8c176c452425e..435ae3e57f555 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -5,36 +5,24 @@ #include #include -void swap_blocks( - torch::Tensor& src, - torch::Tensor& dst, - const torch::Tensor& block_mapping); +void swap_blocks(torch::Tensor& src, torch::Tensor& dst, + const torch::Tensor& block_mapping); -void copy_blocks( - std::vector& key_caches, - std::vector& value_caches, - const torch::Tensor& block_mapping); +void copy_blocks(std::vector& key_caches, + std::vector& value_caches, + const torch::Tensor& block_mapping); -void reshape_and_cache( - torch::Tensor& key, - torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, - const float kv_scale); +void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, + torch::Tensor& key_cache, torch::Tensor& value_cache, + torch::Tensor& slot_mapping, + const std::string& kv_cache_dtype, const float kv_scale); -void reshape_and_cache_flash( - torch::Tensor& key, - torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype); +void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, + torch::Tensor& key_cache, + torch::Tensor& value_cache, + torch::Tensor& slot_mapping, + const std::string& kv_cache_dtype); // Just for unittest -void convert_fp8( - torch::Tensor& dst_cache, - torch::Tensor& src_cache, - const float scale, - const std::string& kv_cache_dtype); +void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, + const float scale, const std::string& kv_cache_dtype); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index e5b74da6ad068..d924ac39b89ca 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -6,9 +6,9 @@ #include "dispatch_utils.h" #ifdef USE_ROCM -#include "quantization/fp8/amd/quant_utils.cuh" + #include "quantization/fp8/amd/quant_utils.cuh" #else -#include "quantization/fp8/nvidia/quant_utils.cuh" + #include "quantization/fp8/nvidia/quant_utils.cuh" #endif #include @@ -18,20 +18,17 @@ #ifdef USE_ROCM #include - typedef __hip_bfloat16 __nv_bfloat16; +typedef __hip_bfloat16 __nv_bfloat16; #endif -void swap_blocks( - torch::Tensor& src, - torch::Tensor& dst, - const torch::Tensor& block_mapping) { +void swap_blocks(torch::Tensor& src, torch::Tensor& dst, + const torch::Tensor& block_mapping) { torch::Device src_device = src.device(); torch::Device dst_device = dst.device(); cudaMemcpyKind memcpy_type; if (src_device.is_cuda() && dst_device.is_cuda()) { - TORCH_CHECK( - src_device.index() == dst_device.index(), - "src and dst must be on the same GPU"); + TORCH_CHECK(src_device.index() == dst_device.index(), + "src and dst must be on the same GPU"); memcpy_type = cudaMemcpyDeviceToDevice; } else if (src_device.is_cuda() && dst_device.is_cpu()) { memcpy_type = cudaMemcpyDeviceToHost; @@ -41,16 +38,17 @@ void swap_blocks( TORCH_CHECK(false, "Invalid device combination"); } - // NOTE(youkaichao): keep in mind that `block_mapping` should be + // NOTE(youkaichao): keep in mind that `block_mapping` should be // a cpu tensor, otherwise every `item` call will require a gpu-cpu // synchronization. TORCH_CHECK(block_mapping.device().is_cpu(), "block_mapping must be on CPU"); - char *src_ptr = static_cast(src.data_ptr()); - char *dst_ptr = static_cast(dst.data_ptr()); + char* src_ptr = static_cast(src.data_ptr()); + char* dst_ptr = static_cast(dst.data_ptr()); const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); - const at::cuda::OptionalCUDAGuard device_guard(src_device.is_cuda() ? src_device : dst_device); + const at::cuda::OptionalCUDAGuard device_guard( + src_device.is_cuda() ? src_device : dst_device); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); // NOTE(woosuk): This can be slow if the number of blocks is large. const int64_t num_blocks = block_mapping.size(0); @@ -59,29 +57,25 @@ void swap_blocks( int64_t dst_block_number = block_mapping[i][1].item(); int64_t src_offset = src_block_number * block_size_in_bytes; int64_t dst_offset = dst_block_number * block_size_in_bytes; - cudaMemcpyAsync( - dst_ptr + dst_offset, - src_ptr + src_offset, - block_size_in_bytes, - memcpy_type, - stream); + cudaMemcpyAsync(dst_ptr + dst_offset, src_ptr + src_offset, + block_size_in_bytes, memcpy_type, stream); } } namespace vllm { // Grid: (num_layers, num_pairs) -template -__global__ void copy_blocks_kernel( - int64_t* key_cache_ptrs, - int64_t* value_cache_ptrs, - const int64_t* __restrict__ block_mapping, - const int numel_per_block) { +template +__global__ void copy_blocks_kernel(int64_t* key_cache_ptrs, + int64_t* value_cache_ptrs, + const int64_t* __restrict__ block_mapping, + const int numel_per_block) { const int layer_idx = blockIdx.x; const int pair_idx = blockIdx.y; scalar_t* key_cache = reinterpret_cast(key_cache_ptrs[layer_idx]); - scalar_t* value_cache = reinterpret_cast(value_cache_ptrs[layer_idx]); + scalar_t* value_cache = + reinterpret_cast(value_cache_ptrs[layer_idx]); int64_t src_block_number = block_mapping[2 * pair_idx]; int64_t dst_block_number = block_mapping[2 * pair_idx + 1]; @@ -99,12 +93,11 @@ __global__ void copy_blocks_kernel( } } -} // namespace vllm +} // namespace vllm -void copy_blocks( - std::vector& key_caches, - std::vector& value_caches, - const torch::Tensor& block_mapping) { +void copy_blocks(std::vector& key_caches, + std::vector& value_caches, + const torch::Tensor& block_mapping) { int num_layers = key_caches.size(); TORCH_CHECK(num_layers == value_caches.size()); if (num_layers == 0) { @@ -118,8 +111,10 @@ void copy_blocks( int64_t key_cache_ptrs[num_layers]; int64_t value_cache_ptrs[num_layers]; for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - key_cache_ptrs[layer_idx] = reinterpret_cast(key_caches[layer_idx].data_ptr()); - value_cache_ptrs[layer_idx] = reinterpret_cast(value_caches[layer_idx].data_ptr()); + key_cache_ptrs[layer_idx] = + reinterpret_cast(key_caches[layer_idx].data_ptr()); + value_cache_ptrs[layer_idx] = + reinterpret_cast(value_caches[layer_idx].data_ptr()); } // block_mapping is a 2D tensor with shape (num_pairs, 2). @@ -127,10 +122,12 @@ void copy_blocks( // Move the data structures to the GPU. // NOTE: This synchronizes the CPU and GPU. - torch::Tensor key_cache_ptrs_tensor = torch::from_blob( - key_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device); - torch::Tensor value_cache_ptrs_tensor = torch::from_blob( - value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device); + torch::Tensor key_cache_ptrs_tensor = + torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64) + .to(cache_device); + torch::Tensor value_cache_ptrs_tensor = + torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64) + .to(cache_device); // Launch the kernel. const int numel_per_block = key_caches[0][0].numel(); @@ -139,31 +136,28 @@ void copy_blocks( const at::cuda::OptionalCUDAGuard device_guard(cache_device); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { - vllm::copy_blocks_kernel<<>>( - key_cache_ptrs_tensor.data_ptr(), - value_cache_ptrs_tensor.data_ptr(), - block_mapping.data_ptr(), - numel_per_block); - })); + key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { + vllm::copy_blocks_kernel<<>>( + key_cache_ptrs_tensor.data_ptr(), + value_cache_ptrs_tensor.data_ptr(), + block_mapping.data_ptr(), numel_per_block); + })); } namespace vllm { -template +template __global__ void reshape_and_cache_kernel( - const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] - const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int64_t* __restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x, - const float kv_scale) { + const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] + const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] + cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, + // block_size, x] + cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, + // block_size] + const int64_t* __restrict__ slot_mapping, // [num_tokens] + const int key_stride, const int value_stride, const int num_heads, + const int head_size, const int block_size, const int x, + const float kv_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { @@ -184,40 +178,39 @@ __global__ void reshape_and_cache_kernel( const int x_idx = head_offset / x; const int x_offset = head_offset % x; - const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; + const int64_t tgt_key_idx = + block_idx * num_heads * (head_size / x) * block_size * x + + head_idx * (head_size / x) * block_size * x + x_idx * block_size * x + + block_offset * x + x_offset; + const int64_t tgt_value_idx = + block_idx * num_heads * head_size * block_size + + head_idx * head_size * block_size + head_offset * block_size + + block_offset; scalar_t tgt_key = key[src_key_idx]; scalar_t tgt_value = value[src_value_idx]; if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { key_cache[tgt_key_idx] = tgt_key; value_cache[tgt_value_idx] = tgt_value; } else { - key_cache[tgt_key_idx] = fp8::scaled_convert(tgt_key, kv_scale); - value_cache[tgt_value_idx] = fp8::scaled_convert(tgt_value, kv_scale); + key_cache[tgt_key_idx] = + fp8::scaled_convert(tgt_key, kv_scale); + value_cache[tgt_value_idx] = + fp8::scaled_convert(tgt_value, kv_scale); } } } -template +template __global__ void reshape_and_cache_flash_kernel( - const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] - const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads, head_size] - scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads, head_size] - const int64_t* __restrict__ slot_mapping, // [num_tokens] - const int block_stride, - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size) { + const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] + const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] + scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads, + // head_size] + scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads, + // head_size] + const int64_t* __restrict__ slot_mapping, // [num_tokens] + const int block_stride, const int key_stride, const int value_stride, + const int num_heads, const int head_size, const int block_size) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; // NOTE: slot_idx can be -1 if the token is padded @@ -232,43 +225,37 @@ __global__ void reshape_and_cache_flash_kernel( const int64_t src_value_idx = token_idx * value_stride + i; const int head_idx = i / head_size; const int head_offset = i % head_size; - const int64_t tgt_value_idx = block_idx * block_stride - + block_offset * num_heads * head_size - + head_idx * head_size - + head_offset; + const int64_t tgt_value_idx = block_idx * block_stride + + block_offset * num_heads * head_size + + head_idx * head_size + head_offset; k_cache[tgt_value_idx] = key[src_key_idx]; v_cache[tgt_value_idx] = value[src_value_idx]; } } -} // namespace vllm +} // namespace vllm // KV_T is the stored data type of kv-cache. // CACHE_T is the data type of key and value tensors. // KV_DTYPE is the real data type of kv-cache. -#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ - vllm::reshape_and_cache_kernel<<>>( \ - reinterpret_cast(key.data_ptr()), \ - reinterpret_cast(value.data_ptr()), \ - reinterpret_cast(key_cache.data_ptr()), \ - reinterpret_cast(value_cache.data_ptr()), \ - slot_mapping.data_ptr(), \ - key_stride, \ - value_stride, \ - num_heads, \ - head_size, \ - block_size, \ - x, \ - kv_scale); +#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ + vllm::reshape_and_cache_kernel \ + <<>>( \ + reinterpret_cast(key.data_ptr()), \ + reinterpret_cast(value.data_ptr()), \ + reinterpret_cast(key_cache.data_ptr()), \ + reinterpret_cast(value_cache.data_ptr()), \ + slot_mapping.data_ptr(), key_stride, value_stride, \ + num_heads, head_size, block_size, x, kv_scale); void reshape_and_cache( - torch::Tensor& key, // [num_tokens, num_heads, head_size] - torch::Tensor& value, // [num_tokens, num_heads, head_size] - torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] - torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype, - const float kv_scale) -{ + torch::Tensor& key, // [num_tokens, num_heads, head_size] + torch::Tensor& value, // [num_tokens, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + torch::Tensor& slot_mapping, // [num_tokens] + const std::string& kv_cache_dtype, const float kv_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); @@ -283,17 +270,17 @@ void reshape_and_cache( const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype, CALL_RESHAPE_AND_CACHE) + DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype, + CALL_RESHAPE_AND_CACHE) } void reshape_and_cache_flash( - torch::Tensor& key, // [num_tokens, num_heads, head_size] - torch::Tensor& value, // [num_tokens, num_heads, head_size] - torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size] - torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size] - torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype) -{ + torch::Tensor& key, // [num_tokens, num_heads, head_size] + torch::Tensor& value, // [num_tokens, num_heads, head_size] + torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size] + torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size] + torch::Tensor& slot_mapping, // [num_tokens] + const std::string& kv_cache_dtype) { // FIXME: only support auto datatype, does not support fp8 if (kv_cache_dtype != "auto") { TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype); @@ -313,62 +300,47 @@ void reshape_and_cache_flash( const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( - key.scalar_type(), - "reshape_and_cache_flash", - [&] { - vllm::reshape_and_cache_flash_kernel<<>>( - key.data_ptr(), - value.data_ptr(), - k_cache.data_ptr(), - v_cache.data_ptr(), - slot_mapping.data_ptr(), - block_stride, - key_stride, - value_stride, - num_heads, - head_size, - block_size); - }); + key.scalar_type(), "reshape_and_cache_flash", [&] { + vllm::reshape_and_cache_flash_kernel + <<>>( + key.data_ptr(), value.data_ptr(), + k_cache.data_ptr(), v_cache.data_ptr(), + slot_mapping.data_ptr(), block_stride, key_stride, + value_stride, num_heads, head_size, block_size); + }); } namespace vllm { -template -__global__ void convert_fp8_kernel( - const Tin* __restrict__ src_cache, - Tout* __restrict__ dst_cache, - const float kv_scale, - const int64_t block_stride) { +template +__global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache, + Tout* __restrict__ dst_cache, + const float kv_scale, + const int64_t block_stride) { const int64_t block_idx = blockIdx.x; for (int i = threadIdx.x; i < block_stride; i += blockDim.x) { int64_t idx = block_idx * block_stride + i; - dst_cache[idx] = fp8::scaled_convert(src_cache[idx], kv_scale); + dst_cache[idx] = + fp8::scaled_convert(src_cache[idx], kv_scale); } } -} // namespace vllm +} // namespace vllm -#define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE) \ - vllm::convert_fp8_kernel<<>>( \ - reinterpret_cast(src_cache.data_ptr()), \ - reinterpret_cast(dst_cache.data_ptr()), \ - kv_scale, \ - block_stride); +#define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE) \ + vllm::convert_fp8_kernel<<>>( \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst_cache.data_ptr()), kv_scale, block_stride); // Only for testing. -void convert_fp8( - torch::Tensor& dst_cache, - torch::Tensor& src_cache, - const float kv_scale, - const std::string& kv_cache_dtype) -{ +void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, + const float kv_scale, const std::string& kv_cache_dtype) { torch::Device src_device = src_cache.device(); torch::Device dst_device = dst_cache.device(); TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU") TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU") - TORCH_CHECK( - src_device.index() == dst_device.index(), - "src and dst must be on the same GPU"); + TORCH_CHECK(src_device.index() == dst_device.index(), + "src and dst must be on the same GPU"); at::cuda::OptionalCUDAGuard device_guard(src_device); int64_t num_blocks = src_cache.size(0); @@ -398,13 +370,15 @@ void convert_fp8( } else if (src_cache.dtype() == at::ScalarType::Half) { CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kFp8E4M3); } else if (src_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8(uint8_t, __nv_bfloat16, vllm::Fp8KVCacheDataType::kFp8E4M3); + CALL_CONVERT_FP8(uint8_t, __nv_bfloat16, + vllm::Fp8KVCacheDataType::kFp8E4M3); } else if (dst_cache.dtype() == at::ScalarType::Float) { CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); } else if (dst_cache.dtype() == at::ScalarType::Half) { CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); } else if (dst_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); + CALL_CONVERT_FP8(__nv_bfloat16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3); } } else { TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype); diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp index 1bd24eb79d129..becd2ac42f17a 100644 --- a/csrc/cpu/activation.cpp +++ b/csrc/cpu/activation.cpp @@ -1,10 +1,10 @@ #include "cpu_types.hpp" namespace { -template -void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input, - scalar_t *__restrict__ output) { +void activation_kernel(int num_tokens, int d, scalar_t* __restrict__ input, + scalar_t* __restrict__ output) { using scalar_vec_t = vec_op::vec_t; constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num(); @@ -34,13 +34,13 @@ void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input, } } -FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8 &x) { +FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 zeros(0.0); const vec_op::FP32Vec8 ones(1.0); return x / (ones + (zeros - x).exp()); } -FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) { +FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(0.79788456f); const vec_op::FP32Vec8 w2(0.044715f); @@ -50,7 +50,7 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) { return w3 * x * (ones + t); } -FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) { +FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(0.79788456f); const vec_op::FP32Vec8 w2(0.044715f); @@ -59,14 +59,14 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) { return w3 * x * (ones + t); } -FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8 &x) { +FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(M_SQRT1_2); const vec_op::FP32Vec8 w2(0.5); return x * w2 * (ones + (x * w1).er()); } -FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) { +FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5); const vec_op::FP32Vec8 w2(0.5); @@ -75,40 +75,36 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) { const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3); return x * w2 * (ones + inner.tanh()); } -}; // namespace +}; // namespace -void silu_and_mul(torch::Tensor &out, torch::Tensor &input) { +void silu_and_mul(torch::Tensor& out, torch::Tensor& input) { int num_tokens = input.numel() / input.size(-1); int d = input.size(-1) / 2; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "silu_and_mul_impl", [&] { - CPU_KERNEL_GUARD_IN(silu_and_mul_impl) - activation_kernel(num_tokens, d, - input.data_ptr(), - out.data_ptr()); - CPU_KERNEL_GUARD_OUT(silu_and_mul_impl) - }); + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_impl", [&] { + CPU_KERNEL_GUARD_IN(silu_and_mul_impl) + activation_kernel( + num_tokens, d, input.data_ptr(), out.data_ptr()); + CPU_KERNEL_GUARD_OUT(silu_and_mul_impl) + }); } -void gelu_and_mul(torch::Tensor &out, // [..., d] - torch::Tensor &input) // [..., 2 * d] +void gelu_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] { int num_tokens = input.numel() / input.size(-1); int d = input.size(-1) / 2; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "gelu_and_mul_impl", [&] { - CPU_KERNEL_GUARD_IN(gelu_and_mul_impl) - activation_kernel(num_tokens, d, - input.data_ptr(), - out.data_ptr()); - CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl) - }); + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul_impl", [&] { + CPU_KERNEL_GUARD_IN(gelu_and_mul_impl) + activation_kernel( + num_tokens, d, input.data_ptr(), out.data_ptr()); + CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl) + }); } -void gelu_tanh_and_mul(torch::Tensor &out, // [..., d] - torch::Tensor &input) // [..., 2 * d] +void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] { int num_tokens = input.numel() / input.size(-1); int d = input.size(-1) / 2; @@ -123,7 +119,7 @@ void gelu_tanh_and_mul(torch::Tensor &out, // [..., d] }); } -void gelu_new(torch::Tensor &out, torch::Tensor &input) { +void gelu_new(torch::Tensor& out, torch::Tensor& input) { int num_tokens = input.numel() / input.size(-1); int d = input.size(-1); @@ -135,7 +131,7 @@ void gelu_new(torch::Tensor &out, torch::Tensor &input) { }); } -void gelu_fast(torch::Tensor &out, torch::Tensor &input) { +void gelu_fast(torch::Tensor& out, torch::Tensor& input) { int num_tokens = input.numel() / input.size(-1); int d = input.size(-1); diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index c1d765be05598..54df69b7379d6 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -2,7 +2,8 @@ namespace { -template struct KernelVecType { +template +struct KernelVecType { using q_load_vec_type = void; using q_vec_type = void; using k_load_vec_type = void; @@ -11,7 +12,8 @@ template struct KernelVecType { using v_load_vec_type = void; }; -template <> struct KernelVecType { +template <> +struct KernelVecType { using q_load_vec_type = vec_op::FP32Vec4; using q_vec_type = vec_op::FP32Vec16; using k_load_vec_type = vec_op::FP32Vec16; @@ -21,7 +23,8 @@ template <> struct KernelVecType { }; #ifdef __AVX512BF16__ -template <> struct KernelVecType { +template <> +struct KernelVecType { using q_load_vec_type = vec_op::BF16Vec8; using q_vec_type = vec_op::BF16Vec32; using k_load_vec_type = vec_op::BF16Vec32; @@ -30,7 +33,8 @@ template <> struct KernelVecType { using v_load_vec_type = vec_op::BF16Vec16; }; #else -template <> struct KernelVecType { +template <> +struct KernelVecType { using q_load_vec_type = vec_op::BF16Vec8; using q_vec_type = vec_op::FP32Vec16; using k_load_vec_type = vec_op::BF16Vec16; @@ -41,7 +45,7 @@ template <> struct KernelVecType { #endif template -FORCE_INLINE std::pair reduceSoftmax(T *data, const int size, +FORCE_INLINE std::pair reduceSoftmax(T* data, const int size, const int capacity) { T max = data[0]; for (int i = 1; i < size; ++i) { @@ -67,10 +71,11 @@ FORCE_INLINE std::pair reduceSoftmax(T *data, const int size, } template -FORCE_INLINE std::pair -reduceSoftmaxAlibi(T *data, const int size, const int capacity, - const float alibi_slope, const int start_index, - const int seq_len) { +FORCE_INLINE std::pair reduceSoftmaxAlibi(T* data, const int size, + const int capacity, + const float alibi_slope, + const int start_index, + const int seq_len) { data[0] += alibi_slope * (start_index - seq_len + 1); T max = data[0]; for (int i = 1; i < size; ++i) { @@ -98,7 +103,7 @@ reduceSoftmaxAlibi(T *data, const int size, const int capacity, } template -FORCE_INLINE void reducePartitonSoftmax(const T *max_data, T *sum_data, +FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data, const int size) { T max = max_data[0]; for (int i = 1; i < size; ++i) { @@ -132,9 +137,9 @@ struct reduceQKBlockKernel { static_assert(k_load_vec_type::get_elem_num() % x == 0); static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16); - FORCE_INLINE static void call(const scalar_t *__restrict__ q, - const scalar_t *__restrict__ k_block, - float *__restrict__ logits, float scale, + FORCE_INLINE static void call(const scalar_t* __restrict__ q, + const scalar_t* __restrict__ k_block, + float* __restrict__ logits, float scale, const int token_num) { const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP; @@ -196,8 +201,8 @@ struct reduceQKBlockKernel { template -FORCE_INLINE void reduceValueBlock(const float *prob, const scalar_t *v_block, - acc_t &&acc) { +FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block, + acc_t&& acc) { using v_load_vec_type = typename KernelVecType::v_load_vec_type; constexpr int ELEM_NUM = v_load_vec_type::get_elem_num(); static_assert(BLOCK_SIZE == ELEM_NUM); @@ -209,27 +214,27 @@ FORCE_INLINE void reduceValueBlock(const float *prob, const scalar_t *v_block, acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec; }); } -}; // namespace +}; // namespace // Paged attention v1 namespace { template struct paged_attention_v1_impl { - static void - call(scalar_t *__restrict__ out, // [num_seqs, num_heads, head_size] - const scalar_t *__restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads, + static void call( + scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, // head_size/x, block_size, x] - const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads, + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, // head_size, block_size] - const int num_kv_heads, const float scale, - const int - *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int *__restrict__ seq_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float *__restrict__ alibi_slopes, // [num_heads] - const int q_stride, const int kv_block_stride, const int kv_head_stride, - const int num_seqs, const int num_heads) { + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, + // max_num_blocks_per_seq] + const int* __restrict__ seq_lens, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + const int num_seqs, const int num_heads) { constexpr int x = 16 / sizeof(scalar_t); const int num_queries_per_kv = num_heads / num_kv_heads; @@ -243,32 +248,31 @@ struct paged_attention_v1_impl { size_t logits_bytes = parallel_work_item_num * max_seq_len_padded * sizeof(float); - float *logits = (float *)std::aligned_alloc( - 64, logits_bytes); // Cacheline alignment for each context token. - // [parallel_work_item_num, max_seq_len_padded] + float* logits = (float*)std::aligned_alloc( + 64, logits_bytes); // Cacheline alignment for each context token. + // [parallel_work_item_num, max_seq_len_padded] #pragma omp parallel for collapse(2) schedule(dynamic, 1) for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { for (int head_idx = 0; head_idx < num_heads; ++head_idx) { int seq_len = seq_lens[seq_idx]; - const int *seq_block_table = + const int* seq_block_table = block_tables + max_num_blocks_per_seq * seq_idx; const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; const int64_t kv_head_idx = head_idx / num_queries_per_kv; - const scalar_t *__restrict__ q_vec_ptr = + const scalar_t* __restrict__ q_vec_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; - const int last_block_token_num = - seq_len - (block_num - 1) * BLOCK_SIZE; - float *__restrict__ thread_block_logits = + const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE; + float* __restrict__ thread_block_logits = logits + omp_get_thread_num() * max_seq_len_padded; // Compute logits for (int block_idx = 0; block_idx < block_num; ++block_idx) { const int64_t physical_block_idx = seq_block_table[block_idx]; - const scalar_t *__restrict__ k_block_cache_ptr = + const scalar_t* __restrict__ k_block_cache_ptr = k_cache + physical_block_idx * kv_block_stride + kv_head_idx * kv_head_stride; - float *__restrict__ head_block_logits = + float* __restrict__ head_block_logits = thread_block_logits + block_idx * BLOCK_SIZE; reduceQKBlockKernel::call( @@ -282,8 +286,7 @@ struct paged_attention_v1_impl { block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0, seq_len); } else { - reduceSoftmax(thread_block_logits, seq_len, - block_num * BLOCK_SIZE); + reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE); } // Compute value @@ -293,14 +296,14 @@ struct paged_attention_v1_impl { for (int head_part_idx = 0; head_part_idx < head_partition_num; ++head_part_idx) { vec_op::FP32Vec16 accums[head_elem_num_per_partition]; - scalar_t *__restrict__ out_ptr = + scalar_t* __restrict__ out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_part_idx * head_elem_num_per_partition; for (int block_idx = 0; block_idx < block_num; ++block_idx) { const int64_t physical_block_idx = seq_block_table[block_idx]; - const float *__restrict__ prob_vec_ptr = + const float* __restrict__ prob_vec_ptr = thread_block_logits + block_idx * BLOCK_SIZE; - const scalar_t *__restrict__ v_block_cache_ptr = + const scalar_t* __restrict__ v_block_cache_ptr = v_cache + physical_block_idx * kv_block_stride + kv_head_idx * kv_head_stride + BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; @@ -311,7 +314,7 @@ struct paged_attention_v1_impl { if (block_idx != block_num - 1) { const int64_t next_physical_block_idx = seq_block_table[block_idx + 1]; - const scalar_t *__restrict__ next_v_block_cache_ptr = + const scalar_t* __restrict__ next_v_block_cache_ptr = v_cache + next_physical_block_idx * kv_block_stride + kv_head_idx * kv_head_stride + BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; @@ -340,16 +343,16 @@ struct paged_attention_v1_impl { #define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \ paged_attention_v1_impl::call( \ out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ + block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \ num_heads); template void paged_attention_v1_impl_launcher( - torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache, - torch::Tensor &value_cache, int num_kv_heads, float scale, - torch::Tensor &block_tables, torch::Tensor &seq_lens, - int max_seq_len, const c10::optional &alibi_slopes) { + torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -359,67 +362,66 @@ void paged_attention_v1_impl_launcher( int kv_head_stride = key_cache.stride(1); // NOTE: alibi_slopes is optional. - const float *alibi_slopes_ptr = + const float* alibi_slopes_ptr = alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) + ? reinterpret_cast(alibi_slopes.value().data_ptr()) : nullptr; - T *out_ptr = reinterpret_cast(out.data_ptr()); - T *query_ptr = reinterpret_cast(query.data_ptr()); - T *key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - T *value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int *block_tables_ptr = block_tables.data_ptr(); - int *seq_lens_ptr = seq_lens.data_ptr(); + T* out_ptr = reinterpret_cast(out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { - case 64: - LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); - break; - case 80: - LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE); - break; - case 96: - LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE); - break; - case 112: - LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE); - break; - case 128: - LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); - break; - case 256: - LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; + case 64: + LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); + break; + case 80: + LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE); + break; + case 96: + LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE); + break; + case 112: + LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE); + break; + case 128: + LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); + break; + case 256: + LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; } } -#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \ - paged_attention_v1_impl_launcher( \ - out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ +#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \ + paged_attention_v1_impl_launcher( \ + out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ seq_lens, max_seq_len, alibi_slopes); -#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \ - switch (block_size) { \ - case 16: \ - CALL_V1_KERNEL_LAUNCHER(T, 16); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ +#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \ + switch (block_size) { \ + case 16: \ + CALL_V1_KERNEL_LAUNCHER(T, 16); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ } -} // namespace +} // namespace -void paged_attention_v1(torch::Tensor &out, torch::Tensor &query, - torch::Tensor &key_cache, torch::Tensor &value_cache, +void paged_attention_v1(torch::Tensor& out, torch::Tensor& query, + torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor &block_tables, - torch::Tensor &seq_lens, int block_size, - int max_seq_len, - const c10::optional &alibi_slopes, - const std::string &kv_cache_dtype, float kv_scale) { + torch::Tensor& block_tables, torch::Tensor& seq_lens, + int block_size, int max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale) { TORCH_CHECK(kv_scale == 1.0f); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl", [&] { @@ -434,23 +436,24 @@ namespace { template struct paged_attention_v2_impl { static void call( - scalar_t *__restrict__ out, // [num_seqs, num_heads, head_size] - float *__restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - float - *__restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - scalar_t *__restrict__ tmp_out, // [num_seqs, num_heads, - // max_num_partitions, head_size] - const scalar_t *__restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads, - // head_size/x, block_size, x] - const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads, - // head_size, block_size] + scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] + float* __restrict__ exp_sums, // [num_seqs, num_heads, + // max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] const int num_kv_heads, const float scale, - const int - *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int *__restrict__ seq_lens, // [num_seqs] + const int* __restrict__ block_tables, // [num_seqs, + // max_num_blocks_per_seq] + const int* __restrict__ seq_lens, // [num_seqs] const int max_num_blocks_per_seq, - const float *__restrict__ alibi_slopes, // [num_heads] + const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, const int num_seqs, const int num_heads, const int max_num_partitions) { constexpr int x = 16 / sizeof(scalar_t); @@ -468,8 +471,7 @@ struct paged_attention_v2_impl { const int seq_len = seq_lens[seq_idx]; const int start_token_idx = partition_idx * PARTITION_SIZE; - if (start_token_idx >= seq_len) - continue; + if (start_token_idx >= seq_len) continue; const int partition_num = (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE; @@ -477,15 +479,14 @@ struct paged_attention_v2_impl { const int token_num = (std::min(seq_len, start_token_idx + PARTITION_SIZE) - start_token_idx); - const int block_num = - (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE; + const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE; const int last_block_token_num = token_num - (block_num - 1) * BLOCK_SIZE; - const int *seq_block_table = block_tables + + const int* seq_block_table = block_tables + max_num_blocks_per_seq * seq_idx + start_token_idx / BLOCK_SIZE; const int64_t kv_head_idx = head_idx / num_queries_per_kv; - const scalar_t *__restrict__ q_vec_ptr = + const scalar_t* __restrict__ q_vec_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0}; @@ -493,10 +494,10 @@ struct paged_attention_v2_impl { // Compute logits for (int block_idx = 0; block_idx < block_num; ++block_idx) { const int64_t physical_block_idx = seq_block_table[block_idx]; - const scalar_t *__restrict__ k_block_cache_ptr = + const scalar_t* __restrict__ k_block_cache_ptr = k_cache + physical_block_idx * kv_block_stride + kv_head_idx * kv_head_stride; - float *__restrict__ head_block_logits = + float* __restrict__ head_block_logits = logits + block_idx * BLOCK_SIZE; reduceQKBlockKernel::call( @@ -510,13 +511,13 @@ struct paged_attention_v2_impl { logits, token_num, block_num * BLOCK_SIZE, alibi_slopes[head_idx], start_token_idx, seq_len); } else { - max_and_sum = reduceSoftmax(logits, token_num, - block_num * BLOCK_SIZE); + max_and_sum = + reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE); } - auto &&[max_logit, exp_sum] = max_and_sum; + auto&& [max_logit, exp_sum] = max_and_sum; - scalar_t *__restrict__ output_buffer = nullptr; + scalar_t* __restrict__ output_buffer = nullptr; if (!no_reduce) { auto idx = seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions + partition_idx; @@ -538,13 +539,13 @@ struct paged_attention_v2_impl { for (int head_part_idx = 0; head_part_idx < head_partition_num; ++head_part_idx) { vec_op::FP32Vec16 accums[head_elem_num_per_partition]; - scalar_t *__restrict__ out_ptr = + scalar_t* __restrict__ out_ptr = output_buffer + head_part_idx * head_elem_num_per_partition; for (int block_idx = 0; block_idx < block_num; ++block_idx) { const int64_t physical_block_idx = seq_block_table[block_idx]; - const float *__restrict__ prob_vec_ptr = + const float* __restrict__ prob_vec_ptr = logits + block_idx * BLOCK_SIZE; - const scalar_t *__restrict__ v_block_cache_ptr = + const scalar_t* __restrict__ v_block_cache_ptr = v_cache + physical_block_idx * kv_block_stride + kv_head_idx * kv_head_stride + BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; @@ -555,7 +556,7 @@ struct paged_attention_v2_impl { if (block_idx != block_num - 1) { const int64_t next_physical_block_idx = seq_block_table[block_idx + 1]; - const scalar_t *__restrict__ next_v_block_cache_ptr = + const scalar_t* __restrict__ next_v_block_cache_ptr = v_cache + next_physical_block_idx * kv_block_stride + kv_head_idx * kv_head_stride + BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; @@ -587,8 +588,7 @@ struct paged_attention_v2_impl { const int partition_num = (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE; - if (partition_num == 1) - continue; + if (partition_num == 1) continue; reducePartitonSoftmax( max_logits + seq_idx * num_heads * max_num_partitions + @@ -603,11 +603,11 @@ struct paged_attention_v2_impl { using v_load_vec_type = typename KernelVecType::v_load_vec_type; static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE); constexpr int head_elem_num_per_group = - 16; // Note: didn't align with the cacheline size, due to some HEAD_SIZE - // didn't align with 64 bytes + 16; // Note: didn't align with the cacheline size, due to some + // HEAD_SIZE didn't align with 64 bytes static_assert(HEAD_SIZE % head_elem_num_per_group == 0); constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group; - const float *__restrict__ rescale_factors = exp_sums; + const float* __restrict__ rescale_factors = exp_sums; #pragma omp parallel for collapse(3) schedule(static, 1) for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { for (int head_idx = 0; head_idx < num_heads; ++head_idx) { @@ -616,17 +616,16 @@ struct paged_attention_v2_impl { const int partition_num = (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE; - if (partition_num == 1) - continue; + if (partition_num == 1) continue; - const float *__restrict__ seq_head_rescale_factors = + const float* __restrict__ seq_head_rescale_factors = rescale_factors + seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions; - const scalar_t *__restrict__ seq_head_tmp_out = + const scalar_t* __restrict__ seq_head_tmp_out = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + head_idx * max_num_partitions * HEAD_SIZE + group_idx * head_elem_num_per_group; - scalar_t *__restrict__ seq_head_output = + scalar_t* __restrict__ seq_head_output = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + group_idx * head_elem_num_per_group; @@ -645,21 +644,21 @@ struct paged_attention_v2_impl { } }; -#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \ - paged_attention_v2_impl::call( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \ - key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ - seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, num_seqs, num_heads, \ +#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \ + paged_attention_v2_impl::call( \ + out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \ + key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ + seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ + kv_block_stride, kv_head_stride, num_seqs, num_heads, \ max_num_partitions); template void paged_attention_v2_impl_launcher( - torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits, - torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache, - torch::Tensor &value_cache, int num_kv_heads, float scale, - torch::Tensor &block_tables, torch::Tensor &seq_lens, int block_size, - int max_seq_len, const c10::optional &alibi_slopes) { + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, + int max_seq_len, const c10::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -670,72 +669,72 @@ void paged_attention_v2_impl_launcher( int max_num_partitions = exp_sums.size(-1); // NOTE: alibi_slopes is optional. - const float *alibi_slopes_ptr = + const float* alibi_slopes_ptr = alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) + ? reinterpret_cast(alibi_slopes.value().data_ptr()) : nullptr; - T *out_ptr = reinterpret_cast(out.data_ptr()); - float *exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); - float *max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); - T *tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); - T *query_ptr = reinterpret_cast(query.data_ptr()); - T *key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - T *value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int *block_tables_ptr = block_tables.data_ptr(); - int *seq_lens_ptr = seq_lens.data_ptr(); + T* out_ptr = reinterpret_cast(out.data_ptr()); + float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); + float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); + T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { - case 64: - LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); - break; - case 80: - LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE); - break; - case 96: - LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE); - break; - case 112: - LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE); - break; - case 128: - LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); - break; - case 256: - LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; + case 64: + LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); + break; + case 80: + LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE); + break; + case 96: + LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE); + break; + case 112: + LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE); + break; + case 128: + LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); + break; + case 256: + LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; } } -#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \ - paged_attention_v2_impl_launcher( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, seq_lens, block_size, \ - max_seq_len, alibi_slopes); - -#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \ - switch (block_size) { \ - case 16: \ - CALL_V2_KERNEL_LAUNCHER(T, 16); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ +#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \ + paged_attention_v2_impl_launcher( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \ + alibi_slopes); + +#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \ + switch (block_size) { \ + case 16: \ + CALL_V2_KERNEL_LAUNCHER(T, 16); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ } -} // namespace - -void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums, - torch::Tensor &max_logits, torch::Tensor &tmp_out, - torch::Tensor &query, torch::Tensor &key_cache, - torch::Tensor &value_cache, int num_kv_heads, - float scale, torch::Tensor &block_tables, - torch::Tensor &seq_lens, int block_size, +} // namespace + +void paged_attention_v2(torch::Tensor& out, torch::Tensor& exp_sums, + torch::Tensor& max_logits, torch::Tensor& tmp_out, + torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, + float scale, torch::Tensor& block_tables, + torch::Tensor& seq_lens, int block_size, int max_seq_len, - const c10::optional &alibi_slopes, - const std::string &kv_cache_dtype, float kv_scale) { + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale) { TORCH_CHECK(kv_scale == 1.0f); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl", [&] { diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp index 26e81685d623e..2890ba6e2bb32 100644 --- a/csrc/cpu/cache.cpp +++ b/csrc/cpu/cache.cpp @@ -5,25 +5,26 @@ namespace { template -void copy_blocks_cpu_impl( - std::vector &key_caches, - std::vector &value_caches, - const torch::Tensor& mapping_pairs, - const int element_num_per_block, const int layer_num) { +void copy_blocks_cpu_impl(std::vector& key_caches, + std::vector& value_caches, + const torch::Tensor& mapping_pairs, + const int element_num_per_block, + const int layer_num) { const size_t pair_num = mapping_pairs.size(0); const size_t block_bytes = sizeof(scalar_t) * element_num_per_block; #pragma omp parallel for collapse(2) for (int layer = 0; layer < layer_num; ++layer) { for (size_t pair = 0; pair < pair_num; ++pair) { - int64_t source_offset = element_num_per_block * mapping_pairs[pair][0].item(); + int64_t source_offset = + element_num_per_block * mapping_pairs[pair][0].item(); int64_t target_offset = element_num_per_block * mapping_pairs[pair][1].item(); - scalar_t *key_cache_ptr = key_caches[layer].data_ptr(); - scalar_t *source_ptr = key_cache_ptr + source_offset; - scalar_t *target_ptr = key_cache_ptr + target_offset; + scalar_t* key_cache_ptr = key_caches[layer].data_ptr(); + scalar_t* source_ptr = key_cache_ptr + source_offset; + scalar_t* target_ptr = key_cache_ptr + target_offset; std::memcpy(target_ptr, source_ptr, block_bytes); - scalar_t *value_cache_ptr = value_caches[layer].data_ptr(); + scalar_t* value_cache_ptr = value_caches[layer].data_ptr(); source_ptr = value_cache_ptr + source_offset; target_ptr = value_cache_ptr + target_offset; std::memcpy(target_ptr, source_ptr, block_bytes); @@ -33,9 +34,9 @@ void copy_blocks_cpu_impl( template void reshape_and_cache_cpu_impl( - const scalar_t *__restrict__ key, const scalar_t *__restrict__ value, - scalar_t *__restrict__ key_cache, scalar_t *__restrict__ value_cache, - const int64_t *__restrict__ slot_mapping, const int num_tokens, + const scalar_t* __restrict__ key, const scalar_t* __restrict__ value, + scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache, + const int64_t* __restrict__ slot_mapping, const int num_tokens, const int key_stride, const int value_stride, const int num_heads, const int head_size, const int block_size, const int x) { const int block_elem_num = num_heads * head_size * block_size; @@ -48,14 +49,14 @@ void reshape_and_cache_cpu_impl( int src_key_head_idx = token_idx * key_stride + head_idx * head_size; int src_value_head_idx = token_idx * value_stride + head_idx * head_size; - const scalar_t *src_key_head_ptr = key + src_key_head_idx; - const scalar_t *src_value_head_ptr = value + src_value_head_idx; + const scalar_t* src_key_head_ptr = key + src_key_head_idx; + const scalar_t* src_value_head_ptr = value + src_value_head_idx; const int64_t block_index = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; - scalar_t *target_key_head_ptr = key_cache + + scalar_t* target_key_head_ptr = key_cache + block_elem_num * block_index + head_idx * block_size * head_size; - scalar_t *target_value_head_ptr = value_cache + + scalar_t* target_value_head_ptr = value_cache + block_elem_num * block_index + head_idx * block_size * head_size; @@ -79,10 +80,10 @@ void reshape_and_cache_cpu_impl( } } } -}; // namespace +}; // namespace -void copy_blocks(std::vector &key_caches, - std::vector &value_caches, +void copy_blocks(std::vector& key_caches, + std::vector& value_caches, const torch::Tensor& block_mapping) { unsigned num_layers = key_caches.size(); TORCH_CHECK(num_layers == value_caches.size()); @@ -100,10 +101,10 @@ void copy_blocks(std::vector &key_caches, }); } -void reshape_and_cache(torch::Tensor &key, torch::Tensor &value, - torch::Tensor &key_cache, torch::Tensor &value_cache, - torch::Tensor &slot_mapping, - const std::string &kv_cache_dtype, float kv_scale) { +void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, + torch::Tensor& key_cache, torch::Tensor& value_cache, + torch::Tensor& slot_mapping, + const std::string& kv_cache_dtype, float kv_scale) { TORCH_CHECK(kv_scale == 1.0f); int num_tokens = key.size(0); @@ -127,7 +128,7 @@ void reshape_and_cache(torch::Tensor &key, torch::Tensor &value, }); } -void swap_blocks(torch::Tensor &src, torch::Tensor &dst, - const torch::Tensor&block_mapping) { +void swap_blocks(torch::Tensor& src, torch::Tensor& dst, + const torch::Tensor& block_mapping) { TORCH_CHECK(false, "swap_blocks is unsupported on CPU.") } diff --git a/csrc/cpu/layernorm.cpp b/csrc/cpu/layernorm.cpp index 467f0dc84982c..65d3ddcec5709 100644 --- a/csrc/cpu/layernorm.cpp +++ b/csrc/cpu/layernorm.cpp @@ -2,10 +2,10 @@ namespace { template -void rms_norm_impl(scalar_t *__restrict__ out, - const scalar_t *__restrict__ input, - const scalar_t *__restrict__ weight, const float epsilon, - const int num_tokens, const int hidden_size) { +void rms_norm_impl(scalar_t* __restrict__ out, + const scalar_t* __restrict__ input, + const scalar_t* __restrict__ weight, const float epsilon, + const int num_tokens, const int hidden_size) { using scalar_vec_t = vec_op::vec_t; constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num(); TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0); @@ -41,11 +41,11 @@ void rms_norm_impl(scalar_t *__restrict__ out, } template -void fused_add_rms_norm_impl(scalar_t *__restrict__ input, - scalar_t *__restrict__ residual, - const scalar_t *__restrict__ weight, - const float epsilon, const int num_tokens, - const int hidden_size) { +void fused_add_rms_norm_impl(scalar_t* __restrict__ input, + scalar_t* __restrict__ residual, + const scalar_t* __restrict__ weight, + const float epsilon, const int num_tokens, + const int hidden_size) { using scalar_vec_t = vec_op::vec_t; constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num(); TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0); @@ -85,24 +85,24 @@ void fused_add_rms_norm_impl(scalar_t *__restrict__ input, } } } -} // namespace +} // namespace -void rms_norm(torch::Tensor &out, torch::Tensor &input, - torch::Tensor &weight, float epsilon) { +void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, + float epsilon) { int hidden_size = input.size(-1); int num_tokens = input.numel() / hidden_size; VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] { CPU_KERNEL_GUARD_IN(rms_norm_impl) rms_norm_impl(out.data_ptr(), input.data_ptr(), - weight.data_ptr(), epsilon, num_tokens, - hidden_size); + weight.data_ptr(), epsilon, num_tokens, + hidden_size); CPU_KERNEL_GUARD_OUT(rms_norm_impl) }); } -void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual, - torch::Tensor &weight, float epsilon) { +void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, + torch::Tensor& weight, float epsilon) { int hidden_size = input.size(-1); int num_tokens = input.numel() / hidden_size; diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp index 5dc1bde45ac5f..73bf77e46f538 100644 --- a/csrc/cpu/pos_encoding.cpp +++ b/csrc/cpu/pos_encoding.cpp @@ -4,16 +4,16 @@ namespace { template void rotary_embedding_impl( - const int64_t - *__restrict__ positions, // [batch_size, seq_len] or [num_tokens] - scalar_t - *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or - /// [num_tokens, num_heads, head_size] - scalar_t - *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or - // [num_tokens, num_kv_heads, head_size] - const scalar_t - *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int64_t* __restrict__ positions, // [batch_size, seq_len] or + // [num_tokens] + scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads, + /// head_size] or [num_tokens, num_heads, + /// head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, + // head_size] or [num_tokens, num_kv_heads, + // head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // + // 2] const int rot_dim, const int64_t query_stride, const int64_t key_stride, const int num_heads, const int num_kv_heads, const int head_size, const int num_tokens) { @@ -26,7 +26,7 @@ void rotary_embedding_impl( #pragma omp parallel for for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { int64_t pos = positions[token_idx]; - const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; for (int i = 0; i < num_heads; ++i) { const int head_idx = i; @@ -94,16 +94,16 @@ void rotary_embedding_impl( template void rotary_embedding_gptj_impl( - const int64_t - *__restrict__ positions, // [batch_size, seq_len] or [num_tokens] - scalar_t - *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or - /// [num_tokens, num_heads, head_size] - scalar_t - *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or - // [num_tokens, num_kv_heads, head_size] - const scalar_t - *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int64_t* __restrict__ positions, // [batch_size, seq_len] or + // [num_tokens] + scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads, + /// head_size] or [num_tokens, num_heads, + /// head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, + // head_size] or [num_tokens, num_kv_heads, + // head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // + // 2] const int rot_dim, const int64_t query_stride, const int64_t key_stride, const int num_heads, const int num_kv_heads, const int head_size, const int num_tokens) { @@ -113,13 +113,13 @@ void rotary_embedding_gptj_impl( for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { for (int i = 0; i < num_heads; ++i) { int64_t pos = positions[token_idx]; - const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim; - const scalar_t *cos_cache_ptr = cache_ptr; - const scalar_t *sin_cache_ptr = cache_ptr + embed_dim; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; + const scalar_t* cos_cache_ptr = cache_ptr; + const scalar_t* sin_cache_ptr = cache_ptr + embed_dim; const int head_idx = i; const int64_t token_head = token_idx * query_stride + head_idx * head_size; - scalar_t *head_query = token_head + query; + scalar_t* head_query = token_head + query; for (int j = 0; j < embed_dim; j += 1) { const int rot_offset = j; const int x_index = 2 * rot_offset; @@ -141,12 +141,12 @@ void rotary_embedding_gptj_impl( for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { for (int i = 0; i < num_kv_heads; ++i) { int64_t pos = positions[token_idx]; - const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim; - const scalar_t *cos_cache_ptr = cache_ptr; - const scalar_t *sin_cache_ptr = cache_ptr + embed_dim; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; + const scalar_t* cos_cache_ptr = cache_ptr; + const scalar_t* sin_cache_ptr = cache_ptr + embed_dim; const int head_idx = i; const int64_t token_head = token_idx * key_stride + head_idx * head_size; - scalar_t *head_key = key + token_head; + scalar_t* head_key = key + token_head; for (int j = 0; j < embed_dim; j += 1) { const int rot_offset = j; const int x_index = 2 * rot_offset; @@ -164,11 +164,11 @@ void rotary_embedding_gptj_impl( } } } -}; // namespace +}; // namespace -void rotary_embedding(torch::Tensor &positions, torch::Tensor &query, - torch::Tensor &key, int head_size, - torch::Tensor &cos_sin_cache, bool is_neox) { +void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, + torch::Tensor& key, int head_size, + torch::Tensor& cos_sin_cache, bool is_neox) { int num_tokens = query.numel() / query.size(-1); int rot_dim = cos_sin_cache.size(1); int num_heads = query.size(-1) / head_size; diff --git a/csrc/cpu/pybind.cpp b/csrc/cpu/pybind.cpp index bba044087f37c..63082393c8102 100644 --- a/csrc/cpu/pybind.cpp +++ b/csrc/cpu/pybind.cpp @@ -8,66 +8,37 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { pybind11::module ops = m.def_submodule("ops", "vLLM custom operators"); // Attention ops - ops.def( - "paged_attention_v1", - &paged_attention_v1, - "Compute the attention between an input query and the cached keys/values using PagedAttention."); - ops.def( - "paged_attention_v2", - &paged_attention_v2, - "PagedAttention V2."); + ops.def("paged_attention_v1", &paged_attention_v1, + "Compute the attention between an input query and the cached " + "keys/values using PagedAttention."); + ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2."); // Activation ops - ops.def( - "silu_and_mul", - &silu_and_mul, - "Activation function used in SwiGLU."); - ops.def( - "gelu_and_mul", - &gelu_and_mul, - "Activation function used in GeGLU with `none` approximation."); - ops.def( - "gelu_tanh_and_mul", - &gelu_tanh_and_mul, - "Activation function used in GeGLU with `tanh` approximation."); - ops.def( - "gelu_new", - &gelu_new, - "GELU implementation used in GPT-2."); - ops.def( - "gelu_fast", - &gelu_fast, - "Approximate GELU implementation."); + ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU."); + ops.def("gelu_and_mul", &gelu_and_mul, + "Activation function used in GeGLU with `none` approximation."); + ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul, + "Activation function used in GeGLU with `tanh` approximation."); + ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2."); + ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation."); // Layernorm - ops.def( - "rms_norm", - &rms_norm, - "Apply Root Mean Square (RMS) Normalization to the input tensor."); + ops.def("rms_norm", &rms_norm, + "Apply Root Mean Square (RMS) Normalization to the input tensor."); - ops.def( - "fused_add_rms_norm", - &fused_add_rms_norm, - "In-place fused Add and RMS Normalization"); + ops.def("fused_add_rms_norm", &fused_add_rms_norm, + "In-place fused Add and RMS Normalization"); // Rotary embedding - ops.def( - "rotary_embedding", - &rotary_embedding, - "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); + ops.def("rotary_embedding", &rotary_embedding, + "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); // Cache ops pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); - cache_ops.def( - "swap_blocks", - &swap_blocks, - "Swap in (out) the cache blocks from src to dst"); - cache_ops.def( - "copy_blocks", - ©_blocks, - "Copy the cache blocks from src to dst"); - cache_ops.def( - "reshape_and_cache", - &reshape_and_cache, - "Reshape the key and value tensors and cache them"); + cache_ops.def("swap_blocks", &swap_blocks, + "Swap in (out) the cache blocks from src to dst"); + cache_ops.def("copy_blocks", ©_blocks, + "Copy the cache blocks from src to dst"); + cache_ops.def("reshape_and_cache", &reshape_and_cache, + "Reshape the key and value tensors and cache them"); } diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h index 1ebb2e74a82fc..5909e5eaf5e60 100644 --- a/csrc/cuda_compat.h +++ b/csrc/cuda_compat.h @@ -1,7 +1,7 @@ #pragma once #ifdef USE_ROCM -#include + #include #endif #ifndef USE_ROCM @@ -17,7 +17,8 @@ #endif #ifndef USE_ROCM - #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) + #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \ + __shfl_xor_sync(uint32_t(-1), var, lane_mask) #else #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) #endif @@ -29,7 +30,8 @@ #endif #ifndef USE_ROCM - #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down_sync(uint32_t(-1), var, lane_delta) + #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \ + __shfl_down_sync(uint32_t(-1), var, lane_delta) #else #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta) #endif @@ -41,4 +43,3 @@ #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) #endif - diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h index 1483484faeb4a..2ba49b339e148 100644 --- a/csrc/cuda_utils.h +++ b/csrc/cuda_utils.h @@ -2,9 +2,6 @@ #include -int get_device_attribute( - int attribute, - int device_id); +int get_device_attribute(int attribute, int device_id); -int get_max_shared_memory_per_block_device_attribute( - int device_id); +int get_max_shared_memory_per_block_device_attribute(int device_id); diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu index 1a443ef3620cc..7d8e2e19720fa 100644 --- a/csrc/cuda_utils_kernels.cu +++ b/csrc/cuda_utils_kernels.cu @@ -2,34 +2,28 @@ #include #include #endif -int get_device_attribute( - int attribute, - int device_id) -{ - int device, value; - if (device_id < 0) { - cudaGetDevice(&device); - } - else { - device = device_id; - } - cudaDeviceGetAttribute(&value, static_cast(attribute), device); - return value; +int get_device_attribute(int attribute, int device_id) { + int device, value; + if (device_id < 0) { + cudaGetDevice(&device); + } else { + device = device_id; + } + cudaDeviceGetAttribute(&value, static_cast(attribute), + device); + return value; } - -int get_max_shared_memory_per_block_device_attribute( - int device_id) -{ -int attribute; -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html -// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 +int get_max_shared_memory_per_block_device_attribute(int device_id) { + int attribute; + // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html + // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 #ifdef USE_ROCM - attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; + attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; #else - attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; + attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; #endif - return get_device_attribute(attribute, device_id); + return get_device_attribute(attribute, device_id); } diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu index 3906dcfc80dbf..0b1d95848525a 100644 --- a/csrc/custom_all_reduce.cu +++ b/csrc/custom_all_reduce.cu @@ -7,11 +7,11 @@ // fake pointer type using fptr_t = uint64_t; -static_assert(sizeof(void *) == sizeof(fptr_t)); +static_assert(sizeof(void*) == sizeof(fptr_t)); -fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, - const std::vector &handles, - const std::vector &offsets, int rank, +fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, + const std::vector& handles, + const std::vector& offsets, int rank, bool full_nvlink) { int world_size = offsets.size(); if (world_size > 8) @@ -29,7 +29,7 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t)); } return (fptr_t) new vllm::CustomAllreduce( - reinterpret_cast(meta.data_ptr()), rank_data.data_ptr(), + reinterpret_cast(meta.data_ptr()), rank_data.data_ptr(), rank_data.numel(), ipc_handles, offsets, rank, full_nvlink); } @@ -49,13 +49,13 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, * 5. A[None].expand(2, -1, -1, -1): Not OK * 6. A[:, 1:, 1:]: Not OK */ -bool _is_weak_contiguous(torch::Tensor &t) { +bool _is_weak_contiguous(torch::Tensor& t) { return t.is_contiguous() || (t.storage().nbytes() - t.storage_offset() * t.element_size() == t.numel() * t.element_size()); } -bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, +bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size, bool full_nvlink) { auto inp_size = inp.numel() * inp.element_size(); // custom allreduce requires input byte size to be multiples of 16 @@ -67,28 +67,27 @@ bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, return false; } -void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out, +void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, cudaStream_t stream) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); TORCH_CHECK(_is_weak_contiguous(out)); switch (out.scalar_type()) { case at::ScalarType::Float: { - fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), - reinterpret_cast(out.data_ptr()), + fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), + reinterpret_cast(out.data_ptr()), out.numel()); break; } case at::ScalarType::Half: { - fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), - reinterpret_cast(out.data_ptr()), - out.numel()); + fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), + reinterpret_cast(out.data_ptr()), out.numel()); break; } #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) case at::ScalarType::BFloat16: { fa->allreduce( - stream, reinterpret_cast(inp.data_ptr()), - reinterpret_cast(out.data_ptr()), out.numel()); + stream, reinterpret_cast(inp.data_ptr()), + reinterpret_cast(out.data_ptr()), out.numel()); break; } #endif @@ -98,7 +97,7 @@ void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out, } } -void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out) { +void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) { const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); auto stream = c10::cuda::getCurrentCUDAStream().stream(); TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); @@ -106,8 +105,8 @@ void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out) { _all_reduce(_fa, inp, out, stream); } -void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor ®_buffer, - torch::Tensor &out) { +void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, + torch::Tensor& out) { const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); auto stream = c10::cuda::getCurrentCUDAStream().stream(); @@ -122,27 +121,27 @@ void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor ®_buffer, } void dispose(fptr_t _fa) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); delete fa; } int meta_size() { return sizeof(vllm::Signal); } -void register_buffer(fptr_t _fa, torch::Tensor &t, - const std::vector &handles, - const std::vector &offsets) { - auto fa = reinterpret_cast(_fa); +void register_buffer(fptr_t _fa, torch::Tensor& t, + const std::vector& handles, + const std::vector& offsets) { + auto fa = reinterpret_cast(_fa); fa->register_buffer(handles, offsets, t.data_ptr()); } std::pair, std::vector> get_graph_buffer_ipc_meta( fptr_t _fa) { - auto fa = reinterpret_cast(_fa); + auto fa = reinterpret_cast(_fa); return fa->get_graph_buffer_ipc_meta(); } -void register_graph_buffers(fptr_t _fa, const std::vector &handles, - const std::vector> &offsets) { - auto fa = reinterpret_cast(_fa); +void register_graph_buffers(fptr_t _fa, const std::vector& handles, + const std::vector>& offsets) { + auto fa = reinterpret_cast(_fa); fa->register_graph_buffers(handles, offsets); } diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 750e68d42f6c6..1ed49b8aa9cae 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -31,9 +31,9 @@ struct Signal { alignas(128) uint32_t end[kMaxBlocks][8]; }; -struct __align__(16) RankData { const void *__restrict__ ptrs[8]; }; +struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; -struct __align__(16) RankSignals { volatile Signal *signals[8]; }; +struct __align__(16) RankSignals { volatile Signal* signals[8]; }; // like std::array, but aligned template @@ -68,11 +68,11 @@ DINLINE half downcast_s(float val) { // scalar add functions // for some reason when compiling with Pytorch, the + operator for half and // bfloat is disabled so we call the intrinsics directly -DINLINE half &assign_add(half &a, half b) { +DINLINE half& assign_add(half& a, half b) { a = __hadd(a, b); return a; } -DINLINE float &assign_add(float &a, float b) { return a += b; } +DINLINE float& assign_add(float& a, float b) { return a += b; } #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); } @@ -80,14 +80,14 @@ template <> DINLINE nv_bfloat16 downcast_s(float val) { return __float2bfloat16(val); } -DINLINE nv_bfloat16 &assign_add(nv_bfloat16 &a, nv_bfloat16 b) { +DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) { a = __hadd(a, b); return a; } #endif template -DINLINE array_t &packed_assign_add(array_t &a, array_t b) { +DINLINE array_t& packed_assign_add(array_t& a, array_t b) { #pragma unroll for (int i = 0; i < N; i++) { assign_add(a.data[i], b.data[i]); @@ -128,7 +128,7 @@ DINLINE O downcast(array_t val) { // prior memory accesses. Note: volatile writes will not be reordered against // other volatile writes. template -DINLINE void start_sync(const RankSignals &sg, volatile Signal *self_sg, +DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg, int rank) { if (threadIdx.x < ngpus) { // reset flag for next time @@ -137,8 +137,7 @@ DINLINE void start_sync(const RankSignals &sg, volatile Signal *self_sg, // Latency = 1 p2p write sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1; // wait until we got true from all ranks - while (!self_sg->start[blockIdx.x][threadIdx.x]) - ; + while (!self_sg->start[blockIdx.x][threadIdx.x]); } __syncthreads(); } @@ -147,13 +146,13 @@ DINLINE void start_sync(const RankSignals &sg, volatile Signal *self_sg, // barrier in the all reduce kernel. If it's the final synchronization barrier, // we don't need to make any visibility guarantees for prior memory accesses. template -DINLINE void end_sync(const RankSignals &sg, volatile Signal *self_sg, +DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg, int rank) { __syncthreads(); // eliminate the case that prior writes are not visible after signals become // visible. Note that I did not managed to make this happen through a lot of // testing. Might be the case that hardware provides stronger guarantee than - // the memory model. + // the memory model. if constexpr (!final_sync) __threadfence_system(); if (threadIdx.x < ngpus) { // reset flag for next time @@ -162,14 +161,13 @@ DINLINE void end_sync(const RankSignals &sg, volatile Signal *self_sg, // Latency = 1 p2p write sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1; // wait until we got true from all ranks - while (!self_sg->end[blockIdx.x][threadIdx.x]) - ; + while (!self_sg->end[blockIdx.x][threadIdx.x]); } if constexpr (!final_sync) __syncthreads(); } template -DINLINE P packed_reduce(const P *ptrs[], int idx) { +DINLINE P packed_reduce(const P* ptrs[], int idx) { A tmp = upcast(ptrs[0][idx]); #pragma unroll for (int i = 1; i < ngpus; i++) { @@ -180,8 +178,8 @@ DINLINE P packed_reduce(const P *ptrs[], int idx) { template __global__ void __launch_bounds__(512, 1) - cross_device_reduce_1stage(RankData *_dp, RankSignals sg, - volatile Signal *self_sg, T *__restrict__ result, + cross_device_reduce_1stage(RankData* _dp, RankSignals sg, + volatile Signal* self_sg, T* __restrict__ result, int rank, int size) { using P = typename packed_t::P; using A = typename packed_t::A; @@ -192,21 +190,20 @@ __global__ void __launch_bounds__(512, 1) // do the actual reduction for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) { - ((P *)result)[idx] = - packed_reduce((const P **)&dp.ptrs[0], idx); + ((P*)result)[idx] = packed_reduce((const P**)&dp.ptrs[0], idx); } end_sync(sg, self_sg, rank); } template -DINLINE P *get_tmp_buf(volatile Signal *sg) { - return (P *)(((Signal *)sg) + 1); +DINLINE P* get_tmp_buf(volatile Signal* sg) { + return (P*)(((Signal*)sg) + 1); } template __global__ void __launch_bounds__(512, 1) - cross_device_reduce_2stage(RankData *_dp, RankSignals sg, - volatile Signal *self_sg, T *__restrict__ result, + cross_device_reduce_2stage(RankData* _dp, RankSignals sg, + volatile Signal* self_sg, T* __restrict__ result, int rank, int size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = gridDim.x * blockDim.x; @@ -216,12 +213,12 @@ __global__ void __launch_bounds__(512, 1) int start = rank * part; int end = rank == ngpus - 1 ? size : start + part; int largest_part = part + size % ngpus; - const P *ptrs[ngpus]; - P *tmps[ngpus]; + const P* ptrs[ngpus]; + P* tmps[ngpus]; #pragma unroll for (int i = 0; i < ngpus; i++) { int target = (rank + i) % ngpus; - ptrs[i] = (const P *)_dp->ptrs[target]; + ptrs[i] = (const P*)_dp->ptrs[target]; tmps[i] = get_tmp_buf

(sg.signals[target]); } auto tmp_out = tmps[0]; @@ -243,7 +240,7 @@ __global__ void __launch_bounds__(512, 1) int gather_from_rank = ((rank + i) % ngpus); if (gather_from_rank == ngpus - 1 || idx < part) { int dst_idx = gather_from_rank * part + idx; - ((P *)result)[dst_idx] = tmps[i][idx]; + ((P*)result)[dst_idx] = tmps[i][idx]; } } } @@ -261,14 +258,14 @@ class CustomAllreduce { // below are device pointers RankSignals sg_; - std::unordered_map buffers_; - Signal *self_sg_; + std::unordered_map buffers_; + Signal* self_sg_; // stores the registered device pointers from all ranks RankData *d_rank_data_base_, *d_rank_data_end_; - std::vector graph_unreg_buffers_; + std::vector graph_unreg_buffers_; // a map from IPC handles to opened IPC pointers - std::map ipc_handles_; + std::map ipc_handles_; /** * meta is a pointer to device metadata and temporary buffer for allreduce. @@ -279,22 +276,22 @@ class CustomAllreduce { * note: this class does not own any device memory. Any required buffers * are passed in from the constructor */ - CustomAllreduce(Signal *meta, void *rank_data, size_t rank_data_sz, - const cudaIpcMemHandle_t *handles, - const std::vector &offsets, int rank, + CustomAllreduce(Signal* meta, void* rank_data, size_t rank_data_sz, + const cudaIpcMemHandle_t* handles, + const std::vector& offsets, int rank, bool full_nvlink = true) : rank_(rank), world_size_(offsets.size()), full_nvlink_(full_nvlink), self_sg_(meta), - d_rank_data_base_(reinterpret_cast(rank_data)), + d_rank_data_base_(reinterpret_cast(rank_data)), d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) { for (int i = 0; i < world_size_; i++) { - Signal *rank_sg; + Signal* rank_sg; if (i != rank_) { - char *handle = open_ipc_handle(&handles[i]); + char* handle = open_ipc_handle(&handles[i]); handle += offsets[i]; - rank_sg = (Signal *)handle; + rank_sg = (Signal*)handle; } else { rank_sg = self_sg_; } @@ -302,13 +299,13 @@ class CustomAllreduce { } } - char *open_ipc_handle(const void *ipc_handle) { + char* open_ipc_handle(const void* ipc_handle) { auto [it, new_handle] = - ipc_handles_.insert({*((IPC_KEY *)ipc_handle), nullptr}); + ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr}); if (new_handle) { - char *ipc_ptr; - CUDACHECK(cudaIpcOpenMemHandle((void **)&ipc_ptr, - *((const cudaIpcMemHandle_t *)ipc_handle), + char* ipc_ptr; + CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr, + *((const cudaIpcMemHandle_t*)ipc_handle), cudaIpcMemLazyEnablePeerAccess)); it->second = ipc_ptr; } @@ -323,7 +320,7 @@ class CustomAllreduce { std::vector offsets(num_buffers); for (int i = 0; i < num_buffers; i++) { auto ptr = graph_unreg_buffers_[i]; - void *base_ptr; + void* base_ptr; // note: must share the base address of each allocation, or we get wrong // address if (cuPointerGetAttribute(&base_ptr, @@ -331,8 +328,8 @@ class CustomAllreduce { (CUdeviceptr)ptr) != CUDA_SUCCESS) throw std::runtime_error("failed to get pointer attr"); CUDACHECK(cudaIpcGetMemHandle( - (cudaIpcMemHandle_t *)&handles[i * handle_sz], base_ptr)); - offsets[i] = ((char *)ptr) - ((char *)base_ptr); + (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr)); + offsets[i] = ((char*)ptr) - ((char*)base_ptr); } return std::make_pair(handles, offsets); } @@ -344,13 +341,13 @@ class CustomAllreduce { std::to_string(d_rank_data_base_ + num - d_rank_data_end_)); } - void register_buffer(const std::vector &handles, - const std::vector &offsets, void *self) { + void register_buffer(const std::vector& handles, + const std::vector& offsets, void* self) { check_rank_data_capacity(); RankData data; for (int i = 0; i < world_size_; i++) { if (i != rank_) { - char *handle = open_ipc_handle(handles[i].data()); + char* handle = open_ipc_handle(handles[i].data()); handle += offsets[i]; data.ptrs[i] = handle; } else { @@ -371,17 +368,17 @@ class CustomAllreduce { // got a different address. IPC handles have internal reference counting // mechanism so overhead should be small. void register_graph_buffers( - const std::vector &handles, - const std::vector> &offsets) { + const std::vector& handles, + const std::vector>& offsets) { auto num_buffers = graph_unreg_buffers_.size(); check_rank_data_capacity(num_buffers); std::vector rank_data(num_buffers); for (int i = 0; i < num_buffers; i++) { auto self_ptr = graph_unreg_buffers_[i]; - auto &rd = rank_data[i]; + auto& rd = rank_data[i]; for (int j = 0; j < world_size_; j++) { if (j != rank_) { - char *handle = + char* handle = open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]); handle += offsets[j][i]; rd.ptrs[j] = handle; @@ -405,7 +402,7 @@ class CustomAllreduce { * will cause contention on NVLink bus. */ template - void allreduce(cudaStream_t stream, T *input, T *output, int size, + void allreduce(cudaStream_t stream, T* input, T* output, int size, int threads = 512, int block_limit = 36) { auto d = packed_t::P::size; if (size % d != 0) @@ -418,7 +415,7 @@ class CustomAllreduce { std::to_string(kMaxBlocks) + ". Got " + std::to_string(block_limit)); - RankData *ptrs; + RankData* ptrs; cudaStreamCaptureStatus status; CUDACHECK(cudaStreamIsCapturing(stream, &status)); if (status == cudaStreamCaptureStatusActive) { diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu index c34a50389c21c..f7868233076cd 100644 --- a/csrc/custom_all_reduce_test.cu +++ b/csrc/custom_all_reduce_test.cu @@ -48,7 +48,7 @@ __global__ void dummy_kernel() { } template -__global__ void set_data(T *data, int size, int myRank) { +__global__ void set_data(T* data, int size, int myRank) { for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) { data[idx] = myRank * 0.11f; @@ -56,8 +56,8 @@ __global__ void set_data(T *data, int size, int myRank) { } template -__global__ void convert_data(const T *data1, const T *data2, double *fdata1, - double *fdata2, int size) { +__global__ void convert_data(const T* data1, const T* data2, double* fdata1, + double* fdata2, int size) { for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) { fdata1[idx] = data1[idx]; @@ -65,7 +65,7 @@ __global__ void convert_data(const T *data1, const T *data2, double *fdata1, } } -__global__ void init_rand(curandState_t *state, int size, int nRanks) { +__global__ void init_rand(curandState_t* state, int size, int nRanks) { for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) { for (int i = 0; i < nRanks; i++) { @@ -75,7 +75,7 @@ __global__ void init_rand(curandState_t *state, int size, int nRanks) { } template -__global__ void gen_data(curandState_t *state, T *data, double *ground_truth, +__global__ void gen_data(curandState_t* state, T* data, double* ground_truth, int myRank, int nRanks, int size) { for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) { @@ -91,9 +91,9 @@ __global__ void gen_data(curandState_t *state, T *data, double *ground_truth, } template -void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit, +void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit, int data_size, bool performance_test) { - T *result; + T* result; cudaStream_t stream; CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); CUDACHECK(cudaMalloc(&result, data_size * sizeof(T))); @@ -101,8 +101,8 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit, cudaIpcMemHandle_t self_data_handle; cudaIpcMemHandle_t data_handles[8]; - vllm::Signal *buffer; - T *self_data_copy; + vllm::Signal* buffer; + T* self_data_copy; /** * Allocate IPC buffer * @@ -125,22 +125,22 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit, MPI_BYTE, data_handles, sizeof(cudaIpcMemHandle_t), MPI_BYTE, MPI_COMM_WORLD)); - void *rank_data; + void* rank_data; size_t rank_data_sz = 16 * 1024 * 1024; CUDACHECK(cudaMalloc(&rank_data, rank_data_sz)); std::vector offsets(nRanks, 0); vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles, offsets, myRank); - auto *self_data = - reinterpret_cast(reinterpret_cast(buffer) + - sizeof(vllm::Signal) + data_size * sizeof(T)); + auto* self_data = + reinterpret_cast(reinterpret_cast(buffer) + + sizeof(vllm::Signal) + data_size * sizeof(T)); // hack buffer registration { std::vector handles; handles.reserve(nRanks); for (int i = 0; i < nRanks; i++) { - char *begin = (char *)&data_handles[i]; - char *end = (char *)&data_handles[i + 1]; + char* begin = (char*)&data_handles[i]; + char* end = (char*)&data_handles[i + 1]; handles.emplace_back(begin, end); } std::vector offsets(nRanks, @@ -148,9 +148,9 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit, fa.register_buffer(handles, offsets, self_data); } - double *ground_truth; + double* ground_truth; CUDACHECK(cudaMallocHost(&ground_truth, data_size * sizeof(double))); - curandState_t *states; + curandState_t* states; CUDACHECK(cudaMalloc(&states, sizeof(curandState_t) * nRanks * data_size)); init_rand<<<108, 1024, 0, stream>>>(states, data_size, nRanks); gen_data<<<108, 1024, 0, stream>>>(states, self_data, ground_truth, myRank, @@ -287,7 +287,7 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit, CUDACHECK(cudaStreamDestroy(stream)); } -int main(int argc, char **argv) { +int main(int argc, char** argv) { int nRanks, myRank; MPICHECK(MPI_Init(&argc, &argv)); MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); @@ -296,7 +296,7 @@ int main(int argc, char **argv) { ncclUniqueId id; ncclComm_t comm; if (myRank == 0) ncclGetUniqueId(&id); - MPICHECK(MPI_Bcast(static_cast(&id), sizeof(id), MPI_BYTE, 0, + MPICHECK(MPI_Bcast(static_cast(&id), sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank)); diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index 91abd9e85b4bb..3ecea03242f06 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -6,32 +6,30 @@ #include -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH( \ - TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) -#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH( \ - TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) - -#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ +#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, \ + VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) + +#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) -#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH( \ - TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index e56b4d2204005..70a2b3b0a07b1 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -11,26 +11,24 @@ #include #include - using __nv_bfloat16 = __hip_bfloat16; - using __nv_bfloat162 = __hip_bfloat162; +using __nv_bfloat16 = __hip_bfloat16; +using __nv_bfloat162 = __hip_bfloat162; #endif namespace vllm { // TODO(woosuk): Further optimize this kernel. -template +template __global__ void rms_norm_kernel( - scalar_t* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] - const scalar_t* __restrict__ weight, // [hidden_size] - const float epsilon, - const int num_tokens, - const int hidden_size) { + scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float) input[blockIdx.x * hidden_size + idx]; + const float x = (float)input[blockIdx.x * hidden_size + idx]; variance += x * x; } variance = blockReduceSum(variance); @@ -40,12 +38,12 @@ __global__ void rms_norm_kernel( __syncthreads(); for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float) input[blockIdx.x * hidden_size + idx]; - out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; + float x = (float)input[blockIdx.x * hidden_size + idx]; + out[blockIdx.x * hidden_size + idx] = + ((scalar_t)(x * s_variance)) * weight[idx]; } } - /* Converter structs for the conversion from torch types to HIP/CUDA types, and the associated type conversions within HIP/CUDA. These helpers need to be implemented for now because the relevant type conversion @@ -54,51 +52,68 @@ __global__ void rms_norm_kernel( Each struct should have the member static constexpr bool `exists`: If false, the optimized kernel is not used for the corresponding torch type. - If true, the struct should be fully defined as shown in the examples below. + If true, the struct should be fully defined as shown in the examples below. */ -template -struct _typeConvert { static constexpr bool exists = false; }; +template +struct _typeConvert { + static constexpr bool exists = false; +}; #if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) // CUDA < 12.0 runs into issues with packed type conversion -template<> +template <> struct _typeConvert { static constexpr bool exists = true; using hip_type = __half; using packed_hip_type = __half2; __device__ static inline float convert(hip_type x) { return __half2float(x); } - __device__ static inline float2 convert(packed_hip_type x) { return __half22float2(x); } - __device__ static inline hip_type convert(float x) { return __float2half_rn(x); } - __device__ static inline packed_hip_type convert(float2 x) { return __float22half2_rn(x); } + __device__ static inline float2 convert(packed_hip_type x) { + return __half22float2(x); + } + __device__ static inline hip_type convert(float x) { + return __float2half_rn(x); + } + __device__ static inline packed_hip_type convert(float2 x) { + return __float22half2_rn(x); + } }; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // CUDA_ARCH < 800 does not have BF16 support // TODO: Add in ROCm support once public headers handle bf16 maturely -template<> +template <> struct _typeConvert { static constexpr bool exists = true; using hip_type = __nv_bfloat16; using packed_hip_type = __nv_bfloat162; - __device__ static inline float convert(hip_type x) { return __bfloat162float(x); } - __device__ static inline float2 convert(packed_hip_type x) { return __bfloat1622float2(x); } - __device__ static inline hip_type convert(float x) { return __float2bfloat16(x); } - __device__ static inline packed_hip_type convert(float2 x) { return __float22bfloat162_rn(x); } + __device__ static inline float convert(hip_type x) { + return __bfloat162float(x); + } + __device__ static inline float2 convert(packed_hip_type x) { + return __bfloat1622float2(x); + } + __device__ static inline hip_type convert(float x) { + return __float2bfloat16(x); + } + __device__ static inline packed_hip_type convert(float2 x) { + return __float22bfloat162_rn(x); + } }; -#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) + #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= + // 12000)) /* Vector POD struct to generate vectorized and packed FP16/BF16 ops for appropriate specializations of fused_add_rms_norm_kernel. Only functions that are necessary in that kernel are implemented. Alignment to 16 bytes is required to use 128-bit global memory ops. */ -template +template struct alignas(16) _f16Vec { - /* Not theoretically necessary that width is a power of 2 but should - almost always be the case for optimization purposes */ + /* Not theoretically necessary that width is a power of 2 but should + almost always be the case for optimization purposes */ static_assert(width > 0 && (width & (width - 1)) == 0, "Width is not a positive power of 2!"); using Converter = _typeConvert; @@ -108,51 +123,49 @@ struct alignas(16) _f16Vec { __device__ _f16Vec& operator+=(const _f16Vec& other) { if constexpr (width % 2 == 0) { - #pragma unroll +#pragma unroll for (int i = 0; i < width; i += 2) { - T2 temp{data[i], data[i+1]}; - temp += T2{other.data[i], other.data[i+1]}; + T2 temp{data[i], data[i + 1]}; + temp += T2{other.data[i], other.data[i + 1]}; data[i] = temp.x; - data[i+1] = temp.y; + data[i + 1] = temp.y; } } else { - #pragma unroll - for (int i = 0; i < width; ++i) - data[i] += other.data[i]; +#pragma unroll + for (int i = 0; i < width; ++i) data[i] += other.data[i]; } return *this; } __device__ _f16Vec& operator*=(const _f16Vec& other) { if constexpr (width % 2 == 0) { - #pragma unroll +#pragma unroll for (int i = 0; i < width; i += 2) { - T2 temp{data[i], data[i+1]}; - temp *= T2{other.data[i], other.data[i+1]}; + T2 temp{data[i], data[i + 1]}; + temp *= T2{other.data[i], other.data[i + 1]}; data[i] = temp.x; - data[i+1] = temp.y; + data[i + 1] = temp.y; } } else { - #pragma unroll - for (int i = 0; i < width; ++i) - data[i] *= other.data[i]; +#pragma unroll + for (int i = 0; i < width; ++i) data[i] *= other.data[i]; } return *this; } __device__ _f16Vec& operator*=(const float scale) { if constexpr (width % 2 == 0) { - #pragma unroll +#pragma unroll for (int i = 0; i < width; i += 2) { - float2 temp_f = Converter::convert(T2{data[i], data[i+1]}); + float2 temp_f = Converter::convert(T2{data[i], data[i + 1]}); temp_f.x *= scale; temp_f.y *= scale; T2 temp = Converter::convert(temp_f); data[i] = temp.x; - data[i+1] = temp.y; + data[i + 1] = temp.y; } } else { - #pragma unroll +#pragma unroll for (int i = 0; i < width; ++i) { float temp = Converter::convert(data[i]) * scale; data[i] = Converter::convert(temp); @@ -164,13 +177,13 @@ struct alignas(16) _f16Vec { __device__ float sum_squares() const { float result = 0.0f; if constexpr (width % 2 == 0) { - #pragma unroll +#pragma unroll for (int i = 0; i < width; i += 2) { - float2 z = Converter::convert(T2{data[i], data[i+1]}); + float2 z = Converter::convert(T2{data[i], data[i + 1]}); result += z.x * z.x + z.y * z.y; } } else { - #pragma unroll +#pragma unroll for (int i = 0; i < width; ++i) { float x = Converter::convert(data[i]); result += x * x; @@ -184,15 +197,13 @@ struct alignas(16) _f16Vec { Additional optimizations we can make in this case are packed and vectorized operations, which help with the memory latency bottleneck. */ -template -__global__ std::enable_if_t< - (width > 0) && _typeConvert::exists> fused_add_rms_norm_kernel( - scalar_t* __restrict__ input, // [..., hidden_size] - scalar_t* __restrict__ residual, // [..., hidden_size] - const scalar_t* __restrict__ weight, // [hidden_size] - const float epsilon, - const int num_tokens, - const int hidden_size) { +template +__global__ std::enable_if_t<(width > 0) && _typeConvert::exists> +fused_add_rms_norm_kernel( + scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ residual, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float epsilon, const int num_tokens, const int hidden_size) { // Sanity checks on our vector struct and type-punned pointer arithmetic static_assert(std::is_pod_v<_f16Vec>); static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); @@ -203,9 +214,12 @@ __global__ std::enable_if_t< /* These and the argument pointers are all declared `restrict` as they are not aliased in practice. Argument pointers should not be dereferenced in this kernel as that would be undefined behavior */ - auto* __restrict__ input_v = reinterpret_cast<_f16Vec*>(input); - auto* __restrict__ residual_v = reinterpret_cast<_f16Vec*>(residual); - auto* __restrict__ weight_v = reinterpret_cast*>(weight); + auto* __restrict__ input_v = + reinterpret_cast<_f16Vec*>(input); + auto* __restrict__ residual_v = + reinterpret_cast<_f16Vec*>(residual); + auto* __restrict__ weight_v = + reinterpret_cast*>(weight); for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { int id = blockIdx.x * vec_hidden_size + idx; @@ -215,10 +229,11 @@ __global__ std::enable_if_t< residual_v[id] = temp; } /* Keep the following if-else block in sync with the - calculation of max_block_size in fused_add_rms_norm */ + calculation of max_block_size in fused_add_rms_norm */ if (num_tokens < 256) { variance = blockReduceSum(variance); - } else variance = blockReduceSum(variance); + } else + variance = blockReduceSum(variance); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); } @@ -233,52 +248,50 @@ __global__ std::enable_if_t< } } - /* Generic fused_add_rms_norm_kernel The width field is not used here but necessary for other specializations. */ -template -__global__ std::enable_if_t< - (width == 0) || !_typeConvert::exists> fused_add_rms_norm_kernel( - scalar_t* __restrict__ input, // [..., hidden_size] - scalar_t* __restrict__ residual, // [..., hidden_size] - const scalar_t* __restrict__ weight, // [hidden_size] - const float epsilon, - const int num_tokens, - const int hidden_size) { +template +__global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> +fused_add_rms_norm_kernel( + scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ residual, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { scalar_t z = input[blockIdx.x * hidden_size + idx]; z += residual[blockIdx.x * hidden_size + idx]; - float x = (float) z; + float x = (float)z; variance += x * x; residual[blockIdx.x * hidden_size + idx] = z; } /* Keep the following if-else block in sync with the - calculation of max_block_size in fused_add_rms_norm */ + calculation of max_block_size in fused_add_rms_norm */ if (num_tokens < 256) { variance = blockReduceSum(variance); - } else variance = blockReduceSum(variance); + } else + variance = blockReduceSum(variance); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); } __syncthreads(); for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float) residual[blockIdx.x * hidden_size + idx]; - input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; + float x = (float)residual[blockIdx.x * hidden_size + idx]; + input[blockIdx.x * hidden_size + idx] = + ((scalar_t)(x * s_variance)) * weight[idx]; } } -} // namespace vllm +} // namespace vllm -void rms_norm( - torch::Tensor& out, // [..., hidden_size] - torch::Tensor& input, // [..., hidden_size] - torch::Tensor& weight, // [hidden_size] - float epsilon) { +void rms_norm(torch::Tensor& out, // [..., hidden_size] + torch::Tensor& input, // [..., hidden_size] + torch::Tensor& weight, // [hidden_size] + float epsilon) { int hidden_size = input.size(-1); int num_tokens = input.numel() / hidden_size; @@ -286,40 +299,27 @@ void rms_norm( dim3 block(std::min(hidden_size, 1024)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "rms_norm_kernel", - [&] { - vllm::rms_norm_kernel<<>>( - out.data_ptr(), - input.data_ptr(), - weight.data_ptr(), - epsilon, - num_tokens, - hidden_size); - }); + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { + vllm::rms_norm_kernel<<>>( + out.data_ptr(), input.data_ptr(), + weight.data_ptr(), epsilon, num_tokens, hidden_size); + }); } -#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), \ - "fused_add_rms_norm_kernel", \ - [&] { \ - vllm::fused_add_rms_norm_kernel \ - <<>>( \ - input.data_ptr(), \ - residual.data_ptr(), \ - weight.data_ptr(), \ - epsilon, \ - num_tokens, \ - hidden_size); \ - }); - -void fused_add_rms_norm( - torch::Tensor& input, // [..., hidden_size] - torch::Tensor& residual, // [..., hidden_size] - torch::Tensor& weight, // [hidden_size] - float epsilon) { +#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ + vllm::fused_add_rms_norm_kernel \ + <<>>(input.data_ptr(), \ + residual.data_ptr(), \ + weight.data_ptr(), epsilon, \ + num_tokens, hidden_size); \ + }); + +void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size] + torch::Tensor& residual, // [..., hidden_size] + torch::Tensor& weight, // [hidden_size] + float epsilon) { int hidden_size = input.size(-1); int num_tokens = input.numel() / hidden_size; @@ -342,8 +342,8 @@ void fused_add_rms_norm( auto inp_ptr = reinterpret_cast(input.data_ptr()); auto res_ptr = reinterpret_cast(residual.data_ptr()); auto wt_ptr = reinterpret_cast(weight.data_ptr()); - bool ptrs_are_aligned = inp_ptr % 16 == 0 && res_ptr % 16 == 0 \ - && wt_ptr % 16 == 0; + bool ptrs_are_aligned = + inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; if (ptrs_are_aligned && hidden_size % 8 == 0) { LAUNCH_FUSED_ADD_RMS_NORM(8); } else { diff --git a/csrc/moe/moe_ops.cpp b/csrc/moe/moe_ops.cpp index 35c328499a22d..4122f7630d7c7 100644 --- a/csrc/moe/moe_ops.cpp +++ b/csrc/moe/moe_ops.cpp @@ -3,5 +3,6 @@ #include PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs."); + m.def("topk_softmax", &topk_softmax, + "Apply topk softmax to the gating outputs."); } diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index a01be3e426d72..93e7844ac1993 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -2,8 +2,6 @@ #include -void topk_softmax( - torch::Tensor& topk_weights, - torch::Tensor& topk_indices, - torch::Tensor& token_expert_indices, - torch::Tensor& gating_output); +void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, + torch::Tensor& token_expert_indices, + torch::Tensor& gating_output); diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu index e01b23685ef4e..edc441d121029 100644 --- a/csrc/moe_align_block_size_kernels.cu +++ b/csrc/moe_align_block_size_kernels.cu @@ -7,119 +7,128 @@ #include "cuda_compat.h" #include "dispatch_utils.h" -#define CEILDIV(x,y) (((x) + (y) - 1) / (y)) +#define CEILDIV(x, y) (((x) + (y) - 1) / (y)) namespace vllm { namespace { -__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) { - // don't worry about overflow because num_experts is relatively small - return row * total_col + col; -} +__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, + int32_t col) { + // don't worry about overflow because num_experts is relatively small + return row * total_col + col; } +} // namespace template -__global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, - int32_t *sorted_token_ids, - int32_t *expert_ids, - int32_t *total_tokens_post_pad, - int32_t num_experts, - int32_t block_size, - size_t numel) { - const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); - const size_t start_idx = threadIdx.x * tokens_per_thread; - - extern __shared__ int32_t shared_mem[]; - - int32_t* tokens_cnts = shared_mem; // 2d tensor with shape (num_experts + 1, num_experts) - int32_t* cumsum = shared_mem + (num_experts + 1) * num_experts; // 1d tensor with shape (num_experts + 1) - - for (int i = 0; i < num_experts; ++i) { - tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; - } - - /** - * In the first step we compute token_cnts[thread_index + 1][expert_index], - * which counts how many tokens in the token shard of thread_index are assigned - * to expert expert_index. - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; - } - - __syncthreads(); - - // For each expert we accumulate the token counts from the different threads. - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += tokens_cnts[index(num_experts, i-1, threadIdx.x)]; - } - - __syncthreads(); - - // We accumulate the token counts of all experts in thread 0. - if (threadIdx.x == 0) { - cumsum[0] = 0; - for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], block_size) * block_size; - } - *total_tokens_post_pad = cumsum[num_experts]; - } - - __syncthreads(); - - /** - * For each expert, each thread processes the tokens of the corresponding blocks - * and stores the corresponding expert_id for each block. - */ - for (int i = cumsum[threadIdx.x];i < cumsum[threadIdx.x + 1];i += block_size) { - expert_ids[i / block_size] = threadIdx.x; +__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, + int32_t* sorted_token_ids, + int32_t* expert_ids, + int32_t* total_tokens_post_pad, + int32_t num_experts, + int32_t block_size, size_t numel) { + const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); + const size_t start_idx = threadIdx.x * tokens_per_thread; + + extern __shared__ int32_t shared_mem[]; + + int32_t* tokens_cnts = + shared_mem; // 2d tensor with shape (num_experts + 1, num_experts) + int32_t* cumsum = + shared_mem + (num_experts + 1) * + num_experts; // 1d tensor with shape (num_experts + 1) + + for (int i = 0; i < num_experts; ++i) { + tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; + } + + /** + * In the first step we compute token_cnts[thread_index + 1][expert_index], + * which counts how many tokens in the token shard of thread_index are + * assigned to expert expert_index. + */ + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; + } + + __syncthreads(); + + // For each expert we accumulate the token counts from the different threads. + tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; + for (int i = 1; i <= blockDim.x; ++i) { + tokens_cnts[index(num_experts, i, threadIdx.x)] += + tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; + } + + __syncthreads(); + + // We accumulate the token counts of all experts in thread 0. + if (threadIdx.x == 0) { + cumsum[0] = 0; + for (int i = 1; i <= num_experts; ++i) { + cumsum[i] = cumsum[i - 1] + + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], + block_size) * + block_size; } - - /** - * Each thread processes a token shard, calculating the index of each token after - * sorting by expert number. Given the example topk_ids = [0,1,2,1,2,3,0,3,4] and - * block_size = 4, then the output would be [0, 6, *, *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], - * where * represents a padding value(preset in python). - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int32_t expert_id = topk_ids[i]; - /** The cumsum[expert_id] stores the starting index of the tokens that the - * expert with expert_id needs to process, and tokens_cnts[threadIdx.x][expert_id] - * stores the indices of the tokens processed by the expert with expert_id within - * the current thread's token shard. - */ - int32_t rank_post_pad = tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id]; - sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; - } -} + *total_tokens_post_pad = cumsum[num_experts]; + } + + __syncthreads(); + + /** + * For each expert, each thread processes the tokens of the corresponding + * blocks and stores the corresponding expert_id for each block. + */ + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[i / block_size] = threadIdx.x; + } + + /** + * Each thread processes a token shard, calculating the index of each token + * after sorting by expert number. Given the example topk_ids = + * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *, + * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a + * padding value(preset in python). + */ + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int32_t expert_id = topk_ids[i]; + /** The cumsum[expert_id] stores the starting index of the tokens that the + * expert with expert_id needs to process, and + * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens + * processed by the expert with expert_id within the current thread's token + * shard. + */ + int32_t rank_post_pad = + tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + + cumsum[expert_id]; + sorted_token_ids[rank_post_pad] = i; + ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; + } } - -void moe_align_block_size( - torch::Tensor topk_ids, - int num_experts, - int block_size, - torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad) { - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_INTEGRAL_TYPES( - topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` tensors - const int32_t shared_mem = ((num_experts + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); +} // namespace vllm + +void moe_align_block_size(torch::Tensor topk_ids, int num_experts, + int block_size, torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad) { + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { + // calc needed amount of shared mem for `tokens_cnts` and `cumsum` + // tensors + const int32_t shared_mem = + ((num_experts + 1) * num_experts + (num_experts + 1)) * + sizeof(int32_t); // set dynamic shared mem auto kernel = vllm::moe_align_block_size_kernel; - AT_CUDA_CHECK( - VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize((void *)kernel, shared_mem)); + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem)); kernel<<<1, num_experts, shared_mem, stream>>>( - topk_ids.data_ptr(), - sorted_token_ids.data_ptr(), - experts_ids.data_ptr(), - num_tokens_post_pad.data_ptr(), - num_experts, - block_size, + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, topk_ids.numel()); - }); + }); } diff --git a/csrc/ops.h b/csrc/ops.h index 8c2c2ae6e1f5a..f5e0e423bb65d 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -2,224 +2,136 @@ #include -void paged_attention_v1( - torch::Tensor& out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& seq_lens, - int block_size, - int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, - float kv_scale); - -void paged_attention_v2( - torch::Tensor& out, - torch::Tensor& exp_sums, - torch::Tensor& max_logits, - torch::Tensor& tmp_out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& seq_lens, - int block_size, - int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, - float kv_scale); - -void rms_norm( - torch::Tensor& out, - torch::Tensor& input, - torch::Tensor& weight, - float epsilon); - -void fused_add_rms_norm( - torch::Tensor& input, - torch::Tensor& residual, - torch::Tensor& weight, - float epsilon); - -void rotary_embedding( - torch::Tensor& positions, - torch::Tensor& query, - torch::Tensor& key, - int head_size, - torch::Tensor& cos_sin_cache, - bool is_neox); - -void batched_rotary_embedding( - torch::Tensor& positions, - torch::Tensor& query, - torch::Tensor& key, - int head_size, - torch::Tensor& cos_sin_cache, - bool is_neox, - int rot_dim, - torch::Tensor& cos_sin_cache_offsets); - -void silu_and_mul( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_and_mul( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_tanh_and_mul( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_new( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_fast( - torch::Tensor& out, - torch::Tensor& input); +void paged_attention_v1(torch::Tensor& out, torch::Tensor& query, + torch::Tensor& key_cache, torch::Tensor& value_cache, + int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, + int block_size, int max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale); + +void paged_attention_v2(torch::Tensor& out, torch::Tensor& exp_sums, + torch::Tensor& max_logits, torch::Tensor& tmp_out, + torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, + float scale, torch::Tensor& block_tables, + torch::Tensor& seq_lens, int block_size, + int max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale); + +void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, + float epsilon); + +void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, + torch::Tensor& weight, float epsilon); + +void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, + torch::Tensor& key, int head_size, + torch::Tensor& cos_sin_cache, bool is_neox); + +void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query, + torch::Tensor& key, int head_size, + torch::Tensor& cos_sin_cache, bool is_neox, + int rot_dim, + torch::Tensor& cos_sin_cache_offsets); + +void silu_and_mul(torch::Tensor& out, torch::Tensor& input); + +void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); + +void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); + +void gelu_new(torch::Tensor& out, torch::Tensor& input); + +void gelu_fast(torch::Tensor& out, torch::Tensor& input); #ifndef USE_ROCM -torch::Tensor aqlm_gemm( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const torch::Tensor& codebook_partition_sizes, - const std::optional& bias -); - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& codebook_partition_sizes -); - -torch::Tensor awq_gemm( - torch::Tensor _in_feats, - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters); - -torch::Tensor awq_dequantize( - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters, - int thx, - int thy); - -torch::Tensor marlin_gemm( - torch::Tensor& a, - torch::Tensor& b_q_weight, - torch::Tensor& b_scales, - torch::Tensor& workspace, - int64_t size_m, - int64_t size_n, - int64_t size_k); - -torch::Tensor gptq_marlin_24_gemm( - torch::Tensor &a, - torch::Tensor &b_q_weight, - torch::Tensor &b_meta, - torch::Tensor &b_scales, - torch::Tensor &workspace, - int64_t num_bits, - int64_t size_m, - int64_t size_n, - int64_t size_k); - -torch::Tensor gptq_marlin_gemm( - torch::Tensor &a, - torch::Tensor &b_q_weight, - torch::Tensor &b_scales, - torch::Tensor &g_idx, - torch::Tensor &perm, - torch::Tensor &workspace, - int64_t num_bits, - int64_t size_m, - int64_t size_n, - int64_t size_k, - bool is_k_full); - -torch::Tensor gptq_marlin_repack( - torch::Tensor &b_q_weight, - torch::Tensor &perm, - int64_t size_k, - int64_t size_n, - int64_t num_bits); - -int cutlass_scaled_mm_dq( - torch::Tensor& out, - torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales); +torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const torch::Tensor& codebook_partition_sizes, + const std::optional& bias); + +torch::Tensor aqlm_dequant(const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& codebook_partition_sizes); + +torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel, + torch::Tensor _scaling_factors, torch::Tensor _zeros, + int split_k_iters); + +torch::Tensor awq_dequantize(torch::Tensor _kernel, + torch::Tensor _scaling_factors, + torch::Tensor _zeros, int split_k_iters, int thx, + int thy); + +torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& workspace, + int64_t size_m, int64_t size_n, int64_t size_k); + +torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_meta, + torch::Tensor& b_scales, + torch::Tensor& workspace, int64_t num_bits, + int64_t size_m, int64_t size_n, + int64_t size_k); + +torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& g_idx, + torch::Tensor& perm, torch::Tensor& workspace, + int64_t num_bits, int64_t size_m, int64_t size_n, + int64_t size_k, bool is_k_full); + +torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, + int64_t size_k, int64_t size_n, + int64_t num_bits); + +int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, torch::Tensor const& a_scales, + torch::Tensor const& b_scales); #endif -void squeezellm_gemm( - torch::Tensor vec, - torch::Tensor mat, - torch::Tensor mul, - torch::Tensor lookup_table); - -torch::Tensor gptq_gemm( - torch::Tensor a, - torch::Tensor b_q_weight, - torch::Tensor b_gptq_qzeros, - torch::Tensor b_gptq_scales, - torch::Tensor b_g_idx, - bool use_exllama, - int bit); - -void gptq_shuffle( - torch::Tensor q_weight, - torch::Tensor q_perm, - int bit); - -void static_scaled_fp8_quant( - torch::Tensor& out, - torch::Tensor& input, - torch::Tensor& scale); - -void dynamic_scaled_fp8_quant( - torch::Tensor& out, - torch::Tensor& input, - torch::Tensor& scale); - -void moe_align_block_size( - torch::Tensor topk_ids, - int num_experts, - int block_size, - torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad); +void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, + torch::Tensor lookup_table); + +torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, + torch::Tensor b_gptq_qzeros, + torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, + bool use_exllama, int bit); + +void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit); + +void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input, + torch::Tensor& scale); + +void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input, + torch::Tensor& scale); + +void moe_align_block_size(torch::Tensor topk_ids, int num_experts, + int block_size, torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad); #ifndef USE_ROCM using fptr_t = uint64_t; -fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, - const std::vector &handles, - const std::vector &offsets, int rank, - bool full_nvlink); -bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, +fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, + const std::vector& handles, + const std::vector& offsets, int rank, + bool full_nvlink); +bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size, bool full_nvlink); -void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out); -void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor ®_buffer, - torch::Tensor &out); +void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out); +void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, + torch::Tensor& out); void dispose(fptr_t _fa); int meta_size(); -void register_buffer(fptr_t _fa, torch::Tensor &t, - const std::vector &handles, - const std::vector &offsets); -std::pair, std::vector> get_graph_buffer_ipc_meta(fptr_t _fa); -void register_graph_buffers(fptr_t _fa, const std::vector &handles, - const std::vector> &offsets); +void register_buffer(fptr_t _fa, torch::Tensor& t, + const std::vector& handles, + const std::vector& offsets); +std::pair, std::vector> get_graph_buffer_ipc_meta( + fptr_t _fa); +void register_graph_buffers(fptr_t _fa, const std::vector& handles, + const std::vector>& offsets); #endif diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index d80cb6973fad6..69d6dae1c26bc 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -7,14 +7,10 @@ namespace vllm { -template +template inline __device__ void apply_token_rotary_embedding( - scalar_t* __restrict__ arr, - const scalar_t* __restrict__ cos_ptr, - const scalar_t* __restrict__ sin_ptr, - int rot_offset, - int embed_dim) -{ + scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr, + const scalar_t* __restrict__ sin_ptr, int rot_offset, int embed_dim) { int x_index, y_index; scalar_t cos, sin; if (IS_NEOX) { @@ -37,19 +33,17 @@ inline __device__ void apply_token_rotary_embedding( arr[y_index] = y * cos + x * sin; } -template +template inline __device__ void apply_rotary_embedding( - scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] - scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] - const scalar_t* cache_ptr, - const int head_size, - const int num_heads, - const int num_kv_heads, - const int rot_dim, - const int token_idx, - const int64_t query_stride, - const int64_t key_stride) -{ + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, + // head_size] or [num_tokens, num_heads, + // head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, + // head_size] or [num_tokens, num_kv_heads, + // head_size] + const scalar_t* cache_ptr, const int head_size, const int num_heads, + const int num_kv_heads, const int rot_dim, const int token_idx, + const int64_t query_stride, const int64_t key_stride) { const int embed_dim = rot_dim / 2; const scalar_t* cos_ptr = cache_ptr; const scalar_t* sin_ptr = cache_ptr + embed_dim; @@ -59,8 +53,8 @@ inline __device__ void apply_rotary_embedding( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * query_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_token_rotary_embedding(query + token_head, cos_ptr, - sin_ptr, rot_offset, embed_dim); + apply_token_rotary_embedding( + query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } const int nk = num_kv_heads * embed_dim; @@ -68,62 +62,74 @@ inline __device__ void apply_rotary_embedding( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * key_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_token_rotary_embedding(key + token_head, cos_ptr, - sin_ptr, rot_offset, embed_dim); + apply_token_rotary_embedding( + key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } } -template +template __global__ void rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] - scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] - scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] - const int rot_dim, - const int64_t query_stride, - const int64_t key_stride, - const int num_heads, - const int num_kv_heads, - const int head_size) { + const int64_t* __restrict__ positions, // [batch_size, seq_len] or + // [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, + // head_size] or [num_tokens, num_heads, + // head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, + // head_size] or [num_tokens, num_kv_heads, + // head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // + // 2] + const int rot_dim, const int64_t query_stride, const int64_t key_stride, + const int num_heads, const int num_kv_heads, const int head_size) { // Each thread block is responsible for one token. const int token_idx = blockIdx.x; int64_t pos = positions[token_idx]; const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; - apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); + apply_rotary_embedding( + query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, + token_idx, query_stride, key_stride); } -template +template __global__ void batched_rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] - scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] - scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] - const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] or [num_tokens] - const int rot_dim, - const int64_t query_stride, - const int64_t key_stride, - const int num_heads, - const int num_kv_heads, - const int head_size) { + const int64_t* __restrict__ positions, // [batch_size, seq_len] or + // [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, + // head_size] or [num_tokens, num_heads, + // head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, + // head_size] or [num_tokens, num_kv_heads, + // head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // + // 2] + const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] + // or [num_tokens] + const int rot_dim, const int64_t query_stride, const int64_t key_stride, + const int num_heads, const int num_kv_heads, const int head_size) { // Each thread block is responsible for one token. const int token_idx = blockIdx.x; int64_t pos = positions[token_idx]; int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx]; - const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; + const scalar_t* cache_ptr = + cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; - apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); + apply_rotary_embedding( + query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, + token_idx, query_stride, key_stride); } -} // namespace vllm +} // namespace vllm void rotary_embedding( - torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] - torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size] - torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size] - int head_size, - torch::Tensor& cos_sin_cache, // [max_position, rot_dim] - bool is_neox) { + torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] + torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or + // [num_tokens, num_heads * head_size] + torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or + // [num_tokens, num_kv_heads * head_size] + int head_size, + torch::Tensor& cos_sin_cache, // [max_position, rot_dim] + bool is_neox) { int64_t num_tokens = query.numel() / query.size(-1); int rot_dim = cos_sin_cache.size(1); int num_heads = query.size(-1) / head_size; @@ -135,36 +141,21 @@ void rotary_embedding( dim3 block(std::min(num_heads * rot_dim / 2, 512)); const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - query.scalar_type(), - "rotary_embedding", - [&] { - if (is_neox) { - vllm::rotary_embedding_kernel<<>>( - positions.data_ptr(), - query.data_ptr(), - key.data_ptr(), - cos_sin_cache.data_ptr(), - rot_dim, - query_stride, - key_stride, - num_heads, - num_kv_heads, - head_size); - } else { - vllm::rotary_embedding_kernel<<>>( - positions.data_ptr(), - query.data_ptr(), - key.data_ptr(), - cos_sin_cache.data_ptr(), - rot_dim, - query_stride, - key_stride, - num_heads, - num_kv_heads, - head_size); - } - }); + VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] { + if (is_neox) { + vllm::rotary_embedding_kernel<<>>( + positions.data_ptr(), query.data_ptr(), + key.data_ptr(), cos_sin_cache.data_ptr(), rot_dim, + query_stride, key_stride, num_heads, num_kv_heads, head_size); + } else { + vllm::rotary_embedding_kernel + <<>>( + positions.data_ptr(), query.data_ptr(), + key.data_ptr(), cos_sin_cache.data_ptr(), + rot_dim, query_stride, key_stride, num_heads, num_kv_heads, + head_size); + } + }); } /* @@ -172,14 +163,15 @@ Batched version of rotary embedding, pack multiple LoRAs together and process in batched manner. */ void batched_rotary_embedding( - torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] - torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size] - torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size] - int head_size, - torch::Tensor& cos_sin_cache, // [max_position, rot_dim] - bool is_neox, - int rot_dim, - torch::Tensor& cos_sin_cache_offsets // [num_tokens] + torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] + torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or + // [num_tokens, num_heads * head_size] + torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or + // [num_tokens, num_kv_heads * head_size] + int head_size, + torch::Tensor& cos_sin_cache, // [max_position, rot_dim] + bool is_neox, int rot_dim, + torch::Tensor& cos_sin_cache_offsets // [num_tokens] ) { int64_t num_tokens = cos_sin_cache_offsets.size(0); int num_heads = query.size(-1) / head_size; @@ -191,36 +183,21 @@ void batched_rotary_embedding( dim3 block(std::min(num_heads * rot_dim / 2, 512)); const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - query.scalar_type(), - "rotary_embedding", - [&] { - if (is_neox) { - vllm::batched_rotary_embedding_kernel<<>>( - positions.data_ptr(), - query.data_ptr(), - key.data_ptr(), - cos_sin_cache.data_ptr(), - cos_sin_cache_offsets.data_ptr(), - rot_dim, - query_stride, - key_stride, - num_heads, - num_kv_heads, - head_size); - } else { - vllm::batched_rotary_embedding_kernel<<>>( - positions.data_ptr(), - query.data_ptr(), - key.data_ptr(), - cos_sin_cache.data_ptr(), - cos_sin_cache_offsets.data_ptr(), - rot_dim, - query_stride, - key_stride, - num_heads, - num_kv_heads, - head_size); - } - }); + VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] { + if (is_neox) { + vllm::batched_rotary_embedding_kernel + <<>>( + positions.data_ptr(), query.data_ptr(), + key.data_ptr(), cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, + key_stride, num_heads, num_kv_heads, head_size); + } else { + vllm::batched_rotary_embedding_kernel + <<>>( + positions.data_ptr(), query.data_ptr(), + key.data_ptr(), cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, + key_stride, num_heads, num_kv_heads, head_size); + } + }); } diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index f5b4865506568..cba07f0ae9f2a 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -8,116 +8,87 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { pybind11::module ops = m.def_submodule("ops", "vLLM custom operators"); // Attention ops - ops.def( - "paged_attention_v1", - &paged_attention_v1, - "Compute the attention between an input query and the cached keys/values using PagedAttention."); - ops.def( - "paged_attention_v2", - &paged_attention_v2, - "PagedAttention V2."); + ops.def("paged_attention_v1", &paged_attention_v1, + "Compute the attention between an input query and the cached " + "keys/values using PagedAttention."); + ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2."); // Activation ops - ops.def( - "silu_and_mul", - &silu_and_mul, - "Activation function used in SwiGLU."); - ops.def( - "gelu_and_mul", - &gelu_and_mul, - "Activation function used in GeGLU with `none` approximation."); - ops.def( - "gelu_tanh_and_mul", - &gelu_tanh_and_mul, - "Activation function used in GeGLU with `tanh` approximation."); - ops.def( - "gelu_new", - &gelu_new, - "GELU implementation used in GPT-2."); - ops.def( - "gelu_fast", - &gelu_fast, - "Approximate GELU implementation."); + ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU."); + ops.def("gelu_and_mul", &gelu_and_mul, + "Activation function used in GeGLU with `none` approximation."); + ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul, + "Activation function used in GeGLU with `tanh` approximation."); + ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2."); + ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation."); // Layernorm - ops.def( - "rms_norm", - &rms_norm, - "Apply Root Mean Square (RMS) Normalization to the input tensor."); + ops.def("rms_norm", &rms_norm, + "Apply Root Mean Square (RMS) Normalization to the input tensor."); - ops.def( - "fused_add_rms_norm", - &fused_add_rms_norm, - "In-place fused Add and RMS Normalization"); + ops.def("fused_add_rms_norm", &fused_add_rms_norm, + "In-place fused Add and RMS Normalization"); // Rotary embedding - ops.def( - "rotary_embedding", - &rotary_embedding, - "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); + ops.def("rotary_embedding", &rotary_embedding, + "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); - ops.def( - "batched_rotary_embedding", - &batched_rotary_embedding, - "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)"); + ops.def("batched_rotary_embedding", &batched_rotary_embedding, + "Apply GPT-NeoX or GPT-J style rotary embedding to query and key " + "(supports multiple loras)"); // Quantization ops #ifndef USE_ROCM ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM"); ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM"); ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); - ops.def("marlin_gemm", &marlin_gemm, "Marlin (Dense) Optimized Quantized GEMM for GPTQ"); - ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm, "Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ"); - ops.def("gptq_marlin_gemm", &gptq_marlin_gemm, "gptq_marlin Optimized Quantized GEMM for GPTQ"); - ops.def("gptq_marlin_repack", &gptq_marlin_repack, "gptq_marlin repack from GPTQ"); + ops.def("marlin_gemm", &marlin_gemm, + "Marlin (Dense) Optimized Quantized GEMM for GPTQ"); + ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm, + "Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ"); + ops.def("gptq_marlin_gemm", &gptq_marlin_gemm, + "gptq_marlin Optimized Quantized GEMM for GPTQ"); + ops.def("gptq_marlin_repack", &gptq_marlin_repack, + "gptq_marlin repack from GPTQ"); ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ"); - ops.def("cutlass_scaled_mm_dq", &cutlass_scaled_mm_dq, "CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column quantization."); + ops.def("cutlass_scaled_mm_dq", &cutlass_scaled_mm_dq, + "CUTLASS w8a8 GEMM, supporting symmetric per-tensor or " + "per-row/column quantization."); #endif - + ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ"); ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ"); ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); - ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor"); - ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor"); - ops.def( - "moe_align_block_size", - &moe_align_block_size, - "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size."); + ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, + "Compute FP8 quantized tensor for given scaling factor"); + ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, + "Compute FP8 quantized tensor and scaling factor"); + ops.def("moe_align_block_size", &moe_align_block_size, + "Aligning the number of tokens to be processed by each expert such " + "that it is divisible by the block size."); // Cache ops pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); - cache_ops.def( - "swap_blocks", - &swap_blocks, - "Swap in (out) the cache blocks from src to dst"); - cache_ops.def( - "copy_blocks", - ©_blocks, - "Copy the cache blocks from src to dst"); - cache_ops.def( - "reshape_and_cache", - &reshape_and_cache, - "Reshape the key and value tensors and cache them"); - cache_ops.def( - "reshape_and_cache_flash", - &reshape_and_cache_flash, - "Reshape the key and value tensors and cache them"); - cache_ops.def( - "convert_fp8", - &convert_fp8, - "Convert the key and value cache to fp8 data type"); + cache_ops.def("swap_blocks", &swap_blocks, + "Swap in (out) the cache blocks from src to dst"); + cache_ops.def("copy_blocks", ©_blocks, + "Copy the cache blocks from src to dst"); + cache_ops.def("reshape_and_cache", &reshape_and_cache, + "Reshape the key and value tensors and cache them"); + cache_ops.def("reshape_and_cache_flash", &reshape_and_cache_flash, + "Reshape the key and value tensors and cache them"); + cache_ops.def("convert_fp8", &convert_fp8, + "Convert the key and value cache to fp8 data type"); // Cuda utils - pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils"); - cuda_utils.def( - "get_device_attribute", - &get_device_attribute, - "Gets the specified device attribute."); + pybind11::module cuda_utils = + m.def_submodule("cuda_utils", "vLLM cuda utils"); + cuda_utils.def("get_device_attribute", &get_device_attribute, + "Gets the specified device attribute."); - cuda_utils.def( - "get_max_shared_memory_per_block_device_attribute", - &get_max_shared_memory_per_block_device_attribute, - "Gets the maximum shared memory per block device attribute."); + cuda_utils.def("get_max_shared_memory_per_block_device_attribute", + &get_max_shared_memory_per_block_device_attribute, + "Gets the maximum shared memory per block device attribute."); #ifndef USE_ROCM // Custom all-reduce kernels @@ -134,5 +105,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { custom_ar.def("register_graph_buffers", ®ister_graph_buffers, "register_graph_buffers"); #endif - } diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu index 4415316e1e8cd..255844eec56d4 100644 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ b/csrc/quantization/aqlm/gemm_kernels.cu @@ -25,32 +25,28 @@ #include #include - namespace vllm { namespace aqlm { __global__ void Code1x16MatVec( - const int4* __restrict__ A, - const int4* __restrict__ B, - int4* __restrict__ C, - const int4* __restrict__ codebook, - const int prob_m, - const int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. - const int codebook_stride // as int4. + const int4* __restrict__ A, const int4* __restrict__ B, + int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m, + const int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each + // codebook, at most 3 long. + const int codebook_stride // as int4. ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); bool pred = a_gl_rd < prob_m; - if (pred) - { - // advance to the correct codebook, this easy because we only multiply one column of the codebook. + if (pred) { + // advance to the correct codebook, this easy because we only multiply one + // column of the codebook. auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) - { - codebook += codebook_stride; - ++codebook_size; + while (a_gl_rd >= *codebook_size) { + codebook += codebook_stride; + ++codebook_size; } } @@ -67,8 +63,7 @@ __global__ void Code1x16MatVec( // We pad shared memory to avoid bank conflicts during reads __syncthreads(); for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) - sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; + if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; } __syncthreads(); b_gl_rd += 32 * 8; @@ -76,22 +71,19 @@ __global__ void Code1x16MatVec( int b_sh_rd = 9 * (threadIdx.x % 32); if (pred && a_gl_rd < a_gl_end) { const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); - #pragma unroll +#pragma unroll for (int i = 0; i < 8; i++) { uint32_t dec[4]; - // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't - // actually help us; this brings > 2x speedup. - asm volatile ( - "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*) &codebook[enc[i]]) - ); + // We bypass the L1 cache to avoid massive amounts of memory streaming + // that doesn't actually help us; this brings > 2x speedup. + asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) + : "l"((void*)&codebook[enc[i]])); half2* a = reinterpret_cast(&dec); half2* b = reinterpret_cast(&sh_b[b_sh_rd]); half2 res2 = {}; - #pragma unroll - for (int j = 0; j < 4; j++) - res2 = __hfma2(a[j], b[j], res2); +#pragma unroll + for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2); res += __half2float(res2.x) + __half2float(res2.y); b_sh_rd++; } @@ -100,37 +92,33 @@ __global__ void Code1x16MatVec( } if (pred) { - #pragma unroll - for (int i = 16; i > 0; i /= 2) - res += __shfl_down_sync(0xffffffff, res, i); +#pragma unroll + for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); if (threadIdx.x % 32 == 0) reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); } } __global__ void Code2x8MatVec( - const int4* __restrict__ A, - const int4* __restrict__ B, - int4* __restrict__ C, - const int4* __restrict__ codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. - const int codebook_stride // as int4. + const int4* __restrict__ A, const int4* __restrict__ B, + int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m, + int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each + // codebook, at most 3 long. + const int codebook_stride // as int4. ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); bool pred = a_gl_rd < prob_m; - if (pred) - { - // advance to the correct codebook, this easy because we only multiply one column of the codebook. + if (pred) { + // advance to the correct codebook, this easy because we only multiply one + // column of the codebook. auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) - { - codebook += codebook_stride; - ++codebook_size; + while (a_gl_rd >= *codebook_size) { + codebook += codebook_stride; + ++codebook_size; } } @@ -148,9 +136,8 @@ __global__ void Code2x8MatVec( for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { int4 dec = codebook[i]; - #pragma unroll - for (int j = 0; j < 8; j++) - sh_code[8 * i + (j + lane) % 8] = dec; +#pragma unroll + for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; } __syncthreads(); @@ -161,8 +148,7 @@ __global__ void Code2x8MatVec( // We pad shared memory to avoid bank conflicts during reads __syncthreads(); for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) - sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; + if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; } __syncthreads(); b_gl_rd += 32 * 8; @@ -170,13 +156,15 @@ __global__ void Code2x8MatVec( int b_sh_rd = 9 * (threadIdx.x % 32); if (pred && a_gl_rd < a_gl_end) { const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); - #pragma unroll +#pragma unroll for (int i = 0; i < 8; i++) { - half2* a0 = reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); + half2* a0 = + reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); + half2* a1 = + reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); + half2* b = reinterpret_cast(&sh_b[b_sh_rd]); half2 res2 = {}; - #pragma unroll +#pragma unroll for (int j = 0; j < 4; j++) res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2); res += __half2float(res2.x) + __half2float(res2.y); @@ -187,36 +175,31 @@ __global__ void Code2x8MatVec( } if (pred) { - #pragma unroll - for (int i = 16; i > 0; i /= 2) - res += __shfl_down_sync(0xffffffff, res, i); +#pragma unroll + for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); if (threadIdx.x % 32 == 0) reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); } } - __global__ void Code1x16Dequant( - const int4* __restrict__ A, - int4* __restrict__ C, - const int4* __restrict__ codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m. - const int codebook_stride // as int4 + const int4* __restrict__ A, int4* __restrict__ C, + const int4* __restrict__ codebook, int prob_m, int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each + // codebook, at most 3 long, sums to m. + const int codebook_stride // as int4 ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); bool pred = a_gl_rd < prob_m; - if (pred) - { - // advance to the correct codebook, this easy because we only multiply one column of the codebook. + if (pred) { + // advance to the correct codebook, this easy because we only multiply one + // column of the codebook. auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) - { - codebook += codebook_stride; - ++codebook_size; + while (a_gl_rd >= *codebook_size) { + codebook += codebook_stride; + ++codebook_size; } } @@ -231,17 +214,15 @@ __global__ void Code1x16Dequant( while (iters--) { if (pred && a_gl_rd < a_gl_end) { const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); - #pragma unroll +#pragma unroll for (int i = 0; i < 8; i++) { int4 chunk; auto dec = reinterpret_cast(&chunk); - // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't - // actually help us; this brings > 2x speedup. - asm volatile ( - "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*) &codebook[enc[i]]) - ); + // We bypass the L1 cache to avoid massive amounts of memory streaming + // that doesn't actually help us; this brings > 2x speedup. + asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) + : "l"((void*)&codebook[enc[i]])); C[a_gl_rd * 8 + i] = chunk; } @@ -250,28 +231,25 @@ __global__ void Code1x16Dequant( } } - __global__ void Code2x8Dequant( - const int4* __restrict__ A, - int4* __restrict__ C, - const int4* __restrict__ codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. - const int codebook_stride // as int4 + const int4* __restrict__ A, int4* __restrict__ C, + const int4* __restrict__ codebook, int prob_m, int prob_k, + const int4 + codebook_a_sizes, // cumulative sizes of A spanning each codebook, at + // most 3 long, corresponds to cols. + const int codebook_stride // as int4 ) { int a_gl_stride = prob_k / 8 / 8; int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); bool pred = a_gl_rd < prob_m; - if (pred) - { - // advance to the correct codebook, this easy because we only multiply one column of the codebook. + if (pred) { + // advance to the correct codebook, this easy because we only multiply one + // column of the codebook. auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) - { - codebook += codebook_stride; - ++codebook_size; + while (a_gl_rd >= *codebook_size) { + codebook += codebook_stride; + ++codebook_size; } } @@ -290,9 +268,8 @@ __global__ void Code2x8Dequant( for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { int4 dec = codebook[i]; - #pragma unroll - for (int j = 0; j < 8; j++) - sh_code[8 * i + (j + lane) % 8] = dec; +#pragma unroll + for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; } __syncthreads(); @@ -302,12 +279,14 @@ __global__ void Code2x8Dequant( while (iters--) { if (pred && a_gl_rd < a_gl_end) { const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); - #pragma unroll +#pragma unroll for (int i = 0; i < 8; i++) { int4 chunk; - half2* a0 = reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); - #pragma unroll + half2* a0 = + reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); + half2* a1 = + reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); +#pragma unroll for (int j = 0; j < 4; j++) reinterpret_cast(&chunk)[j] = __hadd2(a0[j], a1[j]); C[a_gl_rd * 8 + i] = chunk; @@ -317,22 +296,15 @@ __global__ void Code2x8Dequant( } } -inline int ceildiv(int a, int b) { - return (a + b - 1) / b; -} +inline int ceildiv(int a, int b) { return (a + b - 1) / b; } const int THREAD_M = 16; -void code1x16_matvec_cuda( - const void* __restrict__ A, - const void* __restrict__ B, - void* __restrict__ C, - const void* __restrict__ codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, - const int codebook_stride -) { +void code1x16_matvec_cuda(const void* __restrict__ A, + const void* __restrict__ B, void* __restrict__ C, + const void* __restrict__ codebook, int prob_m, + int prob_k, const int4 codebook_a_sizes, + const int codebook_stride) { int sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); int waves = 0; @@ -345,28 +317,16 @@ void code1x16_matvec_cuda( int blocks = ceildiv(prob_m, thread_m); int threads = 32 * thread_m; cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16MatVec<<>>( - (const int4*) A, - (const int4*) B, - (int4*) C, - (const int4*) codebook, - prob_m, - prob_k, - codebook_a_sizes, - codebook_stride - ); + Code1x16MatVec<<>>( + (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, + prob_k, codebook_a_sizes, codebook_stride); } -void code2x8_matvec_cuda( - const void* __restrict__ A, - const void* __restrict__ B, - void* __restrict__ C, - const void* __restrict__ codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, - const int codebook_stride -) { +void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B, + void* __restrict__ C, + const void* __restrict__ codebook, int prob_m, + int prob_k, const int4 codebook_a_sizes, + const int codebook_stride) { int sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); int waves = 0; @@ -379,30 +339,20 @@ void code2x8_matvec_cuda( int blocks = ceildiv(prob_m, thread_m); int threads = 32 * thread_m; int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaFuncSetAttribute( - Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared - ); + cudaFuncSetAttribute(Code2x8MatVec, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); Code2x8MatVec<<>>( - (const int4*) A, - (const int4*) B, - (int4*) C, - (const int4*) codebook, - prob_m, - prob_k, - codebook_a_sizes, - codebook_stride - ); + (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, + prob_k, codebook_a_sizes, codebook_stride); } void code1x16_dequant_cuda( - const void* __restrict__ A, - void* __restrict__ C, - const void* __restrict__ codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. - const int codebook_stride // as int4. + const void* __restrict__ A, void* __restrict__ C, + const void* __restrict__ codebook, int prob_m, int prob_k, + const int4 codebook_a_sizes, // cumulative sizes of A spanning each + // codebook, at most 3 long. + const int codebook_stride // as int4. ) { int sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); @@ -417,25 +367,21 @@ void code1x16_dequant_cuda( int threads = 32 * thread_m; cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); Code1x16Dequant<<>>( - (const int4*) A, - (int4*) C, - (const int4*) codebook, - prob_m, - prob_k, - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long. - codebook_stride // as int4. + (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, + codebook_a_sizes, // cumulative sizes of A spanning each codebook, at + // most 3 long. + codebook_stride // as int4. ); } // Dequantizes the code and codebook into weights. -void code2x8_dequant_cuda( - const void* __restrict__ A, - void* __restrict__ C, - const void* __restrict__ codebook, - int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols. - const int codebook_stride // as int4 +void code2x8_dequant_cuda( + const void* __restrict__ A, void* __restrict__ C, + const void* __restrict__ codebook, int prob_m, int prob_k, + const int4 + codebook_a_sizes, // cumulative sizes of A spanning each codebook, at + // most 3 long, corresponds to cols. + const int codebook_stride // as int4 ) { int sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); @@ -451,74 +397,50 @@ void code2x8_dequant_cuda( int shared = 16 * (2 * 256 * 8 + 32 * 9); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - cudaFuncSetAttribute( - Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared - ); + cudaFuncSetAttribute(Code2x8Dequant, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared); Code2x8Dequant<<>>( - (const int4*) A, - (int4*) C, - (const int4*) codebook, - prob_m, - prob_k, - codebook_a_sizes, - codebook_stride - ); + (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, + codebook_a_sizes, codebook_stride); } -int codebook_stride(const torch::Tensor& codebooks) -{ +int codebook_stride(const torch::Tensor& codebooks) { return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); } void code1x16_matvec( - const torch::Tensor& A, - const torch::Tensor& B, - torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes // cumulative sizes of A spanning each codebook, at most 3 long. + const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C, + const torch::Tensor& codebook, + const int4 codebook_a_sizes // cumulative sizes of A spanning each + // codebook, at most 3 long. ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); int prob_m = C.size(0); int prob_k = B.size(0); - code1x16_matvec_cuda( - A.data_ptr(), - B.data_ptr(), - C.data_ptr(), - codebook.data_ptr(), - prob_m, - prob_k, - codebook_a_sizes, - codebook_stride(codebook) - ); + code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), + codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, + codebook_stride(codebook)); } -torch::Tensor code1x16_matmat( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { +torch::Tensor code1x16_matmat(const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const int4 codebook_a_sizes, + const std::optional& bias) { auto input_sizes = input.sizes(); auto out_features = codes.size(0) * codebooks.size(2); auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty({flat_input.size(0), out_features}, - torch::TensorOptions() - .dtype(input.dtype()) - .device(input.device()) - ); + auto flat_output = torch::empty( + {flat_input.size(0), out_features}, + torch::TensorOptions().dtype(input.dtype()).device(input.device())); for (int i = 0; i < flat_input.size(0); ++i) { auto input_vec = flat_input.index({i}); auto output_vec = flat_output.index({i}); - code1x16_matvec( - codes.squeeze(2), - input_vec, - output_vec, - codebooks, - codebook_a_sizes - ); + code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, + codebook_a_sizes); } flat_output *= scales.flatten().unsqueeze(0); @@ -533,55 +455,35 @@ torch::Tensor code1x16_matmat( return output; } -void code2x8_matvec( - const torch::Tensor& A, - const torch::Tensor& B, - torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes -) { +void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B, + torch::Tensor& C, const torch::Tensor& codebook, + const int4 codebook_a_sizes) { const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); int prob_m = C.size(0); int prob_k = B.size(0); - code2x8_matvec_cuda( - A.data_ptr(), - B.data_ptr(), - C.data_ptr(), - codebook.data_ptr(), - prob_m, - prob_k, - codebook_a_sizes, - 2 * codebook_stride(codebook) - ); + code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), + codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, + 2 * codebook_stride(codebook)); } -torch::Tensor code2x8_matmat( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias -) { +torch::Tensor code2x8_matmat(const torch::Tensor& input, + const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const int4 codebook_a_sizes, + const std::optional& bias) { auto input_sizes = input.sizes(); auto out_features = codes.size(0) * codebooks.size(2); auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty({flat_input.size(0), out_features}, - torch::TensorOptions() - .dtype(input.dtype()) - .device(input.device()) - ); + auto flat_output = torch::empty( + {flat_input.size(0), out_features}, + torch::TensorOptions().dtype(input.dtype()).device(input.device())); for (int i = 0; i < flat_input.size(0); ++i) { auto input_vec = flat_input.index({i}); auto output_vec = flat_output.index({i}); - code2x8_matvec( - codes.squeeze(2), - input_vec, - output_vec, - codebooks, - codebook_a_sizes - ); + code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, + codebook_a_sizes); } flat_output *= scales.flatten().unsqueeze(0); if (bias.has_value()) { @@ -596,64 +498,56 @@ torch::Tensor code2x8_matmat( } // Accumulate the partition sizes. -int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) -{ +int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) { int4 cumulative_sizes; auto cumulative_size = &cumulative_sizes.x; int i = 0; int last = 0; assert(codebook_partition_sizes.size(0) <= 4); - for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) - { + for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) { *cumulative_size = codebook_partition_sizes[i].item() + last; last = *cumulative_size; } // fill in the rest with unreachable. - for (; i < 4; ++i, ++cumulative_size) - { - *cumulative_size = last*10; + for (; i < 4; ++i, ++cumulative_size) { + *cumulative_size = last * 10; } return cumulative_sizes; } -} // namespace aqlm -} // namespace vllm - +} // namespace aqlm +} // namespace vllm -torch::Tensor aqlm_gemm( - const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const torch::Tensor& codebook_partition_sizes, - const std::optional& bias -) -{ - int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes); +torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& scales, + const torch::Tensor& codebook_partition_sizes, + const std::optional& bias) { + int4 cumulative_sizes = + vllm::aqlm::accumulate_sizes(codebook_partition_sizes); int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); int const entries = codebooks.size(1); - if (nbooks == 1 && entries == (1 << 16)) - { - return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); + if (nbooks == 1 && entries == (1 << 16)) { + return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, + cumulative_sizes, bias); } - if (nbooks == 2 && entries == (1 << 8)) - { - return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias); + if (nbooks == 2 && entries == (1 << 8)) { + return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, + cumulative_sizes, bias); } - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") + TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, + " entries is not currently supported.") return {}; } -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& codebook_partition_sizes -) -{ - int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes); +torch::Tensor aqlm_dequant(const torch::Tensor& codes, + const torch::Tensor& codebooks, + const torch::Tensor& codebook_partition_sizes) { + int4 cumulative_sizes = + vllm::aqlm::accumulate_sizes(codebook_partition_sizes); int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); int const entries = codebooks.size(1); @@ -668,45 +562,37 @@ torch::Tensor aqlm_dequant( assert(out_features = codebook_partition_sizes.sum().item()); auto weights = torch::empty({out_features, in_features}, - torch::TensorOptions() - .dtype(codebooks.dtype()) - .device(codebooks.device()) - ); + torch::TensorOptions() + .dtype(codebooks.dtype()) + .device(codebooks.device())); + + if (nbooks == 1 && entries == (1 << 16)) { + vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(), + codebooks.data_ptr(), out_features, + in_features, cumulative_sizes, + vllm::aqlm::codebook_stride(codebooks)); - if (nbooks == 1 && entries == (1 << 16)) - { - vllm::aqlm::code1x16_dequant_cuda( - codes.data_ptr(), - weights.data_ptr(), - codebooks.data_ptr(), - out_features, - in_features, - cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.) - // weights *= scales.index({"...", 0, 0}); - - return weights; + // if you wanted to flip to scaling the weights, (though it's 30%-ish slower + // and not consistent with gemv implementation.) weights *= + // scales.index({"...", 0, 0}); + + return weights; } - if (nbooks == 2 && entries == (1 << 8)) - { - vllm::aqlm::code2x8_dequant_cuda( - codes.data_ptr(), - weights.data_ptr(), - codebooks.data_ptr(), - out_features, - in_features, - cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation) - // weights *= scales.index({"...", 0, 0}); - - return weights; + if (nbooks == 2 && entries == (1 << 8)) { + vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(), + codebooks.data_ptr(), out_features, + in_features, cumulative_sizes, + vllm::aqlm::codebook_stride(codebooks)); + + // if you wanted to flip to scaling the weights, (though it's 30%-ish slower + // and not consistent with gemv implementation) weights *= + // scales.index({"...", 0, 0}); + + return weights; } - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.") + TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, + " entries is not currently supported.") return {}; } diff --git a/csrc/quantization/awq/dequantize.cuh b/csrc/quantization/awq/dequantize.cuh index d1d926de18d78..813ec6716cf54 100644 --- a/csrc/quantization/awq/dequantize.cuh +++ b/csrc/quantization/awq/dequantize.cuh @@ -1,11 +1,11 @@ /* Adapted from https://github.com/mit-han-lab/llm-awq -Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h +Modified from NVIDIA FasterTransformer: +https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h @article{lin2023awq, - title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, - author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, - journal={arXiv}, - year={2023} + title={AWQ: Activation-aware Weight Quantization for LLM Compression and +Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, +Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023} } */ @@ -14,74 +14,88 @@ Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransfor namespace vllm { namespace awq { -__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) -{ +__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 assert(false); #else - uint4 result; + uint4 result; - uint32_t* h = reinterpret_cast(&result); - uint32_t const i4s = reinterpret_cast(source); + uint32_t* h = reinterpret_cast(&result); + uint32_t const i4s = reinterpret_cast(source); - // First, we extract the i4s and construct an intermediate fp16 number. - static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; - static constexpr uint32_t BOTTOM_MASK = 0x000f000f; - static constexpr uint32_t TOP_MASK = 0x00f000f0; - static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; + static constexpr uint32_t BOTTOM_MASK = 0x000f000f; + static constexpr uint32_t TOP_MASK = 0x00f000f0; + static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; - // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing - // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions. - // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and - // elt_67 to fp16 without having to shift them to the bottom bits before hand. + // Note that the entire sequence only requires 1 shift instruction. This is + // thanks to the register packing format and the fact that we force our + // integers to be unsigned, and account for this in the fp16 subtractions. In + // addition, I exploit the fact that sub and fma have the same throughput in + // order to convert elt_23 and elt_67 to fp16 without having to shift them to + // the bottom bits before hand. - // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue - // immediately before required. - const uint32_t top_i4s = i4s >> 8; - // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[0]) - : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); - // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[1]) - : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); - // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[2]) - : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); - // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[3]) - : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); + // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW + // dependency if we issue immediately before required. + const uint32_t top_i4s = i4s >> 8; + // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[0]) + : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), + "n"(immLut)); + // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[1]) + : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), + "n"(immLut)); + // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[2]) + : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), + "n"(immLut)); + // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[3]) + : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), + "n"(immLut)); - // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the - // half2 ctor. In this case, I chose performance reliability over code readability. + // I use inline PTX below because I am not sure if the compiler will emit + // float2half instructions if I use the half2 ctor. In this case, I chose + // performance reliability over code readability. - // This is the half2 {1032, 1032} represented as an integer. - // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; - // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] - static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; - // This is the half2 {1 / 16, 1 / 16} represented as an integer. - static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; - // This is the half2 {-72, -72} represented as an integer. - // static constexpr uint32_t NEG_72 = 0xd480d480; - // Haotian: Let's use {-64, -64}. - static constexpr uint32_t NEG_64 = 0xd400d400; + // This is the half2 {1032, 1032} represented as an integer. + // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; + // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] + static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; + // This is the half2 {1 / 16, 1 / 16} represented as an integer. + static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; + // This is the half2 {-72, -72} represented as an integer. + // static constexpr uint32_t NEG_72 = 0xd480d480; + // Haotian: Let's use {-64, -64}. + static constexpr uint32_t NEG_64 = 0xd400d400; - // Finally, we construct the output numbers. - // Convert elt_01 - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); - // Convert elt_23 - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); - // Convert elt_45 - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); - // Convert elt_67 - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); + // Finally, we construct the output numbers. + // Convert elt_01 + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(h[0]) + : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); + // Convert elt_23 + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(h[1]) + : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); + // Convert elt_45 + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(h[2]) + : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); + // Convert elt_67 + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(h[3]) + : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); - return result; + return result; #endif } -} // namespace awq -} // namespace vllm +} // namespace awq +} // namespace vllm diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu index 5aefb0bd16aef..bb8e5bbb23d7f 100644 --- a/csrc/quantization/awq/gemm_kernels.cu +++ b/csrc/quantization/awq/gemm_kernels.cu @@ -1,14 +1,12 @@ /* Adapted from https://github.com/mit-han-lab/llm-awq @article{lin2023awq, - title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, - author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, - journal={arXiv}, - year={2023} + title={AWQ: Activation-aware Weight Quantization for LLM Compression and +Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, +Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023} } */ - #include #include @@ -20,26 +18,20 @@ namespace vllm { namespace awq { // Pack two half values. -static inline __device__ __host__ unsigned -__pack_half2(const half x, const half y) { - unsigned v0 = *((unsigned short *)&x); - unsigned v1 = *((unsigned short *)&y); +static inline __device__ __host__ unsigned __pack_half2(const half x, + const half y) { + unsigned v0 = *((unsigned short*)&x); + unsigned v1 = *((unsigned short*)&y); return (v1 << 16) | v0; } -template -__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32( - int G, - int split_k_iters, - half* __restrict__ A, - int* __restrict__ B, - half* __restrict__ scaling_factors, - int* __restrict__ zeros, - int M, - int IC, - int OC, - half* __restrict__ C) -{ +template +__global__ void __launch_bounds__(64) + gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters, + half* __restrict__ A, int* __restrict__ B, + half* __restrict__ scaling_factors, + int* __restrict__ zeros, int M, int IC, + int OC, half* __restrict__ C) { // Only support matrix n = 64 or 128 assert(N == 64 || N == 128); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 @@ -70,43 +62,46 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32( static constexpr int row_stride = 2 * 32 * 8 / N; bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N; // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16 - bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M; // threadIdx.y is warp_id + bool ld_A_flag = + (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + + threadIdx.x * 8 / 32) < M; // threadIdx.y is warp_id // bool wb_C_flag = (threadIdx.x / 4) < M; - half* A_ptr = A - + (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC - + (((int)threadIdx.x) % (32 / 8)) * 8; - - int* B_ptr = B - + ((int)threadIdx.y) * (OC / 8) * (256 / N) - + (((int)threadIdx.x) / (N / 8)) * (OC / 8) - + (((int)blockIdx_y) % j_factors1) * (N / 8) - + (((int)threadIdx.x) % (N / 8)) * 1; -// Why * 1 in the above line? - - half* A_shared_ptr = A_shared - + ((int)threadIdx.y) * row_stride_warp * (32 + 8) - + (((int)threadIdx.x) / (32 / 8)) * (32 + 8) - + (((int)threadIdx.x) % (32 / 8) ) * 8; - - half* B_shared_ptr = B_shared - + ((int)threadIdx.y) * (row_stride / 2) * (N + 8) - + (((int)threadIdx.x) / (N / 8)) * (N + 8) - + (((int)threadIdx.x) % (N / 8)) * 8; - - int* zeros_ptr = zeros - + (((int)blockIdx_y) % j_factors1) * (N / 8) - + ((int)threadIdx.x) % (N / 8); - - half* scaling_factors_ptr = scaling_factors - + (((int)blockIdx_y) % j_factors1) * N - + (((int)threadIdx.x) % (N / 8)) * 8; - - half* C_ptr = C - + static_cast(blockIdx_z) * M * OC // blockIdz.x -> split_k dim - + (((int)blockIdx_y) % j_factors1) * N - + ((int)threadIdx.y) * (N / 2) - + (((int)threadIdx.x) % 4) * 2; + half* A_ptr = + A + + (((int)blockIdx_y) / j_factors1 * 16 + + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * + IC + + (((int)threadIdx.x) % (32 / 8)) * 8; + + int* B_ptr = B + ((int)threadIdx.y) * (OC / 8) * (256 / N) + + (((int)threadIdx.x) / (N / 8)) * (OC / 8) + + (((int)blockIdx_y) % j_factors1) * (N / 8) + + (((int)threadIdx.x) % (N / 8)) * 1; + // Why * 1 in the above line? + + half* A_shared_ptr = A_shared + + ((int)threadIdx.y) * row_stride_warp * (32 + 8) + + (((int)threadIdx.x) / (32 / 8)) * (32 + 8) + + (((int)threadIdx.x) % (32 / 8)) * 8; + + half* B_shared_ptr = B_shared + + ((int)threadIdx.y) * (row_stride / 2) * (N + 8) + + (((int)threadIdx.x) / (N / 8)) * (N + 8) + + (((int)threadIdx.x) % (N / 8)) * 8; + + int* zeros_ptr = zeros + (((int)blockIdx_y) % j_factors1) * (N / 8) + + ((int)threadIdx.x) % (N / 8); + + half* scaling_factors_ptr = scaling_factors + + (((int)blockIdx_y) % j_factors1) * N + + (((int)threadIdx.x) % (N / 8)) * 8; + + half* C_ptr = + C + + static_cast(blockIdx_z) * M * OC // blockIdz.x -> split_k dim + + (((int)blockIdx_y) % j_factors1) * N + ((int)threadIdx.y) * (N / 2) + + (((int)threadIdx.x) % 4) * 2; // preload s.f. and zeros int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters; @@ -115,57 +110,83 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32( int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z; __syncthreads(); // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16 - if (ld_A_flag) - { + if (ld_A_flag) { *(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32)); - } - else - { + } else { *(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0); } // for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) { uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8)); uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded); - uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC)); + uint4 B_loaded_scale = + *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC)); /* - if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && threadIdx.y == 0){ - printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w); + if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && + threadIdx.y == 0){ printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, + B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, + B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w); } */ // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0); int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8); for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < N / 16; ++ax0_ax1_fused_0) { - // B: 32 x 136 (128+8) float16 // each warp: 32 x 4 - // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4 - // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8))); - // row stride in shared memory: (NWARPS * 32 * 8 / cta_N) - uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8)); + // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus + // zero -> WB UINT4 + // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * + // 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) + // * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * + // 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * + // 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * + // 8))); row stride in shared memory: (NWARPS * 32 * 8 / cta_N) + uint32_t B_loaded = + *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8)); uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded); - //uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8); + // uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / + // 8)) * 8); - // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8); + // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x + // % (cta_N / 8)) * 8); // - zero and * scale - // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale. - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); + // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = + // q * scale - zero * scale. + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.x) + : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.x) + : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.y) + : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.y) + : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.z) + : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.z) + : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.w) + : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.w) + : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); /* - if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){ - printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); + if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == + 0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n", + B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); } */ // write back - *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (N + 8)) = B_loaded_fp16; + *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (N + 8)) = + B_loaded_fp16; } __syncthreads(); @@ -173,112 +194,179 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32( { unsigned int addr; __asm__ __volatile__( - "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" - : "=r"(addr) - : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))) - ); - + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, " + "addr; }\n" + : "=r"(addr) + : "l"((void*)((&(A_shared[(k_0_1 * 16)])) + + (((((int)threadIdx.x) & 15) * 40) + + ((((int)threadIdx.x) >> 4) * 8))))); __asm__ __volatile__( - "ldmatrix.sync.aligned.m8n8.x4.shared.b16" - "{%0, %1, %2, %3}, [%4];\n" - : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3]) - : "r"(addr) - ); + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned*)(A_shared_warp + 0))[0]), + "=r"(((unsigned*)(A_shared_warp + 0))[1]), + "=r"(((unsigned*)(A_shared_warp + 0))[2]), + "=r"(((unsigned*)(A_shared_warp + 0))[3]) + : "r"(addr)); } for (int ax1_0 = 0; ax1_0 < N / 32; ++ax1_0) { { unsigned int addr; __asm__ __volatile__( - "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" - : "=r"(addr) - : "l"((void *)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + (((int)threadIdx.y) * (N / 2))) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * (N + 8)) + ((((int)threadIdx.x) >> 4) * 8)))) - ); + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, " + "addr; }\n" + : "=r"(addr) + : "l"((void*)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + + (((int)threadIdx.y) * (N / 2))) + + (ax1_0 * 16))])) + + (((((int)threadIdx.x) & 15) * (N + 8)) + + ((((int)threadIdx.x) >> 4) * 8))))); __asm__ __volatile__( - "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" - "{%0, %1, %2, %3}, [%4];\n" - : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3]) - : "r"(addr) - ); + "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[0]), + "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[1]), + "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[2]), + "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[3]) + : "r"(addr)); } } for (int j_0_4 = 0; j_0_4 < N / 32; ++j_0_4) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 { __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" + : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[3]) + : "r"(((unsigned*)(A_shared_warp + 0))[0]), + "r"(((unsigned*)(A_shared_warp + 0))[1]), + "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[0]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[1]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[2]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[3])); } { __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" + : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]) + : "r"(((unsigned*)(A_shared_warp + 0))[0]), + "r"(((unsigned*)(A_shared_warp + 0))[1]), + "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])); } { __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" + : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[3]) + : "r"(((unsigned*)(A_shared_warp + 0))[2]), + "r"(((unsigned*)(A_shared_warp + 0))[3]), + "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[0]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[1]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[2]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[3])); } { __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" + : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]) + : "r"(((unsigned*)(A_shared_warp + 0))[2]), + "r"(((unsigned*)(A_shared_warp + 0))[3]), + "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])); } -#else + #else { __asm__ __volatile__( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, " + "%13};\n" + : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]), + "=f"(((float*)(C_warp + (j_0_4 * 8)))[3]) + : "r"(((unsigned*)(A_shared_warp + 0))[0]), + "r"(((unsigned*)(A_shared_warp + 0))[1]), + "r"(((unsigned*)(A_shared_warp + 0))[2]), + "r"(((unsigned*)(A_shared_warp + 0))[3]), + "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]), + "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[0]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[1]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[2]), + "f"(((float*)(C_warp + (j_0_4 * 8)))[3])); } { __asm__ __volatile__( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, " + "%13};\n" + : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]), + "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]) + : "r"(((unsigned*)(A_shared_warp + 0))[0]), + "r"(((unsigned*)(A_shared_warp + 0))[1]), + "r"(((unsigned*)(A_shared_warp + 0))[2]), + "r"(((unsigned*)(A_shared_warp + 0))[3]), + "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), + "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]), + "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])); } -#endif + #endif } } } -// TODO: Shang: Hoist loop invariance. + // TODO: Shang: Hoist loop invariance. for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) { for (int local_id = 0; local_id < 8; ++local_id) { - int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8; - if (row_offset < M) - { - *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]); + int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8; + if (row_offset < M) { + *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]); } } } #endif } -__global__ void __launch_bounds__(64) dequantize_weights( - int* __restrict__ B, - half* __restrict__ scaling_factors, - int* __restrict__ zeros, - half* __restrict__ C, - int G -) -{ +__global__ void __launch_bounds__(64) + dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors, + int* __restrict__ zeros, half* __restrict__ C, int G) { int j_factors1 = 4; int row_stride2 = 4; int split_k_iters = 1; @@ -310,14 +398,30 @@ __global__ void __launch_bounds__(64) dequantize_weights( uint32_t B_loaded = *(uint32_t*)B_ptr2; uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.x) + : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.x) + : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.y) + : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.y) + : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.z) + : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.z) + : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" + : "=r"(B_loaded_fp16.w) + : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" + : "=r"(B_loaded_fp16.w) + : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); *(uint4*)B_shared_ptr2 = B_loaded_fp16; @@ -326,58 +430,57 @@ __global__ void __launch_bounds__(64) dequantize_weights( } } -} // namespace awq -} // namespace vllm - -torch::Tensor awq_dequantize( - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters, - int thx, - int thy) -{ - int in_c = _kernel.size(0); - int qout_c = _kernel.size(1); - int out_c = qout_c * 8; - int G = in_c / _scaling_factors.size(0); - - int x_thread = thx; - int y_thread = thy; - - int x_blocks = 1; - int y_blocks = 1; - if (thx==0) { - x_thread = qout_c; - } - if (thy==0) { - y_thread = in_c; - } - if (thx==0 && thy==0) { - x_thread = 8; - y_thread = 8; - x_blocks = (int)(qout_c / 8); - y_blocks = (int)(in_c / 8); - } +} // namespace awq +} // namespace vllm + +torch::Tensor awq_dequantize(torch::Tensor _kernel, + torch::Tensor _scaling_factors, + torch::Tensor _zeros, int split_k_iters, int thx, + int thy) { + int in_c = _kernel.size(0); + int qout_c = _kernel.size(1); + int out_c = qout_c * 8; + int G = in_c / _scaling_factors.size(0); + + int x_thread = thx; + int y_thread = thy; + + int x_blocks = 1; + int y_blocks = 1; + if (thx == 0) { + x_thread = qout_c; + } + if (thy == 0) { + y_thread = in_c; + } + if (thx == 0 && thy == 0) { + x_thread = 8; + y_thread = 8; + x_blocks = (int)(qout_c / 8); + y_blocks = (int)(in_c / 8); + } - const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors)); - auto options = torch::TensorOptions().dtype(_scaling_factors.dtype()).device(_scaling_factors.device()); - at::Tensor _de_kernel = torch::empty({in_c, out_c}, options); + auto options = torch::TensorOptions() + .dtype(_scaling_factors.dtype()) + .device(_scaling_factors.device()); + at::Tensor _de_kernel = torch::empty({in_c, out_c}, options); - auto kernel = reinterpret_cast(_kernel.data_ptr()); - auto de_kernel = reinterpret_cast(_de_kernel.data_ptr()); - auto scaling_factors = reinterpret_cast(_scaling_factors.data_ptr()); - auto zeros = reinterpret_cast(_zeros.data_ptr()); + auto kernel = reinterpret_cast(_kernel.data_ptr()); + auto de_kernel = reinterpret_cast(_de_kernel.data_ptr()); + auto scaling_factors = + reinterpret_cast(_scaling_factors.data_ptr()); + auto zeros = reinterpret_cast(_zeros.data_ptr()); - dim3 num_blocks(x_blocks, y_blocks); - dim3 threads_per_block(x_thread, y_thread); + dim3 num_blocks(x_blocks, y_blocks); + dim3 threads_per_block(x_thread, y_thread); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - vllm::awq::dequantize_weights<<>>( - kernel, scaling_factors, zeros, de_kernel, G); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + vllm::awq::dequantize_weights<<>>( + kernel, scaling_factors, zeros, de_kernel, G); - return _de_kernel; + return _de_kernel; } // in_feats: M, IC [float16] @@ -386,61 +489,61 @@ torch::Tensor awq_dequantize( // zeros: IC // G, OC // 8 [int32] -> cast to IC // G, OC [uint4b] // assume that batch_size < 16 for now -torch::Tensor awq_gemm( - torch::Tensor _in_feats, - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters) -{ - int num_in_feats = _in_feats.size(0); - int num_in_channels = _in_feats.size(1); - const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats)); - - auto options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device()); - at::Tensor _out_feats = torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options); - int num_out_feats = _out_feats.size(-2); - int num_out_channels = _out_feats.size(-1); - - auto in_feats = reinterpret_cast(_in_feats.data_ptr()); - auto kernel = reinterpret_cast(_kernel.data_ptr()); - auto out_feats = reinterpret_cast(_out_feats.data_ptr()); - auto scaling_factors = reinterpret_cast(_scaling_factors.data_ptr()); - auto zeros = reinterpret_cast(_zeros.data_ptr()); - int group_size = num_in_channels / _scaling_factors.size(0); - - if (num_out_channels % 64 != 0) - throw std::invalid_argument("OC is not multiple of cta_N = 64"); - if (num_out_channels % 8 != 0) - throw std::invalid_argument("OC is not multiple of pack_num = 8"); - if (group_size % 32 != 0) - throw std::invalid_argument("Group size should be a multiple of 32"); - if (num_out_channels % group_size != 0) - throw std::invalid_argument("OC is not multiple of Group size"); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - if (num_out_channels % 128 == 0) - { - int j_factors1 = num_out_channels / 128 / 1; - dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters); - // threadIdx.x: 32 - // threadIdx.y: i_factors[2] * j_factors[2] - dim3 threads_per_block(32, 2); - vllm::awq::gemm_forward_4bit_cuda_m16nXk32<128><<>>( - group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, - num_out_channels, out_feats); - } - else if (num_out_channels % 64 == 0) - { - int j_factors1 = num_out_channels / 64 / 1; - dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters); - - // threadIdx.x: 32 - // threadIdx.y: i_factors[2] * j_factors[2] - dim3 threads_per_block(32, 2); - vllm::awq::gemm_forward_4bit_cuda_m16nXk32<64><<>>( - group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, - num_out_channels, out_feats); - } - return _out_feats.sum(0); +torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel, + torch::Tensor _scaling_factors, torch::Tensor _zeros, + int split_k_iters) { + int num_in_feats = _in_feats.size(0); + int num_in_channels = _in_feats.size(1); + const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats)); + + auto options = torch::TensorOptions() + .dtype(_in_feats.dtype()) + .device(_in_feats.device()); + at::Tensor _out_feats = + torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options); + int num_out_feats = _out_feats.size(-2); + int num_out_channels = _out_feats.size(-1); + + auto in_feats = reinterpret_cast(_in_feats.data_ptr()); + auto kernel = reinterpret_cast(_kernel.data_ptr()); + auto out_feats = reinterpret_cast(_out_feats.data_ptr()); + auto scaling_factors = + reinterpret_cast(_scaling_factors.data_ptr()); + auto zeros = reinterpret_cast(_zeros.data_ptr()); + int group_size = num_in_channels / _scaling_factors.size(0); + + if (num_out_channels % 64 != 0) + throw std::invalid_argument("OC is not multiple of cta_N = 64"); + if (num_out_channels % 8 != 0) + throw std::invalid_argument("OC is not multiple of pack_num = 8"); + if (group_size % 32 != 0) + throw std::invalid_argument("Group size should be a multiple of 32"); + if (num_out_channels % group_size != 0) + throw std::invalid_argument("OC is not multiple of Group size"); + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + if (num_out_channels % 128 == 0) { + int j_factors1 = num_out_channels / 128 / 1; + dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters); + // threadIdx.x: 32 + // threadIdx.y: i_factors[2] * j_factors[2] + dim3 threads_per_block(32, 2); + vllm::awq::gemm_forward_4bit_cuda_m16nXk32<128> + <<>>( + group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, + num_in_feats, num_in_channels, num_out_channels, out_feats); + } else if (num_out_channels % 64 == 0) { + int j_factors1 = num_out_channels / 64 / 1; + dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 * + split_k_iters); + + // threadIdx.x: 32 + // threadIdx.y: i_factors[2] * j_factors[2] + dim3 threads_per_block(32, 2); + vllm::awq::gemm_forward_4bit_cuda_m16nXk32<64> + <<>>( + group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, + num_in_feats, num_in_channels, num_out_channels, out_feats); + } + return _out_feats.sum(0); } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu index 3ec454f78c654..e62fe731a98d3 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu @@ -117,10 +117,10 @@ struct cutlass_2x_gemm { }; template -void cutlass_scaled_mm_dq_dispatcher(torch::Tensor &out, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales) { +void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; @@ -136,9 +136,9 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor &out, torch::Tensor const &a, using StrideC = Stride, Int<0>>; StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); - auto c_ptr = static_cast(out.data_ptr()); + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto c_ptr = static_cast(out.data_ptr()); auto a_scales_ptr = a_scales.data_ptr(); auto b_scales_ptr = b_scales.data_ptr(); @@ -196,10 +196,10 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor &out, torch::Tensor const &a, } // namespace -void cutlass_scaled_mm_dq_sm75(torch::Tensor &out, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales) { +void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { TORCH_CHECK(a.dtype() == torch::kInt8); TORCH_CHECK(b.dtype() == torch::kInt8); TORCH_CHECK(a_scales.dtype() == torch::kFloat32); @@ -223,10 +223,10 @@ void cutlass_scaled_mm_dq_sm75(torch::Tensor &out, torch::Tensor const &a, } } -void cutlass_scaled_mm_dq_sm80(torch::Tensor &out, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales) { +void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { TORCH_CHECK(a.dtype() == torch::kInt8); TORCH_CHECK(b.dtype() == torch::kInt8); TORCH_CHECK(a_scales.dtype() == torch::kFloat32); @@ -250,10 +250,10 @@ void cutlass_scaled_mm_dq_sm80(torch::Tensor &out, torch::Tensor const &a, } } -void cutlass_scaled_mm_dq_sm89(torch::Tensor &out, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales) { +void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu index 37b096de23e3b..12efcac7bb919 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu @@ -120,10 +120,10 @@ struct cutlass_3x_gemm { }; template -void cutlass_scaled_mm_dq_dispatcher(torch::Tensor &out, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales) { +void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; @@ -146,12 +146,12 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor &out, torch::Tensor const &a, using GemmKernel = typename Gemm::GemmKernel; typename GemmKernel::ProblemShape prob_shape{m, n, k, 1}; - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr, b_stride}; - auto c_ptr = static_cast(out.data_ptr()); + auto c_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ {}, c_ptr, c_stride, c_ptr, c_stride}; @@ -183,10 +183,10 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor &out, torch::Tensor const &a, } } // namespace -void cutlass_scaled_mm_dq_sm90(torch::Tensor &out, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales) { +void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu index a4e696d4a3322..dab73ac6c831e 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu @@ -2,29 +2,29 @@ #include #include -void cutlass_scaled_mm_dq_sm75(torch::Tensor &c, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales); +void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); -void cutlass_scaled_mm_dq_sm80(torch::Tensor &c, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales); +void cutlass_scaled_mm_dq_sm80(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); -void cutlass_scaled_mm_dq_sm89(torch::Tensor &c, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales); +void cutlass_scaled_mm_dq_sm89(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); -void cutlass_scaled_mm_dq_sm90(torch::Tensor &c, torch::Tensor const &a, - torch::Tensor const &b, - torch::Tensor const &a_scales, - torch::Tensor const &b_scales); +void cutlass_scaled_mm_dq_sm90(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); -void cutlass_scaled_mm_dq(torch::Tensor &c, torch::Tensor const &a, - torch::Tensor const &b, torch::Tensor const &a_scales, - torch::Tensor const &b_scales) { +void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a, + torch::Tensor const& b, torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { int32_t major_capability; int32_t minor_capability; cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor, @@ -36,14 +36,15 @@ void cutlass_scaled_mm_dq(torch::Tensor &c, torch::Tensor const &a, // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); + b.size(1) == c.size(1)); TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && b.stride(1) % 16 == 0); // 16 Byte Alignment + TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major + TORCH_CHECK(b.stride(0) == 1); // Column-major + TORCH_CHECK(c.stride(0) % 16 == 0 && + b.stride(1) % 16 == 0); // 16 Byte Alignment TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); at::cuda::OptionalCUDAGuard const device_guard(device_of(a)); diff --git a/csrc/quantization/fp8/amd/hip_float8.h b/csrc/quantization/fp8/amd/hip_float8.h index 87c7c9ce66100..f9c80fcdec576 100644 --- a/csrc/quantization/fp8/amd/hip_float8.h +++ b/csrc/quantization/fp8/amd/hip_float8.h @@ -1,167 +1,137 @@ #pragma once #ifdef __HIPCC__ -#include + #include #else -#include -#include -#include -#include + #include + #include + #include + #include #endif #include "hip_float8_impl.h" -struct alignas(1) hip_fp8 -{ - struct from_bits_t - { - }; - HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() { return from_bits_t(); } - uint8_t data; - - hip_fp8() = default; - HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default; - HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete; - explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t) - : data(v) - { - } +struct alignas(1) hip_fp8 { + struct from_bits_t {}; + HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() { + return from_bits_t(); + } + uint8_t data; + + hip_fp8() = default; + HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default; + HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete; + explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t) + : data(v) {} #ifdef __HIP__MI300__ - // NOTE: ON-DEVICE... always optimal bias - explicit HIP_FP8_DEVICE hip_fp8(float v) - : data(hip_fp8_impl::to_fp8_from_fp32(v)) - { - } - - explicit HIP_FP8_DEVICE hip_fp8(_Float16 v) - : hip_fp8(static_cast(v)) - { - } - - // Host only implementation using s/w simulation - explicit HIP_FP8_HOST -#else // __HIP__MI300__ - // both Host and DEVICE for non-MI300 using s/w simulation - explicit HIP_FP8_HOST_DEVICE -#endif // __HIP__MI300__ - hip_fp8(float v) - { - data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/, true /*clip*/>(v); - } - - explicit HIP_FP8_HOST_DEVICE hip_fp8(double v) - : hip_fp8(static_cast(v)) - { - } + // NOTE: ON-DEVICE... always optimal bias + explicit HIP_FP8_DEVICE hip_fp8(float v) + : data(hip_fp8_impl::to_fp8_from_fp32(v)) {} + + explicit HIP_FP8_DEVICE hip_fp8(_Float16 v) + : hip_fp8(static_cast(v)) {} + + // Host only implementation using s/w simulation + explicit HIP_FP8_HOST +#else // __HIP__MI300__ + // both Host and DEVICE for non-MI300 using s/w simulation + explicit HIP_FP8_HOST_DEVICE +#endif // __HIP__MI300__ + hip_fp8(float v) { + data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/, + true /*clip*/>(v); + } + + explicit HIP_FP8_HOST_DEVICE hip_fp8(double v) + : hip_fp8(static_cast(v)) {} #ifdef __HIP__MI300__ - // upcast using device specific intrinsic - explicit inline HIP_FP8_DEVICE operator float() const - { - float fval; - uint32_t i32val = static_cast(data); - - // upcast - asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val)); - - return fval; - } - - explicit inline HIP_FP8_HOST operator float() const -#else // __HIP__MI300__ - explicit inline HIP_FP8_HOST_DEVICE operator float() const -#endif // __HIP__MI300__ - { - return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(data); - } + // upcast using device specific intrinsic + explicit inline HIP_FP8_DEVICE operator float() const { + float fval; + uint32_t i32val = static_cast(data); + + // upcast + asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" + : "=v"(fval) + : "v"(i32val)); + + return fval; + } + + explicit inline HIP_FP8_HOST operator float() const +#else // __HIP__MI300__ + explicit inline HIP_FP8_HOST_DEVICE operator float() const +#endif // __HIP__MI300__ + { + return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>( + data); + } }; -namespace std -{ -inline hip_fp8 sin(hip_fp8 a) -{ - return hip_fp8(sinf(float(a))); -} -inline hip_fp8 cos(hip_fp8 a) -{ - return hip_fp8(cosf(float(a))); -} -HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) -{ - return a; -} -} // namespace std +namespace std { +inline hip_fp8 sin(hip_fp8 a) { return hip_fp8(sinf(float(a))); } +inline hip_fp8 cos(hip_fp8 a) { return hip_fp8(cosf(float(a))); } +HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) { return a; } +} // namespace std // Special operator overloading -inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) -{ - return os << float(f8); +inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) { + return os << float(f8); } // all + operator overloading with mixed types -// mixed types, always converts to f32, does computation in f32, and returns float -inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) -{ - return (fa + float(b)); +// mixed types, always converts to f32, does computation in f32, and returns +// float +inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) { + return (fa + float(b)); } -inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) -{ - return (float(a) + fb); +inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) { + return (float(a) + fb); } -inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) -{ - return hip_fp8(float(a) + float(b)); +inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) { + return hip_fp8(float(a) + float(b)); } -inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) -{ - return a = hip_fp8(float(a) + float(b)); +inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) { + return a = hip_fp8(float(a) + float(b)); } // overloading multiplication, always returns float, -inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) -{ - return float(a) * float(b); +inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) { + return float(a) * float(b); } -inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) -{ - return (a * float(b)); +inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) { + return (a * float(b)); } -inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) -{ - return (float(a) * b); +inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) { + return (float(a) * b); } -inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) -{ - return ((float)a * float(b)); +inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) { + return ((float)a * float(b)); } -inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) -{ - return ((float)a * float(b)); +inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) { + return ((float)a * float(b)); } // overloading for compare -inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) -{ - return (a.data == b.data); +inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) { + return (a.data == b.data); } -inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) -{ - return (a.data != b.data); +inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) { + return (a.data != b.data); } -inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) -{ - return static_cast(a) >= static_cast(b); +inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) { + return static_cast(a) >= static_cast(b); } -inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) -{ - return static_cast(a) > static_cast(b); +inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) { + return static_cast(a) > static_cast(b); } diff --git a/csrc/quantization/fp8/amd/hip_float8_impl.h b/csrc/quantization/fp8/amd/hip_float8_impl.h index e05905b4e49e8..90251c3539534 100644 --- a/csrc/quantization/fp8/amd/hip_float8_impl.h +++ b/csrc/quantization/fp8/amd/hip_float8_impl.h @@ -1,316 +1,316 @@ #pragma once -#if defined(__HIPCC__) && (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) -#define __HIP__MI300__ +#if defined(__HIPCC__) && \ + (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) + #define __HIP__MI300__ #endif #ifdef __HIPCC__ -#define HIP_FP8_HOST_DEVICE __host__ __device__ -#define HIP_FP8_HOST __host__ -#define HIP_FP8_DEVICE __device__ + #define HIP_FP8_HOST_DEVICE __host__ __device__ + #define HIP_FP8_HOST __host__ + #define HIP_FP8_DEVICE __device__ #else -#define HIP_FP8_HOST_DEVICE -#define HIP_FP8_HOST -#define HIP_FP8_DEVICE + #define HIP_FP8_HOST_DEVICE + #define HIP_FP8_HOST + #define HIP_FP8_DEVICE #endif -namespace hip_fp8_impl -{ +namespace hip_fp8_impl { #ifdef __HIP__MI300__ -HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) -{ - uint8_t i8data; - union { - float fval; - uint32_t i32val; - uint8_t i8val[4]; // NOTE: not endian independent - } val; - - uint32_t ival = 0; - val.fval = v; - - if ((val.i32val & 0x7F800000) != 0x7F800000) { /// propagate NAN/INF, no clipping - val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0); - } - - ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, - false); // false -> WORD0 - val.i32val = ival; - i8data = val.i8val[0]; - - return i8data; +HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) { + uint8_t i8data; + union { + float fval; + uint32_t i32val; + uint8_t i8val[4]; // NOTE: not endian independent + } val; + + uint32_t ival = 0; + val.fval = v; + + if ((val.i32val & 0x7F800000) != + 0x7F800000) { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0); + } + + ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, + false); // false -> WORD0 + val.i32val = ival; + i8data = val.i8val[0]; + + return i8data; } -#endif // __HIP__MI300__ +#endif // __HIP__MI300__ -HIP_FP8_HOST inline int clz(uint32_t x) -{ - return __builtin_clz(x); -} +HIP_FP8_HOST inline int clz(uint32_t x) { return __builtin_clz(x); } #if defined(__HIPCC__) || defined(__CUDA_ARCH__) -HIP_FP8_DEVICE inline int clz(uint32_t x) -{ - return __clz(x); -} +HIP_FP8_DEVICE inline int clz(uint32_t x) { return __clz(x); } #endif template -HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false, uint32_t rng = 0) -{ +HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false, + uint32_t rng = 0) { #ifdef __HIPCC__ - constexpr bool is_half = std::is_same::value; + constexpr bool is_half = std::is_same::value; #else - constexpr bool is_half = false; + constexpr bool is_half = false; #endif - constexpr bool is_float = std::is_same::value; - static_assert(wm + we == 7, "wm+we==7"); - static_assert(is_half || is_float, "Only half and float can be cast to f8"); - - const int mfmt = (sizeof(T) == 4) ? 23 : 10; - uint32_t x; + constexpr bool is_float = std::is_same::value; + static_assert(wm + we == 7, "wm+we==7"); + static_assert(is_half || is_float, "Only half and float can be cast to f8"); + + const int mfmt = (sizeof(T) == 4) ? 23 : 10; + uint32_t x; + if (sizeof(T) == 4) { + x = reinterpret_cast(_x); + } else { + x = reinterpret_cast(_x); + } + + uint32_t head, mantissa; + int exponent, bias; + uint32_t sign; + + if (sizeof(T) == 4) { + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + bias = 127; + } else { + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + bias = 15; + } + + uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm); + + // Deal with inf and NaNs + if (negative_zero_nan) { if (sizeof(T) == 4) { - x = reinterpret_cast(_x); + if ((x & 0x7F800000) == 0x7F800000) { + return 0x80; + } } else { - x = reinterpret_cast(_x); + // if(__hisinf(x) || __hisnan(x)) + if ((x & 0x7C00) == 0x7C00) { + return 0x80; + } } - - uint32_t head, mantissa; - int exponent, bias; - uint32_t sign; - + } else { if (sizeof(T) == 4) { - head = x & 0xFF800000; - mantissa = x & 0x7FFFFF; - exponent = (head >> 23) & 0xFF; - sign = head >> 31; - bias = 127; + if ((x & 0x7F800000) == 0x7F800000) { + return signed_inf + (mantissa != 0 ? 1 : 0); + } } else { - head = x & 0xFC00; - mantissa = x & 0x3FF; - exponent = (head >> 10) & 0x1F; - sign = head >> 15; - bias = 15; + if ((x & 0x7C00) == 0x7C00) { + return signed_inf + (mantissa != 0 ? 1 : 0); + } } - - uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm); - - // Deal with inf and NaNs - if (negative_zero_nan) { - if (sizeof(T) == 4) { - if ((x & 0x7F800000) == 0x7F800000) { - return 0x80; - } - } else { - // if(__hisinf(x) || __hisnan(x)) - if ((x & 0x7C00) == 0x7C00) { - return 0x80; - } - } - } else { - if (sizeof(T) == 4) { - if ((x & 0x7F800000) == 0x7F800000) { - return signed_inf + (mantissa != 0 ? 1 : 0); - } - } else { - if ((x & 0x7C00) == 0x7C00) { - return signed_inf + (mantissa != 0 ? 1 : 0); - } - } - } - if (x == 0) { - return 0; - } - - // First need to check if it is normal or denorm as there is a difference of - // implicit 1 Then need to adjust the exponent to align with the F8 exponent, - // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng - // to mantissa and truncate. And for RNE, no need to add rng. Then probably - // need to check whether there is carry and adjust exponent and mantissa again - - // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent - // bits - const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0); - const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal - // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) - // f8_exponent is the converted f8 exponent with bias encoding - // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, - // the difference needs to be adjusted and mantissa shifted - int act_exponent, f8_exponent, exponent_diff; - - if (exponent == 0) { // fp32/fp16 is in denormal. - /* fp32 denormal is below 2^-127 so it is usually not a concern here, we + } + if (x == 0) { + return 0; + } + + // First need to check if it is normal or denorm as there is a difference of + // implicit 1 Then need to adjust the exponent to align with the F8 exponent, + // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng + // to mantissa and truncate. And for RNE, no need to add rng. Then probably + // need to check whether there is carry and adjust exponent and mantissa again + + // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent + // bits + const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0); + const int f8_denormal_act_exponent = + 1 - f8_bias; // actual exponent of f8 denormal + // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) + // f8_exponent is the converted f8 exponent with bias encoding + // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, + // the difference needs to be adjusted and mantissa shifted + int act_exponent, f8_exponent, exponent_diff; + + if (exponent == 0) { // fp32/fp16 is in denormal. + /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16 here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that there are some numbers in fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 (NANOO) normal. In this case, the fp16 mantissa should be shift left by 1 */ - act_exponent = exponent - bias + 1; - exponent_diff = f8_denormal_act_exponent - act_exponent; // actual exponent is exponent-bias+1 as it is denormal - } else { // fp32/fp16 is normal with implicit 1 - act_exponent = exponent - bias; - if (act_exponent <= f8_denormal_act_exponent) { - /* This is the case where fp32/fp16 is normal but it is in f8 denormal - range. For example fp8 nanoo mode, denormal exponent is -7, but if the - fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1, - Therefore it needs to be adjust to -6 and mantissa shift right by 1. - So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ - exponent_diff = f8_denormal_act_exponent - act_exponent; - } else { // both fp32/fp16 and f8 are in normal range - exponent_diff = 0; // exponent_diff=0 does not mean there is no difference - // for this case, - // act_exponent could be larger. Just that it does not need shift mantissa - } - mantissa += (1 << mfmt); // Add the implicit 1 into mantissa + act_exponent = exponent - bias + 1; + exponent_diff = + f8_denormal_act_exponent - + act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } else { // fp32/fp16 is normal with implicit 1 + act_exponent = exponent - bias; + if (act_exponent <= f8_denormal_act_exponent) { + /* This is the case where fp32/fp16 is normal but it is in f8 denormal +range. For example fp8 nanoo mode, denormal exponent is -7, but if the +fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1, +Therefore it needs to be adjust to -6 and mantissa shift right by 1. +So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ + exponent_diff = f8_denormal_act_exponent - act_exponent; + } else { // both fp32/fp16 and f8 are in normal range + exponent_diff = 0; // exponent_diff=0 does not mean there is no + // difference for this case, act_exponent could be + // larger. Just that it does not need shift mantissa } - - bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) == - static_cast(1 << (mfmt - wm + exponent_diff - 1)); - /* This part is a bit tricky. The judgment of whether it is a tie needs to be - done before we shift right as shift right could rip off some residual part - and make something not midpoint look like midpoint. For example, the fp16 - number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after - shift right by 4 bits, it would look like midpoint. + mantissa += (1 << mfmt); // Add the implicit 1 into mantissa + } + + bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) == + static_cast(1 << (mfmt - wm + exponent_diff - 1)); + /* This part is a bit tricky. The judgment of whether it is a tie needs to be + done before we shift right as shift right could rip off some residual part + and make something not midpoint look like midpoint. For example, the fp16 + number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after + shift right by 4 bits, it would look like midpoint. */ - if (exponent_diff > 0) { - mantissa >>= exponent_diff; - } else if (exponent_diff == -1) { - mantissa <<= -exponent_diff; + if (exponent_diff > 0) { + mantissa >>= exponent_diff; + } else if (exponent_diff == -1) { + mantissa <<= -exponent_diff; + } + bool implicit_one = mantissa & (1 << mfmt); + // if there is no implicit 1, it means the f8 is denormal and need to adjust + // to denorm exponent + f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ + + f8_bias - (implicit_one ? 0 : 1); + + // Now we have the exponent and mantissa adjusted + uint32_t drop_mask = (1 << (mfmt - wm)) - 1; + bool odd = mantissa & (1 << (mfmt - wm)); // if the least significant bit + // that is not truncated is 1 + mantissa += + (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & + drop_mask; + + // Now we deal with overflow + if (f8_exponent == 0) { + if ((1 << mfmt) & mantissa) { + f8_exponent = 1; // denormal overflow to become normal, promote exponent } - bool implicit_one = mantissa & (1 << mfmt); - // if there is no implicit 1, it means the f8 is denormal and need to adjust - // to denorm exponent - f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); - - // Now we have the exponent and mantissa adjusted - uint32_t drop_mask = (1 << (mfmt - wm)) - 1; - bool odd = mantissa & (1 << (mfmt - wm)); // if the least significant bit that - // is not truncated is 1 - mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask; - - // Now we deal with overflow - if (f8_exponent == 0) { - if ((1 << mfmt) & mantissa) { - f8_exponent = 1; // denormal overflow to become normal, promote exponent - } - } else { - if ((1 << (mfmt + 1)) & mantissa) { - mantissa >>= 1; - f8_exponent++; - } + } else { + if ((1 << (mfmt + 1)) & mantissa) { + mantissa >>= 1; + f8_exponent++; } + } - mantissa >>= (mfmt - wm); - - // above range: quantize to maximum possible float of the same sign - const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2); - if (f8_exponent > max_exp) { - if (clip) { - mantissa = (1 << wm) - 1; - f8_exponent = max_exp; - } else { - return signed_inf; - } - } + mantissa >>= (mfmt - wm); - if (f8_exponent == 0 && mantissa == 0) { - return negative_zero_nan ? 0 : (sign << 7); + // above range: quantize to maximum possible float of the same sign + const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2); + if (f8_exponent > max_exp) { + if (clip) { + mantissa = (1 << wm) - 1; + f8_exponent = max_exp; + } else { + return signed_inf; } - mantissa &= (1 << wm) - 1; - return (sign << 7) | (f8_exponent << wm) | mantissa; + } + + if (f8_exponent == 0 && mantissa == 0) { + return negative_zero_nan ? 0 : (sign << 7); + } + mantissa &= (1 << wm) - 1; + return (sign << 7) | (f8_exponent << wm) | mantissa; } template -inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) -{ +inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) { #ifdef __HIPCC__ - constexpr bool is_half = std::is_same::value; + constexpr bool is_half = std::is_same::value; #else - constexpr bool is_half = false; + constexpr bool is_half = false; #endif - constexpr bool is_float = std::is_same::value; - static_assert(is_half || is_float, "only half and float are supported"); + constexpr bool is_float = std::is_same::value; + static_assert(is_half || is_float, "only half and float are supported"); - constexpr int weo = is_half ? 5 : 8; - constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7); + constexpr int weo = is_half ? 5 : 8; + constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7); - T fInf, fNegInf, fNaN, fNeg0; + T fInf, fNegInf, fNaN, fNeg0; #ifdef __HIPCC__ - if (is_half) { - const uint16_t ihInf = 0x7C00; - const uint16_t ihNegInf = 0xFC00; - const uint16_t ihNaN = 0x7C01; - const uint16_t ihNeg0 = 0x8000; - fInf = reinterpret_cast(ihInf); - fNegInf = reinterpret_cast(ihNegInf); - fNaN = reinterpret_cast(ihNaN); - fNeg0 = reinterpret_cast(ihNeg0); - } else + if (is_half) { + const uint16_t ihInf = 0x7C00; + const uint16_t ihNegInf = 0xFC00; + const uint16_t ihNaN = 0x7C01; + const uint16_t ihNeg0 = 0x8000; + fInf = reinterpret_cast(ihInf); + fNegInf = reinterpret_cast(ihNegInf); + fNaN = reinterpret_cast(ihNaN); + fNeg0 = reinterpret_cast(ihNeg0); + } else #endif - if (is_float) { - const uint32_t ifInf = 0x7F800000; - const uint32_t ifNegInf = 0xFF800000; - const uint32_t ifNaN = 0x7F800001; - const uint32_t ifNeg0 = 0x80000000; - fInf = reinterpret_cast(ifInf); - fNegInf = reinterpret_cast(ifNegInf); - fNaN = reinterpret_cast(ifNaN); - fNeg0 = reinterpret_cast(ifNeg0); - } - - if (x == 0) { - return 0; - } - - uint32_t sign = x >> 7; - uint32_t mantissa = x & ((1 << wm) - 1); - int exponent = (x & 0x7F) >> wm; - if (negative_zero_nan) { - if (x == 0x80) { - return fNaN; - } - } else { - if (x == 0x80) { - return fNeg0; - } - if (exponent == ((1 << we) - 1)) { - return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN; - } - } - typename std::conditional::type retval; - if (we == 5 && is_half && !negative_zero_nan) { - retval = x << 8; - return reinterpret_cast(retval); + if (is_float) { + const uint32_t ifInf = 0x7F800000; + const uint32_t ifNegInf = 0xFF800000; + const uint32_t ifNaN = 0x7F800001; + const uint32_t ifNeg0 = 0x80000000; + fInf = reinterpret_cast(ifInf); + fNegInf = reinterpret_cast(ifNegInf); + fNaN = reinterpret_cast(ifNaN); + fNeg0 = reinterpret_cast(ifNeg0); + } + + if (x == 0) { + return 0; + } + + uint32_t sign = x >> 7; + uint32_t mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if (negative_zero_nan) { + if (x == 0x80) { + return fNaN; } - - const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); - - // subnormal input - if (exponent == 0) { - // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above - int sh = 1 + clz(mantissa) - (32 - wm); - mantissa <<= sh; - exponent += 1 - sh; - mantissa &= ((1 << wm) - 1); + } else { + if (x == 0x80) { + return fNeg0; } - exponent += exp_low_cutoff - 1; - mantissa <<= wmo - wm; - - // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) - if (exponent <= 0) { - mantissa |= 1 << wmo; - mantissa >>= 1 - exponent; - exponent = 0; - } - - if (sizeof(T) == 2) { - retval = (sign << 15) | (exponent << 10) | mantissa; - } else { - retval = (sign << 31) | (exponent << 23) | mantissa; + if (exponent == ((1 << we) - 1)) { + return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN; } + } + typename std::conditional::type retval; + if (we == 5 && is_half && !negative_zero_nan) { + retval = x << 8; return reinterpret_cast(retval); + } + + const int exp_low_cutoff = + (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); + + // subnormal input + if (exponent == 0) { + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + int sh = 1 + clz(mantissa) - (32 - wm); + mantissa <<= sh; + exponent += 1 - sh; + mantissa &= ((1 << wm) - 1); + } + exponent += exp_low_cutoff - 1; + mantissa <<= wmo - wm; + + // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) + if (exponent <= 0) { + mantissa |= 1 << wmo; + mantissa >>= 1 - exponent; + exponent = 0; + } + + if (sizeof(T) == 2) { + retval = (sign << 15) | (exponent << 10) | mantissa; + } else { + retval = (sign << 31) | (exponent << 23) | mantissa; + } + return reinterpret_cast(retval); } -} // namespace hip_fp8_impl +} // namespace hip_fp8_impl diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index df0329f79d361..35123d7fc65d4 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -9,566 +9,567 @@ #include "../../../attention/dtype_float32.cuh" #include "../../../attention/dtype_bfloat16.cuh" -namespace vllm -{ +namespace vllm { #ifdef USE_ROCM namespace fp8 { -#ifdef ENABLE_FP8 + #ifdef ENABLE_FP8 template -__inline__ __device__ Tout vec_conversion(const Tin& x) -{ - return x; +__inline__ __device__ Tout vec_conversion(const Tin& x) { + return x; } template -__inline__ __device__ Tout scaled_vec_conversion(const Tin& x, const float scale) -{ - return x; +__inline__ __device__ Tout scaled_vec_conversion(const Tin& x, + const float scale) { + return x; } // fp8 -> half template <> -__inline__ __device__ uint16_t vec_conversion(const uint8_t& a) -{ - hip_fp8 f8{a, hip_fp8::from_bits()}; - __half_raw res; - res.data = static_cast(f8); - return res.x; +__inline__ __device__ uint16_t +vec_conversion(const uint8_t& a) { + hip_fp8 f8{a, hip_fp8::from_bits()}; + __half_raw res; + res.data = static_cast(f8); + return res.x; } // fp8x2 -> half2 template <> -__inline__ __device__ uint32_t vec_conversion(const uint16_t& a) -{ -#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) - const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); - union { - __half2_raw h2r; - uint32_t ui32; - } tmp; - tmp.h2r.x.data = f2[0]; - tmp.h2r.y.data = f2[1]; - return tmp.ui32; -#else - union { - uint16_t u16[2]; - uint32_t u32; - } tmp; - - tmp.u16[0] = vec_conversion(static_cast(a)); - tmp.u16[1] = vec_conversion(static_cast(a >> 8U)); - return tmp.u32; -#endif +__inline__ __device__ uint32_t +vec_conversion(const uint16_t& a) { + #if defined(__HIP__MI300__) && \ + defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + union { + __half2_raw h2r; + uint32_t ui32; + } tmp; + tmp.h2r.x.data = f2[0]; + tmp.h2r.y.data = f2[1]; + return tmp.ui32; + #else + union { + uint16_t u16[2]; + uint32_t u32; + } tmp; + + tmp.u16[0] = vec_conversion(static_cast(a)); + tmp.u16[1] = vec_conversion(static_cast(a >> 8U)); + return tmp.u32; + #endif } // fp8x4 -> half2x2 template <> -__inline__ __device__ uint2 vec_conversion(const uint32_t& a) -{ - union { - uint2 u32x2; - uint32_t u32[2]; - } tmp; - tmp.u32[0] = vec_conversion((uint16_t)a); - tmp.u32[1] = vec_conversion((uint16_t)(a >> 16U)); - return tmp.u32x2; +__inline__ __device__ uint2 vec_conversion(const uint32_t& a) { + union { + uint2 u32x2; + uint32_t u32[2]; + } tmp; + tmp.u32[0] = vec_conversion((uint16_t)a); + tmp.u32[1] = vec_conversion((uint16_t)(a >> 16U)); + return tmp.u32x2; } // fp8x8 -> half2x4 template <> -__inline__ __device__ uint4 vec_conversion(const uint2& a) -{ - union { - uint4 u64x2; - uint2 u64[2]; - } tmp; - tmp.u64[0] = vec_conversion(a.x); - tmp.u64[1] = vec_conversion(a.y); - return tmp.u64x2; +__inline__ __device__ uint4 vec_conversion(const uint2& a) { + union { + uint4 u64x2; + uint2 u64[2]; + } tmp; + tmp.u64[0] = vec_conversion(a.x); + tmp.u64[1] = vec_conversion(a.y); + return tmp.u64x2; } using __nv_bfloat16 = __hip_bfloat16; // fp8 -> __nv_bfloat16 template <> -__inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) -{ - hip_fp8 f8{a, hip_fp8::from_bits()}; - float f{f8}; - return __float2bfloat16(f); +__inline__ __device__ __nv_bfloat16 +vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) { + hip_fp8 f8{a, hip_fp8::from_bits()}; + float f{f8}; + return __float2bfloat16(f); } using __nv_bfloat162 = __hip_bfloat162; // fp8x2 -> __nv_bfloat162 template <> -__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a) -{ - __nv_bfloat162 res; - res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a); - res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U)); - return res; +__inline__ __device__ __nv_bfloat162 +vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a) { + __nv_bfloat162 res; + res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a); + res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U)); + return res; } // fp8x4 -> bf16_4_t template <> -__inline__ __device__ bf16_4_t vec_conversion(const uint32_t& a) -{ - bf16_4_t res; - res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a); - res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U)); - return res; +__inline__ __device__ bf16_4_t +vec_conversion(const uint32_t& a) { + bf16_4_t res; + res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a); + res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U)); + return res; } // fp8x8 -> bf16_8_t template <> -__inline__ __device__ bf16_8_t vec_conversion(const uint2& a) -{ - bf16_4_t tmp1, tmp2; - tmp1 = vec_conversion(a.x); - tmp2 = vec_conversion(a.y); - bf16_8_t res; - res.x = tmp1.x; - res.y = tmp1.y; - res.z = tmp2.x; - res.w = tmp2.y; - return res; +__inline__ __device__ bf16_8_t vec_conversion(const uint2& a) { + bf16_4_t tmp1, tmp2; + tmp1 = vec_conversion(a.x); + tmp2 = vec_conversion(a.y); + bf16_8_t res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; } // fp8 -> float template <> -__inline__ __device__ float vec_conversion(const uint8_t& a) -{ - hip_fp8 fp8{a, hip_fp8::from_bits()}; - return static_cast(fp8); +__inline__ __device__ float vec_conversion(const uint8_t& a) { + hip_fp8 fp8{a, hip_fp8::from_bits()}; + return static_cast(fp8); } // fp8x2 -> float2 template <> -__inline__ __device__ float2 vec_conversion(const uint16_t& a) -{ -#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) - float2 res; - const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); - res.x = f2[0]; - res.y = f2[1]; - return res; -#else - float2 res; - res.x = vec_conversion(static_cast(a)); - res.y = vec_conversion(static_cast(a >> 8U)); - return res; -#endif +__inline__ __device__ float2 +vec_conversion(const uint16_t& a) { + #if defined(__HIP__MI300__) && \ + defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + float2 res; + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + res.x = f2[0]; + res.y = f2[1]; + return res; + #else + float2 res; + res.x = vec_conversion(static_cast(a)); + res.y = vec_conversion(static_cast(a >> 8U)); + return res; + #endif } // fp8x4 -> float4 template <> -__inline__ __device__ Float4_ vec_conversion(const uint32_t& a) -{ - Float4_ res; - res.x = vec_conversion((uint16_t)a); - res.y = vec_conversion((uint16_t)(a >> 16U)); - return res; +__inline__ __device__ Float4_ +vec_conversion(const uint32_t& a) { + Float4_ res; + res.x = vec_conversion((uint16_t)a); + res.y = vec_conversion((uint16_t)(a >> 16U)); + return res; } // fp8x8 -> float8 template <> -__inline__ __device__ Float8_ vec_conversion(const uint2& a) -{ - Float4_ tmp1, tmp2; - tmp1 = vec_conversion(a.x); - tmp2 = vec_conversion(a.y); - Float8_ res; - res.x = tmp1.x; - res.y = tmp1.y; - res.z = tmp2.x; - res.w = tmp2.y; - return res; +__inline__ __device__ Float8_ vec_conversion(const uint2& a) { + Float4_ tmp1, tmp2; + tmp1 = vec_conversion(a.x); + tmp2 = vec_conversion(a.y); + Float8_ res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; } // half -> fp8 template <> -__inline__ __device__ uint8_t vec_conversion(const uint16_t& a) -{ - __half_raw tmp; - tmp.x = a; +__inline__ __device__ uint8_t +vec_conversion(const uint16_t& a) { + __half_raw tmp; + tmp.x = a; - hip_fp8 f8{static_cast(tmp.data)}; - return f8.data; + hip_fp8 f8{static_cast(tmp.data)}; + return f8.data; } // bf16 -> fp8 template <> -__inline__ __device__ uint8_t vec_conversion(const __nv_bfloat16& a) -{ - hip_fp8 res{__bfloat162float(a)}; - return res.data; +__inline__ __device__ uint8_t +vec_conversion(const __nv_bfloat16& a) { + hip_fp8 res{__bfloat162float(a)}; + return res.data; } // float -> fp8 template <> -__inline__ __device__ uint8_t vec_conversion(const float& a) -{ - hip_fp8 f8(a); - return f8.data; +__inline__ __device__ uint8_t vec_conversion(const float& a) { + hip_fp8 f8(a); + return f8.data; } // fp8x4 -> float4 template <> -__inline__ __device__ float4 vec_conversion(const uint32_t& a) -{ - Float4_ tmp = vec_conversion(a); - float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); - return res; +__inline__ __device__ float4 +vec_conversion(const uint32_t& a) { + Float4_ tmp = vec_conversion(a); + float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); + return res; } // float2 -> half2 template <> -__inline__ __device__ uint32_t vec_conversion(const float2& a) -{ - union { - half2 float16; - uint32_t uint32; - }; +__inline__ __device__ uint32_t +vec_conversion(const float2& a) { + union { + half2 float16; + uint32_t uint32; + }; - float16 = __float22half2_rn(a); - return uint32; + float16 = __float22half2_rn(a); + return uint32; } // Float4 -> half2x2 template <> -__inline__ __device__ uint2 vec_conversion(const Float4_& a) -{ - uint2 b; - float2 val; - val.x = a.x.x; - val.y = a.x.y; - b.x = vec_conversion(val); +__inline__ __device__ uint2 vec_conversion(const Float4_& a) { + uint2 b; + float2 val; + val.x = a.x.x; + val.y = a.x.y; + b.x = vec_conversion(val); - val.x = a.y.x; - val.y = a.y.y; - b.y = vec_conversion(val); - return b; + val.x = a.y.x; + val.y = a.y.y; + b.y = vec_conversion(val); + return b; } // Float4 -> float4 template <> -__inline__ __device__ float4 vec_conversion(const Float4_& a) -{ - float4 b; - b.x = a.x.x; - b.y = a.x.y; - b.z = a.y.x; - b.w = a.y.y; - return b; +__inline__ __device__ float4 vec_conversion(const Float4_& a) { + float4 b; + b.x = a.x.x; + b.y = a.x.y; + b.z = a.y.x; + b.w = a.y.y; + return b; } // Float8 -> half2x4 template <> -__inline__ __device__ uint4 vec_conversion(const Float8_& a) -{ - uint4 b; - b.x = vec_conversion(a.x); - b.y = vec_conversion(a.y); - b.z = vec_conversion(a.z); - b.w = vec_conversion(a.w); - return b; +__inline__ __device__ uint4 vec_conversion(const Float8_& a) { + uint4 b; + b.x = vec_conversion(a.x); + b.y = vec_conversion(a.y); + b.z = vec_conversion(a.z); + b.w = vec_conversion(a.w); + return b; } // float2 -> bfloat162 template <> -__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a) -{ - __nv_bfloat162 b = __float22bfloat162_rn(a); - return b; +__inline__ __device__ __nv_bfloat162 +vec_conversion<__nv_bfloat162, float2>(const float2& a) { + __nv_bfloat162 b = __float22bfloat162_rn(a); + return b; } // Float4 -> bfloat162x2 template <> -__inline__ __device__ bf16_4_t vec_conversion(const Float4_& a) -{ - bf16_4_t b; - b.x = __float22bfloat162_rn(a.x); - b.y = __float22bfloat162_rn(a.y); - return b; +__inline__ __device__ bf16_4_t +vec_conversion(const Float4_& a) { + bf16_4_t b; + b.x = __float22bfloat162_rn(a.x); + b.y = __float22bfloat162_rn(a.y); + return b; } // Float8 -> bfloat162x4 template <> -__inline__ __device__ bf16_8_t vec_conversion(const Float8_& a) -{ - bf16_8_t b; - b.x = __float22bfloat162_rn(a.x); - b.y = __float22bfloat162_rn(a.y); - b.z = __float22bfloat162_rn(a.z); - b.w = __float22bfloat162_rn(a.w); - return b; +__inline__ __device__ bf16_8_t +vec_conversion(const Float8_& a) { + bf16_8_t b; + b.x = __float22bfloat162_rn(a.x); + b.y = __float22bfloat162_rn(a.y); + b.z = __float22bfloat162_rn(a.z); + b.w = __float22bfloat162_rn(a.w); + return b; } +/* Scaled and vectorized conversions, for data exchange between high and low + precision domains -/* Scaled and vectorized conversions, for data exchange between high and low precision domains - - Convention of the scale in API, e.g: FP8_data = Quantization( High_Precision_data / scale ) - s.t. - Quantize(HP / scale) => FP8 - Dequant(FP8) * scale => HP + Convention of the scale in API, e.g: FP8_data = Quantization( + High_Precision_data / scale ) s.t. Quantize(HP / scale) => FP8 Dequant(FP8) * + scale => HP */ // fp8 -> half template <> -__inline__ __device__ uint16_t scaled_vec_conversion(const uint8_t& a, const float scale) -{ - hip_fp8 f8{a, hip_fp8::from_bits()}; - __half_raw res; - res.data = static_cast(f8) * scale; - return res.x; +__inline__ __device__ uint16_t +scaled_vec_conversion(const uint8_t& a, const float scale) { + hip_fp8 f8{a, hip_fp8::from_bits()}; + __half_raw res; + res.data = static_cast(f8) * scale; + return res.x; } // fp8x2 -> half2 template <> -__inline__ __device__ uint32_t scaled_vec_conversion(const uint16_t& a, const float scale) -{ -#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) - const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); - union { - __half2_raw h2r; - uint32_t ui32; - } tmp; - tmp.h2r.x.data = f2[0] * scale; - tmp.h2r.y.data = f2[1] * scale; - return tmp.ui32; -#else - union { - uint16_t u16[2]; - uint32_t u32; - } tmp; - - tmp.u16[0] = scaled_vec_conversion(static_cast(a), scale); - tmp.u16[1] = scaled_vec_conversion(static_cast(a >> 8U), scale); - return tmp.u32; -#endif +__inline__ __device__ uint32_t scaled_vec_conversion( + const uint16_t& a, const float scale) { + #if defined(__HIP__MI300__) && \ + defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + union { + __half2_raw h2r; + uint32_t ui32; + } tmp; + tmp.h2r.x.data = f2[0] * scale; + tmp.h2r.y.data = f2[1] * scale; + return tmp.ui32; + #else + union { + uint16_t u16[2]; + uint32_t u32; + } tmp; + + tmp.u16[0] = + scaled_vec_conversion(static_cast(a), scale); + tmp.u16[1] = scaled_vec_conversion( + static_cast(a >> 8U), scale); + return tmp.u32; + #endif } // fp8x4 -> half2x2 template <> -__inline__ __device__ uint2 scaled_vec_conversion(const uint32_t& a, const float scale) -{ - union { - uint2 u32x2; - uint32_t u32[2]; - } tmp; - tmp.u32[0] = scaled_vec_conversion((uint16_t)a, scale); - tmp.u32[1] = scaled_vec_conversion((uint16_t)(a >> 16U), scale); - return tmp.u32x2; +__inline__ __device__ uint2 +scaled_vec_conversion(const uint32_t& a, const float scale) { + union { + uint2 u32x2; + uint32_t u32[2]; + } tmp; + tmp.u32[0] = scaled_vec_conversion((uint16_t)a, scale); + tmp.u32[1] = + scaled_vec_conversion((uint16_t)(a >> 16U), scale); + return tmp.u32x2; } // fp8x8 -> half2x4 template <> -__inline__ __device__ uint4 scaled_vec_conversion(const uint2& a, const float scale) -{ - union { - uint4 u64x2; - uint2 u64[2]; - } tmp; - tmp.u64[0] = scaled_vec_conversion(a.x, scale); - tmp.u64[1] = scaled_vec_conversion(a.y, scale); - return tmp.u64x2; +__inline__ __device__ uint4 +scaled_vec_conversion(const uint2& a, const float scale) { + union { + uint4 u64x2; + uint2 u64[2]; + } tmp; + tmp.u64[0] = scaled_vec_conversion(a.x, scale); + tmp.u64[1] = scaled_vec_conversion(a.y, scale); + return tmp.u64x2; } using __nv_bfloat16 = __hip_bfloat16; // fp8 -> __nv_bfloat16 template <> -__inline__ __device__ __nv_bfloat16 scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, const float scale) -{ - hip_fp8 f8{a, hip_fp8::from_bits()}; - float f{f8}; - return __float2bfloat16(f * scale); +__inline__ __device__ __nv_bfloat16 +scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, + const float scale) { + hip_fp8 f8{a, hip_fp8::from_bits()}; + float f{f8}; + return __float2bfloat16(f * scale); } using __nv_bfloat162 = __hip_bfloat162; // fp8x2 -> __nv_bfloat162 template <> -__inline__ __device__ __nv_bfloat162 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a, const float scale) -{ - __nv_bfloat162 res; - res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale); - res.y = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), scale); - return res; +__inline__ __device__ __nv_bfloat162 +scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a, + const float scale) { + __nv_bfloat162 res; + res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale); + res.y = + scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), scale); + return res; } // fp8x4 -> bf16_4_t template <> -__inline__ __device__ bf16_4_t scaled_vec_conversion(const uint32_t& a, const float scale) -{ - bf16_4_t res; - res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale); - res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U), scale); - return res; +__inline__ __device__ bf16_4_t scaled_vec_conversion( + const uint32_t& a, const float scale) { + bf16_4_t res; + res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale); + res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U), + scale); + return res; } // fp8x8 -> bf16_8_t template <> -__inline__ __device__ bf16_8_t scaled_vec_conversion(const uint2& a, const float scale) -{ - bf16_4_t tmp1, tmp2; - tmp1 = scaled_vec_conversion(a.x, scale); - tmp2 = scaled_vec_conversion(a.y, scale); - bf16_8_t res; - res.x = tmp1.x; - res.y = tmp1.y; - res.z = tmp2.x; - res.w = tmp2.y; - return res; +__inline__ __device__ bf16_8_t +scaled_vec_conversion(const uint2& a, const float scale) { + bf16_4_t tmp1, tmp2; + tmp1 = scaled_vec_conversion(a.x, scale); + tmp2 = scaled_vec_conversion(a.y, scale); + bf16_8_t res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; } // fp8 -> float template <> -__inline__ __device__ float scaled_vec_conversion(const uint8_t& a, const float scale) -{ - hip_fp8 fp8{a, hip_fp8::from_bits()}; - return static_cast(fp8) * scale; +__inline__ __device__ float scaled_vec_conversion( + const uint8_t& a, const float scale) { + hip_fp8 fp8{a, hip_fp8::from_bits()}; + return static_cast(fp8) * scale; } // fp8x2 -> float2 template <> -__inline__ __device__ float2 scaled_vec_conversion(const uint16_t& a, const float scale) -{ -#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) - float2 res; - const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); - res.x = f2[0] * scale; - res.y = f2[1] * scale; - return res; -#else - float2 res; - res.x = scaled_vec_conversion(static_cast(a), scale); - res.y = scaled_vec_conversion(static_cast(a >> 8U), scale); - return res; -#endif +__inline__ __device__ float2 +scaled_vec_conversion(const uint16_t& a, const float scale) { + #if defined(__HIP__MI300__) && \ + defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + float2 res; + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + res.x = f2[0] * scale; + res.y = f2[1] * scale; + return res; + #else + float2 res; + res.x = scaled_vec_conversion(static_cast(a), scale); + res.y = scaled_vec_conversion(static_cast(a >> 8U), + scale); + return res; + #endif } // fp8x4 -> float4 template <> -__inline__ __device__ Float4_ scaled_vec_conversion(const uint32_t& a, const float scale) -{ - Float4_ res; - res.x = scaled_vec_conversion((uint16_t)a, scale); - res.y = scaled_vec_conversion((uint16_t)(a >> 16U), scale); - return res; +__inline__ __device__ Float4_ +scaled_vec_conversion(const uint32_t& a, const float scale) { + Float4_ res; + res.x = scaled_vec_conversion((uint16_t)a, scale); + res.y = scaled_vec_conversion((uint16_t)(a >> 16U), scale); + return res; } // fp8x8 -> float8 template <> -__inline__ __device__ Float8_ scaled_vec_conversion(const uint2& a, const float scale) -{ - Float4_ tmp1, tmp2; - tmp1 = scaled_vec_conversion(a.x, scale); - tmp2 = scaled_vec_conversion(a.y, scale); - Float8_ res; - res.x = tmp1.x; - res.y = tmp1.y; - res.z = tmp2.x; - res.w = tmp2.y; - return res; +__inline__ __device__ Float8_ +scaled_vec_conversion(const uint2& a, const float scale) { + Float4_ tmp1, tmp2; + tmp1 = scaled_vec_conversion(a.x, scale); + tmp2 = scaled_vec_conversion(a.y, scale); + Float8_ res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; } - /* Quantize(HP / scale) => FP8 */ // TODO(Hai): vectorized to add // half -> fp8 template <> -__inline__ __device__ uint8_t scaled_vec_conversion(const uint16_t& a, const float scale) -{ - __half_raw tmp; - tmp.x = a; +__inline__ __device__ uint8_t +scaled_vec_conversion(const uint16_t& a, const float scale) { + __half_raw tmp; + tmp.x = a; - hip_fp8 f8{static_cast(tmp.data)/scale}; - return f8.data; + hip_fp8 f8{static_cast(tmp.data) / scale}; + return f8.data; } // bf16 -> fp8 template <> -__inline__ __device__ uint8_t scaled_vec_conversion(const __nv_bfloat16& a, const float scale) -{ - hip_fp8 res{__bfloat162float(a)/scale}; - return res.data; +__inline__ __device__ uint8_t scaled_vec_conversion( + const __nv_bfloat16& a, const float scale) { + hip_fp8 res{__bfloat162float(a) / scale}; + return res.data; } // float -> fp8 template <> -__inline__ __device__ uint8_t scaled_vec_conversion(const float& a, const float scale) -{ - hip_fp8 f8(a/scale); - return f8.data; +__inline__ __device__ uint8_t +scaled_vec_conversion(const float& a, const float scale) { + hip_fp8 f8(a / scale); + return f8.data; } // fp8x4 -> float4 template <> -__inline__ __device__ float4 scaled_vec_conversion(const uint32_t& a, const float scale) -{ - Float4_ tmp = scaled_vec_conversion(a, scale); - float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); - return res; +__inline__ __device__ float4 +scaled_vec_conversion(const uint32_t& a, const float scale) { + Float4_ tmp = scaled_vec_conversion(a, scale); + float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); + return res; } -#endif // ENABLE_FP8 + #endif // ENABLE_FP8 template -__inline__ __device__ Tout convert(const Tin &x) { -#ifdef ENABLE_FP8 +__inline__ __device__ Tout convert(const Tin& x) { + #ifdef ENABLE_FP8 if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) { return vec_conversion(x); } -#endif + #endif assert(false); } template -__inline__ __device__ Tout scaled_convert(const Tin &x, const float scale) { -#ifdef ENABLE_FP8 +__inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { + #ifdef ENABLE_FP8 if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) { return scaled_vec_conversion(x, scale); } -#endif + #endif assert(false); } -// The following macro is used to dispatch the conversion function based on the -// data type of the key and value cache. The FN is a macro that calls a function -// with template. -#define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN) \ - if (KV_DTYPE == "auto") { \ - if (SRC_DTYPE == at::ScalarType::Float) { \ - FN(float, float, vllm::Fp8KVCacheDataType::kAuto); \ - } else if (SRC_DTYPE == at::ScalarType::Half) { \ - FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto); \ - } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ - FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto); \ - } else { \ - TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \ - } \ - } else { \ - if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") { \ + // The following macro is used to dispatch the conversion function based on + // the data type of the key and value cache. The FN is a macro that calls a + // function with template. + #define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN) \ + if (KV_DTYPE == "auto") { \ if (SRC_DTYPE == at::ScalarType::Float) { \ - FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + FN(float, float, vllm::Fp8KVCacheDataType::kAuto); \ } else if (SRC_DTYPE == at::ScalarType::Half) { \ - FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto); \ } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ - FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto); \ } else { \ TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \ } \ } else { \ - TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE); \ - } \ - } + if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") { \ + if (SRC_DTYPE == at::ScalarType::Float) { \ + FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + } else if (SRC_DTYPE == at::ScalarType::Half) { \ + FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ + FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + } else { \ + TORCH_CHECK(false, \ + "Unsupported input type of kv cache: ", SRC_DTYPE); \ + } \ + } else { \ + TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE); \ + } \ + } -} // fp8 -#endif // USE_ROCM -} // namespace vllm +} // namespace fp8 +#endif // USE_ROCM +} // namespace vllm diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 6d9ef4c183bb7..55be3305a9b8c 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -10,17 +10,20 @@ namespace vllm { __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { - float old; - old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) : - __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value))); + float old; + old = (value >= 0) + ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) + : __uint_as_float( + atomicMin((unsigned int*)addr, __float_as_uint(value))); - return old; + return old; } #define FP8_E4M3_MAX std::numeric_limits::max() -template -__device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion(const scalar_t val, const float scale) { +template +__device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion( + const scalar_t val, const float scale) { float x = static_cast(val) / scale; float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); return static_cast(r); @@ -32,11 +35,10 @@ __device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion(const scalar // So to get the right answer, *scale needs to be initialized to // a value <= 0.0 and we need to wait for all thread blocks to // finish before consuming *scale. -template -__global__ void segmented_max_reduction( - float* __restrict__ scale, - const scalar_t* __restrict__ input, - int64_t num_elems) { +template +__global__ void segmented_max_reduction(float* __restrict__ scale, + const scalar_t* __restrict__ input, + int64_t num_elems) { __shared__ float cache[1024]; int i = blockDim.x * blockIdx.x + threadIdx.x; @@ -56,7 +58,7 @@ __global__ void segmented_max_reduction( int ib = blockDim.x / 2; while (ib != 0) { if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) { - cache[threadIdx.x] = cache[threadIdx.x + ib]; + cache[threadIdx.x] = cache[threadIdx.x + ib]; } __syncthreads(); ib /= 2; @@ -64,16 +66,16 @@ __global__ void segmented_max_reduction( // Finally, since cache[0] contains the maximum for this thread block, // atomically write the max to the target location if (threadIdx.x == 0) { - atomicMaxFloat(scale, cache[0] / std::numeric_limits::max()); + atomicMaxFloat(scale, + cache[0] / std::numeric_limits::max()); } } -template -__global__ void scaled_fp8_quant_kernel( - c10::Float8_e4m3fn* __restrict__ out, - const scalar_t* __restrict__ input, - const float* __restrict__ scale, - int64_t num_elems) { +template +__global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out, + const scalar_t* __restrict__ input, + const float* __restrict__ scale, + int64_t num_elems) { int i = blockDim.x * blockIdx.x + threadIdx.x; while (i < num_elems) { out[i] = scaled_fp8_conversion(input[i], *scale); @@ -81,12 +83,11 @@ __global__ void scaled_fp8_quant_kernel( } } -} // namespace vllm +} // namespace vllm -void static_scaled_fp8_quant( - torch::Tensor& out, // [..., d] - torch::Tensor& input, // [..., d] - torch::Tensor& scale) // [1] +void static_scaled_fp8_quant(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., d] + torch::Tensor& scale) // [1] { int64_t num_tokens = input.numel() / input.size(-1); int64_t num_elems = input.numel(); @@ -95,21 +96,16 @@ void static_scaled_fp8_quant( const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "scaled_fp8_quant_kernel", - [&] { - vllm::scaled_fp8_quant_kernel<<>>( - out.data_ptr(), - input.data_ptr(), - scale.data_ptr(), - num_elems); + input.scalar_type(), "scaled_fp8_quant_kernel", [&] { + vllm::scaled_fp8_quant_kernel<<>>( + out.data_ptr(), input.data_ptr(), + scale.data_ptr(), num_elems); }); } -void dynamic_scaled_fp8_quant( - torch::Tensor& out, // [..., d] - torch::Tensor& input, // [..., d] - torch::Tensor& scale) // [1] +void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., d] + torch::Tensor& scale) // [1] { int64_t num_tokens = input.numel() / input.size(-1); int64_t num_elems = input.numel(); @@ -118,17 +114,11 @@ void dynamic_scaled_fp8_quant( const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "scaled_fp8_quant_kernel", - [&] { - vllm::segmented_max_reduction<<>>( - scale.data_ptr(), - input.data_ptr(), - num_elems); - vllm::scaled_fp8_quant_kernel<<>>( - out.data_ptr(), - input.data_ptr(), - scale.data_ptr(), - num_elems); + input.scalar_type(), "scaled_fp8_quant_kernel", [&] { + vllm::segmented_max_reduction<<>>( + scale.data_ptr(), input.data_ptr(), num_elems); + vllm::scaled_fp8_quant_kernel<<>>( + out.data_ptr(), input.data_ptr(), + scale.data_ptr(), num_elems); }); } diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh index 4eeacf7a6f9d9..cde26dbda18cf 100644 --- a/csrc/quantization/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh @@ -10,9 +10,9 @@ namespace vllm { #ifndef USE_ROCM namespace fp8 { -#ifdef ENABLE_FP8 + #ifdef ENABLE_FP8 -#if 0 // Disable the following code to reduce the binary size. + #if 0 // Disable the following code to reduce the binary size. template __inline__ __device__ Tout vec_conversion(const Tin &x, const __nv_fp8_interpretation_t fp8_type) { @@ -177,13 +177,13 @@ __inline__ __device__ uint8_t vec_conversion( template <> __inline__ __device__ uint8_t vec_conversion( const __nv_bfloat16 &a, const __nv_fp8_interpretation_t fp8_type) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 assert(false); -#else + #else __nv_fp8_storage_t res = __nv_cvt_bfloat16raw_to_fp8( __nv_bfloat16_raw(a), __NV_SATFINITE, fp8_type); return (uint8_t)res; -#endif + #endif } // float -> fp8 @@ -276,7 +276,7 @@ __inline__ __device__ bf16_8_t vec_conversion( from_float(b, a); return b; } -#endif + #endif /* Scaled and vectorized conversions, for data exchange between high and low precision domains Convention of the scale in API, e.g: FP8_data = @@ -286,14 +286,14 @@ __inline__ __device__ bf16_8_t vec_conversion( template __inline__ __device__ Tout scaled_vec_conversion( - const Tin &x, const float scale, const __nv_fp8_interpretation_t fp8_type) { + const Tin& x, const float scale, const __nv_fp8_interpretation_t fp8_type) { return x; } // fp8 -> half template <> __inline__ __device__ uint16_t scaled_vec_conversion( - const uint8_t &a, const float scale, + const uint8_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { __half_raw tmp = __nv_cvt_fp8_to_halfraw(a, fp8_type); return float_to_half(half_to_float(tmp.x) * scale); @@ -302,7 +302,7 @@ __inline__ __device__ uint16_t scaled_vec_conversion( // fp8x2 -> half2 template <> __inline__ __device__ uint32_t scaled_vec_conversion( - const uint16_t &a, const float scale, + const uint16_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { union { uint16_t u16[2]; @@ -317,7 +317,7 @@ __inline__ __device__ uint32_t scaled_vec_conversion( // fp8x4 -> half2x2 template <> __inline__ __device__ uint2 scaled_vec_conversion( - const uint32_t &a, const float scale, + const uint32_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { union { uint2 u32x2; @@ -333,7 +333,7 @@ __inline__ __device__ uint2 scaled_vec_conversion( // fp8x8 -> half2x4 template <> __inline__ __device__ uint4 -scaled_vec_conversion(const uint2 &a, const float scale, +scaled_vec_conversion(const uint2& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { union { uint4 u64x2; @@ -348,7 +348,7 @@ scaled_vec_conversion(const uint2 &a, const float scale, template <> __inline__ __device__ __nv_bfloat16 scaled_vec_conversion<__nv_bfloat16, uint8_t>( - const uint8_t &a, const float scale, + const uint8_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { // Note there is no direct convert function from fp8 to bf16. // fp8 -> half @@ -362,7 +362,7 @@ scaled_vec_conversion<__nv_bfloat16, uint8_t>( template <> __inline__ __device__ __nv_bfloat162 scaled_vec_conversion<__nv_bfloat162, uint16_t>( - const uint16_t &a, const float scale, + const uint16_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { __nv_bfloat162 res; res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale, @@ -375,7 +375,7 @@ scaled_vec_conversion<__nv_bfloat162, uint16_t>( // fp8x4 -> bf16_4_t template <> __inline__ __device__ bf16_4_t scaled_vec_conversion( - const uint32_t &a, const float scale, + const uint32_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { bf16_4_t res; res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale, @@ -388,7 +388,7 @@ __inline__ __device__ bf16_4_t scaled_vec_conversion( // fp8x8 -> bf16_8_t template <> __inline__ __device__ bf16_8_t scaled_vec_conversion( - const uint2 &a, const float scale, + const uint2& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { bf16_4_t tmp1, tmp2; tmp1 = scaled_vec_conversion(a.x, scale, fp8_type); @@ -404,9 +404,8 @@ __inline__ __device__ bf16_8_t scaled_vec_conversion( // fp8 -> float template <> __inline__ __device__ float scaled_vec_conversion( - const uint8_t &a, const float scale, + const uint8_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { - // fp8 -> half __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type); uint16_t tmp = res.x; @@ -418,7 +417,7 @@ __inline__ __device__ float scaled_vec_conversion( // fp8x2 -> float2 template <> __inline__ __device__ float2 scaled_vec_conversion( - const uint16_t &a, const float scale, + const uint16_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { // fp8x2 -> half2 uint32_t tmp = scaled_vec_conversion(a, scale, fp8_type); @@ -429,7 +428,7 @@ __inline__ __device__ float2 scaled_vec_conversion( // fp8x4 -> float4 template <> __inline__ __device__ Float4_ scaled_vec_conversion( - const uint32_t &a, const float scale, + const uint32_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { Float4_ res; res.x = scaled_vec_conversion((uint16_t)a, scale, fp8_type); @@ -441,7 +440,7 @@ __inline__ __device__ Float4_ scaled_vec_conversion( // fp8x8 -> float8 template <> __inline__ __device__ Float8_ scaled_vec_conversion( - const uint2 &a, const float scale, + const uint2& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { Float4_ tmp1, tmp2; tmp1 = scaled_vec_conversion(a.x, scale, fp8_type); @@ -457,7 +456,7 @@ __inline__ __device__ Float8_ scaled_vec_conversion( // half -> fp8 template <> __inline__ __device__ uint8_t scaled_vec_conversion( - const uint16_t &a, const float scale, + const uint16_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(half_to_float(a) / scale, __NV_SATFINITE, fp8_type); @@ -467,21 +466,21 @@ __inline__ __device__ uint8_t scaled_vec_conversion( // bf16 -> fp8 template <> __inline__ __device__ uint8_t scaled_vec_conversion( - const __nv_bfloat16 &a, const float scale, + const __nv_bfloat16& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 assert(false); -#else + #else __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(__bfloat162float(a) / scale, __NV_SATFINITE, fp8_type); return (uint8_t)res; -#endif + #endif } // float -> fp8 template <> __inline__ __device__ uint8_t scaled_vec_conversion( - const float &a, const float scale, + const float& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(a / scale, __NV_SATFINITE, fp8_type); @@ -491,78 +490,81 @@ __inline__ __device__ uint8_t scaled_vec_conversion( // fp8x4 -> float4 template <> __inline__ __device__ float4 scaled_vec_conversion( - const uint32_t &a, const float scale, + const uint32_t& a, const float scale, const __nv_fp8_interpretation_t fp8_type) { Float4_ tmp = scaled_vec_conversion(a, scale, fp8_type); float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); return res; } -#endif // ENABLE_FP8 + #endif // ENABLE_FP8 template -__inline__ __device__ Tout convert(const Tin &x) { -#if 0 // Disable the following code to reduce the binary size. +__inline__ __device__ Tout convert(const Tin& x) { + #if 0 // Disable the following code to reduce the binary size. if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) { return vec_conversion(x, __NV_E4M3); } else if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E5M2) { return vec_conversion(x, __NV_E5M2); } -#endif + #endif assert(false); } template -__inline__ __device__ Tout scaled_convert(const Tin &x, const float scale) { -#ifdef ENABLE_FP8 +__inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { + #ifdef ENABLE_FP8 if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) { return scaled_vec_conversion(x, scale, __NV_E4M3); } else if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E5M2) { return scaled_vec_conversion(x, scale, __NV_E5M2); } -#endif + #endif assert(false); } -// The following macro is used to dispatch the conversion function based on the -// data type of the key and value cache. The FN is a macro that calls a function -// with template. -#define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN) \ - if (KV_DTYPE == "auto") { \ - if (SRC_DTYPE == at::ScalarType::Float) { \ - FN(float, float, vllm::Fp8KVCacheDataType::kAuto); \ - } else if (SRC_DTYPE == at::ScalarType::Half) { \ - FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto); \ - } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ - FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto); \ - } else { \ - TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \ - } \ - } else { \ - if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") { \ + // The following macro is used to dispatch the conversion function based on + // the data type of the key and value cache. The FN is a macro that calls a + // function with template. + #define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN) \ + if (KV_DTYPE == "auto") { \ if (SRC_DTYPE == at::ScalarType::Float) { \ - FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + FN(float, float, vllm::Fp8KVCacheDataType::kAuto); \ } else if (SRC_DTYPE == at::ScalarType::Half) { \ - FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto); \ } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ - FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto); \ } else { \ TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \ } \ - } else if (KV_DTYPE == "fp8_e5m2") { \ - if (SRC_DTYPE == at::ScalarType::Float) { \ - FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2); \ - } else if (SRC_DTYPE == at::ScalarType::Half) { \ - FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2); \ - } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ - FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2); \ + } else { \ + if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") { \ + if (SRC_DTYPE == at::ScalarType::Float) { \ + FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + } else if (SRC_DTYPE == at::ScalarType::Half) { \ + FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ + FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \ + } else { \ + TORCH_CHECK(false, \ + "Unsupported input type of kv cache: ", SRC_DTYPE); \ + } \ + } else if (KV_DTYPE == "fp8_e5m2") { \ + if (SRC_DTYPE == at::ScalarType::Float) { \ + FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2); \ + } else if (SRC_DTYPE == at::ScalarType::Half) { \ + FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2); \ + } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ + FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2); \ + } else { \ + TORCH_CHECK(false, \ + "Unsupported input type of kv cache: ", SRC_DTYPE); \ + } \ } else { \ - TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \ + TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE); \ } \ - } else { \ - TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE); \ - } \ - } + } -} // namespace fp8 -#endif // not USE_ROCM -} // namespace vllm +} // namespace fp8 +#endif // not USE_ROCM +} // namespace vllm diff --git a/csrc/quantization/gptq/compat.cuh b/csrc/quantization/gptq/compat.cuh index 4da0bc6e2df38..1b3fb3d39103f 100644 --- a/csrc/quantization/gptq/compat.cuh +++ b/csrc/quantization/gptq/compat.cuh @@ -9,54 +9,54 @@ namespace vllm { namespace gptq { // atomicAdd for half types, to support CC < 7.x -__device__ __forceinline__ void atomicAdd_half(half* address, half val) -{ - unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int assumed; +__device__ __forceinline__ void atomicAdd_half(half* address, half val) { + unsigned int* address_as_ui = + (unsigned int*)((char*)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; - do - { - assumed = old; - __half_raw hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - half tmpres = __hadd(hsum, val); - hsum = __half_raw(tmpres); - old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; - old = atomicCAS(address_as_ui, assumed, old); - } - while (assumed != old); + do { + assumed = old; + __half_raw hsum; + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + half tmpres = __hadd(hsum, val); + hsum = __half_raw(tmpres); + old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) + : (old & 0xffff0000) | hsum.x; + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); } // atomicAdd for half2 types -__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) -{ - unsigned int* address_as_ui = (unsigned int*)address; - unsigned int old = *address_as_ui; - unsigned int assumed; - do - { - assumed = old; - half2 old_val = *((half2*)&old); - half2 new_val = __hadd2(old_val, val); - old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); - } - while (assumed != old); +__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) { + unsigned int* address_as_ui = (unsigned int*)address; + unsigned int old = *address_as_ui; + unsigned int assumed; + do { + assumed = old; + half2 old_val = *((half2*)&old); + half2 new_val = __hadd2(old_val, val); + old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); + } while (assumed != old); } // #if defined(__CUDA_ARCH__) || defined(USE_ROCM) -#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) + #if __CUDA_ARCH__ < 700 || defined(USE_ROCM) -__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } +__device__ __forceinline__ void atomicAdd(half* address, half val) { + atomicAdd_half(address, val); +} -#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) -__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } -#endif + #if __CUDA_ARCH__ < 600 || defined(USE_ROCM) +__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { + atomicAdd_half2(address, val); +} + #endif -#endif + #endif #endif } // namespace gptq diff --git a/csrc/quantization/gptq/matrix_view.cuh b/csrc/quantization/gptq/matrix_view.cuh index eda3436eb5375..2b6719fbdc1bc 100644 --- a/csrc/quantization/gptq/matrix_view.cuh +++ b/csrc/quantization/gptq/matrix_view.cuh @@ -1,5 +1,6 @@ /* -Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turboderp/exllama +Adapted from https://github.com/turboderp/exllamav2 and +https://github.com/turboderp/exllama */ #ifndef _matrix_view_cuh @@ -13,260 +14,280 @@ Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turbo namespace vllm { namespace gptq { -class MatrixView_half -{ -public: - const half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } - __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } - - __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const - { - half2* ptr = (half2*) item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __low2half(i01); - items[1] = __high2half(i01); - items[2] = __low2half(i23); - items[3] = __high2half(i23); - } - __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const - { - half2* ptr = (half2*)item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __half2float(__low2half(i01)); - items[1] = __half2float(__high2half(i01)); - items[2] = __half2float(__low2half(i23)); - items[3] = __half2float(__high2half(i23)); - } - - __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const - { - half2* ptr = (half2*)item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __half2half2(__low2half(i01)); - items[1] = __half2half2(__high2half(i01)); - items[2] = __half2half2(__low2half(i23)); - items[3] = __half2half2(__high2half(i23)); - } +class MatrixView_half { + public: + const half* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_half(const half* data, const int height, + const int width) + : data(data), height(height), width(width) {} + + __device__ __forceinline__ half item(int row, int column) const { + return data[row * width + column]; + } + __device__ __forceinline__ half2 item_half2(int row, int column) const { + return ((half2*)data)[(row * width + column) / 2]; + } + __device__ __forceinline__ half2 item_half2half2(int row, int column) const { + return __half2half2(data[row * width + column]); + } + __device__ __forceinline__ const half* item_ptr(int row, int column) const { + return &data[row * width + column]; + } + + __device__ __forceinline__ void item4(half (&items)[4], int row, + int column) const { + half2* ptr = (half2*)item_ptr(row, column); + half2 i01 = ptr[0]; + half2 i23 = ptr[1]; + items[0] = __low2half(i01); + items[1] = __high2half(i01); + items[2] = __low2half(i23); + items[3] = __high2half(i23); + } + __device__ __forceinline__ void item4_f(float (&items)[4], int row, + int column) const { + half2* ptr = (half2*)item_ptr(row, column); + half2 i01 = ptr[0]; + half2 i23 = ptr[1]; + items[0] = __half2float(__low2half(i01)); + items[1] = __half2float(__high2half(i01)); + items[2] = __half2float(__low2half(i23)); + items[3] = __half2float(__high2half(i23)); + } + + __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, + int column) const { + half2* ptr = (half2*)item_ptr(row, column); + half2 i01 = ptr[0]; + half2 i23 = ptr[1]; + items[0] = __half2half2(__low2half(i01)); + items[1] = __half2half2(__high2half(i01)); + items[2] = __half2half2(__low2half(i23)); + items[3] = __half2half2(__high2half(i23)); + } }; -class MatrixView_half_rw -{ -public: - half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } - __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } - __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } - - __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3) - { - half2 v01 = __halves2half2(v0, v1); - half2 v23 = __halves2half2(v2, v3); - half2* ptr = (half2*) item_ptr(row, column); - ptr[0] = v01; - ptr[1] = v23; - } +class MatrixView_half_rw { + public: + half* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, + const int width) + : data(data), height(height), width(width) {} + + __device__ __forceinline__ half item(int row, int column) const { + return data[row * width + column]; + } + __device__ __forceinline__ half2 item_half2(int row, int column) const { + return ((half2*)data)[(row * width + column) / 2]; + } + __device__ __forceinline__ half* item_ptr(int row, int column) { + return &data[row * width + column]; + } + __device__ __forceinline__ void set(int row, int column, half value) { + data[row * width + column] = value; + } + __device__ __forceinline__ void set_half2(int row, int column, half2 value) { + ((half2*)data)[(row * width + column) / 2] = value; + } + + __device__ __forceinline__ void set4(int row, int column, half v0, half v1, + half v2, half v3) { + half2 v01 = __halves2half2(v0, v1); + half2 v23 = __halves2half2(v2, v3); + half2* ptr = (half2*)item_ptr(row, column); + ptr[0] = v01; + ptr[1] = v23; + } }; -class MatrixView_q4_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x07) * 4; - return (data[row * width / 8 + column / 8] >> shift) & 0x0f; - } - - __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const - { - int shift = (column & 0x07) * 4; - uint32_t d = data[row * width / 8 + column / 8] >> shift; - items[0] = d & 0x0f; - items[1] = (d >> 4) & 0x0f; - } - - __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const - { - int shift = (column & 0x07) * 4; - uint32_t d = data[row * width / 8 + column / 8] >> shift; - items[0] = d & 0x0f; - items[1] = (d >> 4) & 0x0f; - items[2] = (d >> 8) & 0x0f; - items[3] = (d >> 12) & 0x0f; - } +class MatrixView_q4_row { + public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, + const int height, + const int width) + : data(data), height(height), width(width) {} + + __device__ __forceinline__ int item(int row, int column) const { + int shift = (column & 0x07) * 4; + return (data[row * width / 8 + column / 8] >> shift) & 0x0f; + } + + __device__ __forceinline__ void item2(int (&items)[2], int row, + int column) const { + int shift = (column & 0x07) * 4; + uint32_t d = data[row * width / 8 + column / 8] >> shift; + items[0] = d & 0x0f; + items[1] = (d >> 4) & 0x0f; + } + + __device__ __forceinline__ void item4(int (&items)[4], int row, + int column) const { + int shift = (column & 0x07) * 4; + uint32_t d = data[row * width / 8 + column / 8] >> shift; + items[0] = d & 0x0f; + items[1] = (d >> 4) & 0x0f; + items[2] = (d >> 8) & 0x0f; + items[3] = (d >> 12) & 0x0f; + } }; -class MatrixView_q4_column -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (row & 0x07) * 4; - return (data[row / 8 * width + column] >> shift) & 0x0f; - } - - __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } - __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } +class MatrixView_q4_column { + public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, + const int height, + const int width) + : data(data), height(height), width(width) {} + + __device__ __forceinline__ int item(int row, int column) const { + int shift = (row & 0x07) * 4; + return (data[row / 8 * width + column] >> shift) & 0x0f; + } + + __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { + return data[row / 8 * width + column]; + } + __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, + int column) { + return &data[row / 8 * width + column]; + } }; -class MatrixView_q2_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x0f) * 2; - return (data[row * width / 16 + column / 16] >> shift) & 0x03; - } - - __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const - { - int shift = (column & 0x0f) * 2; - uint32_t d = data[row * width / 16 + column / 16] >> shift; - items[0] = d & 0x03; - items[1] = (d >> 2) & 0x03; - } - - __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const - { - int shift = (column & 0x0f) * 2; - uint32_t d = data[row * width / 16 + column / 16] >> shift; - items[0] = d & 0x03; - items[1] = (d >> 2) & 0x03; - items[2] = (d >> 4) & 0x03; - items[3] = (d >> 6) & 0x03; - } +class MatrixView_q2_row { + public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data, + const int height, + const int width) + : data(data), height(height), width(width) {} + + __device__ __forceinline__ int item(int row, int column) const { + int shift = (column & 0x0f) * 2; + return (data[row * width / 16 + column / 16] >> shift) & 0x03; + } + + __device__ __forceinline__ void item2(int (&items)[2], int row, + int column) const { + int shift = (column & 0x0f) * 2; + uint32_t d = data[row * width / 16 + column / 16] >> shift; + items[0] = d & 0x03; + items[1] = (d >> 2) & 0x03; + } + + __device__ __forceinline__ void item4(int (&items)[4], int row, + int column) const { + int shift = (column & 0x0f) * 2; + uint32_t d = data[row * width / 16 + column / 16] >> shift; + items[0] = d & 0x03; + items[1] = (d >> 2) & 0x03; + items[2] = (d >> 4) & 0x03; + items[3] = (d >> 6) & 0x03; + } }; -class MatrixView_q3_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int z_w = column * 3 / 32; - int z_mod = column & 0x1f; - - if (z_mod == 10) { - return (data[row * width * 3 / 32 + z_w] >> 30) | ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4); - } else if (z_mod == 21) { - return (data[row * width * 3 / 32 + z_w] >> 31) | ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6); - } else if (z_mod < 10) { - return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07; - } else if (z_mod < 21) { - return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 32)) & 0x07; - } else { - return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 64)) & 0x07; - } +class MatrixView_q3_row { + public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data, + const int height, + const int width) + : data(data), height(height), width(width) {} + + __device__ __forceinline__ int item(int row, int column) const { + int z_w = column * 3 / 32; + int z_mod = column & 0x1f; + + if (z_mod == 10) { + return (data[row * width * 3 / 32 + z_w] >> 30) | + ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4); + } else if (z_mod == 21) { + return (data[row * width * 3 / 32 + z_w] >> 31) | + ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6); + } else if (z_mod < 10) { + return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07; + } else if (z_mod < 21) { + return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 32)) & 0x07; + } else { + return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 64)) & 0x07; } - - __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const - { - int shift = (column & 0x1f); - uint32_t d; - if (shift <= 4) { - d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3); - } else if (shift == 8) { - d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8); - } else if (shift <= 16) { - d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32); - } else if (shift == 20) { - d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4); - } else { - d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64); - } - items[0] = d & 0x07; - items[1] = (d >> 3) & 0x07; - items[2] = (d >> 6) & 0x07; - items[3] = (d >> 9) & 0x07; + } + + __device__ __forceinline__ void item4(int (&items)[4], int row, + int column) const { + int shift = (column & 0x1f); + uint32_t d; + if (shift <= 4) { + d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3); + } else if (shift == 8) { + d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) | + ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8); + } else if (shift <= 16) { + d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32); + } else if (shift == 20) { + d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) | + ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4); + } else { + d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64); } + items[0] = d & 0x07; + items[1] = (d >> 3) & 0x07; + items[2] = (d >> 6) & 0x07; + items[3] = (d >> 9) & 0x07; + } }; -class MatrixView_q8_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x03) * 8; - return (data[row * width / 4 + column / 4] >> shift) & 0xff; - } - - __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const - { - int shift = (column & 0x03) * 8; - uint32_t d = data[row * width / 4 + column / 4] >> shift; - items[0] = d & 0xff; - items[1] = (d >> 8) & 0xff; - } - - __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const - { - int shift = (column & 0x03) * 2; - uint32_t d = data[row * width / 4 + column / 4] >> shift; - items[0] = d & 0xff; - items[1] = (d >> 8) & 0xff; - items[2] = (d >> 16) & 0xff; - items[3] = (d >> 24) & 0xff; - } +class MatrixView_q8_row { + public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data, + const int height, + const int width) + : data(data), height(height), width(width) {} + + __device__ __forceinline__ int item(int row, int column) const { + int shift = (column & 0x03) * 8; + return (data[row * width / 4 + column / 4] >> shift) & 0xff; + } + + __device__ __forceinline__ void item2(int (&items)[2], int row, + int column) const { + int shift = (column & 0x03) * 8; + uint32_t d = data[row * width / 4 + column / 4] >> shift; + items[0] = d & 0xff; + items[1] = (d >> 8) & 0xff; + } + + __device__ __forceinline__ void item4(int (&items)[4], int row, + int column) const { + int shift = (column & 0x03) * 2; + uint32_t d = data[row * width / 4 + column / 4] >> shift; + items[0] = d & 0xff; + items[1] = (d >> 8) & 0xff; + items[2] = (d >> 16) & 0xff; + items[3] = (d >> 24) & 0xff; + } }; } // namespace gptq diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu index cc56649917a8a..480c4986c3821 100644 --- a/csrc/quantization/gptq/q_gemm.cu +++ b/csrc/quantization/gptq/q_gemm.cu @@ -1,5 +1,6 @@ /* -Adapted from https://github.com/turboderp/exllamav2 and https://github.com/qwopqwop200/GPTQ-for-LLaMa +Adapted from https://github.com/turboderp/exllamav2 and +https://github.com/qwopqwop200/GPTQ-for-LLaMa */ #include @@ -32,2044 +33,1824 @@ namespace gptq { #define DIVIDE(x, size) (((x) + (size) - 1) / (size)) #if defined(USE_ROCM) -#include -__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, - hipblasOperation_t transA, - hipblasOperation_t transB, - int m, - int n, - int k, - const half* alpha, - const half* AP, - int lda, - const half* BP, - int ldb, - const half* beta, - half* CP, - int ldc) { - return hipblasHgemm(handle, transA, transB, m, n, k, - reinterpret_cast(alpha), - reinterpret_cast(AP), lda, - reinterpret_cast(BP), ldb, - reinterpret_cast(beta), - reinterpret_cast(CP), ldc); + #include +__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm( + hipblasHandle_t handle, hipblasOperation_t transA, + hipblasOperation_t transB, int m, int n, int k, const half* alpha, + const half* AP, int lda, const half* BP, int ldb, const half* beta, + half* CP, int ldc) { + return hipblasHgemm(handle, transA, transB, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(AP), lda, + reinterpret_cast(BP), ldb, + reinterpret_cast(beta), + reinterpret_cast(CP), ldc); } -#define hipblasHgemm __compat_hipblasHgemm + #define hipblasHgemm __compat_hipblasHgemm -// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. -#define rocblas_operation_none HIPBLAS_OP_N -#define rocblas_hgemm __compat_hipblasHgemm + // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. + #define rocblas_operation_none HIPBLAS_OP_N + #define rocblas_hgemm __compat_hipblasHgemm #endif -__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __hadd2(result, g_result); +__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr, + const half2 g_result) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); + return __hadd2(result, g_result); } -__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __half2float(__low2half(result)) + __half2float(__high2half(result)); +__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); + return __half2float(__low2half(result)) + __half2float(__high2half(result)); } -__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); +__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr, + const half2 g_result, + const half qs_h) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); + return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); } -__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); +__forceinline__ __device__ half2 dot22_16(half2 (&dq)[8], const half* a_ptr, + const half2 g_result, + const half qs_h) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); + return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); } -__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); - return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); +__forceinline__ __device__ half2 dot22_32(half2 (&dq)[16], const half* a_ptr, + const half2 g_result, + const half qs_h) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); + return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); } -__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); - return fma(result_f, qs_f, g_result); +__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr, + const float g_result, + const float qs_f) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); + float result_f = + __half2float(__low2half(result)) + __half2float(__high2half(result)); + return fma(result_f, qs_f, g_result); } -__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); - float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); - return fma(result_f, qs_f, g_result); +__forceinline__ __device__ float dot22_16_f(half2 (&dq)[8], const half* a_ptr, + const float g_result, + const float qs_f) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); + float result_f = + __half2float(__low2half(result)) + __half2float(__high2half(result)); + return fma(result_f, qs_f, g_result); } -__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); - float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); - return fma(result_f, qs_f, g_result); +__forceinline__ __device__ float dot22_32_f(half2 (&dq)[16], const half* a_ptr, + const float g_result, + const float qs_f) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); + float result_f = + __half2float(__low2half(result)) + __half2float(__high2half(result)); + return fma(result_f, qs_f, g_result); } -__forceinline__ __device__ half dot22_8_h(half2(&dq)[4], const half* a_ptr, const half g_result, const half qs_h) -{ - // Use FP32 accumulator to avoid potential overflow since unscaled weights are in the range -128..127 - - float result = {}; - #pragma unroll - for (int i = 0; i < 4; i++) - { - half2 w01 = dq[i]; - float w0 = __low2float(w01); - float w1 = __high2float(w01); - float x0 = __half2float(*a_ptr++); - float x1 = __half2float(*a_ptr++); - result = fma(w0, x0, result); - result = fma(w1, x1, result); - } - float qs = __half2float(qs_h); - result *= qs; - half result_h = __float2half_rn(result); - return __hadd(result_h, g_result); +__forceinline__ __device__ half dot22_8_h(half2 (&dq)[4], const half* a_ptr, + const half g_result, + const half qs_h) { + // Use FP32 accumulator to avoid potential overflow since unscaled weights are + // in the range -128..127 + + float result = {}; +#pragma unroll + for (int i = 0; i < 4; i++) { + half2 w01 = dq[i]; + float w0 = __low2float(w01); + float w1 = __high2float(w01); + float x0 = __half2float(*a_ptr++); + float x1 = __half2float(*a_ptr++); + result = fma(w0, x0, result); + result = fma(w1, x1, result); + } + float qs = __half2float(qs_h); + result *= qs; + half result_h = __float2half_rn(result); + return __hadd(result_h, g_result); } -__forceinline__ __device__ half dot22_16_h(half2(&dq)[8], const half* a_ptr, const half g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); - half result_h = __hadd(__low2half(result), __high2half(result)); - return __hfma(result_h, qs_h, g_result); +__forceinline__ __device__ half dot22_16_h(half2 (&dq)[8], const half* a_ptr, + const half g_result, + const half qs_h) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); + half result_h = __hadd(__low2half(result), __high2half(result)); + return __hfma(result_h, qs_h, g_result); } -__forceinline__ __device__ half dot22_32_h(half2(&dq)[16], const half* a_ptr, const half g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); - half result_h = __hadd(__low2half(result), __high2half(result)); - return __hfma(result_h, qs_h, g_result); +__forceinline__ __device__ half dot22_32_h(half2 (&dq)[16], const half* a_ptr, + const half g_result, + const half qs_h) { + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; +#pragma unroll + for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); + half result_h = __hadd(__low2half(result), __high2half(result)); + return __hfma(result_h, qs_h, g_result); } - -typedef void (*fp_gemm_half_q_half_gptq_kernel) -( - const half*, - const uint32_t*, - const uint32_t*, - const half*, - half*, - const int, - const int, - const int, - const int, - const int* -); - +typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*, + const uint32_t*, const half*, + half*, const int, const int, + const int, const int, + const int*); template -__global__ void gemm_half_q_half_gptq_4bit_kernel -( - const half* __restrict__ a, - const uint32_t* __restrict__ b_q_weight, +__global__ void gemm_half_q_half_gptq_4bit_kernel( + const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight, const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - half* __restrict__ c, - const int size_m, - const int size_n, - const int size_k, - const int groups, - const int* __restrict__ b_q_perm -) -{ - MatrixView_half a_(a, size_m, size_k); - MatrixView_half_rw c_(c, size_m, size_n); - MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int t = threadIdx.x; - - // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; - - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - int n = offset_n + t * 4; - - // Preload block_a - __shared__ half block_a[m_count][BLOCK_KN_SIZE]; - - if (offset_k + t < end_k) - { - for (int m = 0; m < m_count; ++m) - { - const half* a_ptr = a_.item_ptr(offset_m + m, 0); - half* block_a_ptr = block_a[m]; - - half a0; - if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; - else a0 = a_ptr[offset_k + t]; - block_a_ptr[t] = a0; - } + const half* __restrict__ b_gptq_scales, half* __restrict__ c, + const int size_m, const int size_n, const int size_k, const int groups, + const int* __restrict__ b_q_perm) { + MatrixView_half a_(a, size_m, size_k); + MatrixView_half_rw c_(c, size_m, size_n); + MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int t = threadIdx.x; + + // Block + int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + int offset_m = blockIdx.y * m_count; + int offset_k = blockIdx.z * BLOCK_KN_SIZE; + + int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + int end_m = min(offset_m + m_count, size_m); + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + int n = offset_n + t * 4; + + // Preload block_a + __shared__ half block_a[m_count][BLOCK_KN_SIZE]; + + if (offset_k + t < end_k) { + for (int m = 0; m < m_count; ++m) { + const half* a_ptr = a_.item_ptr(offset_m + m, 0); + half* block_a_ptr = block_a[m]; + + half a0; + if (b_q_perm) + a0 = a_ptr[b_q_perm[offset_k + t]]; + else + a0 = a_ptr[offset_k + t]; + block_a_ptr[t] = a0; } + } - // Zero output - if (n >= size_n) return; + // Zero output + if (n >= size_n) return; - if (blockIdx.z == 0) - { - for (int m = 0; m < m_count; m++) - *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + if (blockIdx.z == 0) { + for (int m = 0; m < m_count; m++) + *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + } + + __syncthreads(); + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // a, b offset + int qk = offset_k / (32 / 4); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const half* a_ptr = &block_a[0][0]; + int a_stride = BLOCK_KN_SIZE; + + // Initial group + int zeros[4]; + float scales[4]; + half2 z1z16[4][2]; + half2 y1y16[4][2]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_f(scales, group, n); + dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); + dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); + dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); + dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); + + // Column result + float block_c[m_count][4] = {}; + + // Dequantize and multiply + int k = offset_k; + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_f(scales, group, n); + dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); + dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); + dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); + dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); } - __syncthreads(); - - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // a, b offset - int qk = offset_k / (32 / 4); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - const half* a_ptr = &block_a[0][0]; - int a_stride = BLOCK_KN_SIZE; - - // Initial group - int zeros[4]; - float scales[4]; - half2 z1z16[4][2]; - half2 y1y16[4][2]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_f(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - - // Column result - float block_c[m_count][4] = {}; - - // Dequantize and multiply - int k = offset_k; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_f(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - } - - #pragma unroll - for (int j = 0; j < 4; j++) - { - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - half2 dq[4][4]; - dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); - dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); - dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); - dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); - - #pragma unroll - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]); - block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]); - block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]); - block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]); - } - - b_ptr += size_n; - a_ptr += 8; - } - - k += 32; +#pragma unroll + for (int j = 0; j < 4; j++) { + const int4* b_ptr4 = (int4*)b_ptr; + int4 load_int4 = *b_ptr4; + + half2 dq[4][4]; + dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, + false); + dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, + false); + dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, + false); + dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, + false); + +#pragma unroll + for (int m = 0; m < m_count; m++) { + block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0], + block_c[m][0]); + block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1], + block_c[m][1]); + block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2], + block_c[m][2]); + block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3], + block_c[m][3]); + } + + b_ptr += size_n; + a_ptr += 8; } - for (int m = 0; m < m_count; m++) - { - half2 *out = (half2*) c_.item_ptr(offset_m + m, n); - half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1])); - half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3])); - atomicAdd(out , result01); - atomicAdd(out + 1, result23); - } + k += 32; + } + + for (int m = 0; m < m_count; m++) { + half2* out = (half2*)c_.item_ptr(offset_m + m, n); + half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), + __float2half_rn(block_c[m][1])); + half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), + __float2half_rn(block_c[m][3])); + atomicAdd(out, result01); + atomicAdd(out + 1, result23); + } } template -__global__ void gemm_half_q_half_gptq_2bit_kernel -( - const half* __restrict__ a, - const uint32_t* __restrict__ b_q_weight, +__global__ void gemm_half_q_half_gptq_2bit_kernel( + const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight, const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - half* __restrict__ c, - const int size_m, - const int size_n, - const int size_k, - const int groups, - const int* __restrict__ b_q_perm -) -{ - MatrixView_half a_(a, size_m, size_k); - MatrixView_half_rw c_(c, size_m, size_n); - MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int t = threadIdx.x; - - // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; - - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - int n = offset_n + t * 4; - - // Preload block_a - __shared__ half block_a[m_count][BLOCK_KN_SIZE]; - - if (offset_k + t < end_k) - { - for (int m = 0; m < m_count; ++m) - { - const half* a_ptr = a_.item_ptr(offset_m + m, 0); - half* block_a_ptr = block_a[m]; - - half a0; - if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; - else a0 = a_ptr[offset_k + t]; - block_a_ptr[t] = a0; - } + const half* __restrict__ b_gptq_scales, half* __restrict__ c, + const int size_m, const int size_n, const int size_k, const int groups, + const int* __restrict__ b_q_perm) { + MatrixView_half a_(a, size_m, size_k); + MatrixView_half_rw c_(c, size_m, size_n); + MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int t = threadIdx.x; + + // Block + int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + int offset_m = blockIdx.y * m_count; + int offset_k = blockIdx.z * BLOCK_KN_SIZE; + + int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + int end_m = min(offset_m + m_count, size_m); + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + int n = offset_n + t * 4; + + // Preload block_a + __shared__ half block_a[m_count][BLOCK_KN_SIZE]; + + if (offset_k + t < end_k) { + for (int m = 0; m < m_count; ++m) { + const half* a_ptr = a_.item_ptr(offset_m + m, 0); + half* block_a_ptr = block_a[m]; + + half a0; + if (b_q_perm) + a0 = a_ptr[b_q_perm[offset_k + t]]; + else + a0 = a_ptr[offset_k + t]; + block_a_ptr[t] = a0; } + } - // Zero output - if (n >= size_n) return; + // Zero output + if (n >= size_n) return; - if (blockIdx.z == 0) - { - for (int m = 0; m < m_count; m++) - *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + if (blockIdx.z == 0) { + for (int m = 0; m < m_count; m++) + *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + } + + __syncthreads(); + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // a, b offset + int qk = offset_k / (32 / 2); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const half* a_ptr = &block_a[0][0]; + int a_stride = BLOCK_KN_SIZE; + + // Initial group + int zeros[4]; + half scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + // Column result + half block_c[m_count][4] = {}; + + // Dequantize and multiply + int k = offset_k; + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); } - __syncthreads(); - - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // a, b offset - int qk = offset_k / (32 / 2); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - const half* a_ptr = &block_a[0][0]; - int a_stride = BLOCK_KN_SIZE; - - // Initial group - int zeros[4]; - half scales[4]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4(scales, group, n); - // Column result - half block_c[m_count][4] = {}; - - // Dequantize and multiply - int k = offset_k; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4(scales, group, n); - } - - #pragma unroll - for (int j = 0; j < 1; j++) - { - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - half2 dq[4][8]; - dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1); - dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1); - dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1); - dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1); - - #pragma unroll - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); - block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); - block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); - block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); - } - - b_ptr += size_n; - a_ptr += 16; - } - - k += 16; +#pragma unroll + for (int j = 0; j < 1; j++) { + const int4* b_ptr4 = (int4*)b_ptr; + int4 load_int4 = *b_ptr4; + + half2 dq[4][8]; + dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1); + dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1); + dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1); + dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1); + +#pragma unroll + for (int m = 0; m < m_count; m++) { + block_c[m][0] = + dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); + block_c[m][1] = + dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); + block_c[m][2] = + dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); + block_c[m][3] = + dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); + } + + b_ptr += size_n; + a_ptr += 16; } - for (int m = 0; m < m_count; m++) - { - half2 *out = (half2*) c_.item_ptr(offset_m + m, n); - half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); - half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); - atomicAdd(out , result01); - atomicAdd(out + 1, result23); - } + k += 16; + } + + for (int m = 0; m < m_count; m++) { + half2* out = (half2*)c_.item_ptr(offset_m + m, n); + half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); + half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); + atomicAdd(out, result01); + atomicAdd(out + 1, result23); + } } template -__global__ void gemm_half_q_half_gptq_3bit_kernel -( - const half* __restrict__ a, - const uint32_t* __restrict__ b_q_weight, +__global__ void gemm_half_q_half_gptq_3bit_kernel( + const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight, const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - half* __restrict__ c, - const int size_m, - const int size_n, - const int size_k, - const int groups, - const int* __restrict__ b_q_perm -) -{ - MatrixView_half a_(a, size_m, size_k); - MatrixView_half_rw c_(c, size_m, size_n); - MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int t = threadIdx.x; - - // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; - - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - int n = offset_n + t * 4; - - // Preload block_a - __shared__ half block_a[m_count][BLOCK_KN_SIZE]; - - if (offset_k + t < end_k) - { - for (int m = 0; m < m_count; ++m) - { - const half* a_ptr = a_.item_ptr(offset_m + m, 0); - half* block_a_ptr = block_a[m]; - - half a0; - if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; - else a0 = a_ptr[offset_k + t]; - block_a_ptr[t] = a0; - } + const half* __restrict__ b_gptq_scales, half* __restrict__ c, + const int size_m, const int size_n, const int size_k, const int groups, + const int* __restrict__ b_q_perm) { + MatrixView_half a_(a, size_m, size_k); + MatrixView_half_rw c_(c, size_m, size_n); + MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int t = threadIdx.x; + + // Block + int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + int offset_m = blockIdx.y * m_count; + int offset_k = blockIdx.z * BLOCK_KN_SIZE; + + int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + int end_m = min(offset_m + m_count, size_m); + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + int n = offset_n + t * 4; + + // Preload block_a + __shared__ half block_a[m_count][BLOCK_KN_SIZE]; + + if (offset_k + t < end_k) { + for (int m = 0; m < m_count; ++m) { + const half* a_ptr = a_.item_ptr(offset_m + m, 0); + half* block_a_ptr = block_a[m]; + + half a0; + if (b_q_perm) + a0 = a_ptr[b_q_perm[offset_k + t]]; + else + a0 = a_ptr[offset_k + t]; + block_a_ptr[t] = a0; } + } - // Zero output - if (n >= size_n) return; + // Zero output + if (n >= size_n) return; - if (blockIdx.z == 0) - { - for (int m = 0; m < m_count; m++) - *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + if (blockIdx.z == 0) { + for (int m = 0; m < m_count; m++) + *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + } + + __syncthreads(); + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // a, b offset + int qk = offset_k / 32 * 3; + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const half* a_ptr = &block_a[0][0]; + int a_stride = BLOCK_KN_SIZE; + + // Initial group + int zeros[4]; + half scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + // Column result + half block_c[m_count][4] = {}; + + // Dequantize and multiply + int k = offset_k; + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); } - __syncthreads(); - - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // a, b offset - int qk = offset_k / 32 * 3; - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - const half* a_ptr = &block_a[0][0]; - int a_stride = BLOCK_KN_SIZE; - - // Initial group - int zeros[4]; - half scales[4]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4(scales, group, n); - // Column result - half block_c[m_count][4] = {}; - - // Dequantize and multiply - int k = offset_k; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4(scales, group, n); - } - - #pragma unroll - for (int j = 0; j < 1; j++) - { - int4 load_int4[3]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[2] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][16]; - dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1); - dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1); - dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1); - dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1); - - #pragma unroll - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); - block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); - block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); - block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); - } - a_ptr += 32; - } - - k += 32; +#pragma unroll + for (int j = 0; j < 1; j++) { + int4 load_int4[3]; + load_int4[0] = *((int4*)b_ptr); + b_ptr += size_n; + load_int4[1] = *((int4*)b_ptr); + b_ptr += size_n; + load_int4[2] = *((int4*)b_ptr); + b_ptr += size_n; + + half2 dq[4][16]; + dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], + size_n, zeros[0] + 1); + dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], + size_n, zeros[1] + 1); + dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], + size_n, zeros[2] + 1); + dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], + size_n, zeros[3] + 1); + +#pragma unroll + for (int m = 0; m < m_count; m++) { + block_c[m][0] = + dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); + block_c[m][1] = + dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); + block_c[m][2] = + dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); + block_c[m][3] = + dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); + } + a_ptr += 32; } - for (int m = 0; m < m_count; m++) - { - half2 *out = (half2*) c_.item_ptr(offset_m + m, n); - half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); - half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); - atomicAdd(out , result01); - atomicAdd(out + 1, result23); - } + k += 32; + } + + for (int m = 0; m < m_count; m++) { + half2* out = (half2*)c_.item_ptr(offset_m + m, n); + half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); + half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); + atomicAdd(out, result01); + atomicAdd(out + 1, result23); + } } template -__global__ void gemm_half_q_half_gptq_8bit_kernel -( - const half* __restrict__ a, - const uint32_t* __restrict__ b_q_weight, +__global__ void gemm_half_q_half_gptq_8bit_kernel( + const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight, const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - half* __restrict__ c, - const int size_m, - const int size_n, - const int size_k, - const int groups, - const int* __restrict__ b_q_perm -) -{ - MatrixView_half a_(a, size_m, size_k); - MatrixView_half_rw c_(c, size_m, size_n); - MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int t = threadIdx.x; - - // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; - - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - int n = offset_n + t * 4; - - // Preload block_a - __shared__ half block_a[m_count][BLOCK_KN_SIZE]; - - if (offset_k + t < end_k) - { - for (int m = 0; m < m_count; ++m) - { - const half* a_ptr = a_.item_ptr(offset_m + m, 0); - half* block_a_ptr = block_a[m]; - - half a0; - if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; - else a0 = a_ptr[offset_k + t]; - block_a_ptr[t] = a0; - } - } - - // Zero output - if (n >= size_n) return; - - if (blockIdx.z == 0) - { - for (int m = 0; m < m_count; m++) - *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + const half* __restrict__ b_gptq_scales, half* __restrict__ c, + const int size_m, const int size_n, const int size_k, const int groups, + const int* __restrict__ b_q_perm) { + MatrixView_half a_(a, size_m, size_k); + MatrixView_half_rw c_(c, size_m, size_n); + MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int t = threadIdx.x; + + // Block + int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + int offset_m = blockIdx.y * m_count; + int offset_k = blockIdx.z * BLOCK_KN_SIZE; + + int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + int end_m = min(offset_m + m_count, size_m); + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + int n = offset_n + t * 4; + + // Preload block_a + __shared__ half block_a[m_count][BLOCK_KN_SIZE]; + + if (offset_k + t < end_k) { + for (int m = 0; m < m_count; ++m) { + const half* a_ptr = a_.item_ptr(offset_m + m, 0); + half* block_a_ptr = block_a[m]; + + half a0; + if (b_q_perm) + a0 = a_ptr[b_q_perm[offset_k + t]]; + else + a0 = a_ptr[offset_k + t]; + block_a_ptr[t] = a0; } + } - __syncthreads(); - - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // a, b offset - int qk = offset_k / (32 / 8); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - const half* a_ptr = &block_a[0][0]; - int a_stride = BLOCK_KN_SIZE; - - // Initial group - int zeros[4]; - half scales[4]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4(scales, group, n); - // Column result - half block_c[m_count][4] = {}; - - // Dequantize and multiply - int k = offset_k; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4(scales, group, n); - } + // Zero output + if (n >= size_n) return; - #pragma unroll - for (int j = 0; j < 4; j++) - { - int4 load_int4[2]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][4]; - dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1); - dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1); - dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1); - dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1); - - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); - block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); - block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); - block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); - } - a_ptr += 8; - } - k += 32; + if (blockIdx.z == 0) { + for (int m = 0; m < m_count; m++) + *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + } + + __syncthreads(); + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // a, b offset + int qk = offset_k / (32 / 8); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const half* a_ptr = &block_a[0][0]; + int a_stride = BLOCK_KN_SIZE; + + // Initial group + int zeros[4]; + half scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + // Column result + half block_c[m_count][4] = {}; + + // Dequantize and multiply + int k = offset_k; + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); } - for (int m = 0; m < m_count; m++) - { - half2 *out = (half2*) c_.item_ptr(offset_m + m, n); - half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); - half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); - atomicAdd(out , result01); - atomicAdd(out + 1, result23); +#pragma unroll + for (int j = 0; j < 4; j++) { + int4 load_int4[2]; + load_int4[0] = *((int4*)b_ptr); + b_ptr += size_n; + load_int4[1] = *((int4*)b_ptr); + b_ptr += size_n; + + half2 dq[4][4]; + dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, + zeros[0] + 1); + dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, + zeros[1] + 1); + dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, + zeros[2] + 1); + dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, + zeros[3] + 1); + + for (int m = 0; m < m_count; m++) { + block_c[m][0] = + dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); + block_c[m][1] = + dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); + block_c[m][2] = + dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); + block_c[m][3] = + dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); + } + a_ptr += 8; } + k += 32; + } + + for (int m = 0; m < m_count; m++) { + half2* out = (half2*)c_.item_ptr(offset_m + m, n); + half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); + half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); + atomicAdd(out, result01); + atomicAdd(out + 1, result23); + } } fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel( - bool first_block, const int m_count, const int bit) -{ - #define SELECT_KERNEL(M_COUNT) \ - if (m_count == M_COUNT) { \ - if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel; \ - if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel; \ - if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel; \ - if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel; \ - } - #if BLOCK_M_SIZE_MAX >= 1 - SELECT_KERNEL(1); - #endif - #if BLOCK_M_SIZE_MAX >= 2 - SELECT_KERNEL(2); - #endif - #if BLOCK_M_SIZE_MAX >= 3 - SELECT_KERNEL(3); - #endif - #if BLOCK_M_SIZE_MAX >= 4 - SELECT_KERNEL(4); - #endif - #if BLOCK_M_SIZE_MAX >= 5 - SELECT_KERNEL(5); - #endif - #if BLOCK_M_SIZE_MAX >= 6 - SELECT_KERNEL(6); - #endif - #if BLOCK_M_SIZE_MAX >= 7 - SELECT_KERNEL(7); - #endif - #if BLOCK_M_SIZE_MAX >= 8 - SELECT_KERNEL(8); - #endif - return NULL; + bool first_block, const int m_count, const int bit) { +#define SELECT_KERNEL(M_COUNT) \ + if (m_count == M_COUNT) { \ + if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel; \ + if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel; \ + if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel; \ + if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel; \ + } +#if BLOCK_M_SIZE_MAX >= 1 + SELECT_KERNEL(1); +#endif +#if BLOCK_M_SIZE_MAX >= 2 + SELECT_KERNEL(2); +#endif +#if BLOCK_M_SIZE_MAX >= 3 + SELECT_KERNEL(3); +#endif +#if BLOCK_M_SIZE_MAX >= 4 + SELECT_KERNEL(4); +#endif +#if BLOCK_M_SIZE_MAX >= 5 + SELECT_KERNEL(5); +#endif +#if BLOCK_M_SIZE_MAX >= 6 + SELECT_KERNEL(6); +#endif +#if BLOCK_M_SIZE_MAX >= 7 + SELECT_KERNEL(7); +#endif +#if BLOCK_M_SIZE_MAX >= 8 + SELECT_KERNEL(8); +#endif + return NULL; } +void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight, + const uint32_t* b_gptq_qzeros, + const half* b_gptq_scales, const int* b_q_perm, + half* c, int size_m, int size_n, int size_k, + int m_count, int groups, int bit) { + dim3 blockDim, gridDim; + blockDim.x = BLOCK_KN_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4); + gridDim.y = DIVIDE(size_m, m_count); + gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); + + fp_gemm_half_q_half_gptq_kernel kernel = + pick_gemm_half_q_half_gptq_kernel(true, m_count, bit); + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + kernel<<>>(a, b_q_weight, b_gptq_qzeros, + b_gptq_scales, c, size_m, size_n, + size_k, groups, b_q_perm); +} -void gemm_half_q_half_cuda_part -( - const half* a, - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_q_perm, - half* c, - int size_m, - int size_n, - int size_k, - int m_count, - int groups, - int bit -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4); - gridDim.y = DIVIDE(size_m, m_count); - gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); +__global__ void reconstruct_exllama_8bit_kernel( + const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm, + const uint32_t* __restrict__ b_gptq_qzeros, + const half* __restrict__ b_gptq_scales, const int size_k, const int size_n, + const int groups, half* __restrict__ b) { + MatrixView_half_rw b_(b, size_k, size_n); + MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count, bit); + int offset_k = BLOCK_KN_SIZE * blockIdx.y; + int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - kernel<<>> - ( - a, - b_q_weight, - b_gptq_qzeros, - b_gptq_scales, - c, - size_m, - size_n, - size_k, - groups, - b_q_perm - ); -} + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + // Preload remapping table + __shared__ int perm[BLOCK_KN_SIZE]; + int t = threadIdx.x; -__global__ void reconstruct_exllama_8bit_kernel -( - const uint32_t* __restrict__ b_q_weight, - const int* __restrict__ b_q_perm, - const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - const int size_k, - const int size_n, - const int groups, - half* __restrict__ b -) -{ - MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - // Preload remapping table - __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; - - if (b_q_perm) - { - if (offset_k + t < size_k) - perm[t] = b_q_perm[offset_k + t]; - } + if (b_q_perm) { + if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; + } - // Column - int n = offset_n + t * 4; - if (n >= size_n) return; + // Column + int n = offset_n + t * 4; + if (n >= size_n) return; - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; - // b offset - int qk = offset_k / (32 / 8); + // b offset + int qk = offset_k / (32 / 8); - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - // Initial zeros/scale - int zeros[4]; - half2 scales[4]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); + // Initial zeros/scale + int zeros[4]; + half2 scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); - __syncthreads(); + __syncthreads(); - int k = offset_k; - int lk = 0; + int k = offset_k; + int lk = 0; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - } + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + } - for (int p = 0; p < 4; p++) - { - int4 load_int4[2]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][4]; - dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1); - dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1); - dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1); - dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1); - - //half* dqh = (half*)dq; - if (b_q_perm) - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - else - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } + for (int p = 0; p < 4; p++) { + int4 load_int4[2]; + load_int4[0] = *((int4*)b_ptr); + b_ptr += size_n; + load_int4[1] = *((int4*)b_ptr); + b_ptr += size_n; + + half2 dq[4][4]; + dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, + zeros[0] + 1); + dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, + zeros[1] + 1); + dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, + zeros[2] + 1); + dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, + zeros[3] + 1); + + // half* dqh = (half*)dq; + if (b_q_perm) { + for (int j = 0; j < 4; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), + __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), + __high2half(dq[2][j]), __high2half(dq[3][j])); } - k += 32; + } else { + for (int j = 0; j < 4; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), + __low2half(dq[1][j]), __low2half(dq[2][j]), + __low2half(dq[3][j])); + b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), + __high2half(dq[1][j]), __high2half(dq[2][j]), + __high2half(dq[3][j])); + } + } } + k += 32; + } } -__global__ void reconstruct_exllama_4bit_kernel -( - const uint32_t* __restrict__ b_q_weight, - const int* __restrict__ b_q_perm, +__global__ void reconstruct_exllama_4bit_kernel( + const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm, const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - const int size_k, - const int size_n, - const int groups, - half* __restrict__ b -) -{ - MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - // Preload remapping table - __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; - - if (b_q_perm) - { - if (offset_k + t < size_k) - perm[t] = b_q_perm[offset_k + t]; + const half* __restrict__ b_gptq_scales, const int size_k, const int size_n, + const int groups, half* __restrict__ b) { + MatrixView_half_rw b_(b, size_k, size_n); + MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int offset_k = BLOCK_KN_SIZE * blockIdx.y; + int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + // Preload remapping table + __shared__ int perm[BLOCK_KN_SIZE]; + int t = threadIdx.x; + + if (b_q_perm) { + if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; + } + + // Column + int n = offset_n + t * 4; + if (n >= size_n) return; + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // b offset + int qk = offset_k / (32 / 4); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + + // Initial zeros/scale + int zeros[4]; + half2 scales[4]; + half2 z1z16[4][2]; + half2 y1y16[4][2]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); + dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); + dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); + dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); + + __syncthreads(); + + int k = offset_k; + int lk = 0; + + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); + dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); + dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); + dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); } - // Column - int n = offset_n + t * 4; - if (n >= size_n) return; - - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // b offset - int qk = offset_k / (32 / 4); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - - // Initial zeros/scale - int zeros[4]; - half2 scales[4]; - half2 z1z16[4][2]; - half2 y1y16[4][2]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - - __syncthreads(); - - int k = offset_k; - int lk = 0; - - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); + for (int p = 0; p < 4; p++) { + half2 dq[4][4]; + const int4* b_ptr4 = (int4*)b_ptr; + int4 load_int4 = *b_ptr4; + + dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, + false); + dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, + false); + dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, + false); + dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, + false); + + b_ptr += size_n; + // half* dqh = (half*)dq; + if (b_q_perm) { + for (int j = 0; j < 4; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), + __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), + __high2half(dq[2][j]), __high2half(dq[3][j])); } - - for (int p = 0; p < 4; p++) - { - half2 dq[4][4]; - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); - dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); - dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); - dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); - - b_ptr += size_n; - //half* dqh = (half*)dq; - if (b_q_perm) - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - else - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } + } else { + for (int j = 0; j < 4; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), + __low2half(dq[1][j]), __low2half(dq[2][j]), + __low2half(dq[3][j])); + b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), + __high2half(dq[1][j]), __high2half(dq[2][j]), + __high2half(dq[3][j])); } - k += 32; + } } + k += 32; + } } -__global__ void reconstruct_exllama_3bit_kernel -( - const uint32_t* __restrict__ b_q_weight, - const int* __restrict__ b_q_perm, +__global__ void reconstruct_exllama_3bit_kernel( + const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm, const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - const int size_k, - const int size_n, - const int groups, - half* __restrict__ b -) -{ - MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - // Preload remapping table - __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; - - if (b_q_perm) - { - if (offset_k + t < size_k) - perm[t] = b_q_perm[offset_k + t]; - } + const half* __restrict__ b_gptq_scales, const int size_k, const int size_n, + const int groups, half* __restrict__ b) { + MatrixView_half_rw b_(b, size_k, size_n); + MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - // Column - int n = offset_n + t * 4; - if (n >= size_n) return; + int offset_k = BLOCK_KN_SIZE * blockIdx.y; + int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - // b offset - int qk = offset_k / 32* 3; + // Preload remapping table + __shared__ int perm[BLOCK_KN_SIZE]; + int t = threadIdx.x; - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + if (b_q_perm) { + if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; + } - // Initial zeros/scale - int zeros[4]; - half2 scales[4]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); + // Column + int n = offset_n + t * 4; + if (n >= size_n) return; - __syncthreads(); + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; - int k = offset_k; - int lk = 0; + // b offset + int qk = offset_k / 32 * 3; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - } + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + + // Initial zeros/scale + int zeros[4]; + half2 scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + + __syncthreads(); - for (int p = 0; p < 1; p++) - { - int4 load_int4[3]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[2] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][16]; - dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1); - dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1); - dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1); - dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1); - - if (b_q_perm) - { - for (int j = 0; j < 16; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - else - { - for (int j = 0; j < 16; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } + int k = offset_k; + int lk = 0; + + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + } + + for (int p = 0; p < 1; p++) { + int4 load_int4[3]; + load_int4[0] = *((int4*)b_ptr); + b_ptr += size_n; + load_int4[1] = *((int4*)b_ptr); + b_ptr += size_n; + load_int4[2] = *((int4*)b_ptr); + b_ptr += size_n; + + half2 dq[4][16]; + dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], + size_n, zeros[0] + 1); + dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], + size_n, zeros[1] + 1); + dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], + size_n, zeros[2] + 1); + dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], + size_n, zeros[3] + 1); + + if (b_q_perm) { + for (int j = 0; j < 16; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), + __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), + __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } else { + for (int j = 0; j < 16; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), + __low2half(dq[1][j]), __low2half(dq[2][j]), + __low2half(dq[3][j])); + b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), + __high2half(dq[1][j]), __high2half(dq[2][j]), + __high2half(dq[3][j])); } - k += 32; + } } + k += 32; + } } -__global__ void reconstruct_exllama_2bit_kernel -( - const uint32_t* __restrict__ b_q_weight, - const int* __restrict__ b_q_perm, +__global__ void reconstruct_exllama_2bit_kernel( + const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm, const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - const int size_k, - const int size_n, - const int groups, - half* __restrict__ b -) -{ - MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - // Preload remapping table - __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; - - if (b_q_perm) - { - if (offset_k + t < size_k) - perm[t] = b_q_perm[offset_k + t]; - } + const half* __restrict__ b_gptq_scales, const int size_k, const int size_n, + const int groups, half* __restrict__ b) { + MatrixView_half_rw b_(b, size_k, size_n); + MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - // Column - int n = offset_n + t * 4; - if (n >= size_n) return; + int offset_k = BLOCK_KN_SIZE * blockIdx.y; + int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - // b offset - int qk = offset_k / (32 / 2); + // Preload remapping table + __shared__ int perm[BLOCK_KN_SIZE]; + int t = threadIdx.x; - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + if (b_q_perm) { + if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; + } - // Initial zeros/scale - int zeros[4]; - half2 scales[4]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); + // Column + int n = offset_n + t * 4; + if (n >= size_n) return; - __syncthreads(); + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; - int k = offset_k; - int lk = 0; + // b offset + int qk = offset_k / (32 / 2); - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - } + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - for (int p = 0; p < 2; p++) - { - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - half2 dq[4][8]; - dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1); - dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1); - dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1); - dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1); - - b_ptr += size_n; - //half* dqh = (half*)dq; - if (b_q_perm) - { - for (int j = 0; j < 8; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - else - { - for (int j = 0; j < 8; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - } - k += 32; - } -} + // Initial zeros/scale + int zeros[4]; + half2 scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); -void reconstruct_exllama -( - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_q_perm, - half* out, - int height, - int width, - int groups, - int bit -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - gridDim.y = DIVIDE(height, BLOCK_KN_SIZE); - gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); + __syncthreads(); - auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel; - if (bit == 2) { - reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel; - } else if (bit == 3) { - reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel; - } else if (bit == 8) { - reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel; + int k = offset_k; + int lk = 0; + + while (k < end_k) { + if (k == nextgroup) { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); } - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - reconstruct_exllama_kernel<<>> - ( - b_q_weight, - b_q_perm, - b_gptq_qzeros, - b_gptq_scales, - height, - width, - groups, - out - ); + for (int p = 0; p < 2; p++) { + const int4* b_ptr4 = (int4*)b_ptr; + int4 load_int4 = *b_ptr4; + + half2 dq[4][8]; + dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1); + dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1); + dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1); + dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1); + + b_ptr += size_n; + // half* dqh = (half*)dq; + if (b_q_perm) { + for (int j = 0; j < 8; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), + __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), + __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } else { + for (int j = 0; j < 8; j++) { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), + __low2half(dq[1][j]), __low2half(dq[2][j]), + __low2half(dq[3][j])); + b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), + __high2half(dq[1][j]), __high2half(dq[2][j]), + __high2half(dq[3][j])); + } + } + } + k += 32; + } } +void reconstruct_exllama(const uint32_t* b_q_weight, + const uint32_t* b_gptq_qzeros, + const half* b_gptq_scales, const int* b_q_perm, + half* out, int height, int width, int groups, + int bit) { + dim3 blockDim, gridDim; + blockDim.x = BLOCK_KN_SIZE; + blockDim.y = 1; + gridDim.y = DIVIDE(height, BLOCK_KN_SIZE); + gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); + + auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel; + if (bit == 2) { + reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel; + } else if (bit == 3) { + reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel; + } else if (bit == 8) { + reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel; + } + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + reconstruct_exllama_kernel<<>>( + b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups, + out); +} __global__ void gemm_half_q_half_alt_4bit_kernel( - const half2* __restrict__ vec, - const uint32_t* __restrict__ mat, - half* __restrict__ mul, - const half* __restrict__ scales, - const uint32_t* __restrict__ zeros, - const int* __restrict__ g_idx, - int batch, - int height, - int width -) -{ - int zero_width = width / 8; - int vec_height = height * 4; - const int blockwidth2 = BLOCK_KN_SIZE / 2; - int b = blockIdx.y * BLOCK_M_SIZE_MAX; - int b_end = min(BLOCK_M_SIZE_MAX, batch - b); - int h = BLOCK_KN_SIZE * blockIdx.z / 8; - int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4; - int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - - __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; - if (threadIdx.x < h_end) { - for (int m = 0; m < b_end; ++m) { - blockvec[m][threadIdx.x] = - vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + - threadIdx.x]; - } + const half2* __restrict__ vec, const uint32_t* __restrict__ mat, + half* __restrict__ mul, const half* __restrict__ scales, + const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx, + int batch, int height, int width) { + int zero_width = width / 8; + int vec_height = height * 4; + const int blockwidth2 = BLOCK_KN_SIZE / 2; + int b = blockIdx.y * BLOCK_M_SIZE_MAX; + int b_end = min(BLOCK_M_SIZE_MAX, batch - b); + int h = BLOCK_KN_SIZE * blockIdx.z / 8; + int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4; + int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + + __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; + if (threadIdx.x < h_end) { + for (int m = 0; m < b_end; ++m) { + blockvec[m][threadIdx.x] = + vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + + threadIdx.x]; } - - __shared__ half2 deq2[256][8]; - int val = threadIdx.x / 8; - int off = threadIdx.x % 8; - for (; val < 256; val += BLOCK_KN_SIZE / 8) { - deq2[val][off] = __halves2half2( - __int2half_rn(val & 0xF), __int2half_rn(val >> 4) - ); + } + + __shared__ half2 deq2[256][8]; + int val = threadIdx.x / 8; + int off = threadIdx.x % 8; + for (; val < 256; val += BLOCK_KN_SIZE / 8) { + deq2[val][off] = + __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4)); + } + + if (blockIdx.z == 0) { + for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0); + } + __syncthreads(); + + int i = width * h + w; + int g_h = h * 8; + int k = 0; + int z_w = w / 8; + int z_mod = (w % 8) * 4; + half2 res2; + half res[BLOCK_M_SIZE_MAX] = {}; + + unsigned int tmp; + while (k < h_end) { + tmp = mat[i]; + half2 scales_tmp[4]; + half2 zeros_tmp[4]; + for (int tmp_k = 0; tmp_k < 4; tmp_k++) { + int g = g_idx[g_h + (k + tmp_k) * 2]; + int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1]; + half scale_f = scales[g * width + w]; + half scale_f2 = scales[g2 * width + w]; + half2 scale = __halves2half2(scale_f, scale_f2); + half2 zero = __halves2half2( + __hmul(scale_f, + __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) - + 1)), + __hmul(scale_f2, + __int2half_rn( + -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1))); + scales_tmp[tmp_k] = scale; + zeros_tmp[tmp_k] = zero; } - - if (blockIdx.z == 0) - { - for (int m = 0; m < b_end; m++) - mul[(b + m) * width + w] = __int2half_rn(0); - } - __syncthreads(); - - int i = width * h + w; - int g_h = h * 8; - int k = 0; - int z_w = w / 8; - int z_mod = (w % 8) * 4; - half2 res2; - half res[BLOCK_M_SIZE_MAX] = {}; - - unsigned int tmp; - while (k < h_end) { - tmp = mat[i]; - half2 scales_tmp[4]; - half2 zeros_tmp[4]; - for (int tmp_k = 0; tmp_k < 4; tmp_k++) { - int g = g_idx[g_h + (k + tmp_k) * 2]; - int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1]; - half scale_f = scales[g * width + w]; - half scale_f2 = scales[g2 * width + w]; - half2 scale = __halves2half2(scale_f, scale_f2); - half2 zero = __halves2half2( - __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) - 1)), - __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)) - ); - scales_tmp[tmp_k] = scale; - zeros_tmp[tmp_k] = zero; - } - for (int m = 0; m < b_end; m++) { + for (int m = 0; m < b_end; m++) { #ifndef USE_ROCM - res2 = {}; + res2 = {}; #else - res2.x = __half_as_ushort(__float2half(0)); - res2.y = __half_as_ushort(__float2half(0)); + res2.x = __half_as_ushort(__float2half(0)); + res2.y = __half_as_ushort(__float2half(0)); #endif - res2 = __hfma2(__hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2); - res2 = __hfma2(__hfma2(deq2[(tmp >> 8) & 0xff][off], scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2); - res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]), blockvec[m][k + 2], res2); - res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]), blockvec[m][k + 3], res2); + res2 = __hfma2( + __hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]), + blockvec[m][k + 0], res2); + res2 = __hfma2( + __hfma2(deq2[(tmp >> 8) & 0xff][off], scales_tmp[1], zeros_tmp[1]), + blockvec[m][k + 1], res2); + res2 = __hfma2( + __hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]), + blockvec[m][k + 2], res2); + res2 = __hfma2( + __hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]), + blockvec[m][k + 3], res2); #ifndef USE_ROCM - res[m] = __hadd(res[m], __hadd(res2.x, res2.y)); + res[m] = __hadd(res[m], __hadd(res2.x, res2.y)); #else - res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y))); + res[m] = __hadd( + res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y))); #endif - } - i += width; - k += 4; - } - for (int m = 0; m < b_end; m++) { - atomicAdd(&mul[(b + m) * width + w], res[m]); } + i += width; + k += 4; + } + for (int m = 0; m < b_end; m++) { + atomicAdd(&mul[(b + m) * width + w], res[m]); + } } - __global__ void gemm_half_q_half_alt_8bit_kernel( - const half2* __restrict__ vec, - const uint32_t* __restrict__ mat, - half* __restrict__ mul, - const half* __restrict__ scales, - const uint32_t* __restrict__ zeros, - const int* __restrict__ g_idx, - int batch, - int height, - int width -) -{ - int zero_width = width / 4; - int vec_height = height * 2; - const int blockwidth2 = BLOCK_KN_SIZE / 2; - int b = blockIdx.y * BLOCK_M_SIZE_MAX; - int b_end = min(BLOCK_M_SIZE_MAX, batch - b); - int h = BLOCK_KN_SIZE * blockIdx.z / 4; - int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2; - int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - - __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; - if (threadIdx.x < h_end) { - for (int m = 0; m < b_end; ++m) { - blockvec[m][threadIdx.x] = - vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + - threadIdx.x]; - } + const half2* __restrict__ vec, const uint32_t* __restrict__ mat, + half* __restrict__ mul, const half* __restrict__ scales, + const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx, + int batch, int height, int width) { + int zero_width = width / 4; + int vec_height = height * 2; + const int blockwidth2 = BLOCK_KN_SIZE / 2; + int b = blockIdx.y * BLOCK_M_SIZE_MAX; + int b_end = min(BLOCK_M_SIZE_MAX, batch - b); + int h = BLOCK_KN_SIZE * blockIdx.z / 4; + int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2; + int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + + __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; + if (threadIdx.x < h_end) { + for (int m = 0; m < b_end; ++m) { + blockvec[m][threadIdx.x] = + vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + + threadIdx.x]; } - - - if (blockIdx.z == 0) - { - for (int m = 0; m < b_end; m++) - mul[(b + m) * width + w] = __int2half_rn(0); + } + + if (blockIdx.z == 0) { + for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0); + } + __syncthreads(); + + int i = width * h + w; + int g_h = h * 4; + int k = 0; + int z_w = w / 4; + int z_mod = (w % 4) * 8; + half2 res2; + half res[BLOCK_M_SIZE_MAX] = {}; + + unsigned int tmp; + while (k < h_end) { + tmp = mat[i]; + half2 scales_tmp[2]; + half2 zeros_tmp[2]; + for (int tmp_k = 0; tmp_k < 2; tmp_k++) { + int g = g_idx[g_h + (k + tmp_k) * 2]; + int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1]; + half scale_f = scales[g * width + w]; + half scale_f2 = scales[g2 * width + w]; + half2 scale = __halves2half2(scale_f, scale_f2); + half2 zero = __halves2half2( + __hmul(scale_f, + __int2half_rn( + -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)), + __hmul(scale_f2, + __int2half_rn( + -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1))); + scales_tmp[tmp_k] = scale; + zeros_tmp[tmp_k] = zero; } - __syncthreads(); - - int i = width * h + w; - int g_h = h * 4; - int k = 0; - int z_w = w / 4; - int z_mod = (w % 4) * 8; - half2 res2; - half res[BLOCK_M_SIZE_MAX] = {}; - - unsigned int tmp; - while (k < h_end) { - tmp = mat[i]; - half2 scales_tmp[2]; - half2 zeros_tmp[2]; - for (int tmp_k = 0; tmp_k < 2; tmp_k++) { - int g = g_idx[g_h + (k + tmp_k) * 2]; - int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1]; - half scale_f = scales[g * width + w]; - half scale_f2 = scales[g2 * width + w]; - half2 scale = __halves2half2(scale_f, scale_f2); - half2 zero = __halves2half2( - __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)), - __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)) - ); - scales_tmp[tmp_k] = scale; - zeros_tmp[tmp_k] = zero; - } - for (int m = 0; m < b_end; m++) { + for (int m = 0; m < b_end; m++) { #ifndef USE_ROCM - res2 = {}; + res2 = {}; #else - res2.x = __half_as_ushort(__float2half(0)); - res2.y = __half_as_ushort(__float2half(0)); + res2.x = __half_as_ushort(__float2half(0)); + res2.y = __half_as_ushort(__float2half(0)); #endif - half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF), __int2half_rn((tmp >> 8) & 0xFF)); - res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2); - half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF), __int2half_rn((tmp >> 24) & 0xFF)); - res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2); + half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF), + __int2half_rn((tmp >> 8) & 0xFF)); + res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]), + blockvec[m][k + 0], res2); + half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF), + __int2half_rn((tmp >> 24) & 0xFF)); + res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]), + blockvec[m][k + 1], res2); #ifndef USE_ROCM - res[m] = __hadd(res[m], __hadd(res2.x, res2.y)); + res[m] = __hadd(res[m], __hadd(res2.x, res2.y)); #else - res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y))); + res[m] = __hadd( + res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y))); #endif - } - i += width; - k += 2; - } - for (int m = 0; m < b_end; m++) { - atomicAdd(&mul[(b + m) * width + w], res[m]); } + i += width; + k += 2; + } + for (int m = 0; m < b_end; m++) { + atomicAdd(&mul[(b + m) * width + w], res[m]); + } } -void gemm_half_q_half_alt -( - const half* a, - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_g_idx, - half* c, - int size_m, - int size_n, - int size_k, - int bit -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE); - gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX); - gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); - - auto kernel = gemm_half_q_half_alt_4bit_kernel; - if (bit == 8) { - kernel = gemm_half_q_half_alt_8bit_kernel; - } - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - kernel<<>> - ( - (const half2*) a, - b_q_weight, - c, - b_gptq_scales, - b_gptq_qzeros, - b_g_idx, - size_m, - size_k / 32 * bit, - size_n - ); +void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight, + const uint32_t* b_gptq_qzeros, + const half* b_gptq_scales, const int* b_g_idx, + half* c, int size_m, int size_n, int size_k, + int bit) { + dim3 blockDim, gridDim; + blockDim.x = BLOCK_KN_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE); + gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX); + gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); + + auto kernel = gemm_half_q_half_alt_4bit_kernel; + if (bit == 8) { + kernel = gemm_half_q_half_alt_8bit_kernel; + } + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + kernel<<>>( + (const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx, + size_m, size_k / 32 * bit, size_n); } -template -__global__ void reconstruct_gptq_kernel -( - const uint32_t* __restrict__ w, - const half* __restrict__ w_scales, - const uint32_t* __restrict__ w_zeros, - const int* __restrict__ g_idx, - const int height, - const int width, - const int group, - half* __restrict__ out -) -{ - // Start of block - - int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - int row = blockIdx.y * 32 / bit; - if (column >= width) return; - - // Views - - MatrixView_half_rw out_(out, height, width); - MatrixView_half w_scales_(w_scales, group, width); - T w_zeros_(w_zeros, group, width); - - uint32_t w_read = w[blockIdx.y * width + column]; - half* out_ptr = out_.item_ptr(row, column); - - #pragma unroll - for (int s = 0; s < 32; s += bit) - { - int group = g_idx[row + s / bit]; - half w_scale = w_scales_.item(group, column); - uint32_t w_zero = w_zeros_.item(group, column) + 1; - half w_item = __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero), w_scale); - *out_ptr = w_item; out_ptr += out_.width; - } +template +__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w, + const half* __restrict__ w_scales, + const uint32_t* __restrict__ w_zeros, + const int* __restrict__ g_idx, + const int height, const int width, + const int group, + half* __restrict__ out) { + // Start of block + + int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + int row = blockIdx.y * 32 / bit; + if (column >= width) return; + + // Views + + MatrixView_half_rw out_(out, height, width); + MatrixView_half w_scales_(w_scales, group, width); + T w_zeros_(w_zeros, group, width); + + uint32_t w_read = w[blockIdx.y * width + column]; + half* out_ptr = out_.item_ptr(row, column); + +#pragma unroll + for (int s = 0; s < 32; s += bit) { + int group = g_idx[row + s / bit]; + half w_scale = w_scales_.item(group, column); + uint32_t w_zero = w_zeros_.item(group, column) + 1; + half w_item = + __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero), + w_scale); + *out_ptr = w_item; + out_ptr += out_.width; + } } -__global__ void reconstruct_gptq_3bit_kernel -( - const uint32_t* __restrict__ w, - const half* __restrict__ w_scales, - const uint32_t* __restrict__ w_zeros, - const int* __restrict__ g_idx, - const int height, - const int width, - const int group, - half* __restrict__ out -) -{ - // Start of block - int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - int row = blockIdx.y * 32; - if (column >= width) return; - - // Views - - MatrixView_half_rw out_(out, height, width); - MatrixView_half w_scales_(w_scales, group, width); - MatrixView_q3_row w_zeros_(w_zeros, group, width); - - uint32_t w1 = w[(blockIdx.y * 3) * width + column]; - uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column]; - uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column]; - half* out_ptr = out_.item_ptr(row, column); - - #pragma unroll - for (int i = 0; i < 32; i += 1) - { - int group = g_idx[row + i]; - half w_scale = w_scales_.item(group, column); - uint32_t w_zero = w_zeros_.item(group, column) + 1; - int w_item; - if (i == 10) { - w_item = (w1 >> 30) | ((w2 << 2) & 0x4); - } else if (i == 21) { - w_item = (w2 >> 31) | ((w3 << 1) & 0x6); - } else if (i < 10) { - w_item = ((w1 >> (i * 3)) & 0x7); - } else if (i < 21) { - w_item = ((w2 >> (i * 3 - 32)) & 0x7); - } else { - w_item = ((w3 >> (i * 3 - 64)) & 0x7); - } - *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale); - out_ptr += out_.width; +__global__ void reconstruct_gptq_3bit_kernel( + const uint32_t* __restrict__ w, const half* __restrict__ w_scales, + const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx, + const int height, const int width, const int group, + half* __restrict__ out) { + // Start of block + int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + int row = blockIdx.y * 32; + if (column >= width) return; + + // Views + + MatrixView_half_rw out_(out, height, width); + MatrixView_half w_scales_(w_scales, group, width); + MatrixView_q3_row w_zeros_(w_zeros, group, width); + + uint32_t w1 = w[(blockIdx.y * 3) * width + column]; + uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column]; + uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column]; + half* out_ptr = out_.item_ptr(row, column); + +#pragma unroll + for (int i = 0; i < 32; i += 1) { + int group = g_idx[row + i]; + half w_scale = w_scales_.item(group, column); + uint32_t w_zero = w_zeros_.item(group, column) + 1; + int w_item; + if (i == 10) { + w_item = (w1 >> 30) | ((w2 << 2) & 0x4); + } else if (i == 21) { + w_item = (w2 >> 31) | ((w3 << 1) & 0x6); + } else if (i < 10) { + w_item = ((w1 >> (i * 3)) & 0x7); + } else if (i < 21) { + w_item = ((w2 >> (i * 3 - 32)) & 0x7); + } else { + w_item = ((w3 >> (i * 3 - 64)) & 0x7); } + *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale); + out_ptr += out_.width; + } } -void reconstruct_gptq -( - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_g_idx, - half* out, - int height, - int width, - int groups, - int bit -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - gridDim.y = DIVIDE(height, 32 / bit); - gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); - - auto kernel = reconstruct_gptq_kernel; - if (bit == 2) { - kernel = reconstruct_gptq_kernel; - } else if (bit == 8) { - kernel = reconstruct_gptq_kernel; - } else if (bit == 3) { - kernel = reconstruct_gptq_3bit_kernel; - gridDim.y = DIVIDE(height, 32); - } - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - kernel<<>> - ( - b_q_weight, - b_gptq_scales, - b_gptq_qzeros, - b_g_idx, - height, - width, - groups, - out - ); +void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros, + const half* b_gptq_scales, const int* b_g_idx, half* out, + int height, int width, int groups, int bit) { + dim3 blockDim, gridDim; + blockDim.x = BLOCK_KN_SIZE; + blockDim.y = 1; + gridDim.y = DIVIDE(height, 32 / bit); + gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); + + auto kernel = reconstruct_gptq_kernel; + if (bit == 2) { + kernel = reconstruct_gptq_kernel; + } else if (bit == 8) { + kernel = reconstruct_gptq_kernel; + } else if (bit == 3) { + kernel = reconstruct_gptq_3bit_kernel; + gridDim.y = DIVIDE(height, 32); + } + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + kernel<<>>(b_q_weight, b_gptq_scales, + b_gptq_qzeros, b_g_idx, height, + width, groups, out); } - -void gemm_half_q_half_cuda -( - cublasHandle_t cublas_handle, - const half* a, - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_g_idx, - half* c, - half* temp_dq, - int size_m, - int size_n, - int size_k, - int groups, - bool use_exllama, - int bit -) -{ - bool use_reconstruct; +void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a, + const uint32_t* b_q_weight, + const uint32_t* b_gptq_qzeros, + const half* b_gptq_scales, const int* b_g_idx, + half* c, half* temp_dq, int size_m, int size_n, + int size_k, int groups, bool use_exllama, int bit) { + bool use_reconstruct; + if (use_exllama) { + use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) || + (bit != 8 && size_m > MAX_Q_GEMM_ROWS)); + } else { + // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so + // we disabled them for now. + use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS); + } + if (use_reconstruct) { + // Reconstruct FP16 matrix, then cuBLAS if (use_exllama) { - use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) || (bit != 8 && size_m > MAX_Q_GEMM_ROWS)); + reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, + temp_dq, size_k, size_n, groups, bit); } else { - // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so we disabled them for now. - use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS); + reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, + temp_dq, size_k, size_n, groups, bit); } - if (use_reconstruct) { - // Reconstruct FP16 matrix, then cuBLAS - if (use_exllama) { - reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, temp_dq, - size_k, size_n, groups, bit); - } - else - { - reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - temp_dq, size_k, size_n, groups, bit); - } - const half alpha = __float2half(1.0f); - const half beta = __float2half(0.0f); - cublasHgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - size_n, size_m, size_k, - &alpha, temp_dq, size_n, - a, size_k, - &beta, c, size_n); + const half alpha = __float2half(1.0f); + const half beta = __float2half(0.0f); + cublasHgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, size_n, size_m, size_k, + &alpha, temp_dq, size_n, a, size_k, &beta, c, size_n); + } else if (use_exllama) { + // Quantized matmul + int max_chunks = size_m / BLOCK_M_SIZE_MAX; + int last_chunk = max_chunks * BLOCK_M_SIZE_MAX; + int last_chunk_size = size_m - last_chunk; + + if (max_chunks) { + gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, + b_g_idx, c, last_chunk, size_n, size_k, + BLOCK_M_SIZE_MAX, groups, bit); } - else if (use_exllama) - { - // Quantized matmul - int max_chunks = size_m / BLOCK_M_SIZE_MAX; - int last_chunk = max_chunks * BLOCK_M_SIZE_MAX; - int last_chunk_size = size_m - last_chunk; - - if (max_chunks) - { - gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX, - groups, bit); - } - if (last_chunk_size) - { - gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, - b_gptq_scales, b_g_idx, c + last_chunk * size_n, - last_chunk_size, size_n, size_k, last_chunk_size, - groups, bit); - } - } - else - { - gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - c, size_m, size_n, size_k, bit); + if (last_chunk_size) { + gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight, + b_gptq_qzeros, b_gptq_scales, b_g_idx, + c + last_chunk * size_n, last_chunk_size, + size_n, size_k, last_chunk_size, groups, bit); } + } else { + gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, + c, size_m, size_n, size_k, bit); + } } -__global__ void shuffle_4bit_kernel -( - uint32_t* __restrict__ b_q_weight, - const int size_k, - const int size_n -) -{ - int n = blockIdx.x * THREADS_X + threadIdx.x; - if (n >= size_n) return; - int k = 0; - uint32_t* b_ptr = b_q_weight + n; - while (k < size_k) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k += 8; } +__global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight, + const int size_k, const int size_n) { + int n = blockIdx.x * THREADS_X + threadIdx.x; + if (n >= size_n) return; + int k = 0; + uint32_t* b_ptr = b_q_weight + n; + while (k < size_k) { + shuffle_4bit_8(b_ptr, size_n); + b_ptr += 1 * size_n; + k += 8; + } } -__global__ void shuffle_8bit_kernel -( - uint32_t* __restrict__ b_q_weight, - const int size_k, - const int size_n -) -{ - int n = blockIdx.x * THREADS_X + threadIdx.x; - if (n >= size_n) return; - int k = 0; - uint32_t* b_ptr = b_q_weight + n; - while (k < size_k) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k += 4; } +__global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight, + const int size_k, const int size_n) { + int n = blockIdx.x * THREADS_X + threadIdx.x; + if (n >= size_n) return; + int k = 0; + uint32_t* b_ptr = b_q_weight + n; + while (k < size_k) { + shuffle_8bit_4(b_ptr, size_n); + b_ptr += 1 * size_n; + k += 4; + } } -__global__ void shuffle_2bit_kernel -( - uint32_t* __restrict__ b_q_weight, - const int size_k, - const int size_n -) -{ - int n = blockIdx.x * THREADS_X + threadIdx.x; - if (n >= size_n) return; - int k = 0; - uint32_t* b_ptr = b_q_weight + n; - while (k < size_k) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16; } +__global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight, + const int size_k, const int size_n) { + int n = blockIdx.x * THREADS_X + threadIdx.x; + if (n >= size_n) return; + int k = 0; + uint32_t* b_ptr = b_q_weight + n; + while (k < size_k) { + shuffle_2bit_16(b_ptr, size_n); + b_ptr += 1 * size_n; + k += 16; + } } -__global__ void shuffle_3bit_kernel -( - uint32_t* __restrict__ b_q_weight, - const int size_k, - const int size_n -) -{ - int n = blockIdx.x * THREADS_X + threadIdx.x; - if (n >= size_n) return; - int k = 0; - uint32_t* b_ptr = b_q_weight + n; - while (k < size_k) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32; } +__global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight, + const int size_k, const int size_n) { + int n = blockIdx.x * THREADS_X + threadIdx.x; + if (n >= size_n) return; + int k = 0; + uint32_t* b_ptr = b_q_weight + n; + while (k < size_k) { + shuffle_3bit_32(b_ptr, size_n); + b_ptr += 3 * size_n; + k += 32; + } } -__global__ void make_sequential_4bit_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const int* __restrict__ q_perm, - const int w_width -) -{ - const uint64_t* w2 = (uint64_t*) w; - uint64_t* w_new2 = (uint64_t*) w_new; - int w2_stride = w_width >> 1; - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; - if (w2_column >= w2_stride) return; - int w_new2_row = blockIdx.y; - int q_perm_idx = w_new2_row << 3; - uint64_t dst = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - int source_row = q_perm[q_perm_idx++]; - - int w2_row = source_row >> 3; - int w2_subrow = source_row & 0x07; - int w2_row_shift = w2_subrow << 2; - int wnew2_row_shift = i << 2; - - uint64_t src = w2[w2_row * w2_stride + w2_column]; - src >>= w2_row_shift; - src &= 0x0000000f0000000f; - src <<= wnew2_row_shift; - dst |= src; - } - w_new2[w_new2_row * w2_stride + w2_column] = dst; +__global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w, + uint32_t* __restrict__ w_new, + const int* __restrict__ q_perm, + const int w_width) { + const uint64_t* w2 = (uint64_t*)w; + uint64_t* w_new2 = (uint64_t*)w_new; + int w2_stride = w_width >> 1; + int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + if (w2_column >= w2_stride) return; + int w_new2_row = blockIdx.y; + int q_perm_idx = w_new2_row << 3; + uint64_t dst = 0; + +#pragma unroll + for (int i = 0; i < 8; i++) { + int source_row = q_perm[q_perm_idx++]; + + int w2_row = source_row >> 3; + int w2_subrow = source_row & 0x07; + int w2_row_shift = w2_subrow << 2; + int wnew2_row_shift = i << 2; + + uint64_t src = w2[w2_row * w2_stride + w2_column]; + src >>= w2_row_shift; + src &= 0x0000000f0000000f; + src <<= wnew2_row_shift; + dst |= src; + } + w_new2[w_new2_row * w2_stride + w2_column] = dst; } -__global__ void make_sequential_2bit_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const int* __restrict__ q_perm, - const int w_width -) -{ - const uint64_t* w2 = (uint64_t*) w; - uint64_t* w_new2 = (uint64_t*) w_new; - int w2_stride = w_width >> 1; - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; - if (w2_column >= w2_stride) return; - int w_new2_row = blockIdx.y; - int q_perm_idx = w_new2_row << 4; - uint64_t dst = 0; - - #pragma unroll - for (int i = 0; i < 16; i++) - { - int source_row = q_perm[q_perm_idx++]; - - int w2_row = source_row >> 4; - int w2_subrow = source_row & 0x0f; - int w2_row_shift = w2_subrow << 1; - int wnew2_row_shift = i << 1; - - uint64_t src = w2[w2_row * w2_stride + w2_column]; - src >>= w2_row_shift; - src &= 0x0000000300000003; - src <<= wnew2_row_shift; - dst |= src; - } - w_new2[w_new2_row * w2_stride + w2_column] = dst; +__global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w, + uint32_t* __restrict__ w_new, + const int* __restrict__ q_perm, + const int w_width) { + const uint64_t* w2 = (uint64_t*)w; + uint64_t* w_new2 = (uint64_t*)w_new; + int w2_stride = w_width >> 1; + int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + if (w2_column >= w2_stride) return; + int w_new2_row = blockIdx.y; + int q_perm_idx = w_new2_row << 4; + uint64_t dst = 0; + +#pragma unroll + for (int i = 0; i < 16; i++) { + int source_row = q_perm[q_perm_idx++]; + + int w2_row = source_row >> 4; + int w2_subrow = source_row & 0x0f; + int w2_row_shift = w2_subrow << 1; + int wnew2_row_shift = i << 1; + + uint64_t src = w2[w2_row * w2_stride + w2_column]; + src >>= w2_row_shift; + src &= 0x0000000300000003; + src <<= wnew2_row_shift; + dst |= src; + } + w_new2[w_new2_row * w2_stride + w2_column] = dst; } -__global__ void make_sequential_3bit_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const int* __restrict__ q_perm, - const int w_width -) -{ - int w_column = THREADS_X * blockIdx.x + threadIdx.x; - if (w_column >= w_width) return; - int w_new_row = blockIdx.y * 3; - int q_perm_idx = blockIdx.y << 5; - uint32_t dst[3] = {0, 0, 0}; - - #pragma unroll - for (int i = 0; i < 32; i++) - { - int source_row = q_perm[q_perm_idx++]; - int z_w = (source_row / 32) * 3; - int z_mod = source_row % 32; - int z_bit; - - if (z_mod != 10){ - if (z_mod != 21){ - z_bit = z_mod; - if (z_bit > 21){ - z_bit *= 3; - z_bit -= 64; - z_w += 2; - } else if (z_bit > 10){ - z_bit *= 3; - z_bit -= 32; - z_w += 1; - } else { - z_bit *= 3; - } - } else { - z_w += 1; - } - } - - uint64_t src; - if (z_mod == 10) { - src = (w[z_w * w_width + w_column] >> 30) | ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4); - } else if (z_mod == 21){ - src = (w[z_w * w_width + w_column] >> 31) | ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6); +__global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w, + uint32_t* __restrict__ w_new, + const int* __restrict__ q_perm, + const int w_width) { + int w_column = THREADS_X * blockIdx.x + threadIdx.x; + if (w_column >= w_width) return; + int w_new_row = blockIdx.y * 3; + int q_perm_idx = blockIdx.y << 5; + uint32_t dst[3] = {0, 0, 0}; + +#pragma unroll + for (int i = 0; i < 32; i++) { + int source_row = q_perm[q_perm_idx++]; + int z_w = (source_row / 32) * 3; + int z_mod = source_row % 32; + int z_bit; + + if (z_mod != 10) { + if (z_mod != 21) { + z_bit = z_mod; + if (z_bit > 21) { + z_bit *= 3; + z_bit -= 64; + z_w += 2; + } else if (z_bit > 10) { + z_bit *= 3; + z_bit -= 32; + z_w += 1; } else { - src = w[z_w * w_width + w_column]; - src >>= z_bit; - src &= 0x07; + z_bit *= 3; } + } else { + z_w += 1; + } + } - z_w = 0; - if (i != 10){ - if (i != 21){ - z_bit = i; - if (z_bit > 21){ - z_bit *= 3; - z_bit -= 64; - z_w += 2; - } else if (z_bit > 10){ - z_bit *= 3; - z_bit -= 32; - z_w += 1; - } else { - z_bit *= 3; - } - } else { - z_w += 1; - } - } - if (i == 10) { - dst[z_w] |= (src & 0x03) << 30; - dst[z_w + 1] |= ((src & 0x4) >> 2); - } else if (i == 21) { - dst[z_w] |= (src & 0x01) << 31; - dst[z_w + 1] |= ((src & 0x6) >> 1); + uint64_t src; + if (z_mod == 10) { + src = (w[z_w * w_width + w_column] >> 30) | + ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4); + } else if (z_mod == 21) { + src = (w[z_w * w_width + w_column] >> 31) | + ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6); + } else { + src = w[z_w * w_width + w_column]; + src >>= z_bit; + src &= 0x07; + } + + z_w = 0; + if (i != 10) { + if (i != 21) { + z_bit = i; + if (z_bit > 21) { + z_bit *= 3; + z_bit -= 64; + z_w += 2; + } else if (z_bit > 10) { + z_bit *= 3; + z_bit -= 32; + z_w += 1; } else { - dst[z_w] |= (src << z_bit); + z_bit *= 3; } + } else { + z_w += 1; + } + } + if (i == 10) { + dst[z_w] |= (src & 0x03) << 30; + dst[z_w + 1] |= ((src & 0x4) >> 2); + } else if (i == 21) { + dst[z_w] |= (src & 0x01) << 31; + dst[z_w + 1] |= ((src & 0x6) >> 1); + } else { + dst[z_w] |= (src << z_bit); } - w_new[w_new_row * w_width + w_column] = dst[0]; - w_new[(w_new_row + 1) * w_width + w_column] = dst[1]; - w_new[(w_new_row + 2) * w_width + w_column] = dst[2]; + } + w_new[w_new_row * w_width + w_column] = dst[0]; + w_new[(w_new_row + 1) * w_width + w_column] = dst[1]; + w_new[(w_new_row + 2) * w_width + w_column] = dst[2]; } -__global__ void make_sequential_8bit_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const int* __restrict__ q_perm, - const int w_width -) -{ - const uint64_t* w2 = (uint64_t*) w; - uint64_t* w_new2 = (uint64_t*) w_new; - int w2_stride = w_width >> 1; - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; - if (w2_column >= w2_stride) return; - int w_new2_row = blockIdx.y; - int q_perm_idx = w_new2_row << 2; - uint64_t dst = 0; - - #pragma unroll - for (int i = 0; i < 4; i++) - { - int source_row = q_perm[q_perm_idx++]; - - int w2_row = source_row >> 2; - int w2_subrow = source_row & 0x03; - int w2_row_shift = w2_subrow << 3; - int wnew2_row_shift = i << 3; - - uint64_t src = w2[w2_row * w2_stride + w2_column]; - src >>= w2_row_shift; - src &= 0x000000ff000000ff; - src <<= wnew2_row_shift; - dst |= src; - } - w_new2[w_new2_row * w2_stride + w2_column] = dst; +__global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w, + uint32_t* __restrict__ w_new, + const int* __restrict__ q_perm, + const int w_width) { + const uint64_t* w2 = (uint64_t*)w; + uint64_t* w_new2 = (uint64_t*)w_new; + int w2_stride = w_width >> 1; + int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + if (w2_column >= w2_stride) return; + int w_new2_row = blockIdx.y; + int q_perm_idx = w_new2_row << 2; + uint64_t dst = 0; + +#pragma unroll + for (int i = 0; i < 4; i++) { + int source_row = q_perm[q_perm_idx++]; + + int w2_row = source_row >> 2; + int w2_subrow = source_row & 0x03; + int w2_row_shift = w2_subrow << 3; + int wnew2_row_shift = i << 3; + + uint64_t src = w2[w2_row * w2_stride + w2_column]; + src >>= w2_row_shift; + src &= 0x000000ff000000ff; + src <<= wnew2_row_shift; + dst |= src; + } + w_new2[w_new2_row * w2_stride + w2_column] = dst; } +void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height, + int width, int bit) { + if (q_perm) { + uint32_t* new_qweight = NULL; + cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t)); -void shuffle_exllama_weight -( - uint32_t* q_weight, - int* q_perm, - int height, - int width, - int bit -) -{ - if (q_perm) - { - uint32_t* new_qweight = NULL; - cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t)); - - dim3 blockDim, gridDim; - blockDim.x = THREADS_X; - blockDim.y = 1; - gridDim.x = DIVIDE(width, THREADS_X); - gridDim.y = height / 32 * bit; - - auto kernel = make_sequential_4bit_kernel; - if (bit == 2) { - kernel = make_sequential_2bit_kernel; - } else if (bit == 3) { - kernel = make_sequential_3bit_kernel; - gridDim.y = height / 32; - } else if (bit == 8) { - kernel = make_sequential_8bit_kernel; - } - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - kernel<<>> - ( - q_weight, - new_qweight, - q_perm, - width - ); - // Replace qweights - cudaMemcpyAsync(q_weight, new_qweight, height / 32 * bit * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); - // Cleanup - cudaDeviceSynchronize(); - cudaFree(new_qweight); - } dim3 blockDim, gridDim; blockDim.x = THREADS_X; blockDim.y = 1; gridDim.x = DIVIDE(width, THREADS_X); - gridDim.y = 1; - auto shuffle_kernel = shuffle_4bit_kernel; + gridDim.y = height / 32 * bit; + + auto kernel = make_sequential_4bit_kernel; if (bit == 2) { - shuffle_kernel = shuffle_2bit_kernel; + kernel = make_sequential_2bit_kernel; } else if (bit == 3) { - shuffle_kernel = shuffle_3bit_kernel; + kernel = make_sequential_3bit_kernel; + gridDim.y = height / 32; } else if (bit == 8) { - shuffle_kernel = shuffle_8bit_kernel; + kernel = make_sequential_8bit_kernel; } const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - shuffle_kernel<<>>(q_weight, height, width); + kernel<<>>(q_weight, new_qweight, q_perm, + width); + // Replace qweights + cudaMemcpyAsync(q_weight, new_qweight, + height / 32 * bit * width * sizeof(uint32_t), + cudaMemcpyDeviceToDevice); + // Cleanup + cudaDeviceSynchronize(); + cudaFree(new_qweight); + } + dim3 blockDim, gridDim; + blockDim.x = THREADS_X; + blockDim.y = 1; + gridDim.x = DIVIDE(width, THREADS_X); + gridDim.y = 1; + auto shuffle_kernel = shuffle_4bit_kernel; + if (bit == 2) { + shuffle_kernel = shuffle_2bit_kernel; + } else if (bit == 3) { + shuffle_kernel = shuffle_3bit_kernel; + } else if (bit == 8) { + shuffle_kernel = shuffle_8bit_kernel; + } + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + shuffle_kernel<<>>(q_weight, height, width); } } // namespace gptq } // namespace vllm -torch::Tensor gptq_gemm -( - torch::Tensor a, - torch::Tensor b_q_weight, - torch::Tensor b_gptq_qzeros, - torch::Tensor b_gptq_scales, - torch::Tensor b_g_idx, - bool use_exllama, - int bit -) -{ - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); - at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options); - at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options); - - vllm::gptq::gemm_half_q_half_cuda - ( - at::cuda::getCurrentCUDABlasHandle(), - (const half*) a.data_ptr(), - (const uint32_t*) b_q_weight.data_ptr(), - (const uint32_t*)b_gptq_qzeros.data_ptr(), - (const half*) b_gptq_scales.data_ptr(), - b_g_idx.device().is_meta() ? NULL : (const int*) b_g_idx.data_ptr(), - (half*) c.data_ptr(), - (half*) temp_dq.data_ptr(), - c.size(0), // m - c.size(1), // n - a.size(1), // k - b_gptq_qzeros.size(0), // group number - use_exllama, - bit - ); - return c; +torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, + torch::Tensor b_gptq_qzeros, + torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, + bool use_exllama, int bit) { + const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); + auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); + at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options); + at::Tensor temp_dq = torch::empty( + {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options); + + vllm::gptq::gemm_half_q_half_cuda( + at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(), + (const uint32_t*)b_q_weight.data_ptr(), + (const uint32_t*)b_gptq_qzeros.data_ptr(), + (const half*)b_gptq_scales.data_ptr(), + b_g_idx.device().is_meta() ? NULL : (const int*)b_g_idx.data_ptr(), + (half*)c.data_ptr(), (half*)temp_dq.data_ptr(), + c.size(0), // m + c.size(1), // n + a.size(1), // k + b_gptq_qzeros.size(0), // group number + use_exllama, bit); + return c; } -void gptq_shuffle -( - torch::Tensor q_weight, - torch::Tensor q_perm, - int bit -) -{ - const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight)); - vllm::gptq::shuffle_exllama_weight( - (uint32_t*) q_weight.data_ptr(), - q_perm.device().is_meta() || q_perm.numel() == 0 ? NULL : (int*) q_perm.data_ptr(), - q_weight.size(0) * 32 / bit, - q_weight.size(1), - bit - ); +void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit) { + const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight)); + vllm::gptq::shuffle_exllama_weight( + (uint32_t*)q_weight.data_ptr(), + q_perm.device().is_meta() || q_perm.numel() == 0 + ? NULL + : (int*)q_perm.data_ptr(), + q_weight.size(0) * 32 / bit, q_weight.size(1), bit); } diff --git a/csrc/quantization/gptq/qdq_2.cuh b/csrc/quantization/gptq/qdq_2.cuh index 295872a91de37..ca0f810608d1b 100644 --- a/csrc/quantization/gptq/qdq_2.cuh +++ b/csrc/quantization/gptq/qdq_2.cuh @@ -14,71 +14,60 @@ namespace gptq { // // ffddbb99 77553311 eeccaa88 66442200 -__forceinline__ __device__ void shuffle_2bit_16 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0]; - uint32_t qb = 0; +__forceinline__ __device__ void shuffle_2bit_16(uint32_t* q, int stride) { + uint32_t qa = q[0]; + uint32_t qb = 0; - #pragma unroll - for (int i = 0; i < 8; i++) - { - uint32_t qa0 = qa & 0x03; - uint32_t qa1 = (qa & 0x0c) >> 2; - qa >>= 4; - qb |= (qa1 << (i * 2 + 16)); - qb |= (qa0 << (i * 2)); - } - q[0] = qb; +#pragma unroll + for (int i = 0; i < 8; i++) { + uint32_t qa0 = qa & 0x03; + uint32_t qa1 = (qa & 0x0c) >> 2; + qa >>= 4; + qb |= (qa1 << (i * 2 + 16)); + qb |= (qa0 << (i * 2)); + } + q[0] = qb; } -__forceinline__ __device__ void dequant_2bit_16 -( - const uint32_t q_0, - half2 (&dq)[8], - int stride, - const uint32_t zero -) -{ - const uint32_t c0 = 0x64006400; - const half y4_ = __float2half_rn(1.0f / 4.0f); - const half y16_ = __float2half_rn(1.0f / 16.0f); - const half y64_ = __float2half_rn(1.0f / 64.0f); - const half2 y4 = __halves2half2(y4_, y4_); - const half2 y16 = __halves2half2(y16_, y16_); - const half2 y64 = __halves2half2(y64_, y64_); +__forceinline__ __device__ void dequant_2bit_16(const uint32_t q_0, + half2 (&dq)[8], int stride, + const uint32_t zero) { + const uint32_t c0 = 0x64006400; + const half y4_ = __float2half_rn(1.0f / 4.0f); + const half y16_ = __float2half_rn(1.0f / 16.0f); + const half y64_ = __float2half_rn(1.0f / 64.0f); + const half2 y4 = __halves2half2(y4_, y4_); + const half2 y16 = __halves2half2(y16_, y16_); + const half2 y64 = __halves2half2(y64_, y64_); - const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); - const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero)); - const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero)); - const half2 z1 = __half2half2(z1_.as_half); - const half2 z4 = __half2half2(z4_); - const half2 z16 = __half2half2(z16_); - const half2 z64 = __half2half2(z64_); + const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); + const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero)); + const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero)); + const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero)); + const half2 z1 = __half2half2(z1_.as_half); + const half2 z4 = __half2half2(z4_); + const half2 z16 = __half2half2(z16_); + const half2 z64 = __half2half2(z64_); - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) * 4 + 1024 - half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024 - half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024 - qa >>= 8; - half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8]) + 1024 - half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) * 4 + 1024 - half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024 - half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024 + uint32_t qa = q_0; + half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1]) + 1024 + half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) * 4 + 1024 + half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024 + half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024 + qa >>= 8; + half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8]) + 1024 + half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) * 4 + 1024 + half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024 + half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024 - dq[0] = __hadd2(q0.as_half2, z1); - dq[1] = __hfma2(q1.as_half2, y4, z4); - dq[2] = __hfma2(q2.as_half2, y16, z16); - dq[3] = __hfma2(q3.as_half2, y64, z64); - dq[4] = __hadd2(q4.as_half2, z1); - dq[5] = __hfma2(q5.as_half2, y4, z4); - dq[6] = __hfma2(q6.as_half2, y16, z16); - dq[7] = __hfma2(q7.as_half2, y64, z64); + dq[0] = __hadd2(q0.as_half2, z1); + dq[1] = __hfma2(q1.as_half2, y4, z4); + dq[2] = __hfma2(q2.as_half2, y16, z16); + dq[3] = __hfma2(q3.as_half2, y64, z64); + dq[4] = __hadd2(q4.as_half2, z1); + dq[5] = __hfma2(q5.as_half2, y4, z4); + dq[6] = __hfma2(q6.as_half2, y16, z16); + dq[7] = __hfma2(q7.as_half2, y64, z64); } } // namespace gptq diff --git a/csrc/quantization/gptq/qdq_3.cuh b/csrc/quantization/gptq/qdq_3.cuh index 3e7ecde752ba3..0d5c2adf5dbbe 100644 --- a/csrc/quantization/gptq/qdq_3.cuh +++ b/csrc/quantization/gptq/qdq_3.cuh @@ -11,128 +11,136 @@ namespace gptq { // vjjjhhhf ffdddbbb uiiiggge eecccaaa // vtttrrrp ppnnnlll usssqqqo oommmkkk -__forceinline__ __device__ void shuffle_3bit_32 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0 * stride]; - uint32_t qb = q[1 * stride]; - uint32_t qc = q[2 * stride]; - - // qa: aa999888 77766655 54443332 22111000 - // qb: lkkkjjji iihhhggg fffeeedd dcccbbba - // qc: vvvuuutt tsssrrrq qqpppooo nnnmmmll - - uint32_t qd = qc >> 26; - qc <<= 4; - qc |= qb >> 28; - qb <<= 2; - qb |= qa >> 30; - - // qa: ..999888 77766655 54443332 22111000 - // qb: ..jjjiii hhhgggff feeedddc ccbbbaaa - // qc: ..tttsss rrrqqqpp pooonnnm mmlllkkk - // qd: vvvuuu - - uint32_t za = 0; - uint32_t zb = 0; - uint32_t zc = 0; - - for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); } - for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); } - for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); } - - // za: 9997775 55333111 8886664 44222000 - // zb: jjjhhhf ffdddbbb iiiggge eecccaaa - // zc: tttrrrp ppnnnlll sssqqqo oommmkkk - // qd: vvvuuu - - za |= ((qd & 0x01) >> 0) << 15; - zb |= ((qd & 0x02) >> 1) << 15; - zc |= ((qd & 0x04) >> 2) << 15; - za |= ((qd & 0x08) >> 3) << 31; - zb |= ((qd & 0x10) >> 4) << 31; - zc |= ((qd & 0x20) >> 5) << 31; - - // za: v9997775 55333111 u8886664 44222000 (u, v lsb) - // zb: vjjjhhhf ffdddbbb uiiiggge eecccaaa - // zc: vtttrrrp ppnnnlll usssqqqo oommmkkk - - q[0 * stride] = za; - q[1 * stride] = zb; - q[2 * stride] = zc; -} - -__forceinline__ __device__ void dequant_3bit_32 -( - const uint32_t q_0, - const uint32_t q_1, - const uint32_t q_2, - half2 (&dq)[16], - int stride, - const uint32_t zero -) -{ - const uint32_t c0 = 0x64006400; - const half y8_ = __float2half_rn(1.0f / 8.0f); - const half y64_ = __float2half_rn(1.0f / 64.0f); - const half2 y8 = __halves2half2(y8_, y8_); - const half2 y64 = __halves2half2(y64_, y64_); - const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); - const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero)); - const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero)); - const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half); - const half2 z8 = __halves2half2(z8_, z8_); - const half2 z64 = __halves2half2(z64_, z64_); - - uint32_t qa = q_0; - uint32_t qb = q_1; - uint32_t qc = q_2; - - half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) * 8 + 1024 +__forceinline__ __device__ void shuffle_3bit_32(uint32_t* q, int stride) { + uint32_t qa = q[0 * stride]; + uint32_t qb = q[1 * stride]; + uint32_t qc = q[2 * stride]; + + // qa: aa999888 77766655 54443332 22111000 + // qb: lkkkjjji iihhhggg fffeeedd dcccbbba + // qc: vvvuuutt tsssrrrq qqpppooo nnnmmmll + + uint32_t qd = qc >> 26; + qc <<= 4; + qc |= qb >> 28; + qb <<= 2; + qb |= qa >> 30; + + // qa: ..999888 77766655 54443332 22111000 + // qb: ..jjjiii hhhgggff feeedddc ccbbbaaa + // qc: ..tttsss rrrqqqpp pooonnnm mmlllkkk + // qd: vvvuuu + + uint32_t za = 0; + uint32_t zb = 0; + uint32_t zc = 0; + + for (int i = 0; i < 5; i++) { + uint32_t t0 = qa & 0x07; + uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; - half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5]) + 1024 - half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) * 8 + 1024 - half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024 - qa >>= 9; - qa &= 0x00010001; - half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11]) + 1024 - half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) * 8 + 1024 + za |= (t0 << (i * 3)); + za |= (t1 << (i * 3 + 16)); + } + for (int i = 0; i < 5; i++) { + uint32_t t0 = qb & 0x07; + uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; - half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15]) + 1024 - half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) * 8 + 1024 - half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024 - qb >>= 8; - qb &= 0x00020002; - half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21]) + 1024 - half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) * 8 + 1024 + zb |= (t0 << (i * 3)); + zb |= (t1 << (i * 3 + 16)); + } + for (int i = 0; i < 5; i++) { + uint32_t t0 = qc & 0x07; + uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; - half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25]) + 1024 - half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) * 8 + 1024 - half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024 - qc >>= 7; - qc &= 0x00040004; - half2_uint32 q15((qa | qb | qc) | c0); - - dq[ 0] = __hadd2( q0.as_half2, z1); - dq[ 1] = __hfma2( q1.as_half2, y8, z8); - dq[ 2] = __hadd2( q2.as_half2, z1); - dq[ 3] = __hfma2( q3.as_half2, y8, z8); - dq[ 4] = __hfma2( q4.as_half2, y64, z64); - dq[ 5] = __hadd2( q5.as_half2, z1); - dq[ 6] = __hfma2( q6.as_half2, y8, z8); - dq[ 7] = __hadd2( q7.as_half2, z1); - dq[ 8] = __hfma2( q8.as_half2, y8, z8); - dq[ 9] = __hfma2( q9.as_half2, y64, z64); - dq[10] = __hadd2(q10.as_half2, z1); - dq[11] = __hfma2(q11.as_half2, y8, z8); - dq[12] = __hadd2(q12.as_half2, z1); - dq[13] = __hfma2(q13.as_half2, y8, z8); - dq[14] = __hfma2(q14.as_half2, y64, z64); - dq[15] = __hadd2(q15.as_half2, z1); + zc |= (t0 << (i * 3)); + zc |= (t1 << (i * 3 + 16)); + } + + // za: 9997775 55333111 8886664 44222000 + // zb: jjjhhhf ffdddbbb iiiggge eecccaaa + // zc: tttrrrp ppnnnlll sssqqqo oommmkkk + // qd: vvvuuu + + za |= ((qd & 0x01) >> 0) << 15; + zb |= ((qd & 0x02) >> 1) << 15; + zc |= ((qd & 0x04) >> 2) << 15; + za |= ((qd & 0x08) >> 3) << 31; + zb |= ((qd & 0x10) >> 4) << 31; + zc |= ((qd & 0x20) >> 5) << 31; + + // za: v9997775 55333111 u8886664 44222000 (u, v lsb) + // zb: vjjjhhhf ffdddbbb uiiiggge eecccaaa + // zc: vtttrrrp ppnnnlll usssqqqo oommmkkk + + q[0 * stride] = za; + q[1 * stride] = zb; + q[2 * stride] = zc; +} + +__forceinline__ __device__ void dequant_3bit_32(const uint32_t q_0, + const uint32_t q_1, + const uint32_t q_2, + half2 (&dq)[16], int stride, + const uint32_t zero) { + const uint32_t c0 = 0x64006400; + const half y8_ = __float2half_rn(1.0f / 8.0f); + const half y64_ = __float2half_rn(1.0f / 64.0f); + const half2 y8 = __halves2half2(y8_, y8_); + const half2 y64 = __halves2half2(y64_, y64_); + const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); + const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero)); + const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero)); + const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half); + const half2 z8 = __halves2half2(z8_, z8_); + const half2 z64 = __halves2half2(z64_, z64_); + + uint32_t qa = q_0; + uint32_t qb = q_1; + uint32_t qc = q_2; + + half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1]) + 1024 + half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) * 8 + 1024 + qa >>= 6; + half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5]) + 1024 + half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) * 8 + 1024 + half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024 + qa >>= 9; + qa &= 0x00010001; + half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11]) + 1024 + half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) * 8 + 1024 + qb >>= 6; + half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15]) + 1024 + half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) * 8 + 1024 + half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024 + qb >>= 8; + qb &= 0x00020002; + half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21]) + 1024 + half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) * 8 + 1024 + qc >>= 6; + half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25]) + 1024 + half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) * 8 + 1024 + half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024 + qc >>= 7; + qc &= 0x00040004; + half2_uint32 q15((qa | qb | qc) | c0); + + dq[0] = __hadd2(q0.as_half2, z1); + dq[1] = __hfma2(q1.as_half2, y8, z8); + dq[2] = __hadd2(q2.as_half2, z1); + dq[3] = __hfma2(q3.as_half2, y8, z8); + dq[4] = __hfma2(q4.as_half2, y64, z64); + dq[5] = __hadd2(q5.as_half2, z1); + dq[6] = __hfma2(q6.as_half2, y8, z8); + dq[7] = __hadd2(q7.as_half2, z1); + dq[8] = __hfma2(q8.as_half2, y8, z8); + dq[9] = __hfma2(q9.as_half2, y64, z64); + dq[10] = __hadd2(q10.as_half2, z1); + dq[11] = __hfma2(q11.as_half2, y8, z8); + dq[12] = __hadd2(q12.as_half2, z1); + dq[13] = __hfma2(q13.as_half2, y8, z8); + dq[14] = __hfma2(q14.as_half2, y64, z64); + dq[15] = __hadd2(q15.as_half2, z1); } } // namespace gptq diff --git a/csrc/quantization/gptq/qdq_4.cuh b/csrc/quantization/gptq/qdq_4.cuh index 881f353f6564d..7f65d2d2819b1 100644 --- a/csrc/quantization/gptq/qdq_4.cuh +++ b/csrc/quantization/gptq/qdq_4.cuh @@ -13,133 +13,112 @@ namespace gptq { // // 77775555 33331111 66664444 22220000 -__forceinline__ __device__ void shuffle_4bit_8 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0]; - uint32_t qb = 0; - - #pragma unroll - for (int i = 0; i < 4; i++) - { - uint32_t qa0 = qa & 0x0f; - uint32_t qa1 = (qa & 0xf0) >> 4; - qa >>= 8; - qb |= (qa1 << (i * 4 + 16)); - qb |= (qa0 << (i * 4)); - } - q[0] = qb; -} - -__forceinline__ __device__ void dequant_4bit_8 -( - const uint32_t q_0, - half2 (&dq)[4], - int stride, - const uint32_t zero -) -{ - const uint32_t c0 = 0x64006400; - const half y16_ = __float2half_rn(1.0f / 16.0f); - const half2 y16 = __halves2half2(y16_, y16_); - const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); - const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - const half2 z1 = __half2half2(z1_.as_half); - const half2 z16 = __half2half2(z16_); - - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024 +__forceinline__ __device__ void shuffle_4bit_8(uint32_t* q, int stride) { + uint32_t qa = q[0]; + uint32_t qb = 0; + +#pragma unroll + for (int i = 0; i < 4; i++) { + uint32_t qa0 = qa & 0x0f; + uint32_t qa1 = (qa & 0xf0) >> 4; qa >>= 8; - half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5]) + 1024 - half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024 + qb |= (qa1 << (i * 4 + 16)); + qb |= (qa0 << (i * 4)); + } + q[0] = qb; +} - dq[0] = __hadd2(q0.as_half2, z1); - dq[1] = __hfma2(q1.as_half2, y16, z16); - dq[2] = __hadd2(q2.as_half2, z1); - dq[3] = __hfma2(q3.as_half2, y16, z16); +__forceinline__ __device__ void dequant_4bit_8(const uint32_t q_0, + half2 (&dq)[4], int stride, + const uint32_t zero) { + const uint32_t c0 = 0x64006400; + const half y16_ = __float2half_rn(1.0f / 16.0f); + const half2 y16 = __halves2half2(y16_, y16_); + const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); + const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero)); + const half2 z1 = __half2half2(z1_.as_half); + const half2 z16 = __half2half2(z16_); + + uint32_t qa = q_0; + half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1]) + 1024 + half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024 + qa >>= 8; + half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5]) + 1024 + half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024 + + dq[0] = __hadd2(q0.as_half2, z1); + dq[1] = __hfma2(q1.as_half2, y16, z16); + dq[2] = __hadd2(q2.as_half2, z1); + dq[3] = __hfma2(q3.as_half2, y16, z16); } -__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale -( - const uint32_t zero, - const half scale, - half2 (&z1z16)[2], - half2 (&y1y16)[2] -) -{ - half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); - half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); +__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale( + const uint32_t zero, const half scale, half2 (&z1z16)[2], + half2 (&y1y16)[2]) { + half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); + half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - half2 scale2 = __half2half2(scale); + half2 scale2 = __half2half2(scale); - z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half)); - z1z16[1] = __hmul2(scale2, __half2half2(z16)); + z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half)); + z1z16[1] = __hmul2(scale2, __half2half2(z16)); - const half y1 = __float2half_rn(1.0f); - const half y16 = __float2half_rn(1.0f / 16.0f); + const half y1 = __float2half_rn(1.0f); + const half y16 = __float2half_rn(1.0f / 16.0f); - y1y16[0] = __hmul2(scale2, __half2half2(y1)); - y1y16[1] = __hmul2(scale2, __half2half2(y16)); + y1y16[0] = __hmul2(scale2, __half2half2(y1)); + y1y16[1] = __hmul2(scale2, __half2half2(y16)); } -__forceinline__ __device__ void dequant_4bit_8_prep_zero -( - const uint32_t zero, - half2(&z1z16)[2], - half2(&y1y16)[2] -) -{ - half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); - half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); +__forceinline__ __device__ void dequant_4bit_8_prep_zero(const uint32_t zero, + half2 (&z1z16)[2], + half2 (&y1y16)[2]) { + half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); + half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - z1z16[0] = __half2half2(z1.as_half); - z1z16[1] = __half2half2(z16); + z1z16[0] = __half2half2(z1.as_half); + z1z16[1] = __half2half2(z16); - const half y1 = __float2half_rn(1.0f); - const half y16 = __float2half_rn(1.0f / 16.0f); + const half y1 = __float2half_rn(1.0f); + const half y16 = __float2half_rn(1.0f / 16.0f); - y1y16[0] = __half2half2(y1); - y1y16[1] = __half2half2(y16); + y1y16[0] = __half2half2(y1); + y1y16[1] = __half2half2(y16); } - -__forceinline__ __device__ void dequant_4bit_8_gptq -( - const uint32_t q_0, - half2 (&dq)[4], - half2 (&z1z16)[2], - half2 (&y1y16)[2], - int stride, - bool scaled -) -{ - const uint32_t c0 = 0x64006400; - - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0] + 1024, q[1] + 1024 ) - half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 ) - qa >>= 8; - half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4] + 1024, q[5] + 1024 ) - half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 ) - - if (scaled) - { - dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]); // half2( q[0] * s - z * s, q[1] * s - z * s) - dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]); // half2( q[2] * s - z * s, q[3] * s - z * s) - dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]); - dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]); - } - else - { - dq[0] = __hadd2(q0.as_half2, z1z16[0]); // half2( q[0] - z, q[1] - z ) - dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]); // half2( q[2] - z, q[3] - z ) - dq[2] = __hadd2(q2.as_half2, z1z16[0]); // half2( q[4] - z, q[5] - z ) - dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]); // half2( q[6] - z, q[7] - z ) - } +__forceinline__ __device__ void dequant_4bit_8_gptq(const uint32_t q_0, + half2 (&dq)[4], + half2 (&z1z16)[2], + half2 (&y1y16)[2], + int stride, bool scaled) { + const uint32_t c0 = 0x64006400; + + uint32_t qa = q_0; + half2_uint32 q0((qa & 0x000f000f) | + c0); // half2( q[0] + 1024, q[1] + 1024 ) + half2_uint32 q1((qa & 0x00f000f0) | + c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 ) + qa >>= 8; + half2_uint32 q2((qa & 0x000f000f) | + c0); // half2( q[4] + 1024, q[5] + 1024 ) + half2_uint32 q3((qa & 0x00f000f0) | + c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 ) + + if (scaled) { + dq[0] = __hfma2(q0.as_half2, y1y16[0], + z1z16[0]); // half2( q[0] * s - z * s, q[1] * s - z * s) + dq[1] = __hfma2(q1.as_half2, y1y16[1], + z1z16[1]); // half2( q[2] * s - z * s, q[3] * s - z * s) + dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]); + dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]); + } else { + dq[0] = __hadd2(q0.as_half2, z1z16[0]); // half2( q[0] - z, q[1] - z ) + dq[1] = __hfma2(q1.as_half2, y1y16[1], + z1z16[1]); // half2( q[2] - z, q[3] - z ) + dq[2] = __hadd2(q2.as_half2, z1z16[0]); // half2( q[4] - z, q[5] - z ) + dq[3] = __hfma2(q3.as_half2, y1y16[1], + z1z16[1]); // half2( q[6] - z, q[7] - z ) + } } } // namespace gptq } // namespace vllm diff --git a/csrc/quantization/gptq/qdq_8.cuh b/csrc/quantization/gptq/qdq_8.cuh index 0c7ad7876140b..feb5d220424b0 100644 --- a/csrc/quantization/gptq/qdq_8.cuh +++ b/csrc/quantization/gptq/qdq_8.cuh @@ -10,28 +10,18 @@ Copied from https://github.com/turboderp/exllamav2 namespace vllm { namespace gptq { -__forceinline__ __device__ void shuffle_8bit_4 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_8bit_8 -( - const uint32_t q_0, - const uint32_t q_1, - half2 (&dq)[4], - int stride, - const uint32_t zero -) -{ - half dqh[8]; - for (int i = 0; i < 4; i++) dqh[i ] = dq_ns(exb(q_0, i * 8, 0xff), zero); - for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); - - for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); +__forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {} + +__forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0, + const uint32_t q_1, + half2 (&dq)[4], int stride, + const uint32_t zero) { + half dqh[8]; + for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero); + for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); + + for (int i = 0; i < 4; i++) + dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); } } // namespace gptq diff --git a/csrc/quantization/gptq/qdq_util.cuh b/csrc/quantization/gptq/qdq_util.cuh index 1722a9aa6cb34..9426408fec502 100644 --- a/csrc/quantization/gptq/qdq_util.cuh +++ b/csrc/quantization/gptq/qdq_util.cuh @@ -8,51 +8,47 @@ Copied from https://github.com/turboderp/exllamav2 namespace vllm { namespace gptq { -union half2_uint32 -{ - uint32_t as_uint32; - half2 as_half2; - __device__ half2_uint32(uint32_t val) : as_uint32(val) {} - __device__ half2_uint32(half2 val) : as_half2(val) {} +union half2_uint32 { + uint32_t as_uint32; + half2 as_half2; + __device__ half2_uint32(uint32_t val) : as_uint32(val) {} + __device__ half2_uint32(half2 val) : as_half2(val) {} }; -union half_uint16 -{ - uint16_t as_uint16; - half as_half; - __device__ half_uint16(uint16_t val) : as_uint16(val) {} - __device__ half_uint16(half val) : as_half(val) {} +union half_uint16 { + uint16_t as_uint16; + half as_half; + __device__ half_uint16(uint16_t val) : as_uint16(val) {} + __device__ half_uint16(half val) : as_half(val) {} }; // Max_scale premultiplied by 1/256 -__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) -{ - int qs_i = qs + 1; - half qs_h = __int2half_rn(qs_i * qs_i); - qs_h = __hmul(qs_h, max_scale); - return qs_h; +__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) { + int qs_i = qs + 1; + half qs_h = __int2half_rn(qs_i * qs_i); + qs_h = __hmul(qs_h, max_scale); + return qs_h; } -__forceinline__ __device__ half dq(const int q, const int qzero, const half scale) -{ - return __hmul(__int2half_rn(q - qzero), scale); +__forceinline__ __device__ half dq(const int q, const int qzero, + const half scale) { + return __hmul(__int2half_rn(q - qzero), scale); } -__forceinline__ __device__ half dq_ns(const int q, const int qzero) -{ - //return __hsub(__int2half_rn(q), __int2half_rn(qzero)); - return __int2half_rn(q - qzero); +__forceinline__ __device__ half dq_ns(const int q, const int qzero) { + // return __hsub(__int2half_rn(q), __int2half_rn(qzero)); + return __int2half_rn(q - qzero); } -__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask) -{ - return (int)((q >> shift) & mask); +__forceinline__ __device__ int exb(const uint32_t q, const int shift, + const int mask) { + return (int)((q >> shift) & mask); } -__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask) -{ - return (int)(__funnelshift_rc(q0, q1, shift) & mask); +__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, + const int shift, const int mask) { + return (int)(__funnelshift_rc(q0, q1, shift) & mask); } } // namespace gptq diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 34950a5d13cf5..c573b9041065b 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -22,53 +22,58 @@ #include "gptq_marlin.cuh" #include "gptq_marlin_dtypes.cuh" -#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) static_assert(\ - std::is_same::value || std::is_same::value, \ - "only float16 and bfloat16 is supported"); - -template inline std::string str(T x) { return std::to_string(x); } +#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ + static_assert(std::is_same::value || \ + std::is_same::value, \ + "only float16 and bfloat16 is supported"); + +template +inline std::string str(T x) { + return std::to_string(x); +} namespace gptq_marlin { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 -__global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr, - int const *__restrict__ perm_int_ptr, - int4 *__restrict__ out_int4_ptr, int size_m, +__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr, + int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, int size_m, int size_k, int block_rows) {} -template shared - // fetch pipeline - const bool has_act_order, // whether act_order is enabled - const int group_blocks = -1 // number of consecutive 16x16 blocks with - // a separate quantization scale +template shared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale > -__global__ void -Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk - const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn - int4 *__restrict__ C, // fp16 output buffer of shape mxn - const int4 *__restrict__ scales_ptr, // fp16 quantization scales of shape - // (k/groupsize)xn - const int *__restrict__ g_idx, // int32 group indices of shape k - int num_groups, // number of scale groups per output channel - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int *locks // extra global storage for barrier synchronization +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const int* __restrict__ g_idx, // int32 group indices of shape k + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization ) {} -} // namespace gptq_marlin +} // namespace gptq_marlin -torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, - torch::Tensor &b_scales, torch::Tensor &g_idx, - torch::Tensor &perm, torch::Tensor &workspace, +torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& g_idx, + torch::Tensor& perm, torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full) { TORCH_CHECK_NOT_IMPLEMENTED(false, @@ -81,24 +86,26 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, // m16n8k16 tensor core mma instruction with fp16 inputs and fp32 // output/accumulation. template -__device__ inline void mma(const typename ScalarType::FragA &a_frag, - const typename ScalarType::FragB &frag_b, - typename ScalarType::FragC &frag_c) { - const uint32_t *a = reinterpret_cast(&a_frag); - const uint32_t *b = reinterpret_cast(&frag_b); - float *c = reinterpret_cast(&frag_c); +__device__ inline void mma(const typename ScalarType::FragA& a_frag, + const typename ScalarType::FragB& frag_b, + typename ScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + float* c = reinterpret_cast(&frag_c); if constexpr (std::is_same::value) { - asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), - "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); } else if constexpr (std::is_same::value) { - asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), - "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); } else { STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); } @@ -107,8 +114,9 @@ __device__ inline void mma(const typename ScalarType::FragA &a_frag, // Instruction for loading a full 16x16 matrix fragment of operand A from shared // memory, directly in tensor core layout. template -__device__ inline void ldsm4(typename ScalarType::FragA &frag_a, const void *smem_ptr) { - uint32_t *a = reinterpret_cast(&frag_a); +__device__ inline void ldsm4(typename ScalarType::FragA& frag_a, + const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) @@ -118,7 +126,8 @@ __device__ inline void ldsm4(typename ScalarType::FragA &frag_a, const // Lookup-table based 3-input logical operation; explicitly used for // dequantization as the compiler does not seem to automatically recognize it in // all cases. -template __device__ inline int lop3(int a, int b, int c) { +template +__device__ inline int lop3(int a, int b, int c) { int res; asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(res) @@ -140,8 +149,10 @@ __device__ inline uint32_t prmt(uint32_t a) { // Efficiently dequantize an int32 value into a full B-fragment of 4 fp16 // values. We mostly follow the strategy in the link below, with some small // changes: -// - FP16: https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287 -// - BF16: https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385 +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385 template __device__ inline typename ScalarType::FragB dequant_4bit(int q) { STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); @@ -161,16 +172,17 @@ __device__ inline typename ScalarType::FragB dequant_4bit(int q) { const int MUL = 0x2c002c00; const int ADD = 0xd480d480; typename ScalarType::FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&SUB)); - frag_b[1] = __hfma2(*reinterpret_cast(&hi), - *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); return frag_b; } template <> -__device__ inline typename ScalarType::FragB dequant_4bit(int q) { +__device__ inline typename ScalarType::FragB +dequant_4bit(int q) { static constexpr uint32_t MASK = 0x000f000f; static constexpr uint32_t EX = 0x43004300; @@ -184,7 +196,7 @@ __device__ inline typename ScalarType::FragB dequant_4bit(&lo), + frag_b[0] = __hfma2(*reinterpret_cast(&lo), *reinterpret_cast(&MUL), *reinterpret_cast(&ADD)); frag_b[1] = __hfma2(*reinterpret_cast(&hi), @@ -193,10 +205,12 @@ __device__ inline typename ScalarType::FragB dequant_4bit __device__ inline typename ScalarType::FragB dequant_8bit(int q) { STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); @@ -214,24 +228,26 @@ __device__ inline typename ScalarType::FragB dequant_8bit(int q) { static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; typename ScalarType::FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); - frag_b[1] = __hsub2(*reinterpret_cast(&hi), - *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[1] = __hsub2(*reinterpret_cast(&hi), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); return frag_b; } template <> -__device__ inline typename ScalarType::FragB dequant_8bit(int q) { +__device__ inline typename ScalarType::FragB +dequant_8bit(int q) { typename ScalarType::FragB frag_b; float fp32_intermediates[4]; - uint32_t * fp32_intermediates_casted = reinterpret_cast(fp32_intermediates); + uint32_t* fp32_intermediates_casted = + reinterpret_cast(fp32_intermediates); static constexpr uint32_t fp32_base = 0x4B000000; - fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650); + fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650); fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652); - fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651); + fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651); fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653); fp32_intermediates[0] -= 8388736.f; @@ -240,8 +256,10 @@ __device__ inline typename ScalarType::FragB dequant_8bit(&frag_b); - bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], fp32_intermediates_casted[1], 0x7632); - bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], fp32_intermediates_casted[3], 0x7632); + bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], + fp32_intermediates_casted[1], 0x7632); + bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], + fp32_intermediates_casted[3], 0x7632); return frag_b; } @@ -249,30 +267,32 @@ __device__ inline typename ScalarType::FragB dequant_8bit -__device__ inline void scale(typename ScalarType::FragB &frag_b, - typename ScalarType::FragS &frag_s, int i) { +__device__ inline void scale(typename ScalarType::FragB& frag_b, + typename ScalarType::FragS& frag_s, + int i) { using scalar_t2 = typename ScalarType::scalar_t2; - scalar_t2 s = ScalarType::num2num2(reinterpret_cast(&frag_s)[i]); + scalar_t2 s = + ScalarType::num2num2(reinterpret_cast(&frag_s)[i]); frag_b[0] = __hmul2(frag_b[0], s); frag_b[1] = __hmul2(frag_b[1], s); } // Same as above, but for act_order (each K is multiplied individually) template -__device__ inline void scale4(typename ScalarType::FragB &frag_b, - typename ScalarType::FragS &frag_s_1, - typename ScalarType::FragS &frag_s_2, - typename ScalarType::FragS &frag_s_3, - typename ScalarType::FragS &frag_s_4, +__device__ inline void scale4(typename ScalarType::FragB& frag_b, + typename ScalarType::FragS& frag_s_1, + typename ScalarType::FragS& frag_s_2, + typename ScalarType::FragS& frag_s_3, + typename ScalarType::FragS& frag_s_4, int i) { - using scalar_t2 = typename ScalarType::scalar_t2; + using scalar_t2 = typename ScalarType::scalar_t2; scalar_t2 s_val_1_2; - s_val_1_2.x = reinterpret_cast(&frag_s_1)[i]; - s_val_1_2.y = reinterpret_cast(&frag_s_2)[i]; + s_val_1_2.x = reinterpret_cast(&frag_s_1)[i]; + s_val_1_2.y = reinterpret_cast(&frag_s_2)[i]; scalar_t2 s_val_3_4; - s_val_3_4.x = reinterpret_cast(&frag_s_3)[i]; - s_val_3_4.y = reinterpret_cast(&frag_s_4)[i]; + s_val_3_4.x = reinterpret_cast(&frag_s_3)[i]; + s_val_3_4.y = reinterpret_cast(&frag_s_4)[i]; frag_b[0] = __hmul2(frag_b[0], s_val_1_2); frag_b[1] = __hmul2(frag_b[1], s_val_3_4); @@ -280,14 +300,15 @@ __device__ inline void scale4(typename ScalarType::FragB &frag_b, // Given 2 floats multiply by 2 scales (halves) template -__device__ inline void scale_float(float *c, typename ScalarType::FragS &s) { - scalar_t *s_ptr = reinterpret_cast(&s); +__device__ inline void scale_float(float* c, + typename ScalarType::FragS& s) { + scalar_t* s_ptr = reinterpret_cast(&s); c[0] = __fmul_rn(c[0], ScalarType::num2float(s_ptr[0])); c[1] = __fmul_rn(c[1], ScalarType::num2float(s_ptr[1])); } // Wait until barrier reaches `count`, then lock for current threadblock. -__device__ inline void barrier_acquire(int *lock, int count) { +__device__ inline void barrier_acquire(int* lock, int count) { if (threadIdx.x == 0) { int state = -1; do @@ -302,7 +323,7 @@ __device__ inline void barrier_acquire(int *lock, int count) { } // Release barrier and increment visitation count. -__device__ inline void barrier_release(int *lock, bool reset = false) { +__device__ inline void barrier_release(int* lock, bool reset = false) { __syncthreads(); if (threadIdx.x == 0) { if (reset) { @@ -321,11 +342,10 @@ __device__ inline void barrier_release(int *lock, bool reset = false) { // For a given "a" of size [M,K] performs a permutation of the K columns based // on the given "perm" indices. -__global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr, - int const *__restrict__ perm_int_ptr, - int4 *__restrict__ out_int4_ptr, int size_m, +__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr, + int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, int size_m, int size_k, int block_rows) { - int start_row = block_rows * blockIdx.x; int finish_row = start_row + block_rows; if (finish_row > size_m) { @@ -341,9 +361,8 @@ __global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr, int offset = row * row_stride; - half const *a_row_half = - reinterpret_cast(a_int4_ptr + offset); - half *out_half = reinterpret_cast(out_int4_ptr + offset); + half const* a_row_half = reinterpret_cast(a_int4_ptr + offset); + half* out_half = reinterpret_cast(out_int4_ptr + offset); int base_k = 0; @@ -374,31 +393,32 @@ __global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr, } } -template shared - // fetch pipeline - const bool has_act_order, // whether act_order is enabled - const int group_blocks = -1 // number of consecutive 16x16 blocks with - // a separate quantization scale +template shared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale > -__global__ void -Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk - const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn - int4 *__restrict__ C, // fp16 output buffer of shape mxn - const int4 *__restrict__ scales_ptr, // fp16 quantization scales of shape - // (k/groupsize)xn - const int *__restrict__ g_idx, // int32 group indices of shape k - int num_groups, // number of scale groups per output channel - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int *locks // extra global storage for barrier synchronization +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const int* __restrict__ g_idx, // int32 group indices of shape k + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization ) { // Each threadblock processes one "stripe" of the B matrix with (roughly) the // same size, which might involve multiple column "slices" (of width 16 * @@ -445,11 +465,11 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk int slice_row = (iters * blockIdx.x) % k_tiles; int slice_col_par = (iters * blockIdx.x) / k_tiles; int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice + int slice_iters; // number of threadblock tiles in the current slice int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top + 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top // We can easily implement parallel problem execution by just remapping // indices and advancing global pointers @@ -465,27 +485,22 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk auto init_slice = [&]() { slice_iters = iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) - slice_iters = 0; - if (slice_iters == 0) - return; - if (slice_row + slice_iters > k_tiles) - slice_iters = k_tiles - slice_row; + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; slice_count = 1; slice_idx = 0; int col_first = iters * div_ceil(k_tiles * slice_col_par, iters); if (col_first <= k_tiles * (slice_col_par + 1)) { int col_off = col_first - k_tiles * slice_col_par; slice_count = div_ceil(k_tiles - col_off, iters); - if (col_off > 0) - slice_count++; + if (col_off > 0) slice_count++; int delta_first = iters * blockIdx.x - col_first; if (delta_first < 0 || (col_off == 0 && delta_first == 0)) slice_idx = slice_count - 1; else { slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) - slice_idx--; + if (col_off > 0) slice_idx--; } } if (slice_col == n_tiles) { @@ -605,7 +620,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // needed if there are more threads than required for a certain tilesize or // when the batchsize is not a multiple of 16. bool a_sh_wr_pred[a_sh_wr_iters]; -#pragma unroll + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; @@ -623,13 +638,13 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // loop unrolls, all shared memory accesses are static, we simply precompute // both transformed reads and writes. int a_sh_wr_trans[a_sh_wr_iters]; -#pragma unroll + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; -#pragma unroll + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < thread_m_blocks; j++) a_sh_rd_trans[i][j] = transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); @@ -639,30 +654,30 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // runtime; we break dependencies between subsequent accesses with a tile by // maintining multiple pointers (we have enough registers), a tiny // optimization. - const int4 *B_ptr[b_sh_wr_iters]; -#pragma unroll + const int4* B_ptr[b_sh_wr_iters]; + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; extern __shared__ int4 sh[]; // Shared memory storage for global fetch pipelines. - int4 *sh_a = sh; - int4 *sh_b = sh_a + (stages * a_sh_stage); - int4 *sh_g_idx = sh_b + (stages * b_sh_stage); - int4 *sh_s = sh_g_idx + (stages * g_idx_stage); + int4* sh_a = sh; + int4* sh_b = sh_a + (stages * a_sh_stage); + int4* sh_g_idx = sh_b + (stages * b_sh_stage); + int4* sh_s = sh_g_idx + (stages * g_idx_stage); // Register storage for double buffer of shared memory reads. FragA frag_a[2][thread_m_blocks]; I4 frag_b_quant[2][b_thread_vecs]; FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; // No act-order - FragS act_frag_s[2][4][4]; // For act-order + FragS frag_s[2][4]; // No act-order + FragS act_frag_s[2][4][4]; // For act-order // Zero accumulators. auto zero_accums = [&]() { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; + reinterpret_cast(frag_c)[i] = 0; }; int sh_first_group_id = -1; @@ -706,18 +721,18 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // shared memory pipeline location. auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { if (pred) { - int4 *sh_a_stage = sh_a + a_sh_stage * pipe; -#pragma unroll + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) { cp_async4_pred( &sh_a_stage[a_sh_wr_trans[i]], &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], a_sh_wr_pred[i]); } - int4 *sh_b_stage = sh_b + b_sh_stage * pipe; -#pragma unroll + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < b_thread_vecs; j++) { cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j); } @@ -730,10 +745,10 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk int full_pipe = a_off; int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe; if (cur_k < prob_k && cur_k < slice_k_finish) { - int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; - int4 const *cur_g_idx_stage_ptr = - reinterpret_cast(&g_idx[cur_k]); + int4 const* cur_g_idx_stage_ptr = + reinterpret_cast(&g_idx[cur_k]); if (threadIdx.x < g_idx_stage) { cp_async4_pred(&sh_g_idx_stage[threadIdx.x], @@ -742,7 +757,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk } } else { if constexpr (group_blocks != -1) { - int4 *sh_s_stage = sh_s + s_sh_stage * pipe; + int4* sh_s_stage = sh_s + s_sh_stage * pipe; if constexpr (group_blocks >= thread_k_blocks) { // Only fetch scales if this tile starts a new group @@ -782,15 +797,16 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // Load the next sub-tile from the current location in the shared memory pipe // into the current register buffer. auto fetch_to_registers = [&](int k, int pipe) { - int4 *sh_a_stage = sh_a + a_sh_stage * pipe; -#pragma unroll + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) - ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4 *sh_b_stage = sh_b + b_sh_stage * pipe; + ldsm4(frag_a[k % 2][i], + &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); + int4* sh_b_stage = sh_b + b_sh_stage * pipe; -#pragma unroll + #pragma unroll for (int i = 0; i < b_thread_vecs; i++) { - frag_b_quant[k % 2][i] = *reinterpret_cast( + frag_b_quant[k % 2][i] = *reinterpret_cast( &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]); } }; @@ -805,8 +821,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk return; } - int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; - int *sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); int group_id_1 = sh_g_idx_int_ptr[0]; int group_id_2 = sh_g_idx_int_ptr[tb_k - 1]; @@ -822,10 +838,10 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // No act-order case if constexpr (group_blocks != -1) { if constexpr (group_blocks >= thread_k_blocks) { - int4 *sh_s_stage = + int4* sh_s_stage = sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; } else { int warp_id = threadIdx.x / 32; int n_warps = thread_n_blocks / 4; @@ -838,9 +854,9 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk int k_blocks = cur_k / 16; int cur_group_id = k_blocks / group_blocks; - int4 *sh_s_stage = sh_s + s_sh_stage * pipe; + int4* sh_s_stage = sh_s + s_sh_stage * pipe; - reinterpret_cast(&frag_s[k % 2])[0] = + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride]; } } @@ -867,7 +883,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // thread-id) int warp_id = threadIdx.x / 32; int n_warps = - thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N + thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N int warp_row = warp_id / n_warps; int warp_col = warp_id % n_warps; @@ -875,7 +891,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk cur_k += warp_row * 16; int th_id = threadIdx.x % 32; - cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix + cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix int s_col_shift = /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) + @@ -883,45 +899,44 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk if (is_same_group[pipe]) { if (k % 2 == 0) { - *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride + s_col_shift]; } else { - *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = - *(reinterpret_cast(&(act_frag_s[(k - 1) % 2][0][0]))); + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = + *(reinterpret_cast(&(act_frag_s[(k - 1) % 2][0][0]))); } for (int i = 1; i < 4; i++) { - *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = - *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))); + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))); } return; } - int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; - int *sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); constexpr int k_frag_offsets[4] = {0, 1, 8, - 9}; // Tensor core offsets per thread + 9}; // Tensor core offsets per thread -#pragma unroll + #pragma unroll for (int i = 0; i < 4; i++) { - int actual_k = cur_k + k_frag_offsets[i]; int group_id = sh_g_idx_int_ptr[actual_k]; int rel_group_id = group_id - sh_first_group_id; - *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = sh_s[rel_group_id * s_sh_stride + s_col_shift]; } }; // Execute the actual tensor core matmul of a sub-tile. auto matmul = [&](int k) { -// We have the m dimension as the inner loop in order to encourage overlapping -// dequantization and matmul operations. -#pragma unroll + // We have the m dimension as the inner loop in order to encourage overlapping + // dequantization and matmul operations. + #pragma unroll for (int j = 0; j < 4; j++) { FragB frag_b0; FragB frag_b1; @@ -933,7 +948,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk frag_b1 = dequant_4bit(b_quant_shift); } else { - int *frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k % 2]); + int* frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k % 2]); int b_quant_0 = frag_b_quant_ptr[j * 2 + 0]; int b_quant_1 = frag_b_quant_ptr[j * 2 + 1]; @@ -943,8 +958,9 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // Apply scale to frag_b0 if constexpr (has_act_order) { - scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j], - act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0); + scale4(frag_b0, act_frag_s[k % 2][0][j], + act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j], + act_frag_s[k % 2][3][j], 0); } else { if constexpr (group_blocks != -1) { scale(frag_b0, frag_s[k % 2][j], 0); @@ -953,8 +969,9 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // Apply scale to frag_b1 if constexpr (has_act_order) { - scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j], - act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1); + scale4(frag_b1, act_frag_s[k % 2][0][j], + act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j], + act_frag_s[k % 2][3][j], 1); } else { if constexpr (group_blocks != -1) { @@ -962,7 +979,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk } } -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); @@ -987,38 +1004,38 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // unnecessary read or write iterations, e.g., for two warps we write only // once by warp 1 and read only once by warp 0. -#pragma unroll + #pragma unroll for (int m_block = 0; m_block < thread_m_blocks; m_block++) { -#pragma unroll + #pragma unroll for (int i = red_off; i > 0; i /= 2) { if (i <= red_idx && red_idx < 2 * i) { -#pragma unroll + #pragma unroll for (int j = 0; j < 4 * 2; j++) { int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i); if (i < red_off) { - float *c_rd = reinterpret_cast( - &sh[red_sh_delta * j + red_sh_rd]); - float *c_wr = reinterpret_cast(&sh[red_sh_wr]); -#pragma unroll + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + #pragma unroll for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k]; } sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; } } __syncthreads(); } if (red_idx == 0) { -#pragma unroll + #pragma unroll for (int i = 0; i < 4 * 2; i++) { - float *c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); -#pragma unroll + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + #pragma unroll for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += c_rd[j]; } } @@ -1049,39 +1066,39 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk int row = (threadIdx.x % 32) / 4; if (!first) { -// Interestingly, doing direct global accesses here really seems to mess up the -// compiler and lead to slowdowns, hence we also use async-copies even though -// these fetches are not actually asynchronous. -#pragma unroll + // Interestingly, doing direct global accesses here really seems to mess up + // the compiler and lead to slowdowns, hence we also use async-copies even + // though these fetches are not actually asynchronous. + #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || - 8 * (i / 2) + row < prob_m); + cp_async4_pred( + &sh[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + + c_gl_wr_delta_i * (i % 2)], + i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); } cp_async_fence(); cp_async_wait<0>(); } -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { if (!first) { int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; -#pragma unroll + #pragma unroll for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast( + reinterpret_cast( &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - Dtype::num2float(reinterpret_cast(&c_red)[j]); + Dtype::num2float(reinterpret_cast(&c_red)[j]); } } if (!last) { int4 c; -#pragma unroll + #pragma unroll for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast(&c)[j] = - Dtype::float2num(reinterpret_cast( + reinterpret_cast(&c)[j] = + Dtype::float2num(reinterpret_cast( &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); } C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = @@ -1115,8 +1132,9 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // We first reorder in shared memory to guarantee the most efficient final // global write patterns - auto write = [&](int idx, float c0, float c1, FragS &s) { - scalar_t2 res = Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); + auto write = [&](int idx, float c0, float c1, FragS& s) { + scalar_t2 res = + Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); // For per-column quantization we finally apply the scale here (only for // 4-bit) @@ -1124,13 +1142,13 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk res = __hmul2(res, s[0]); } - ((scalar_t2 *)sh)[idx] = res; + ((scalar_t2*)sh)[idx] = res; }; if (threadIdx.x / 32 < thread_n_blocks / 4) { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < 4; j++) { int wr = c_sh_wr + 8 * j; write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], @@ -1147,7 +1165,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk } __syncthreads(); -#pragma unroll + #pragma unroll for (int i = 0; i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) { @@ -1162,7 +1180,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // Start global fetch and register load pipelines. auto start_pipes = [&]() { -#pragma unroll + #pragma unroll for (int i = 0; i < stages - 1; i++) { if (has_act_order && i == 0) { int last_g_idx = slice_k_start + stages * tb_k * 2; @@ -1193,9 +1211,9 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // have even length meaning that the next iteration will always start at // index 0. -#pragma unroll + #pragma unroll for (int pipe = 0; pipe < stages;) { -#pragma unroll + #pragma unroll for (int k = 0; k < b_sh_wr_iters; k++) { fetch_to_registers(k + 1, pipe % stages); fetch_scales_to_registers(k + 1, pipe); @@ -1261,8 +1279,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk cp_async_wait<0>(); __syncthreads(); if (threadIdx.x / 32 < thread_n_blocks / 4) { - reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; - reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; } } else { @@ -1270,8 +1288,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk cp_async_wait<0>(); __syncthreads(); if (threadIdx.x / 32 < thread_n_blocks / 4) { - reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; - reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; } } } @@ -1282,31 +1300,35 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // overflow in fp16) if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) { if (threadIdx.x / 32 < thread_n_blocks / 4) { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < 4; j++) { - scale_float(reinterpret_cast(&frag_c[i][j][0][0]), - frag_s[j / 2][2 * (j % 2) + 0]); - scale_float(reinterpret_cast(&frag_c[i][j][0][2]), - frag_s[j / 2][2 * (j % 2) + 0]); - - scale_float(reinterpret_cast(&frag_c[i][j][1][0]), - frag_s[j / 2][2 * (j % 2) + 1]); - scale_float(reinterpret_cast(&frag_c[i][j][1][2]), - frag_s[j / 2][2 * (j % 2) + 1]); + scale_float( + reinterpret_cast(&frag_c[i][j][0][0]), + frag_s[j / 2][2 * (j % 2) + 0]); + scale_float( + reinterpret_cast(&frag_c[i][j][0][2]), + frag_s[j / 2][2 * (j % 2) + 0]); + + scale_float( + reinterpret_cast(&frag_c[i][j][1][0]), + frag_s[j / 2][2 * (j % 2) + 1]); + scale_float( + reinterpret_cast(&frag_c[i][j][1][2]), + frag_s[j / 2][2 * (j % 2) + 1]); } } } } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice + if (slice_count > 1) { // only globally reduce if there is more than one + // block in a slice barrier_acquire(&locks[slice_col], slice_idx); global_reduce(slice_idx == 0, last); barrier_release(&locks[slice_col], last); } - if (last) // only the last block in a slice actually writes the result + if (last) // only the last block in a slice actually writes the result write_result(); slice_row = 0; slice_col_par++; @@ -1315,13 +1337,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk if (slice_iters) { a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o); -#pragma unroll + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; if (slice_col == 0) { -#pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] -= b_gl_stride; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; } // Update slice k/n for scales loading @@ -1341,23 +1362,24 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk } } -#define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \ - else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \ - num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute( \ - Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ - Marlin \ - <<>>( \ - A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n, \ - prob_k, locks); \ - } + #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \ + THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \ + else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \ + num_threads == NUM_THREADS) { \ + cudaFuncSetAttribute( \ + Marlin, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + Marlin<<>>( \ + A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n, \ + prob_k, locks); \ + } typedef struct { int thread_k; @@ -1389,7 +1411,7 @@ thread_config_t large_batch_thread_configs[] = { }; -int get_scales_cache_size(thread_config_t const &th_config, int prob_m, +int get_scales_cache_size(thread_config_t const& th_config, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full) { bool cache_scales_chunk = has_act_order && !is_k_full; @@ -1402,15 +1424,15 @@ int get_scales_cache_size(thread_config_t const &th_config, int prob_m, if (group_size == -1) { tb_groups = 1; } else if (group_size == 0) { - tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size + tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size } else { tb_groups = div_ceil(tb_k, group_size); } if (cache_scales_chunk) { int load_groups = - tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K - load_groups = max(load_groups, 32); // We load at least 32 scale groups + tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K + load_groups = max(load_groups, 32); // We load at least 32 scale groups return load_groups * tb_n * 2; } else { @@ -1420,7 +1442,7 @@ int get_scales_cache_size(thread_config_t const &th_config, int prob_m, } } -bool is_valid_cache_size(thread_config_t const &th_config, int max_m_blocks, +bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks, int prob_m, int prob_n, int prob_k, int num_bits, int scales_cache_size, int max_shared_mem) { int pack_factor = 32 / num_bits; @@ -1451,12 +1473,12 @@ bool is_valid_cache_size(thread_config_t const &th_config, int max_m_blocks, float pipe_size = (a_size + b_size) * pipe_stages; - TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity + TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity return pipe_size < 0.95f * (max_shared_mem - scales_cache_size); } -bool is_valid_config(thread_config_t const &th_config, int max_m_blocks, +bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full, int max_shared_mem) { @@ -1519,43 +1541,43 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, } } - max_m_blocks--; // Process less M blocks per invocation to reduce cache - // usage + max_m_blocks--; // Process less M blocks per invocation to reduce cache + // usage } return exec_config_t{0, {-1, -1, -1}}; } -#define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ - \ - __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ - \ - __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ - \ - __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ - \ - __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ - __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) + #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \ + \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) template -void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s, - void *g_idx, void *perm, void *a_tmp, int prob_m, - int prob_n, int prob_k, void *workspace, int num_bits, +void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, + void* g_idx, void* perm, void* a_tmp, int prob_m, + int prob_n, int prob_k, void* workspace, int num_bits, bool has_act_order, bool is_k_full, int num_groups, int group_size, int dev, cudaStream_t stream, int thread_k, int thread_n, int sms, int max_par) { @@ -1639,15 +1661,15 @@ void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s, } } - const int4 *A_ptr = (const int4 *)A; - const int4 *B_ptr = (const int4 *)B; - int4 *C_ptr = (int4 *)C; - const int4 *s_ptr = (const int4 *)s; - const int *g_idx_ptr = (const int *)g_idx; - const int *perm_ptr = (const int *)perm; - int4 *a_tmp_ptr = (int4 *)a_tmp; + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + int4* C_ptr = (int4*)C; + const int4* s_ptr = (const int4*)s; + const int* g_idx_ptr = (const int*)g_idx; + const int* perm_ptr = (const int*)perm; + int4* a_tmp_ptr = (int4*)a_tmp; - int *locks = (int *)workspace; + int* locks = (int*)workspace; if (has_act_order) { // Permute A columns @@ -1673,8 +1695,7 @@ void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s, // Note that parallel > 1 currently only works for inputs without any // padding par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks); - if (par > max_par) - par = max_par; + if (par > max_par) par = max_par; prob_m = (16 * exec_cfg.max_m_blocks) * par; i += exec_cfg.max_m_blocks * (par - 1); thread_m_blocks = exec_cfg.max_m_blocks; @@ -1709,11 +1730,11 @@ void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s, } } -} // namespace gptq_marlin +} // namespace gptq_marlin -torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, - torch::Tensor &b_scales, torch::Tensor &g_idx, - torch::Tensor &perm, torch::Tensor &workspace, +torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& g_idx, + torch::Tensor& perm, torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full) { // Verify num_bits @@ -1824,18 +1845,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, int dev = a.get_device(); if (a.scalar_type() == at::ScalarType::Half) { gptq_marlin::marlin_mm_f16i4( - a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), b_scales.data_ptr(), - g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, - size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full, - num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, gptq_marlin::max_par); + a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), + b_scales.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), + a_tmp.data_ptr(), size_m, size_n, size_k, + workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups, + group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, + thread_n, sms, gptq_marlin::max_par); } else if (a.scalar_type() == at::ScalarType::BFloat16) { gptq_marlin::marlin_mm_f16i4( - a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), b_scales.data_ptr(), - g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, - size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full, - num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, gptq_marlin::max_par); + a.data_ptr(), b_q_weight.data_ptr(), + c.data_ptr(), b_scales.data_ptr(), + g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), + size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order, + is_k_full, num_groups, group_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + gptq_marlin::max_par); } else { TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16"); } diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cuh b/csrc/quantization/gptq_marlin/gptq_marlin.cuh index 35ea48aaba310..ba5368ea8835f 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cuh +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cuh @@ -11,22 +11,23 @@ namespace gptq_marlin { -// 8 warps are a good choice since every SM has 4 schedulers and having more than 1 warp per -// schedule allows some more latency hiding. At the same time, we want relatively few warps to have -// many registers per warp and small tiles. +// 8 warps are a good choice since every SM has 4 schedulers and having more +// than 1 warp per schedule allows some more latency hiding. At the same time, +// we want relatively few warps to have many registers per warp and small tiles. static constexpr int default_threads = 256; -static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory +static constexpr int pipe_stages = + 4; // 4 pipeline stages fit into shared memory static constexpr int min_thread_n = 64; static constexpr int min_thread_k = 64; static constexpr int tile_size = 16; -static constexpr int max_par = 16; +static constexpr int max_par = 16; template struct Vec { - T elems[n]; + T elems[n]; __device__ T& operator[](int i) { return elems[i]; } }; @@ -35,30 +36,35 @@ using I4 = Vec; constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - // No support for async +// No support for async #else -__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) { +__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, + bool pred = true) { const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.cg.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int)pred), - "r"(smem), "l"(glob_ptr), "n"(BYTES)); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES)); } __device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("{\n" - " cp.async.cg.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); } -__device__ inline void cp_async_fence() { asm volatile("cp.async.commit_group;\n" ::); } +__device__ inline void cp_async_fence() { + asm volatile("cp.async.commit_group;\n" ::); +} template __device__ inline void cp_async_wait() { @@ -67,4 +73,4 @@ __device__ inline void cp_async_wait() { #endif -} // namespace gptq_marlin +} // namespace gptq_marlin diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh index 7881abbe4cbbf..ca1b7099d6ec7 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh +++ b/csrc/quantization/gptq_marlin/gptq_marlin_dtypes.cuh @@ -5,58 +5,73 @@ #include #include - namespace gptq_marlin { template -class ScalarType { -}; +class ScalarType {}; template <> class ScalarType { -public: - using scalar_t = half; - using scalar_t2 = half2; - - // Matrix fragments for tensor core instructions; their precise layout is - // documented here: - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type - using FragA = Vec; - using FragB = Vec; - using FragC = Vec; - using FragS = Vec; - - static __device__ float inline num2float(const half x) { return __half2float(x); } - - static __device__ half2 inline num2num2(const half x) { return __half2half2(x); } - - static __device__ half2 inline nums2num2(const half x1, const half x2) { return __halves2half2(x1, x2); } - - static __host__ __device__ half inline float2num(const float x) { return __float2half(x); } + public: + using scalar_t = half; + using scalar_t2 = half2; + + // Matrix fragments for tensor core instructions; their precise layout is + // documented here: + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type + using FragA = Vec; + using FragB = Vec; + using FragC = Vec; + using FragS = Vec; + + static __device__ float inline num2float(const half x) { + return __half2float(x); + } + + static __device__ half2 inline num2num2(const half x) { + return __half2half2(x); + } + + static __device__ half2 inline nums2num2(const half x1, const half x2) { + return __halves2half2(x1, x2); + } + + static __host__ __device__ half inline float2num(const float x) { + return __float2half(x); + } }; template <> class ScalarType { -public: - using scalar_t = nv_bfloat16; - using scalar_t2 = nv_bfloat162; + public: + using scalar_t = nv_bfloat16; + using scalar_t2 = nv_bfloat162; - using FragA = Vec; - using FragB = Vec; - using FragC = Vec; - using FragS = Vec; + using FragA = Vec; + using FragB = Vec; + using FragC = Vec; + using FragS = Vec; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - static __device__ float inline num2float(const nv_bfloat16 x) { return __bfloat162float(x); } - - static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) { return __bfloat162bfloat162(x); } - - static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1, const nv_bfloat16 x2) { return __halves2bfloat162(x1, x2); } - - static __host__ __device__ nv_bfloat16 inline float2num(const float x) { return __float2bfloat16(x); } + static __device__ float inline num2float(const nv_bfloat16 x) { + return __bfloat162float(x); + } + + static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) { + return __bfloat162bfloat162(x); + } + + static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1, + const nv_bfloat16 x2) { + return __halves2bfloat162(x1, x2); + } + + static __host__ __device__ nv_bfloat16 inline float2num(const float x) { + return __float2bfloat16(x); + } #endif }; -} +} // namespace gptq_marlin #endif diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu index 0d3da6240dbca..4adc158eb14ea 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu @@ -12,14 +12,14 @@ static constexpr int tile_n_size = tile_k_size * 4; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 template -__global__ void -marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, - uint32_t const *__restrict__ perm_ptr, - uint32_t *__restrict__ out_ptr, int size_k, int size_n) {} +__global__ void marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, + uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr, + int size_k, int size_n) {} -} // namespace gptq_marlin +} // namespace gptq_marlin -torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm, +torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits) { TORCH_CHECK_NOT_IMPLEMENTED( @@ -30,10 +30,10 @@ torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm, #else template -__global__ void -marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, - uint32_t const *__restrict__ perm_ptr, - uint32_t *__restrict__ out_ptr, int size_k, int size_n) { +__global__ void marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, + uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr, + int size_k, int size_n) { constexpr int pack_factor = 32 / num_bits; int k_tiles = size_k / tile_k_size; @@ -61,8 +61,8 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, constexpr int perm_size = tile_k_size / 4; - int4 *sh_perm_ptr = sh; - int4 *sh_pipe_ptr = sh_perm_ptr; + int4* sh_perm_ptr = sh; + int4* sh_pipe_ptr = sh_perm_ptr; if constexpr (has_perm) { sh_pipe_ptr += perm_size; } @@ -76,7 +76,7 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, auto load_perm_to_shared = [&](int k_tile_id) { int first_k_int4 = (k_tile_id * tile_k_size) / 4; - int4 const *perm_int4_ptr = reinterpret_cast(perm_ptr); + int4 const* perm_int4_ptr = reinterpret_cast(perm_ptr); if (threadIdx.x < perm_size) { sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x]; @@ -92,22 +92,22 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, int first_n = n_tile_id * tile_n_size; - int4 *sh_ptr = sh_pipe_ptr + stage_size * pipe; + int4* sh_ptr = sh_pipe_ptr + stage_size * pipe; if constexpr (has_perm) { if (threadIdx.x < stage_size) { int k_id = threadIdx.x / stage_n_threads; int n_id = threadIdx.x % stage_n_threads; - uint32_t const *sh_perm_int_ptr = - reinterpret_cast(sh_perm_ptr); + uint32_t const* sh_perm_int_ptr = + reinterpret_cast(sh_perm_ptr); int src_k = sh_perm_int_ptr[k_id]; int src_k_packed = src_k / pack_factor; cp_async4( &sh_ptr[k_id * stage_n_threads + n_id], - reinterpret_cast(&( + reinterpret_cast(&( b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)]))); } @@ -120,7 +120,7 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, int first_k_packed = first_k / pack_factor; cp_async4(&sh_ptr[k_id * stage_n_threads + n_id], - reinterpret_cast( + reinterpret_cast( &(b_q_weight_ptr[(first_k_packed + k_id) * size_n + first_n + (n_id * 4)]))); } @@ -151,10 +151,10 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, constexpr int sh_stride = 64; constexpr uint32_t mask = (1 << num_bits) - 1; - int4 *sh_stage_ptr = sh_pipe_ptr + stage_size * pipe; - uint32_t *sh_stage_int_ptr = reinterpret_cast(sh_stage_ptr); + int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe; + uint32_t* sh_stage_int_ptr = reinterpret_cast(sh_stage_ptr); - uint32_t *sh_perm_int_ptr = reinterpret_cast(sh_perm_ptr); + uint32_t* sh_perm_int_ptr = reinterpret_cast(sh_perm_ptr); uint32_t vals[8]; @@ -176,17 +176,16 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, } } else { - uint32_t b1_vals[tile_ints]; uint32_t b2_vals[tile_ints]; -#pragma unroll + #pragma unroll for (int i = 0; i < tile_ints; i++) { b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i]; b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i]; } -#pragma unroll + #pragma unroll for (int i = 0; i < 4; i++) { int cur_elem = tc_row + tc_offsets[i]; int cur_int = cur_elem / pack_factor; @@ -206,7 +205,7 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; uint32_t res = 0; -#pragma unroll + #pragma unroll for (int i = 0; i < 8; i++) { res |= vals[pack_idx[i]] << (i * 4); } @@ -218,7 +217,7 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, uint32_t res1 = 0; uint32_t res2 = 0; -#pragma unroll + #pragma unroll for (int i = 0; i < 4; i++) { res1 |= vals[pack_idx[i]] << (i * 8); res2 |= vals[4 + pack_idx[i]] << (i * 8); @@ -230,14 +229,14 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, }; auto start_pipes = [&](int k_tile_id, int n_tile_id) { -#pragma unroll + #pragma unroll for (int pipe = 0; pipe < repack_stages - 1; pipe++) { fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe); } wait_for_stage(); }; -#pragma unroll + #pragma unroll for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) { int n_tile_id = 0; @@ -248,7 +247,7 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, start_pipes(k_tile_id, n_tile_id); while (n_tile_id < n_tiles) { -#pragma unroll + #pragma unroll for (int pipe = 0; pipe < repack_stages; pipe++) { fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id, n_tile_id + pipe + repack_stages - 1); @@ -260,21 +259,21 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr, } } -} // namespace gptq_marlin - -#define CALL_IF(NUM_BITS, HAS_PERM) \ - else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \ - cudaFuncSetAttribute( \ - gptq_marlin::marlin_repack_kernel, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ - gptq_marlin::marlin_repack_kernel \ - <<>>( \ - b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \ - } +} // namespace gptq_marlin + + #define CALL_IF(NUM_BITS, HAS_PERM) \ + else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \ + cudaFuncSetAttribute( \ + gptq_marlin::marlin_repack_kernel, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + gptq_marlin::marlin_repack_kernel \ + <<>>( \ + b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \ + } -torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm, +torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits) { // Verify compatibility with marlin tile of 16x64 @@ -318,11 +317,10 @@ torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm, bool has_perm = perm.size(0) != 0; // Get ptrs - uint32_t const *b_q_weight_ptr = - reinterpret_cast(b_q_weight.data_ptr()); - uint32_t const *perm_ptr = - reinterpret_cast(perm.data_ptr()); - uint32_t *out_ptr = reinterpret_cast(out.data_ptr()); + uint32_t const* b_q_weight_ptr = + reinterpret_cast(b_q_weight.data_ptr()); + uint32_t const* perm_ptr = reinterpret_cast(perm.data_ptr()); + uint32_t* out_ptr = reinterpret_cast(out.data_ptr()); // Get dev info int dev = b_q_weight.get_device(); diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu index 002a70001885d..03d66cecedf1f 100644 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu @@ -25,7 +25,10 @@ #include -template inline std::string str(T x) { return std::to_string(x); } +template +inline std::string str(T x) { + return std::to_string(x); +} namespace marlin { @@ -38,9 +41,10 @@ constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } // corresponding index accesses must be compile-time constants, which is why we // extensively use `#pragma unroll` throughout the kernel code to guarantee // this. -template struct Vec { +template +struct Vec { T elems[n]; - __device__ T &operator[](int i) { return elems[i]; } + __device__ T& operator[](int i) { return elems[i]; } }; using I4 = Vec; @@ -51,29 +55,32 @@ using I4 = Vec; using FragA = Vec; using FragB = Vec; using FragC = Vec; -using FragS = Vec; // quantization scales +using FragS = Vec; // quantization scales // Predicated asynchronous global->shared copy; used for inputs A where we apply // predication to handle batchsizes that are not multiples of 16. -__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr, +__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) { const int BYTES = 16; uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.cg.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int)pred), - "r"(smem), "l"(glob_ptr), "n"(BYTES)); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES)); } // Asynchronous global->shared copy -__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) { +__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { const int BYTES = 16; uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("{\n" - " cp.async.cg.shared.global [%0], [%1], %2;\n" - "}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); } // Async copy fence. @@ -82,28 +89,30 @@ __device__ inline void cp_async_fence() { } // Wait until at most `n` async copy stages are still pending. -template __device__ inline void cp_async_wait() { +template +__device__ inline void cp_async_wait() { asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); } // m16n8k16 tensor core mma instruction with fp16 inputs and fp32 // output/accumulation. -__device__ inline void mma(const FragA &a_frag, const FragB &frag_b, - FragC &frag_c) { - const uint32_t *a = reinterpret_cast(&a_frag); - const uint32_t *b = reinterpret_cast(&frag_b); - float *c = reinterpret_cast(&frag_c); - asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), - "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); +__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, + FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); } // Instruction for loading a full 16x16 matrix fragment of operand A from shared // memory, directly in tensor core layout. -__device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) { - uint32_t *a = reinterpret_cast(&frag_a); +__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) @@ -113,7 +122,8 @@ __device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) { // Lookup-table based 3-input logical operation; explicitly used for // dequantization as the compiler does not seem to automatically recognize it in // all cases. -template __device__ inline int lop3(int a, int b, int c) { +template +__device__ inline int lop3(int a, int b, int c) { int res; asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(res) @@ -138,24 +148,24 @@ __device__ inline FragB dequant(int q) { const int MUL = 0x2c002c00; const int ADD = 0xd480d480; FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&SUB)); - frag_b[1] = __hfma2(*reinterpret_cast(&hi), - *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); return frag_b; } // Multiply dequantized values by the corresponding quantization scale; used // only for grouped quantization. -__device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) { - half2 s = __half2half2(reinterpret_cast<__half *>(&frag_s)[i]); +__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { + half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]); frag_b[0] = __hmul2(frag_b[0], s); frag_b[1] = __hmul2(frag_b[1], s); } // Wait until barrier reaches `count`, then lock for current threadblock. -__device__ inline void barrier_acquire(int *lock, int count) { +__device__ inline void barrier_acquire(int* lock, int count) { if (threadIdx.x == 0) { int state = -1; do @@ -170,7 +180,7 @@ __device__ inline void barrier_acquire(int *lock, int count) { } // Release barrier and increment visitation count. -__device__ inline void barrier_release(int *lock, bool reset = false) { +__device__ inline void barrier_release(int* lock, bool reset = false) { __syncthreads(); if (threadIdx.x == 0) { if (reset) { @@ -187,26 +197,27 @@ __device__ inline void barrier_release(int *lock, bool reset = false) { } } -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks with - // a separate quantization scale +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale > -__global__ void -Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk - const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn - int4 *__restrict__ C, // fp16 output buffer of shape mxn - const int4 - *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int *locks // extra global storage for barrier synchronization +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ s, // fp16 quantization scales of shape + // (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization ) { // Each threadblock processes one "stripe" of the B matrix with (roughly) the // same size, which might involve multiple column "slices" (of width 16 * @@ -241,11 +252,11 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk int slice_row = (iters * blockIdx.x) % k_tiles; int slice_col_par = (iters * blockIdx.x) / k_tiles; int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice + int slice_iters; // number of threadblock tiles in the current slice int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top + 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top // We can easily implement parallel problem execution by just remapping // indices and advancing global pointers @@ -261,27 +272,22 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk auto init_slice = [&]() { slice_iters = iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) - slice_iters = 0; - if (slice_iters == 0) - return; - if (slice_row + slice_iters > k_tiles) - slice_iters = k_tiles - slice_row; + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; slice_count = 1; slice_idx = 0; int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); if (col_first <= k_tiles * (slice_col_par + 1)) { int col_off = col_first - k_tiles * slice_col_par; slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) - slice_count++; + if (col_off > 0) slice_count++; int delta_first = iters * blockIdx.x - col_first; if (delta_first < 0 || (col_off == 0 && delta_first == 0)) slice_idx = slice_count - 1; else { slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) - slice_idx--; + if (col_off > 0) slice_idx--; } } if (slice_col == n_tiles) { @@ -293,29 +299,30 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk }; init_slice(); - int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory + int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory // We typically use `constexpr` to indicate that this value is a compile-time // constant constexpr int a_sh_stride = - 16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory + 16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / - 8; // delta between subsequent A tiles in global memory + 8; // delta between subsequent A tiles in global memory int a_gl_rd_delta_i = a_gl_stride * - (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile + (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile constexpr int a_sh_wr_delta = - a_sh_stride * (threads / a_gl_rd_delta_o); // between shared memory writes + a_sh_stride * + (threads / a_gl_rd_delta_o); // between shared memory writes constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / - (thread_n_blocks / 4)); // between shared memory tile reads + (thread_n_blocks / 4)); // between shared memory tile reads constexpr int a_sh_rd_delta_i = - a_sh_stride * 16; // within a shared memory tile + a_sh_stride * 16; // within a shared memory tile constexpr int a_sh_stage = - a_sh_stride * (16 * thread_m_blocks); // overall size of a tile + a_sh_stride * (16 * thread_m_blocks); // overall size of a tile constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, - a_sh_wr_delta); // number of shared write iterations for a tile + a_sh_wr_delta); // number of shared write iterations for a tile int b_gl_stride = 16 * prob_n / 32; constexpr int b_sh_stride = 32 * thread_n_blocks / 4; @@ -368,7 +375,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // needed if there are more threads than required for a certain tilesize or // when the batchsize is not a multiple of 16. bool a_sh_wr_pred[a_sh_wr_iters]; -#pragma unroll + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; bool s_sh_wr_pred = threadIdx.x < s_sh_stride; @@ -387,13 +394,13 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // loop unrolls, all shared memory accesses are static, we simply precompute // both transformed reads and writes. int a_sh_wr_trans[a_sh_wr_iters]; -#pragma unroll + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; -#pragma unroll + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < thread_m_blocks; j++) a_sh_rd_trans[i][j] = transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); @@ -403,16 +410,16 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // runtime; we break dependencies between subsequent accesses with a tile by // maintining multiple pointers (we have enough registers), a tiny // optimization. - const int4 *B_ptr[b_sh_wr_iters]; -#pragma unroll + const int4* B_ptr[b_sh_wr_iters]; + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; extern __shared__ int4 sh[]; // Shared memory storage for global fetch pipelines. - int4 *sh_a = sh; - int4 *sh_b = sh_a + (stages * a_sh_stage); - int4 *sh_s = sh_b + (stages * b_sh_stage); + int4* sh_a = sh; + int4* sh_b = sh_a + (stages * a_sh_stage); + int4* sh_s = sh_b + (stages * b_sh_stage); // Register storage for double buffer of shared memory reads. FragA frag_a[2][thread_m_blocks]; I4 frag_b_quant[2]; @@ -421,34 +428,33 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // Zero accumulators. auto zero_accums = [&]() { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; + reinterpret_cast(frag_c)[i] = 0; }; // Asynchronously fetch the next A, B and s tile from global to the next // shared memory pipeline location. auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { if (pred) { - int4 *sh_a_stage = sh_a + a_sh_stage * pipe; -#pragma unroll + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) { cp_async4_pred( &sh_a_stage[a_sh_wr_trans[i]], &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], a_sh_wr_pred[i]); } - int4 *sh_b_stage = sh_b + b_sh_stage * pipe; -#pragma unroll + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) { cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); B_ptr[i] += b_gl_rd_delta_o; } // Only fetch scales if this tile starts a new group if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) { - int4 *sh_s_stage = sh_s + s_sh_stage * pipe; - if (s_sh_wr_pred) - cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); s_gl_rd += s_gl_rd_delta; } } @@ -475,37 +481,35 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // theoretically better attempts have lead to bad instruction ordering by // the compiler and correspondingly a noticeable drop in performance. if (group_blocks != -1) { - int4 *sh_s_stage = + int4* sh_s_stage = sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; } - int4 *sh_a_stage = sh_a + a_sh_stage * pipe; -#pragma unroll + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4 *sh_b_stage = sh_b + b_sh_stage * pipe; - frag_b_quant[k % 2] = *reinterpret_cast( + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + frag_b_quant[k % 2] = *reinterpret_cast( &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); }; // Execute the actual tensor core matmul of a sub-tile. auto matmul = [&](int k) { -// We have the m dimension as the inner loop in order to encourage overlapping -// dequantization and matmul operations. -#pragma unroll + // We have the m dimension as the inner loop in order to encourage overlapping + // dequantization and matmul operations. + #pragma unroll for (int j = 0; j < 4; j++) { int b_quant = frag_b_quant[k % 2][j]; int b_quant_shift = b_quant >> 8; FragB frag_b0 = dequant(b_quant); // If there are no groups, we can just scale the final output once and can // avoid doing so for each weight. - if (group_blocks != -1) - scale(frag_b0, frag_s[k % 2][j], 0); + if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0); FragB frag_b1 = dequant(b_quant_shift); - if (group_blocks != -1) - scale(frag_b1, frag_s[k % 2][j], 1); -#pragma unroll + if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1); + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); @@ -530,38 +534,38 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // unnecessary read or write iterations, e.g., for two warps we write only // once by warp 1 and read only once by warp 0. -#pragma unroll + #pragma unroll for (int m_block = 0; m_block < thread_m_blocks; m_block++) { -#pragma unroll + #pragma unroll for (int i = red_off; i > 0; i /= 2) { if (i <= red_idx && red_idx < 2 * i) { -#pragma unroll + #pragma unroll for (int j = 0; j < 4 * 2; j++) { int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i); if (i < red_off) { - float *c_rd = reinterpret_cast( - &sh[red_sh_delta * j + red_sh_rd]); - float *c_wr = reinterpret_cast(&sh[red_sh_wr]); -#pragma unroll + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + #pragma unroll for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k]; } sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; } } __syncthreads(); } if (red_idx == 0) { -#pragma unroll + #pragma unroll for (int i = 0; i < 4 * 2; i++) { - float *c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); -#pragma unroll + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + #pragma unroll for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += c_rd[j]; } } @@ -571,9 +575,9 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk }; // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped partitioning - // minimizes the number of such reductions and our outputs are usually rather - // small, we perform this reduction serially in L2 cache. + // finally have to globally reduce over the results. As the striped + // partitioning minimizes the number of such reductions and our outputs are + // usually rather small, we perform this reduction serially in L2 cache. auto global_reduce = [&](bool first = false, bool last = false) { // We are very careful here to reduce directly in the output buffer to // maximize L2 cache utilization in this step. To do this, we write out @@ -592,39 +596,39 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk int row = (threadIdx.x % 32) / 4; if (!first) { -// Interestingly, doing direct global accesses here really seems to mess up the -// compiler and lead to slowdowns, hence we also use async-copies even though -// these fetches are not actually asynchronous. -#pragma unroll + // Interestingly, doing direct global accesses here really seems to mess up + // the compiler and lead to slowdowns, hence we also use async-copies even + // though these fetches are not actually asynchronous. + #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || - 8 * (i / 2) + row < prob_m); + cp_async4_pred( + &sh[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + + c_gl_wr_delta_i * (i % 2)], + i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); } cp_async_fence(); cp_async_wait<0>(); } -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { if (!first) { int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; -#pragma unroll + #pragma unroll for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast( + reinterpret_cast( &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - __half2float(reinterpret_cast<__half *>(&c_red)[j]); + __half2float(reinterpret_cast<__half*>(&c_red)[j]); } } if (!last) { int4 c; -#pragma unroll + #pragma unroll for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast<__half *>(&c)[j] = - __float2half(reinterpret_cast( + reinterpret_cast<__half*>(&c)[j] = + __float2half(reinterpret_cast( &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); } C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = @@ -658,17 +662,17 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // We first reorder in shared memory to guarantee the most efficient final // global write patterns - auto write = [&](int idx, float c0, float c1, FragS &s) { + auto write = [&](int idx, float c0, float c1, FragS& s) { half2 res = __halves2half2(__float2half(c0), __float2half(c1)); if (group_blocks == - -1) // for per-column quantization we finally apply the scale here + -1) // for per-column quantization we finally apply the scale here res = __hmul2(res, s[0]); - ((half2 *)sh)[idx] = res; + ((half2*)sh)[idx] = res; }; if (threadIdx.x / 32 < thread_n_blocks / 4) { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < 4; j++) { int wr = c_sh_wr + 8 * j; write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], @@ -685,7 +689,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk } __syncthreads(); -#pragma unroll + #pragma unroll for (int i = 0; i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) { @@ -699,9 +703,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // Start global fetch and register load pipelines. auto start_pipes = [&]() { -#pragma unroll - for (int i = 0; i < stages - 1; i++) - fetch_to_shared(i, i, i < slice_iters); + #pragma unroll + for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); zero_accums(); wait_for_stage(); fetch_to_registers(0, 0); @@ -711,12 +714,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // Main loop. while (slice_iters) { -// We unroll over both the global fetch and the register load pipeline to ensure -// all shared memory accesses are static. Note that both pipelines have even -// length meaning that the next iteration will always start at index 0. -#pragma unroll + // We unroll over both the global fetch and the register load pipeline to + // ensure all shared memory accesses are static. Note that both pipelines have + // even length meaning that the next iteration will always start at index 0. + #pragma unroll for (int pipe = 0; pipe < stages;) { -#pragma unroll + #pragma unroll for (int k = 0; k < b_sh_wr_iters; k++) { fetch_to_registers(k + 1, pipe % stages); if (k == b_sh_wr_iters - 2) { @@ -728,8 +731,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk matmul(k); } slice_iters--; - if (slice_iters == 0) - break; + if (slice_iters == 0) break; } a_gl_rd += a_gl_rd_delta_o * stages; @@ -742,8 +744,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // For per-column scales, we only fetch them here in the final step before // write-out if (group_blocks == -1 && last) { - if (s_sh_wr_pred) - cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); + if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); cp_async_fence(); } thread_block_reduce(); @@ -751,17 +752,17 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk cp_async_wait<0>(); __syncthreads(); if (threadIdx.x / 32 < thread_n_blocks / 4) { - reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; - reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; } } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice + if (slice_count > 1) { // only globally reduce if there is more than one + // block in a slice barrier_acquire(&locks[slice_col], slice_idx); global_reduce(slice_idx == 0, last); barrier_release(&locks[slice_col], last); } - if (last) // only the last block in a slice actually writes the result + if (last) // only the last block in a slice actually writes the result write_result(); slice_row = 0; slice_col_par++; @@ -770,13 +771,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk if (slice_iters) { a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o); -#pragma unroll + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; if (slice_col == 0) { -#pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] -= b_gl_stride; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; } s_gl_rd = s_sh_stride * slice_col + threadIdx.x; start_pipes(); @@ -787,26 +787,27 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk #else -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks with - // a separate quantization scale +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale > -__global__ void -Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk - const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn - int4 *__restrict__ C, // fp16 output buffer of shape mxn - const int4 - *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int *locks // extra global storage for barrier synchronization +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ s, // fp16 quantization scales of shape + // (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization ) { // Marlin is not implemented yet for SM < 8.0 assert(false); @@ -819,10 +820,10 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk // than 1 warp per schedule allows some more latency hiding. At the same time, // we want relatively few warps to have many registers per warp and small tiles. const int USER_THREADS = - 256; // Note: This is only used with user-provided thread_k/n -const int STAGES = 4; // 4 pipeline stages fit into shared memory + 256; // Note: This is only used with user-provided thread_k/n +const int STAGES = 4; // 4 pipeline stages fit into shared memory const int SHARED_MEM = - 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) + 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) static constexpr int min_thread_n = 64; static constexpr int min_thread_k = 64; @@ -831,7 +832,7 @@ static constexpr int tile_size = 16; static constexpr int max_par = 16; static constexpr int pack_factor_4bit = - 8; // We have 8 4-bit vals inside a 32 bit + 8; // We have 8 4-bit vals inside a 32 bit #define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ GROUP_BLOCKS, NUM_THREADS) \ @@ -858,23 +859,23 @@ thread_config_t small_batch_thread_configs[] = { // Ordered by priority // thread_k, thread_n, num_threads - {128, 128, 256}, // Default - {128, 64, 128}, // Reduce N 2X, same K - {64, 256, 256}, // Reduce K 2X, increase N 2X - {64, 128, 128}, // Reduce K 2X, same N + {128, 128, 256}, // Default + {128, 64, 128}, // Reduce N 2X, same K + {64, 256, 256}, // Reduce K 2X, increase N 2X + {64, 128, 128}, // Reduce K 2X, same N }; thread_config_t large_batch_thread_configs[] = { // Ordered by priority // thread_k, thread_n, num_threads - {64, 256, 256}, // Default - {128, 128, 256}, // Reduce N 2X, increase K 2X - {64, 128, 128}, // Reduce N 2X, same K - {128, 64, 128}, // Reduce N 4X, increase K 2X + {64, 256, 256}, // Default + {128, 128, 256}, // Reduce N 2X, increase K 2X + {64, 128, 128}, // Reduce N 2X, same K + {128, 64, 128}, // Reduce N 4X, increase K 2X }; -bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n, +bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, int prob_k) { // Sanity if (th_config.thread_k == -1 || th_config.thread_n == -1 || @@ -907,7 +908,6 @@ bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n, } thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { - if (prob_m <= 16) { for (auto th_config : small_batch_thread_configs) { if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { @@ -926,20 +926,20 @@ thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { return thread_config_t{-1, -1, -1}; } -#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ +#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) -void marlin_cuda(const void *A, const void *B, void *C, void *s, int prob_m, - int prob_n, int prob_k, void *workspace, int groupsize = -1, +void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m, + int prob_n, int prob_k, void* workspace, int groupsize = -1, int dev = 0, cudaStream_t stream = 0, int thread_k = -1, int thread_n = -1, int sms = -1, int max_par = 16) { int tot_m = prob_m; @@ -996,12 +996,12 @@ void marlin_cuda(const void *A, const void *B, void *C, void *s, int prob_m, " is not divisible by group_blocks = ", group_blocks); } - const int4 *A_ptr = (const int4 *)A; - const int4 *B_ptr = (const int4 *)B; - int4 *C_ptr = (int4 *)C; - const int4 *s_ptr = (const int4 *)s; + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + int4* C_ptr = (int4*)C; + const int4* s_ptr = (const int4*)s; - int *locks = (int *)workspace; + int* locks = (int*)workspace; for (int i = 0; i < tot_m_blocks; i += 4) { int thread_m_blocks = tot_m_blocks - i; @@ -1011,8 +1011,7 @@ void marlin_cuda(const void *A, const void *B, void *C, void *s, int prob_m, // Note that parallel > 1 currently only works for inputs without any // padding par = (16 * thread_m_blocks - pad) / 64; - if (par > max_par) - par = max_par; + if (par > max_par) par = max_par; prob_m = 64 * par; i += 4 * (par - 1); thread_m_blocks = 4; @@ -1041,12 +1040,11 @@ void marlin_cuda(const void *A, const void *B, void *C, void *s, int prob_m, } } -} // namespace marlin +} // namespace marlin -torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, - torch::Tensor &b_scales, torch::Tensor &workspace, +torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k) { - // Verify M TORCH_CHECK(size_m == a.size(0), "Shape mismatch: a.size(0) = " + str(a.size(0)) + @@ -1074,9 +1072,9 @@ torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, int actual_size_n = (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit; - TORCH_CHECK(size_n == actual_size_n, - "size_n = " + str(size_n) + - ", actual_size_n = " + str(actual_size_n)); + TORCH_CHECK( + size_n == actual_size_n, + "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n)); // Verify A device and strides TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); diff --git a/csrc/quantization/marlin/sparse/common/base.h b/csrc/quantization/marlin/sparse/common/base.h index 929b39d7642f1..16018d331bec2 100644 --- a/csrc/quantization/marlin/sparse/common/base.h +++ b/csrc/quantization/marlin/sparse/common/base.h @@ -26,12 +26,14 @@ constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } // corresponding index accesses must be compile-time constants, which is why we // extensively use `#pragma unroll` throughout the kernel code to guarantee // this. -template struct Vec { +template +struct Vec { T elems[n]; - __device__ T &operator[](int i) { return elems[i]; } + __device__ T& operator[](int i) { return elems[i]; } }; -template struct ShapeBase { +template +struct ShapeBase { static constexpr int M = M_, N = N_, K = K_; }; @@ -44,6 +46,6 @@ using FragA = Vec; using FragB = Vec; using FragM = Vec; using FragC = Vec; -using FragS = Vec; // quantization scales +using FragS = Vec; // quantization scales -} // namespace marlin_24 +} // namespace marlin_24 diff --git a/csrc/quantization/marlin/sparse/common/mem.h b/csrc/quantization/marlin/sparse/common/mem.h index a49d15ca544eb..83e3578d2f511 100644 --- a/csrc/quantization/marlin/sparse/common/mem.h +++ b/csrc/quantization/marlin/sparse/common/mem.h @@ -21,41 +21,44 @@ namespace marlin_24 { // Predicated asynchronous global->shared copy; used for inputs A where we apply // predication to handle batchsizes that are not multiples of 16. -__device__ inline void cp_async4_pred_zfill(void *smem_ptr, - const void *glob_ptr, +__device__ inline void cp_async4_pred_zfill(void* smem_ptr, + const void* glob_ptr, bool pred = true, const bool zfill = false) { const int BYTES = 16; int src_in_bytes = (zfill ? 0 : BYTES); uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.cg.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int)pred), - "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes)); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes)); } -__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr, +__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) { const int BYTES = 16; uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.cg.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int)pred), - "r"(smem), "l"(glob_ptr), "n"(BYTES)); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES)); } // Asynchronous global->shared copy -__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) { +__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { const int BYTES = 16; uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("{\n" - " cp.async.cg.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); } // Async copy fence. @@ -64,22 +67,23 @@ __device__ inline void cp_async_fence() { } // Wait until at most `n` async copy stages are still pending. -template __device__ inline void cp_async_wait() { +template +__device__ inline void cp_async_wait() { asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); } // Instruction for loading a full 16x16 matrix fragment of operand A from shared // memory, directly in tensor core layout. -__device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) { - uint32_t *a = reinterpret_cast(&frag_a); +__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) : "r"(smem)); } -__device__ inline void ldsm4_m(FragM &frag_m, const void *smem_ptr) { - uint32_t *a = reinterpret_cast(&frag_m); +__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_m); uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(a[0]), "=r"(a[1]) @@ -88,8 +92,8 @@ __device__ inline void ldsm4_m(FragM &frag_m, const void *smem_ptr) { // Instruction for loading a full 16x16 matrix fragment of operand A from shared // memory, directly in tensor core layout. -__device__ inline void ldsm4_t(FragA &frag_a, const void *smem_ptr) { - uint32_t *a = reinterpret_cast(&frag_a); +__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); asm volatile( "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n" @@ -98,7 +102,7 @@ __device__ inline void ldsm4_t(FragA &frag_a, const void *smem_ptr) { } // Wait until barrier reaches `count`, then lock for current threadblock. -__device__ inline void barrier_acquire(int *lock, int count) { +__device__ inline void barrier_acquire(int* lock, int count) { if (threadIdx.x == 0) { int state = -1; do @@ -113,7 +117,7 @@ __device__ inline void barrier_acquire(int *lock, int count) { } // Release barrier and increment visitation count. -__device__ inline void barrier_release(int *lock, bool reset = false) { +__device__ inline void barrier_release(int* lock, bool reset = false) { __syncthreads(); if (threadIdx.x == 0) { if (reset) { @@ -129,4 +133,4 @@ __device__ inline void barrier_release(int *lock, bool reset = false) { : "l"(lock), "r"(val)); } } -} // namespace marlin_24 +} // namespace marlin_24 diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 9319456677d36..45ab67a78a1de 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -22,51 +22,56 @@ namespace marlin_24 { // m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32 // output/accumulation. -__device__ inline void mma_sp(const FragB &a_frag0, const FragB &a_frag1, - const FragA &frag_b, FragC &frag_c, FragM &frag_m, +__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, + const FragA& frag_b, FragC& frag_c, FragM& frag_m, const int psel) { - const uint32_t *a0 = reinterpret_cast(&a_frag0); - const uint32_t *a1 = reinterpret_cast(&a_frag1); - const uint32_t *b = reinterpret_cast(&frag_b); - const uint32_t *e = reinterpret_cast(&frag_m); - float *c = reinterpret_cast(&frag_c); + const uint32_t* a0 = reinterpret_cast(&a_frag0); + const uint32_t* a1 = reinterpret_cast(&a_frag1); + const uint32_t* b = reinterpret_cast(&frag_b); + const uint32_t* e = reinterpret_cast(&frag_m); + float* c = reinterpret_cast(&frag_c); if (psel == 0) { - asm volatile("mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x0;\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), - "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), - "f"(c[2]), "f"(c[3]), "r"(e[0])); - asm volatile("mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x0;\n" - : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), - "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), - "f"(c[6]), "f"(c[7]), "r"(e[0])); + asm volatile( + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x0;\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), + "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), + "r"(e[0])); + asm volatile( + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x0;\n" + : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), + "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]), + "r"(e[0])); } else { - asm volatile("mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x1;\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), - "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), - "f"(c[2]), "f"(c[3]), "r"(e[0])); - asm volatile("mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " - "{%12,%13,%14,%15}, %16, 0x1;\n" - : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) - : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), - "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), - "f"(c[6]), "f"(c[7]), "r"(e[0])); + asm volatile( + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x1;\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), + "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), + "r"(e[0])); + asm volatile( + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " + "{%12,%13,%14,%15}, %16, 0x1;\n" + : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) + : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), + "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]), + "r"(e[0])); } } // Lookup-table based 3-input logical operation; explicitly used for // dequantization as the compiler does not seem to automatically recognize it in // all cases. -template __device__ inline int lop3(int a, int b, int c) { +template +__device__ inline int lop3(int a, int b, int c) { int res; asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(res) @@ -120,11 +125,11 @@ __device__ inline FragB dequant_4bit(int q) { const int ADD = 0xd480d480; FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&SUB)); - frag_b[1] = __hfma2(*reinterpret_cast(&hi), - *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); return frag_b; } @@ -143,24 +148,24 @@ __device__ inline FragB dequant_8bit(int q) { static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); - frag_b[1] = __hsub2(*reinterpret_cast(&hi), - *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[1] = __hsub2(*reinterpret_cast(&hi), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); return frag_b; } // Multiply dequantized values by the corresponding quantization scale; used // only for grouped quantization. -__device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) { - half2 s = __half2half2(reinterpret_cast<__half *>(&frag_s)[i]); +__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { + half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]); frag_b[0] = __hmul2(frag_b[0], s); frag_b[1] = __hmul2(frag_b[1], s); } -__device__ inline void scale_floats(float *c0, float *c1, float *c2, float *c3, - FragS &s0, float *c4, float *c5, float *c6, - float *c7, FragS &s1) { +__device__ inline void scale_floats(float* c0, float* c1, float* c2, float* c3, + FragS& s0, float* c4, float* c5, float* c6, + float* c7, FragS& s1) { *c0 = __fmul_rn(*c0, __half2float(s0[0].x)); *c1 = __fmul_rn(*c1, __half2float(s0[0].y)); *c2 = __fmul_rn(*c2, __half2float(s0[1].x)); @@ -172,4 +177,4 @@ __device__ inline void scale_floats(float *c0, float *c1, float *c2, float *c3, *c7 = __fmul_rn(*c7, __half2float(s1[1].y)); } -} // namespace marlin_24 +} // namespace marlin_24 diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu index 42b0566183a8d..54ad27676e207 100644 --- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu +++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu @@ -32,12 +32,15 @@ #else -#include "common/mem.h" -#include "common/mma.h" + #include "common/mem.h" + #include "common/mma.h" #endif -template inline std::string str(T x) { return std::to_string(x); } +template +inline std::string str(T x) { + return std::to_string(x); +} namespace marlin_24 { @@ -45,7 +48,7 @@ namespace marlin_24 { // than 1 warp per schedule allows some more latency hiding. At the same time, // we want relatively few warps to have many registers per warp and small tiles. static constexpr int THREADS = 256; -static constexpr int STAGES = 4; // 4 pipeline stages fit into shared memory +static constexpr int STAGES = 4; // 4 pipeline stages fit into shared memory static constexpr int min_thread_n = 128; @@ -54,35 +57,36 @@ static constexpr int max_par = 16; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks with - // a separate quantization scale +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale > __global__ void Marlin_24( - const int4 *__restrict__ A, // fp16 input matrix of shape mxk - const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn - const int4 - *__restrict__ meta, // 2bit metadata information about 2:4 format on B - int4 *__restrict__ C, // fp16 output buffer of shape mxn - const int4 - *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int *locks // extra global storage for barrier synchronization + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + const int4* __restrict__ meta, // 2bit metadata information about 2:4 + // format on B + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ s, // fp16 quantization scales of shape + // (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization ) {} -torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, - torch::Tensor &b_meta, - torch::Tensor &b_scales, - torch::Tensor &workspace, int64_t num_bits, +torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_meta, + torch::Tensor& b_scales, + torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k) { TORCH_CHECK_NOT_IMPLEMENTED( @@ -92,29 +96,30 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, #else -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks with - // a separate quantization scale +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale > __global__ void Marlin_24( - const int4 *__restrict__ A, // fp16 input matrix of shape mxk - const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn - const int4 - *__restrict__ meta, // 2bit metadata information about 2:4 format on B - int4 *__restrict__ C, // fp16 output buffer of shape mxn - const int4 - *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int *locks // extra global storage for barrier synchronization + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + const int4* __restrict__ meta, // 2bit metadata information about 2:4 + // format on B + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ s, // fp16 quantization scales of shape + // (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization ) { // Each threadblock processes one "stripe" of the B matrix with (roughly) the // same size, which might involve multiple column "slices" (of width 16 * @@ -174,27 +179,22 @@ __global__ void Marlin_24( auto init_slice = [&]() { slice_iters = iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) - slice_iters = 0; - if (slice_iters == 0) - return; - if (slice_row + slice_iters > k_tiles) - slice_iters = k_tiles - slice_row; + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; slice_count = 1; slice_idx = 0; int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); if (col_first <= k_tiles * (slice_col_par + 1)) { int col_off = col_first - k_tiles * slice_col_par; slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) - slice_count++; + if (col_off > 0) slice_count++; int delta_first = iters * blockIdx.x - col_first; if (delta_first < 0 || (col_off == 0 && delta_first == 0)) slice_idx = slice_count - 1; else { slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) - slice_idx--; + if (col_off > 0) slice_idx--; } } if (slice_col == n_tiles) { @@ -207,7 +207,7 @@ __global__ void Marlin_24( init_slice(); // RLC: 8 is vec_size -> 128-bit instructions, 8 fp16 elements - int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory + int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory // stride of an A matrix tile in shared memory constexpr int a_sh_stride = 32 * thread_k_blocks / 8; @@ -239,9 +239,9 @@ __global__ void Marlin_24( constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; - int m_gl_stride = 2 * prob_n / 8; // (16*2*4 / 8) = 16 + int m_gl_stride = 2 * prob_n / 8; // (16*2*4 / 8) = 16 constexpr int m_sh_stride = - (16 * thread_n_blocks) / 4; // #warps n-dim * threads/warp + (16 * thread_n_blocks) / 4; // #warps n-dim * threads/warp int m_gl_rd_delta_o = m_gl_stride * thread_k_blocks; int m_gl_rd_delta_i = m_gl_stride * (threads / m_sh_stride); constexpr int m_sh_wr_delta = threads / 2; @@ -305,7 +305,7 @@ __global__ void Marlin_24( // needed if there are more threads than required for a certain tilesize or // when the batchsize is not a multiple of 16. bool a_sh_wr_pred[a_sh_wr_iters]; -#pragma unroll + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) { a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; } @@ -325,13 +325,13 @@ __global__ void Marlin_24( // loop unrolls, all shared memory accesses are static, we simply precompute // both transformed reads and writes. int a_sh_wr_trans[a_sh_wr_iters]; -#pragma unroll + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); int a_sh_rd_trans[2][b_sh_wr_iters][thread_m_blocks]; -#pragma unroll + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < thread_m_blocks; j++) { a_sh_rd_trans[0][i][j] = transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); @@ -344,23 +344,23 @@ __global__ void Marlin_24( // runtime; we break dependencies between subsequent accesses with a tile by // maintining multiple pointers (we have enough registers), a tiny // optimization. - const int4 *B_ptr[b_sh_wr_iters]; -#pragma unroll + const int4* B_ptr[b_sh_wr_iters]; + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; bool m_sh_wr_pred = threadIdx.x < m_sh_wr_delta; - const int4 *meta_ptr[m_sh_iters]; -#pragma unroll + const int4* meta_ptr[m_sh_iters]; + #pragma unroll for (int i = 0; i < m_sh_iters; i++) meta_ptr[i] = meta + m_gl_rd_delta_i * i + m_gl_rd; extern __shared__ int4 sh[]; // Shared memory storage for global fetch pipelines. - int4 *sh_a = sh; - int4 *sh_b = sh_a + (stages * a_sh_stage); - int4 *sh_s = sh_b + (stages * b_sh_stage); - int4 *sh_m = sh_s + (stages * s_sh_stage); + int4* sh_a = sh; + int4* sh_b = sh_a + (stages * a_sh_stage); + int4* sh_s = sh_b + (stages * b_sh_stage); + int4* sh_m = sh_s + (stages * s_sh_stage); // Register storage for double buffer of shared memory reads. FragA frag_a[2][thread_m_blocks][2]; I4 frag_b_quant[2][b_thread_vecs]; @@ -370,46 +370,43 @@ __global__ void Marlin_24( // Zero accumulators. auto zero_accums = [&]() { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; + reinterpret_cast(frag_c)[i] = 0; }; // Asynchronously fetch the next A, B and s tile from global to the next // shared memory pipeline location. auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { if (pred) { - int4 *sh_a_stage = sh_a + a_sh_stage * pipe; -#pragma unroll + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll for (int i = 0; i < a_sh_wr_iters; i++) { cp_async4_pred( &sh_a_stage[a_sh_wr_trans[i]], &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], a_sh_wr_pred[i]); } - int4 *sh_b_stage = sh_b + b_sh_stage * pipe; -#pragma unroll + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) { -#pragma unroll + #pragma unroll for (int j = 0; j < b_thread_vecs; j++) { - cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], - B_ptr[i] + j); + cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j); } B_ptr[i] += b_gl_rd_delta_o; } - int4 *sh_meta_stage = sh_m + m_sh_stage * pipe; -#pragma unroll + int4* sh_meta_stage = sh_m + m_sh_stage * pipe; + #pragma unroll for (int i = 0; i < m_sh_iters; i++) { if (m_sh_wr_pred) - cp_async4(&sh_meta_stage[m_sh_wr_delta * i + m_sh_wr], - meta_ptr[i]); + cp_async4(&sh_meta_stage[m_sh_wr_delta * i + m_sh_wr], meta_ptr[i]); meta_ptr[i] += m_gl_rd_delta_o; } // Only fetch scales if this tile starts a new group if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) { - int4 *sh_s_stage = sh_s + s_sh_stage * pipe; - if (s_sh_wr_pred) - cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); s_gl_rd += s_gl_rd_delta; } } @@ -436,13 +433,13 @@ __global__ void Marlin_24( // theoretically better attempts have lead to bad instruction ordering by // the compiler and correspondingly a noticeable drop in performance. if (group_blocks != -1) { - int4 *sh_s_stage = + int4* sh_s_stage = sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; } - int4 *sh_a_stage = sh_a + a_sh_stage * pipe; -#pragma unroll + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { ldsm4(frag_a[k % 2][i][0], &sh_a_stage[a_sh_rd_trans[0][k % b_sh_wr_iters][i]]); @@ -450,24 +447,24 @@ __global__ void Marlin_24( &sh_a_stage[a_sh_rd_trans[1][k % b_sh_wr_iters][i]]); } - int4 *sh_b_stage = sh_b + b_sh_stage * pipe; -#pragma unroll + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll for (int i = 0; i < b_thread_vecs; i++) { - frag_b_quant[k % 2][i] = *reinterpret_cast( + frag_b_quant[k % 2][i] = *reinterpret_cast( &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]); } // Load meta with ldsm4 - int4 *sh_m_stage = sh_m + m_sh_stage * pipe; + int4* sh_m_stage = sh_m + m_sh_stage * pipe; ldsm4_m(frag_m[k % 2][0], &sh_m_stage[m_sh_rd_delta * (k % m_sh_iters) + m_sh_rd]); }; // Execute the actual tensor core matmul of a sub-tile. auto matmul = [&](int k) { -// We have the m dimension as the inner loop in order to encourage overlapping -// dequantization and matmul operations. -#pragma unroll + // We have the m dimension as the inner loop in order to encourage overlapping + // dequantization and matmul operations. + #pragma unroll for (int j = 0; j < 4; j++) { FragB frag_b0; FragB frag_b1; @@ -480,7 +477,7 @@ __global__ void Marlin_24( frag_b1 = dequant_4bit(b_quant_shift); } else { - int *frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k % 2]); + int* frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k % 2]); int b_quant_0 = frag_b_quant_ptr[j * 2 + 0]; int b_quant_1 = frag_b_quant_ptr[j * 2 + 1]; @@ -497,7 +494,7 @@ __global__ void Marlin_24( scale(frag_b1, frag_s[k % 2][j], 1); } -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { mma_sp(frag_b0, frag_b1, frag_a[k % 2][i][0], frag_c[i][j][0], frag_m[k % 2][j / 2], j % 2); @@ -518,41 +515,41 @@ __global__ void Marlin_24( int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) + (threadIdx.x % b_sh_stride_threads); -// Parallel logarithmic shared memory reduction. We make sure to avoid any -// unnecessary read or write iterations, e.g., for two warps we write only once -// by warp 1 and read only once by warp 0. -#pragma unroll + // Parallel logarithmic shared memory reduction. We make sure to avoid any + // unnecessary read or write iterations, e.g., for two warps we write only + // once by warp 1 and read only once by warp 0. + #pragma unroll for (int m_block = 0; m_block < thread_m_blocks; m_block++) { -#pragma unroll + #pragma unroll for (int i = red_off; i > 0; i /= 2) { if (i <= red_idx && red_idx < 2 * i) { -#pragma unroll + #pragma unroll for (int j = 0; j < 4 * 2; j++) { int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i); if (i < red_off) { - float *c_rd = reinterpret_cast( - &sh[red_sh_delta * j + red_sh_rd]); - float *c_wr = reinterpret_cast(&sh[red_sh_wr]); -#pragma unroll + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + #pragma unroll for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k]; } sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; } } __syncthreads(); } if (red_idx == 0) { -#pragma unroll + #pragma unroll for (int i = 0; i < 4 * 2; i++) { - float *c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); -#pragma unroll + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + #pragma unroll for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += c_rd[j]; } } @@ -562,9 +559,9 @@ __global__ void Marlin_24( }; // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped partitioning - // minimizes the number of such reductions and our outputs are usually rather - // small, we perform this reduction serially in L2 cache. + // finally have to globally reduce over the results. As the striped + // partitioning minimizes the number of such reductions and our outputs are + // usually rather small, we perform this reduction serially in L2 cache. auto global_reduce = [&](bool first = false, bool last = false) { // We are very careful here to reduce directly in the output buffer to // maximize L2 cache utilization in this step. To do this, we write out @@ -574,7 +571,7 @@ __global__ void Marlin_24( int c_gl_stride = prob_n / 8; int c_gl_wr_delta_o = 2 * 4 * c_gl_stride; int c_gl_wr_delta_i = - c_gl_stride; // 8 threads (e.g., 0,4,8,12,16,20,24,28) + c_gl_stride; // 8 threads (e.g., 0,4,8,12,16,20,24,28) int c_gl_wr = 2 * c_gl_stride * (threadIdx.x % 4) + 8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4; c_gl_wr += (2 * thread_n_blocks) * slice_col; @@ -584,10 +581,10 @@ __global__ void Marlin_24( int col = 2 * ((threadIdx.x % 32) % 4); if (!first) { -// Interestingly, doing direct global accesses here really seems to mess up the -// compiler and lead to slowdowns, hence we also use async-copies even though -// these fetches are not actually asynchronous. -#pragma unroll + // Interestingly, doing direct global accesses here really seems to mess up + // the compiler and lead to slowdowns, hence we also use async-copies even + // though these fetches are not actually asynchronous. + #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + @@ -599,32 +596,32 @@ __global__ void Marlin_24( cp_async_wait<0>(); } -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + col + (i % 2) < prob_m) { if (!first) { int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; -#pragma unroll + #pragma unroll for (int j2 = 0; j2 < 2; j2++) { -#pragma unroll + #pragma unroll for (int j1 = 0; j1 < 4; j1++) { - reinterpret_cast( + reinterpret_cast( &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 + 4 * ((i % 4) / 2) + i % 2] += __half2float( - reinterpret_cast<__half *>(&c_red)[(j2 * 4 + j1)]); + reinterpret_cast<__half*>(&c_red)[(j2 * 4 + j1)]); } } } if (!last) { int4 c; -#pragma unroll + #pragma unroll for (int j2 = 0; j2 < 2; j2++) { -#pragma unroll + #pragma unroll for (int j1 = 0; j1 < 4; j1++) { - reinterpret_cast<__half *>(&c)[(j2 * 4 + j1)] = - __float2half(reinterpret_cast( + reinterpret_cast<__half*>(&c)[(j2 * 4 + j1)] = + __float2half(reinterpret_cast( &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 + 4 * ((i % 4) / 2) + i % 2]); } @@ -643,9 +640,9 @@ __global__ void Marlin_24( auto write_result = [&]() { int c_gl_stride = prob_n / 8; - constexpr int c_sh_stride = 2 * thread_n_blocks; // RLC: - constexpr int c_sh_stride_2 = 2 * c_sh_stride + 2; // RLC: - constexpr int c_sh_stride_3 = 2 * (2 * thread_n_blocks) + 2; // RLC: + constexpr int c_sh_stride = 2 * thread_n_blocks; // RLC: + constexpr int c_sh_stride_2 = 2 * c_sh_stride + 2; // RLC: + constexpr int c_sh_stride_3 = 2 * (2 * thread_n_blocks) + 2; // RLC: int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); @@ -654,22 +651,22 @@ __global__ void Marlin_24( c_gl_wr += (2 * thread_n_blocks) * slice_col; int c_sh_wr = c_sh_stride_2 * ((threadIdx.x % 32) % 4) + - ((threadIdx.x % 32) / 4); // RLC: - c_sh_wr += 8 * (threadIdx.x / 32); // 128/4(half4) + ((threadIdx.x % 32) / 4); // RLC: + c_sh_wr += 8 * (threadIdx.x / 32); // 128/4(half4) constexpr int c_sh_rd_delta = - c_sh_stride_3 * (threads / (2 * 2 * thread_n_blocks)); // RLC: + c_sh_stride_3 * (threads / (2 * 2 * thread_n_blocks)); // RLC: int c_sh_rd = c_sh_stride_3 * (threadIdx.x / (2 * 2 * thread_n_blocks)) + (threadIdx.x % (2 * 2 * thread_n_blocks)); int c_gl_wr_end = c_gl_stride * prob_m; - auto write = [&](int idx, float c0, float c1, float c2, float c3, FragS &s0, - float c4, float c5, float c6, float c7, FragS &s1) { + auto write = [&](int idx, float c0, float c1, float c2, float c3, FragS& s0, + float c4, float c5, float c6, float c7, FragS& s1) { uint2 res[2]; res[0] = to_half4(c0, c1, c2, c3); res[1] = to_half4(c4, c5, c6, c7); - half2 *tmp = (half2 *)&res; + half2* tmp = (half2*)&res; // for per-column quantization we finally apply the scale here if constexpr (group_blocks == -1 && num_bits == 4) { tmp[0] = __hmul2(tmp[0], s0[0]); @@ -677,12 +674,12 @@ __global__ void Marlin_24( tmp[2] = __hmul2(tmp[2], s1[0]); tmp[3] = __hmul2(tmp[3], s1[1]); } - ((int4 *)sh)[idx] = *((int4 *)&res[0]); + ((int4*)sh)[idx] = *((int4*)&res[0]); }; // RLC: only warp 0 and 1 baseline example if (threadIdx.x / 32 < thread_n_blocks / 4) { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { int wr = c_sh_wr; write(wr, frag_c[i][0][0][0], frag_c[i][1][0][0], frag_c[i][2][0][0], @@ -707,7 +704,7 @@ __global__ void Marlin_24( } __syncthreads(); -#pragma unroll + #pragma unroll for (int i = 0; i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) { @@ -721,9 +718,8 @@ __global__ void Marlin_24( // Start global fetch and register load pipelines. auto start_pipes = [&]() { -#pragma unroll - for (int i = 0; i < stages - 1; i++) - fetch_to_shared(i, i, i < slice_iters); + #pragma unroll + for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); zero_accums(); wait_for_stage(); fetch_to_registers(0, 0); @@ -733,10 +729,10 @@ __global__ void Marlin_24( // Main loop. while (slice_iters) { -// We unroll over both the global fetch and the register load pipeline to ensure -// all shared memory accesses are static. Note that both pipelines have even -// length meaning that the next iteration will always start at index 0. -#pragma unroll + // We unroll over both the global fetch and the register load pipeline to + // ensure all shared memory accesses are static. Note that both pipelines have + // even length meaning that the next iteration will always start at index 0. + #pragma unroll for (int pipe = 0; pipe < stages;) { fetch_to_shared((pipe + stages - 1) % stages, pipe, slice_iters >= stages); @@ -747,8 +743,7 @@ __global__ void Marlin_24( pipe++; slice_iters--; - if (slice_iters == 0) - break; + if (slice_iters == 0) break; } a_gl_rd += a_gl_rd_delta_o * stages; @@ -762,13 +757,11 @@ __global__ void Marlin_24( // write-out if constexpr (group_blocks == -1) { if constexpr (num_bits == 8) { - if (s_sh_wr_pred) - cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); + if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); cp_async_fence(); } else { if (last) { - if (s_sh_wr_pred) - cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); + if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); cp_async_fence(); } } @@ -780,14 +773,14 @@ __global__ void Marlin_24( cp_async_wait<0>(); __syncthreads(); if (threadIdx.x / 32 < thread_n_blocks / 4) { - *(float4 *)(frag_s) = *(float4 *)(&sh_s[s_sh_rd]); + *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]); } } else { if (last) { cp_async_wait<0>(); __syncthreads(); if (threadIdx.x / 32 < thread_n_blocks / 4) { - *(float4 *)(frag_s) = *(float4 *)(&sh_s[s_sh_rd]); + *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]); } } } @@ -798,7 +791,7 @@ __global__ void Marlin_24( // overflow in fp16) if constexpr (group_blocks == -1 && num_bits == 8) { if (threadIdx.x / 32 < thread_n_blocks / 4) { -#pragma unroll + #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { scale_floats(&frag_c[i][0][0][0], &frag_c[i][1][0][0], &frag_c[i][2][0][0], &frag_c[i][3][0][0], frag_s[0][0], @@ -827,13 +820,13 @@ __global__ void Marlin_24( } } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice + if (slice_count > 1) { // only globally reduce if there is more than one + // block in a slice barrier_acquire(&locks[slice_col], slice_idx); global_reduce(slice_idx == 0, last); barrier_release(&locks[slice_col], last); } - if (last) // only the last block in a slice actually writes the result + if (last) // only the last block in a slice actually writes the result write_result(); slice_row = 0; @@ -843,19 +836,17 @@ __global__ void Marlin_24( if (slice_iters) { a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o); -#pragma unroll + #pragma unroll for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; -#pragma unroll + #pragma unroll for (int i = 0; i < m_sh_iters; i++) meta_ptr[i] += (m_sh_stride)-m_gl_rd_delta_o * k_tiles; if (slice_col == 0) { -#pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] -= b_gl_stride; -#pragma unroll - for (int i = 0; i < m_sh_iters; i++) - meta_ptr[i] -= m_gl_stride; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; + #pragma unroll + for (int i = 0; i < m_sh_iters; i++) meta_ptr[i] -= m_gl_stride; } s_gl_rd = s_sh_stride * slice_col + threadIdx.x; start_pipes(); @@ -866,26 +857,26 @@ __global__ void Marlin_24( #endif -#define CALL_IF_2_4(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \ - THREAD_K_BLOCKS, GROUP_BLOCKS) \ - else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - group_blocks == GROUP_BLOCKS) { \ - cudaFuncSetAttribute( \ - Marlin_24, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ - Marlin_24 \ - <<>>(A_ptr, B_ptr, meta_ptr, \ - C_ptr, s_ptr, prob_n, \ - prob_m, prob_k, locks); \ +#define CALL_IF_2_4(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \ + THREAD_K_BLOCKS, GROUP_BLOCKS) \ + else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + group_blocks == GROUP_BLOCKS) { \ + cudaFuncSetAttribute( \ + Marlin_24, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + Marlin_24 \ + <<>>(A_ptr, B_ptr, meta_ptr, \ + C_ptr, s_ptr, prob_n, \ + prob_m, prob_k, locks); \ } -void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, - void *s, int prob_m, int prob_n, int prob_k, - void *workspace, int num_bits, int groupsize = -1, +void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, + void* s, int prob_m, int prob_n, int prob_k, + void* workspace, int num_bits, int groupsize = -1, int dev = 0, cudaStream_t stream = 0, int thread_k = -1, int thread_m = -1, int sms = -1, int max_par = 16) { int tot_n = prob_n; @@ -904,8 +895,8 @@ void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, if (thread_k == -1 || thread_m == -1) { if (prob_n <= 16) { - // For small batchizes, better partitioningif is slightly more important than - // better compute utilization + // For small batchizes, better partitioningif is slightly more important + // than better compute utilization thread_k = 128; thread_m = 128; } else { @@ -914,7 +905,7 @@ void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, } } - int thread_k_blocks = thread_k / 32; // 2:4 version with m16n8k32 instruction + int thread_k_blocks = thread_k / 32; // 2:4 version with m16n8k32 instruction int thread_m_blocks = thread_m / 16; int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; int blocks = sms; @@ -931,13 +922,13 @@ void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]"); - const int4 *A_ptr = (const int4 *)A; - const int4 *B_ptr = (const int4 *)B; - const int4 *meta_ptr = (const int4 *)meta; - int4 *C_ptr = (int4 *)C; - const int4 *s_ptr = (const int4 *)s; + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + const int4* meta_ptr = (const int4*)meta; + int4* C_ptr = (int4*)C; + const int4* s_ptr = (const int4*)s; - int *locks = (int *)workspace; + int* locks = (int*)workspace; for (int i = 0; i < tot_n_blocks; i += 4) { int thread_n_blocks = tot_n_blocks - i; prob_n = tot_n - 16 * i; @@ -946,8 +937,7 @@ void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, // Note that parallel > 1 currently only works for inputs without any // padding par = (16 * thread_n_blocks - pad) / 64; - if (par > max_par) - par = max_par; + if (par > max_par) par = max_par; prob_n = 64 * par; i += 4 * (par - 1); thread_n_blocks = 4; @@ -956,16 +946,16 @@ void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, // For compilation speed, we only define the kernel configurations that have // seemed useful (in terms of performance) in our testing, however many more // are, in principle, possible. - + // the false is start of the CALL_IF macros - if (false) { - } // BMxBNxBK, group + if (false) { + } // BMxBNxBK, group // 4-bit - CALL_IF_2_4(4, 8, 1, 4, -1) // e.g., 16x128x128 - CALL_IF_2_4(4, 8, 1, 4, 4) // e.g., 16x128x128, 64 - CALL_IF_2_4(4, 16, 1, 2, -1) // e.g., 16x256x64 - CALL_IF_2_4(4, 16, 1, 2, 4) // e.g., 16x256x64, 64 - CALL_IF_2_4(4, 16, 2, 2, -1) // e.g.. 32x256x64 + CALL_IF_2_4(4, 8, 1, 4, -1) // e.g., 16x128x128 + CALL_IF_2_4(4, 8, 1, 4, 4) // e.g., 16x128x128, 64 + CALL_IF_2_4(4, 16, 1, 2, -1) // e.g., 16x256x64 + CALL_IF_2_4(4, 16, 1, 2, 4) // e.g., 16x256x64, 64 + CALL_IF_2_4(4, 16, 2, 2, -1) // e.g.. 32x256x64 CALL_IF_2_4(4, 16, 2, 2, 4) CALL_IF_2_4(4, 16, 3, 2, -1) CALL_IF_2_4(4, 16, 3, 2, 4) @@ -973,11 +963,11 @@ void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, CALL_IF_2_4(4, 16, 4, 2, 4) // 8-bit - CALL_IF_2_4(8, 8, 1, 4, -1) // e.g., 16x128x128 - CALL_IF_2_4(8, 8, 1, 4, 4) // e.g., 16x128x128, 64 - CALL_IF_2_4(8, 16, 1, 2, -1) // e.g., 16x256x64 - CALL_IF_2_4(8, 16, 1, 2, 4) // e.g., 16x256x64, 64 - CALL_IF_2_4(8, 16, 2, 2, -1) // e.g.. 32x256x64 + CALL_IF_2_4(8, 8, 1, 4, -1) // e.g., 16x128x128 + CALL_IF_2_4(8, 8, 1, 4, 4) // e.g., 16x128x128, 64 + CALL_IF_2_4(8, 16, 1, 2, -1) // e.g., 16x256x64 + CALL_IF_2_4(8, 16, 1, 2, 4) // e.g., 16x256x64, 64 + CALL_IF_2_4(8, 16, 2, 2, -1) // e.g.. 32x256x64 CALL_IF_2_4(8, 16, 2, 2, 4) CALL_IF_2_4(8, 16, 3, 2, -1) CALL_IF_2_4(8, 16, 3, 2, 4) @@ -997,12 +987,12 @@ void marlin_cuda_2_4(const void *A, const void *B, const void *meta, void *C, } } -} // namespace marlin_24 +} // namespace marlin_24 -torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, - torch::Tensor &b_meta, - torch::Tensor &b_scales, - torch::Tensor &workspace, int64_t num_bits, +torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_meta, + torch::Tensor& b_scales, + torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k) { // Verify num_bits @@ -1037,9 +1027,9 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, " is not divisible by tile_size = " + str(marlin_24::tile_size)); int actual_size_n = (b_q_weight.size(1) / marlin_24::tile_size) * pack_factor; - TORCH_CHECK(size_n == actual_size_n, - "size_n = " + str(size_n) + - ", actual_size_n = " + str(actual_size_n)); + TORCH_CHECK( + size_n == actual_size_n, + "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n)); // Verify meta TORCH_CHECK(b_meta.size(0) == size_k / 8 / 2 / 2, @@ -1081,7 +1071,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, ", is not divisible by b_scales.size(0) = " + str(b_scales.size(0))); groupsize = size_k / b_scales.size(0); - groupsize /= 2; // Because of 24 + groupsize /= 2; // Because of 24 } // Verify groupsize diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu index 09964903622b4..1b339fa4b392b 100644 --- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu +++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu @@ -22,27 +22,23 @@ __device__ inline unsigned int as_unsigned(int i) { // 4-bit matvec kernel (LUT-based) __global__ void NUQ4MatMulKernel( #ifndef USE_ROCM - const half2* __restrict__ vec, + const half2* __restrict__ vec, #else - const __half2* __restrict__ vec, + const __half2* __restrict__ vec, #endif - const int* __restrict__ mat, + const int* __restrict__ mat, #ifndef USE_ROCM - half2* __restrict__ mul, + half2* __restrict__ mul, #else - float2* __restrict__ mul, + float2* __restrict__ mul, #endif - const __half* __restrict__ lookup_table, - int height, - int width, - int batch, - int vec_height -) { + const __half* __restrict__ lookup_table, int height, int width, int batch, + int vec_height) { const int blockwidth2 = BLOCKWIDTH / 2; int row = BLOCKHEIGHT4 * blockIdx.x; - int col = BLOCKWIDTH * blockIdx.y + threadIdx.x; + int col = BLOCKWIDTH * blockIdx.y + threadIdx.x; #ifndef USE_ROCM __shared__ half2 blockvec[blockwidth2]; @@ -73,14 +69,16 @@ __global__ void NUQ4MatMulKernel( unsigned int tmp1; unsigned int lut_index1, lut_index2; - for (int b = 0; b < batch; ++b){ + for (int b = 0; b < batch; ++b) { i = width * row + col; res = __int2half_rd(0); k = 0; __syncthreads(); if (threadIdx.x < blockwidth2) - blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x]; + blockvec[threadIdx.x] = + vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + + threadIdx.x]; __syncthreads(); while (k < blockwidth2) { @@ -143,7 +141,8 @@ __global__ void NUQ4MatMulKernel( #ifndef USE_ROCM res = __hadd(__hadd(res2.x, res2.y), res); #else - res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)), res); + res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)), + res); #endif i += width; @@ -179,46 +178,38 @@ __global__ void NUQ4MatMulKernel( } } -} // namespace squeezellm -} // namespace vllm +} // namespace squeezellm +} // namespace vllm // 4-bit matvec kernel (LUT-based) -void squeezellm_gemm( - torch::Tensor vec, - torch::Tensor mat, - torch::Tensor mul, - torch::Tensor lookup_table -) { +void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, + torch::Tensor lookup_table) { int height = mat.size(0); int width = mat.size(1); int batch = vec.size(0); int vec_height = vec.size(1); - dim3 blocks( - (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4, - (width + BLOCKWIDTH - 1) / BLOCKWIDTH - ); + dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4, + (width + BLOCKWIDTH - 1) / BLOCKWIDTH); dim3 threads(BLOCKWIDTH); const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); vllm::squeezellm::NUQ4MatMulKernel<<>>( #ifndef USE_ROCM - (half2*) vec.data(), + (half2*)vec.data(), #else - (__half2*) vec.data_ptr(), + (__half2*)vec.data_ptr(), #endif - mat.data_ptr(), + mat.data_ptr(), #ifndef USE_ROCM - (half2*) mul.data(), - (__half*) lookup_table.data(), + (half2*)mul.data(), (__half*)lookup_table.data(), #else - (float2*) mul.data_ptr(), - (__half*) lookup_table.data_ptr(), + (float2*)mul.data_ptr(), + (__half*)lookup_table.data_ptr(), #endif - height, width, batch, vec_height - ); + height, width, batch, vec_height); } #undef BLOCKWIDTH diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index bb5171f854d55..9af4aae516151 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -1,5 +1,6 @@ /* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh * Copyright (c) 2023, The vLLM team. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * @@ -20,12 +21,12 @@ #include "cuda_compat.h" namespace vllm { -template +template __inline__ __device__ T warpReduceSum(T val) { static_assert(numLanes > 0 && (numLanes & (numLanes - 1)) == 0, "numLanes is not a positive power of 2!"); static_assert(numLanes <= WARP_SIZE); - #pragma unroll +#pragma unroll for (int mask = numLanes >> 1; mask > 0; mask >>= 1) val += VLLM_SHFL_XOR_SYNC(val, mask); return val; @@ -38,22 +39,23 @@ static constexpr int _nextPow2(unsigned int num) { } /* Calculate the sum of all elements in a block */ -template +template __inline__ __device__ T blockReduceSum(T val) { static_assert(maxBlockSize <= 1024); if constexpr (maxBlockSize > WARP_SIZE) { val = warpReduceSum(val); - // Calculates max number of lanes that need to participate in the last warpReduce + // Calculates max number of lanes that need to participate in the last + // warpReduce constexpr int maxActiveLanes = (maxBlockSize + WARP_SIZE - 1) / WARP_SIZE; static __shared__ T shared[maxActiveLanes]; int lane = threadIdx.x % WARP_SIZE; int wid = threadIdx.x / WARP_SIZE; - if (lane == 0) - shared[wid] = val; + if (lane == 0) shared[wid] = val; __syncthreads(); - val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane] : (T)(0.0f); + val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane] + : (T)(0.0f); val = warpReduceSum(val); } else { // A single warpReduce is equal to blockReduce @@ -62,4 +64,4 @@ __inline__ __device__ T blockReduceSum(T val) { return val; } -} // namespace vllm +} // namespace vllm diff --git a/format.sh b/format.sh index 2bbb00726ec80..2740f56241e0c 100755 --- a/format.sh +++ b/format.sh @@ -26,6 +26,7 @@ RUFF_VERSION=$(ruff --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}') CODESPELL_VERSION=$(codespell --version) ISORT_VERSION=$(isort --vn) +CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') # # params: tool name, tool version, required version tool_version_check() { @@ -40,6 +41,7 @@ tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)" tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)" +tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-dev.txt | cut -d'=' -f3)" YAPF_FLAGS=( '--recursive' @@ -181,7 +183,6 @@ lint_changed() { } # Run Ruff -echo 'vLLM ruff:' ### This flag lints individual files. --files *must* be the first command line ### arg to use this option. if [[ "$1" == '--files' ]]; then @@ -194,6 +195,7 @@ else # Format only the files that changed in last commit. lint_changed fi +echo 'vLLM ruff: Done' # check spelling of specified files isort_check() { @@ -235,6 +237,59 @@ else fi echo 'vLLM isort: Done' +# Clang-format section +# Exclude some files for formatting because they are vendored +# NOTE: Keep up to date with .github/workflows/clang-format.yml +CLANG_FORMAT_EXCLUDES=( + 'csrc/moe/topk_softmax_kernels.cu' + 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu' + 'csrc/punica/bgmv/bgmv_config.h' + 'csrc/punica/bgmv/bgmv_impl.cuh' + 'csrc/punica/bgmv/vec_dtypes.cuh' + 'csrc/punica/punica_ops.cu' + 'csrc/punica/type_convert.h' +) + +# Format specified files with clang-format +clang_format() { + clang-format -i "$@" +} + +# Format files that differ from main branch with clang-format. +clang_format_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause clang-format to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + # Get the list of changed files, excluding the specified ones + changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}")) + if [ -n "$changed_files" ]; then + echo "$changed_files" | xargs -P 5 clang-format -i + fi +} + +# Format all files with clang-format +clang_format_all() { + find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ + | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ + | xargs clang-format -i +} + +# Run clang-format +if [[ "$1" == '--files' ]]; then + clang_format "${@:2}" +elif [[ "$1" == '--all' ]]; then + clang_format_all +else + clang_format_changed +fi +echo 'vLLM clang-format: Done' + + if ! git diff --quiet &>/dev/null; then echo 'Reformatted files. Please review and stage the changes.' echo 'Changes not staged for commit:' diff --git a/requirements-dev.txt b/requirements-dev.txt index 73a413cfc633a..4329a4fd0fbe3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,6 +5,7 @@ tomli==2.0.1 ruff==0.1.5 codespell==2.2.6 isort==5.13.2 +clang-format==18.1.5 # type checking mypy==1.9.0 From 4b749741f22dddeec6aab2be543bd415427df5b9 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Wed, 22 May 2024 22:02:58 +0900 Subject: [PATCH 019/154] [misc] remove comments that were supposed to be removed (#4977) --- tests/lora/conftest.py | 1 - vllm/lora/models.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 95fc65cdd1a8f..e5cf9cd48b65d 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -185,7 +185,6 @@ def long_context_lora_files_32k(): return snapshot_download(repo_id="SangBinCho/long_context_32k_testing") -# SANG-TODO Download long lora files. @pytest.fixture(scope="session") def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index d001d17144d98..a2092d31ea9aa 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -105,8 +105,6 @@ def convert_mapping( lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) long_lora_offsets[i] = lora_offset - # SANG-TODO - # index_mapping_indices[i] = i indices_list: List[Union[List[int], torch.Tensor]] = [ index_mapping_indices, lora_indices, embedding_indices From 39c15eefe080b6d85782791c1f90afd1547bc842 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 22 May 2024 10:10:43 -0400 Subject: [PATCH 020/154] [Kernel] Fixup for CUTLASS kernels in CUDA graphs (#4954) Pass the CUDA stream into the CUTLASS GEMMs, to avoid future issues with CUDA graphs --- .../cutlass_w8a8/scaled_mm_dq_c2x.cu | 6 ++- .../cutlass_w8a8/scaled_mm_dq_c3x.cu | 5 ++- tests/kernels/test_cutlass.py | 41 +++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu index e62fe731a98d3..3a6b8a226e18c 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu @@ -1,6 +1,8 @@ #include #include +#include + // clang-format will break include orders // clang-format off #include "cute/tensor.hpp" @@ -189,8 +191,10 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, size_t workspace_size = gemm_op.get_workspace_size(args); cutlass::device_memory::allocation workspace(workspace_size); + auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); + CUTLASS_CHECK(gemm_op.can_implement(args)); - cutlass::Status status = gemm_op(args, workspace.get()); + cutlass::Status status = gemm_op(args, workspace.get(), stream); CUTLASS_CHECK(status); } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu index 12efcac7bb919..5fd6d8ff20867 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu @@ -1,5 +1,7 @@ #include +#include + #include #include #include @@ -178,7 +180,8 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, size_t workspace_size = gemm_op.get_workspace_size(args); TORCH_CHECK(workspace_size == 0); - cutlass::Status status = gemm_op.run(args); + auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); + cutlass::Status status = gemm_op.run(args, stream); CUTLASS_CHECK(status); } } // namespace diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index fdfd1dee29ce6..2cf0e86e5ca44 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -190,3 +190,44 @@ def test_cutlass_subset(): b.to(dtype=torch.float32)).to(dtype=torch.bfloat16) assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) + + +# Test to make sure cuda graphs work +class CutlassLayer(torch.nn.Module): + + def __init__(self, b, scale_a, scale_b, out_dtype): + super().__init__() + self.b = b + self.scale_a = scale_a + self.scale_b = scale_b + self.out_dtype = out_dtype + + def forward(self, a): + return ops.cutlass_scaled_mm_dq(a, self.b, self.scale_a, self.scale_b, + self.out_dtype) + + +def test_cutlass_cuda_graph(): + m, n, k = 512, 512, 512 + + a = to_int8(torch.randn((m, k), device="cuda")) + b = to_int8(torch.randn((n, k), device="cuda").t()) + + scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32) / 10) + scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32) / 10) + + # Construct a trivial model with a single layer that calls a CUTLASS kernel + model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16) + + # Run the model with a cuda graph + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + out = model(a) + out.zero_() + g.replay() + + baseline = torch.mm(scale_a * a.to(dtype=torch.float32), + scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16) + assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) From 2835fc6b725f60782f4ef7f7fab04a3e392e1b5f Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 22 May 2024 13:28:20 -0700 Subject: [PATCH 021/154] [Misc] Load FP8 kv-cache scaling factors from checkpoints (#4893) The 2nd PR for #4532. This PR supports loading FP8 kv-cache scaling factors from a FP8 checkpoint (with .kv_scale parameter). --- benchmarks/benchmark_latency.py | 14 ++-- benchmarks/benchmark_throughput.py | 12 ++- .../kernels/benchmark_paged_attention.py | 10 +-- tests/models/test_fp8.py | 80 ++++++++++++------- vllm/attention/layer.py | 27 ++++++- vllm/config.py | 8 +- vllm/engine/arg_utils.py | 7 +- .../model_executor/layers/quantization/fp8.py | 47 ++++++++++- vllm/model_executor/models/arctic.py | 3 +- vllm/model_executor/models/baichuan.py | 6 +- vllm/model_executor/models/bloom.py | 3 +- vllm/model_executor/models/chatglm.py | 13 ++- vllm/model_executor/models/commandr.py | 13 ++- vllm/model_executor/models/dbrx.py | 13 ++- vllm/model_executor/models/deepseek.py | 3 +- vllm/model_executor/models/falcon.py | 9 ++- vllm/model_executor/models/gemma.py | 3 +- vllm/model_executor/models/gpt2.py | 3 +- vllm/model_executor/models/gpt_bigcode.py | 3 +- vllm/model_executor/models/gpt_j.py | 3 +- vllm/model_executor/models/gpt_neox.py | 3 +- vllm/model_executor/models/internlm2.py | 3 +- vllm/model_executor/models/jais.py | 13 ++- vllm/model_executor/models/llama.py | 32 ++++---- vllm/model_executor/models/minicpm.py | 3 +- vllm/model_executor/models/mixtral.py | 29 +++++-- vllm/model_executor/models/mixtral_quant.py | 15 ++-- vllm/model_executor/models/mpt.py | 3 +- vllm/model_executor/models/olmo.py | 3 +- vllm/model_executor/models/opt.py | 3 +- vllm/model_executor/models/orion.py | 3 +- vllm/model_executor/models/phi.py | 3 +- vllm/model_executor/models/qwen.py | 3 +- vllm/model_executor/models/qwen2.py | 3 +- vllm/model_executor/models/qwen2_moe.py | 3 +- vllm/model_executor/models/stablelm.py | 3 +- vllm/model_executor/models/starcoder2.py | 15 ++-- vllm/model_executor/models/xverse.py | 3 +- vllm/utils.py | 2 + vllm/worker/model_runner.py | 17 ++-- 40 files changed, 284 insertions(+), 158 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index f84e3453947c9..a9657f7859750 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -153,15 +153,13 @@ def run_to_completion(profile_dir: Optional[str] = None): action='store_true', help='enforce eager mode and disable CUDA graph') parser.add_argument( - "--kv-cache-dtype", + '--kv-cache-dtype', type=str, - choices=['auto', 'fp8'], - default='auto', - help= - 'Data type for kv cache storage. If "auto", will use model data type. ' - 'FP8_E5M2 (without scaling) is only supported on cuda version greater ' - 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + default="auto", + help='Data type for kv cache storage. If "auto", will use model ' + 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' + 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') parser.add_argument( '--quantization-param-path', type=str, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 41f443968c3c4..7c8cb5ee8cea2 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -323,15 +323,13 @@ def main(args: argparse.Namespace): action="store_true", help="enforce eager execution") parser.add_argument( - "--kv-cache-dtype", + '--kv-cache-dtype', type=str, - choices=["auto", "fp8"], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type. ' - 'FP8_E5M2 (without scaling) is only supported on cuda version greater ' - 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for ' - 'common inference criteria.') + help='Data type for kv cache storage. If "auto", will use model ' + 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' + 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') parser.add_argument( '--quantization-param-path', type=str, diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index ca7967c1ab0d2..fc9621e885dc4 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -183,13 +183,11 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument( "--kv-cache-dtype", type=str, - choices=["auto", "fp8"], + choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"], default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type. ' - 'FP8_E5M2 (without scaling) is only supported on cuda version greater ' - 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for ' - 'common inference criteria.') + help="Data type for kv cache storage. If 'auto', will use model " + "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. " + "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)") args = parser.parse_args() print(args) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 664e951a89f2a..0a5819ea3f054 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -16,31 +16,55 @@ MAX_MODEL_LEN = 1024 MODELS = [ - "nm-testing/Meta-Llama-3-8B-Instruct-FP8", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", "meta-llama/Meta-Llama-3-8B-Instruct", ] EXPECTED_STRS_MAP = { - "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [ - 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred to a', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o', - ], - "meta-llama/Meta-Llama-3-8B-Instruct": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu' - ], + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": { + "auto": [ + 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', + 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', + 'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both', + 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', + 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', + 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', + 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', + 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no' + ], + "fp8": [ + 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', + 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', + 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', + 'A neural network is a complex system made up of several basic components that work together to enable it to', + 'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like', + 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', + 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', + 'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk' + ] + }, + "meta-llama/Meta-Llama-3-8B-Instruct": { + "auto": [ + 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', + 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', + 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', + 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', + 'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short', + 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', + 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', + 'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu' + ], + "fp8": [ + 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', + 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', + 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', + 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', + 'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest', + 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', + 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', + 'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu' + ] + }, } capability = torch.cuda.get_device_capability() @@ -52,14 +76,14 @@ @pytest.mark.skipif(fp8_not_supported, reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) -def test_models( - example_prompts, - model_name, -) -> None: +@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) +def test_models(example_prompts, model_name, kv_cache_dtype) -> None: model = LLM(model=model_name, max_model_len=MAX_MODEL_LEN, + trust_remote_code=True, enforce_eager=True, - quantization="fp8") + quantization="fp8", + kv_cache_dtype=kv_cache_dtype) tokenizer = AutoTokenizer.from_pretrained(model_name) formatted_prompts = [ @@ -81,8 +105,8 @@ def test_models( generations.append(outputs[0].outputs[0].text) del model - print(generations) - expected_strs = EXPECTED_STRS_MAP[model_name] + print(model_name, kv_cache_dtype, generations) + expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype] for i in range(len(example_prompts)): generated_str = generations[i] expected_str = expected_strs[i] diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 4299726bdca4b..dc7b3940bc9b7 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -7,6 +7,8 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) class Attention(nn.Module): @@ -30,6 +32,7 @@ def __init__( alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() if cache_config is not None: @@ -40,6 +43,27 @@ def __init__( block_size = 16 if num_kv_heads is None: num_kv_heads = num_heads + + # The default kv_scale is set to 1.0. This is ignored + # when kv-cache is not fp8, and should be used with + # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we + # expect the pre-quantized kv_scale to be loaded along + # with the model weights. + self.kv_cache_dtype = kv_cache_dtype + self._kv_scale = 1.0 + quant_method = quant_config.get_quant_method( + self) if quant_config else None + if quant_method is not None: + if self.kv_cache_dtype == "fp8_e5m2": + raise ValueError("fp8_e5m2 kv-cache is not supported with " + "fp8 checkpoints.") + # When FP8 quantization is enabled, we make a parameter + # "kv_scale" so that it can be loaded from FP8 checkpoint. + # The kv_scale will then be converted back + # to self._kv_scale in a native float32 value after weight loading. + self.quant_method = quant_method + self.quant_method.create_weights(self) + # During model initialization, the default dtype is set as the model # weight and activation dtype. dtype = torch.get_default_dtype() @@ -57,10 +81,9 @@ def forward( value: torch.Tensor, kv_cache: Optional[torch.Tensor], attn_metadata: AttentionMetadata, - kv_scale: float = 1.0, ) -> torch.Tensor: return self.impl.forward(query, key, value, kv_cache, attn_metadata, - kv_scale) + self._kv_scale) def extra_repr(self) -> str: s = f"head_size={self.impl.head_size}" # type: ignore diff --git a/vllm/config.py b/vllm/config.py index 773655aa6c793..33b49a0fb2284 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -387,14 +387,12 @@ def _verify_args(self) -> None: def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass - elif self.cache_dtype == "fp8": + elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"): logger.info( "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " - "But it may cause slight accuracy drop without scaling " - "factors. FP8_E5M2 (without scaling) is only supported on " - "cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 " - "is instead supported for common inference criteria.") + "Meanwhile, it may cause accuracy drop without a proper " + "scaling factor") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 803e1836e654e..b94f2619ba767 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -195,12 +195,11 @@ def add_cli_args( parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' - 'data type. FP8_E5M2 (without scaling) is only supported on cuda ' - 'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' - 'supported for common inference criteria.') + 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' + 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') parser.add_argument( '--quantization-param-path', type=nullable_str, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ff996741c1d00..b084b9cee4983 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -8,8 +8,9 @@ from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs +from vllm.utils import print_warning_once ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -58,9 +59,13 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config": activation_scheme=activation_scheme) def get_quant_method( - self, layer: torch.nn.Module) -> Optional["Fp8LinearMethod"]: + self, layer: torch.nn.Module) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + if isinstance(layer, LinearBase): return Fp8LinearMethod(self) + if isinstance(layer, Attention): + return Fp8KVCacheMethod(self) return None def get_scaled_act_names(self) -> List[str]: @@ -251,6 +256,44 @@ def apply(self, return torch.narrow(output, 0, 0, x.shape[0]) +class Fp8KVCacheMethod(QuantizeMethodBase): + """Supports loading kv-cache scaling factors from FP8 checkpoints. + """ + + def __init__(self, quant_config: Fp8Config): + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module): + """Create "weight" (aka kv_scale) for an attention layer. + + Args: + layer: The layer that is using the QuantizeMethodBase factory. + """ + # Initialize the KV cache scale to 1.0 as the default value. + # If the kv_scale appears in the checkpoint, it will be + # overwritten when loading weights. + layer.kv_scale = Parameter(torch.tensor(1.0), requires_grad=False) + + def apply(self, layer: torch.nn.Module) -> torch.Tensor: + raise RuntimeError("Fp8KVCacheMethod.apply should not be called.") + + def process_weights_after_loading(self, layer: Module) -> None: + # If the kv-cache dtype is auto, we enforce the kv-scale to be 1.0 + # regardless whether the kv-scale is available in the checkpoint. + if layer.kv_cache_dtype != "auto": + kv_scale = layer.kv_scale.to("cpu").tolist() + if not isinstance(kv_scale, float): + raise ValueError("Only support per-tensor scaling factor " + "for fp8 KV cache") + layer._kv_scale = kv_scale + if layer._kv_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype: + print_warning_once( + "Using KV cache scaling factor 1.0 for fp8_e4m3. This may " + "cause accuracy issues. Please make sure kv-cache scaling " + "factor is available in the fp8 checkpoint.") + del layer.kv_scale + + def all_close_1d(x: torch.Tensor) -> bool: assert len(x.shape) == 1 return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0])) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index cb99939cbb17a..313762b1353d1 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -268,7 +268,8 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 58b3405d319d1..babb92e7cdcef 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -154,7 +154,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_dim, scaling, - alibi_slopes=alibi_slopes) + alibi_slopes=alibi_slopes, + quant_config=quant_config) else: self.rotary_emb = get_rope( self.head_dim, @@ -166,7 +167,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_dim, self.scaling, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index fe2de87b20dc9..a29aee4cffb7d 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -111,7 +111,8 @@ def __init__( self.head_dim, scaling, alibi_slopes=alibi_slopes, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index ed65d76f7b5b9..e3a5e43e23e1c 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -86,13 +86,12 @@ def __init__( base=10000 * rope_ratio, is_neox_style=False, ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 7354d11f98b15..84786921ce1b4 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -177,13 +177,12 @@ def __init__( rope_scaling=self.rope_scaling, is_neox_style=False, ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config) if self.use_qk_norm: self.q_norm = LayerNorm(param_shape=(self.num_heads, self.head_dim), diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 083ddf0159f71..8ff19a2015e0f 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -218,13 +218,12 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 62e04f9649915..8fbda2638aaa3 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -232,7 +232,8 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index ab9e1994be426..ba707adb03dfe 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -153,7 +153,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_dim, self.inv_norm_factor, - num_kv_heads=self.num_kv_heads) + num_kv_heads=self.num_kv_heads, + quant_config=quant_config) elif self.use_alibi: tp_rank = get_tensor_model_parallel_rank() head_start = tp_rank * self.num_heads @@ -165,13 +166,15 @@ def __init__( self.head_dim, self.inv_norm_factor, num_kv_heads=self.num_kv_heads, - alibi_slopes=alibi_slopes) + alibi_slopes=alibi_slopes, + quant_config=quant_config) else: self.attn = Attention(self.num_heads, self.head_dim, scale=self.inv_norm_factor, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index d1502b718a773..27dda00b66af4 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -157,7 +157,8 @@ def __init__(self, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 0deaa58ed9eb5..cc83f6eb6d94d 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -75,7 +75,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_dim, scale=self.scale, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c20fb3230c394..f488ef40039c0 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -88,7 +88,8 @@ def __init__( self.head_dim, scale=self.scale, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 5f4d8ec3d3a7a..47fd5788a4c35 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -88,7 +88,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_size, scaling, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index dcb52ff666c95..eb0fcc8f26a58 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -89,7 +89,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_size, scaling, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 65f7ddb8b082c..e75c567f589c8 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -117,7 +117,8 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index df30fd1ba0a37..869b8fc91fd64 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -105,13 +105,12 @@ def __init__( head_end = (tp_rank + 1) * self.num_heads alibi_slopes = _get_alibi_slopes(total_num_heads) alibi_slopes = alibi_slopes[head_start:head_end] - self.attn = Attention( - self.num_heads, - self.head_dim, - scale=self.scale, - alibi_slopes=alibi_slopes, - cache_config=cache_config, - ) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.scale, + alibi_slopes=alibi_slopes, + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f2996c240aaf4..23141124e69e1 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -47,7 +47,7 @@ default_weight_loader, kv_cache_scales_loader) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput -from vllm.utils import is_hip +from vllm.utils import is_hip, print_warning_once class LlamaMLP(nn.Module): @@ -119,15 +119,6 @@ def __init__( self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings - # This will be overwritten by model initialization if we are using it. - # N.B. currently we only support per tensor scalar scaling factors - # & only applicable to ROCm (AMD GPU). - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - self.kv_scale = 1.0 - self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, @@ -155,7 +146,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, sliding_window=sliding_window, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, @@ -167,8 +159,7 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata, - self.kv_scale) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.o_proj(attn_output) return output @@ -421,6 +412,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + print_warning_once( + f"Found kv scale in the checkpoint (e.g. {name}), " + "but not found the expected name in the model " + f"(e.g. {remapped_kv_scale_name}). kv-scale is " + "not loaded.") + continue + else: + name = remapped_kv_scale_name param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) @@ -445,7 +449,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.kv_scale = scaling_factor + layer_self_attn.attn._kv_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 0b85cf1c94795..59fbf8e1b35f2 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -236,7 +236,8 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index e3ac33e0452fe..ea95cf7380d54 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -308,14 +308,13 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, - cache_config=cache_config, - ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window, + cache_config=cache_config, + quant_config=quant_config) def forward( self, @@ -581,6 +580,20 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + print_warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index ee2626b1c1aa2..9b99ff729aadd 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -213,14 +213,13 @@ def __init__( base=int(self.rope_theta), is_neox_style=True, ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, - cache_config=cache_config, - ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window, + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 716ac51cde94d..5f9e4d86f3cd8 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -110,7 +110,8 @@ def __init__( scaling, alibi_slopes=alibi_slopes, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 69f23bbfb5d0a..39270f71ec46f 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -96,7 +96,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_dim, scale=self.scaling, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) # Attention output projection. self.o_proj = RowParallelLinear( diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index d241756e50f4a..4bf59105dbabb 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -91,7 +91,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_dim, scale=self.scaling, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 59cd42e31b374..133a10e6bb3e8 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -121,7 +121,8 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 193a29d20c894..c8e61735a9bb6 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -110,7 +110,8 @@ def __init__(self, self.attn = Attention(self.num_heads, self.head_size, scaling, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d158846a3a1f5..d22ea6b79de0f 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -106,7 +106,8 @@ def __init__( self.attn = Attention(self.num_heads, self.head_dim, self.scaling, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 97ab6168c3230..ec203c3b9001a 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -141,7 +141,8 @@ def __init__(self, self.scaling, num_kv_heads=self.num_kv_heads, sliding_window=self.sliding_window, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index a0d3b0406ef4a..564536f2dd248 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -241,7 +241,8 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 8b4a5507feade..a6ed3800bed0f 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -127,7 +127,8 @@ def __init__(self, self.head_dim, self.scaling, num_kv_heads=self.num_key_value_heads, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 3c19d63276a77..91ffd0861c39d 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -97,14 +97,13 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, - cache_config=cache_config, - ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window, + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 6ef230a8ebbca..dda13d83f89a3 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -135,7 +135,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, sliding_window=sliding_window, - cache_config=cache_config) + cache_config=cache_config, + quant_config=quant_config) def forward( self, diff --git a/vllm/utils.py b/vllm/utils.py index bd47ab055b7b5..f4f027ce70e37 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -31,6 +31,8 @@ "bfloat16": torch.bfloat16, "float": torch.float, "fp8": torch.uint8, + "fp8_e4m3": torch.uint8, + "fp8_e5m2": torch.uint8, } diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e264fede0ee64..9720363ac300e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,4 +1,5 @@ import time +import warnings from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union import numpy as np @@ -168,11 +169,21 @@ def load_model(self) -> None: self.model = self.lora_manager.create_lora_manager(self.model) if self.kv_cache_dtype == "fp8" and is_hip(): - # Currently scaled KV cache is only enabled on ROCm + # Currently only ROCm accepts kv-cache scaling factors + # via quantization_param_path and this will be deprecated + # in the future. if self.model_config.quantization_param_path is not None: if callable(getattr(self.model, "load_kv_cache_scales", None)): + warnings.warn( + "Loading kv cache scaling factor from JSON is " + "deprecated and will be removed. Please include " + "kv cache scaling factors in the model checkpoint.", + FutureWarning, + stacklevel=2) self.model.load_kv_cache_scales( self.model_config.quantization_param_path) + logger.info("Loaded KV cache scaling factors from %s", + self.model_config.quantization_param_path) else: raise RuntimeError( "Using FP8 KV cache and scaling factors provided but " @@ -183,10 +194,6 @@ def load_model(self) -> None: "Using FP8 KV cache but no scaling factors " "provided. Defaulting to scaling factors of 1.0. " "This may lead to less accurate results!") - elif self.model_config.quantization_param_path is not None: - logger.warning("KV cache scaling factors provided, " - "but the KV cache data type is not FP8. " - "KV cache scaling factors will not be used.") def save_sharded_state( self, From 3db99a67aabe12ecda0a6b33494e373df03b87cc Mon Sep 17 00:00:00 2001 From: raywanb <112235519+raywanb@users.noreply.github.com> Date: Thu, 23 May 2024 04:58:59 +0800 Subject: [PATCH 022/154] [Model] LoRA gptbigcode implementation (#3949) --- csrc/punica/bgmv/bgmv_config.h | 4 +++ tests/lora/test_punica.py | 2 ++ vllm/lora/models.py | 2 ++ vllm/model_executor/models/gpt_bigcode.py | 31 +++++++++++++++++++---- 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 98ac8de779e13..4b376261d30d2 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -28,6 +28,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 2752) \ f(in_T, out_T, W_T, narrow, 2816) \ f(in_T, out_T, W_T, narrow, 3072) \ + f(in_T, out_T, W_T, narrow, 3328) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ f(in_T, out_T, W_T, narrow, 4096) \ @@ -36,6 +37,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 5504) \ f(in_T, out_T, W_T, narrow, 5632) \ f(in_T, out_T, W_T, narrow, 6144) \ + f(in_T, out_T, W_T, narrow, 6400) \ f(in_T, out_T, W_T, narrow, 6848) \ f(in_T, out_T, W_T, narrow, 6912) \ f(in_T, out_T, W_T, narrow, 7168) \ @@ -97,6 +99,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 2752, narrow) \ f(in_T, out_T, W_T, 2816, narrow) \ f(in_T, out_T, W_T, 3072, narrow) \ + f(in_T, out_T, W_T, 3328, narrow) \ f(in_T, out_T, W_T, 3456, narrow) \ f(in_T, out_T, W_T, 3584, narrow) \ f(in_T, out_T, W_T, 4096, narrow) \ @@ -105,6 +108,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 5504, narrow) \ f(in_T, out_T, W_T, 5632, narrow) \ f(in_T, out_T, W_T, 6144, narrow) \ + f(in_T, out_T, W_T, 6400, narrow) \ f(in_T, out_T, W_T, 6848, narrow) \ f(in_T, out_T, W_T, 6912, narrow) \ f(in_T, out_T, W_T, 7168, narrow) \ diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index 193e3906997c4..f021c003b1322 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -58,6 +58,7 @@ def _lora_ref_impl( 2560, 2752, 3072, + 3328, 3456, 3584, 4096, @@ -66,6 +67,7 @@ def _lora_ref_impl( 5504, 5632, 6144, + 6400, 6848, 6912, 7168, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index a2092d31ea9aa..3e82856866d85 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -310,7 +310,9 @@ def from_local_checkpoint( if part_name not in expected_lora_modules: unexpected_modules.append(module) # loaded lora's target modules must be a subset of expected_lora_modules + if unexpected_modules: + print(unexpected_modules, "modules") raise ValueError( f"While loading {lora_dir}, expected" f" target modules in {expected_lora_modules}" diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index f488ef40039c0..69b75763e9a3d 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -25,7 +25,7 @@ from transformers import GPTBigCodeConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -191,14 +191,19 @@ def __init__( config: GPTBigCodeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ): super().__init__() self.config = config assert not config.add_cross_attention self.embed_dim = config.hidden_size - - self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.wte = VocabParallelEmbedding(self.vocab_size, + self.embed_dim, + org_num_embeddings=config.vocab_size) self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.h = nn.ModuleList([ GPTBigCodeBlock(config, cache_config, quant_config) @@ -226,19 +231,35 @@ def forward( class GPTBigCodeForCausalLM(nn.Module): + packed_modules_mapping = {"c_attn": ["c_attn"]} + + supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"] + + embedding_modules = { + "wte": "input_embeddings", + "lm_head": "output_embeddings", + } + + embedding_padding_modules = [] def __init__( self, config: GPTBigCodeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ): super().__init__() self.config = config self.quant_config = quant_config - self.transformer = GPTBigCodeModel(config, cache_config, quant_config) + self.transformer = GPTBigCodeModel(config, cache_config, quant_config, + lora_config) self.lm_head_weight = self.transformer.wte.weight - self.logits_processor = LogitsProcessor(config.vocab_size) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) self.sampler = Sampler() def forward( From 39a0a40f4e85454d00c2579a49c345cad3f41be8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 22 May 2024 14:17:27 -0700 Subject: [PATCH 023/154] [Core] Eliminate parallel worker per-step task scheduling overhead (#4894) --- vllm/engine/async_llm_engine.py | 10 +- vllm/engine/llm_engine.py | 8 ++ vllm/executor/distributed_gpu_executor.py | 123 ++++++++++++++++----- vllm/executor/executor_base.py | 8 ++ vllm/executor/multiproc_gpu_executor.py | 73 ++++++++----- vllm/executor/ray_gpu_executor.py | 86 +++++++-------- vllm/spec_decode/ngram_worker.py | 4 +- vllm/spec_decode/spec_decode_worker.py | 125 +++++++++++----------- vllm/worker/embedding_model_runner.py | 5 +- vllm/worker/model_runner.py | 5 +- vllm/worker/worker.py | 103 +++++++++++------- vllm/worker/worker_base.py | 7 +- 12 files changed, 348 insertions(+), 209 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 8a37bac02823a..5a15ed67e3327 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -234,6 +234,14 @@ async def step_async( # Log stats. self.do_log_stats(scheduler_outputs, output) + if not request_outputs: + # Stop the execute model loop in parallel workers until there are + # more requests to process. This avoids waiting indefinitely in + # torch.distributed ops which may otherwise timeout, and unblocks + # the RPC thread in the workers so that they can process any other + # queued control plane messages, such as add/remove lora adapters. + await self.model_executor.stop_remote_worker_execution_loop_async() + return request_outputs async def encode_request_async( @@ -687,7 +695,7 @@ async def encode( multi_modal_data: Multi modal data per request. Yields: - The output `EmbeddingRequestOutput` objects from the LLMEngine + The output `EmbeddingRequestOutput` objects from the LLMEngine for the request. Details: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index db26779c86b51..db3141b277b5b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -694,6 +694,14 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: # Log stats. self.do_log_stats(scheduler_outputs, output) + if not request_outputs: + # Stop the execute model loop in parallel workers until there are + # more requests to process. This avoids waiting indefinitely in + # torch.distributed ops which may otherwise timeout, and unblocks + # the RPC thread in the workers so that they can process any other + # queued control plane messages, such as add/remove lora adapters. + self.model_executor.stop_remote_worker_execution_loop() + return request_outputs def do_log_stats( diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py index c5b1e61112afb..f7c608af1ad39 100644 --- a/vllm/executor/distributed_gpu_executor.py +++ b/vllm/executor/distributed_gpu_executor.py @@ -1,11 +1,12 @@ +import asyncio from abc import abstractmethod -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import SamplerOutput +from vllm.sequence import ExecuteModelRequest, SamplerOutput logger = init_logger(__name__) @@ -13,6 +14,16 @@ class DistributedGPUExecutor(GPUExecutor): """Abstract superclass of multi-GPU executor implementations.""" + def __init__(self, *args, **kwargs): + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + # Updated by implementations that require additional args to be passed + # to the _run_workers execute_model call + self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} + + super().__init__(*args, **kwargs) + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available KV blocks. @@ -52,13 +63,28 @@ def initialize_cache(self, num_gpu_blocks: int, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - def execute_model(self, *args, **kwargs) -> List[SamplerOutput]: - all_outputs = self._run_workers("execute_model", - driver_args=args, - driver_kwargs=kwargs) + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_remote_workers_only=True, + **self.extra_execute_model_run_workers_kwargs) # Only the driver worker returns the sampling results. - return all_outputs[0] + return self._driver_execute_model(execute_model_req) + + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + + self._driver_execute_model() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) def add_lora(self, lora_request: LoRARequest) -> bool: assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." @@ -88,39 +114,84 @@ def save_sharded_state( pattern=pattern, max_size=max_size) + @abstractmethod + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + raise NotImplementedError + @abstractmethod def _run_workers( self, method: str, *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, + async_run_remote_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, ) -> Any: - """Runs the given method on all workers.""" + """Runs the given method on all workers. + + Args: + async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than + blocking on the results. + """ + raise NotImplementedError + + @abstractmethod + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" raise NotImplementedError class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase): + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req) + + async def stop_remote_worker_execution_loop_async(self) -> None: + if self.parallel_worker_tasks is None: + return + + await self._driver_execute_model_async() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + await parallel_worker_tasks + @abstractmethod - async def _run_workers_async( + async def _driver_execute_model_async( self, - method: str, - *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - raise NotImplementedError + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Execute the model asynchronously in the driver worker. - async def execute_model_async(self, *args, - **kwargs) -> List[SamplerOutput]: - all_outputs = await self._run_workers_async("execute_model", - driver_args=args, - driver_kwargs=kwargs) + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + raise NotImplementedError - # Only the driver worker returns the sampling results. - return all_outputs[0] + @abstractmethod + async def _start_worker_execution_loop(self): + """Run execution loop on all workers. It guarantees all workers run + the loop or None of them is running the loop. Loop can be stopped by + `stop_remote_worker_execution_loop`. + The API is idempotent (guarantee only 1 loop run at any moment).""" + raise NotImplementedError diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 08aa58999b1ec..4d01939c2e38b 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -74,6 +74,10 @@ def execute_model( """Executes at least one model step on the given sequences.""" raise NotImplementedError + def stop_remote_worker_execution_loop(self) -> None: + """Releases parallel workers from model loop.""" + return + @abstractmethod def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError @@ -109,6 +113,10 @@ async def execute_model_async( """Executes one model step on the given sequences.""" raise NotImplementedError + async def stop_remote_worker_execution_loop_async(self) -> None: + """Releases parallel workers from model loop.""" + return + async def check_health_async(self) -> None: """Checks if the executor is healthy. If not, it should raise an exception.""" diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 2a7b99c9dcbe1..8fa54454907b5 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -1,13 +1,14 @@ import asyncio import os from functools import partial -from typing import Any, Dict, Optional, Tuple +from typing import Any, List, Optional from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) @@ -71,16 +72,34 @@ def shutdown(self): None)) is not None: worker_monitor.close() + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_model( + execute_model_req=execute_model_req) + def _run_workers( self, method: str, *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, + async_run_remote_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, ) -> Any: - """Runs the given method on all workers.""" + """Runs the given method on all workers. + + Args: + async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than + blocking on the results. + """ if max_concurrent_workers: raise NotImplementedError( @@ -92,15 +111,12 @@ def _run_workers( for worker in self.workers ] - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs + if async_run_remote_workers_only: + # Just return futures + return worker_outputs - # Start the driver worker after all the ray workers. driver_worker_method = getattr(self.driver_worker, method) - driver_worker_output = driver_worker_method(*driver_args, - **driver_kwargs) + driver_worker_output = driver_worker_method(*args, **kwargs) # Get the results of the workers. return [driver_worker_output @@ -111,30 +127,29 @@ def check_health(self) -> None: if not self.worker_monitor.is_alive(): raise RuntimeError("Worker processes are not running") + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + for result in parallel_worker_tasks: + result.get() + class MultiprocessingGPUExecutorAsync(MultiprocessingGPUExecutor, DistributedGPUExecutorAsync): - async def _run_workers_async( - self, - method: str, - *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_exec_model = make_async(self.driver_worker.execute_model) - driver_executor = make_async(getattr(self.driver_worker, method)) + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_model(execute_model_req) - # Run all the workers asynchronously. - coros = [driver_executor(*driver_args, **driver_kwargs)] + [ - worker.execute_method_async(method, *args, **kwargs) + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method_async("start_worker_execution_loop") for worker in self.workers ] - return await asyncio.gather(*coros) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index dd3ee60682d30..bed356d1b6e58 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -42,6 +42,8 @@ def _init_executor(self) -> None: self.forward_dag = None if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() + self.extra_execute_model_run_workers_kwargs[ + "use_ray_compiled_dag"] = True def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> Dict[str, Any]: @@ -171,23 +173,23 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_concurrent_workers=self.parallel_config. max_parallel_loading_workers) - def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - all_outputs = self._run_workers( - "execute_model", - driver_kwargs={"execute_model_req": execute_model_req}, - use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. - # Only the driver worker returns the sampling results. - return all_outputs[0] + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) def _run_workers( self, method: str, *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, + async_run_remote_workers_only: bool = False, all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, use_dummy_driver: bool = False, @@ -198,9 +200,11 @@ def _run_workers( """Runs the given method on all workers. Can be used in the following ways: + - async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than blocking + on the results. - args/kwargs: All workers share the same args/kwargs - - args/kwargs and driver_args/driver_kwargs: Driver worker has - different args - all_args/all_kwargs: args/kwargs for each worker are specified individually """ @@ -209,11 +213,6 @@ def _run_workers( raise NotImplementedError( "max_concurrent_workers is not supported yet.") - if driver_args is None: - driver_args = args if all_args is None else all_args[0] - if driver_kwargs is None: - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - count = len(self.workers) all_worker_args = repeat(args, count) if all_args is None \ else islice(all_args, 1, None) @@ -225,6 +224,7 @@ def _run_workers( # input. TODO(sang): Fix it. assert self.forward_dag is not None output_channels = self.forward_dag.execute(1) + ray_worker_outputs = [] else: # Start the ray workers first. ray_worker_outputs = [ @@ -234,6 +234,13 @@ def _run_workers( ) in zip(self.workers, all_worker_args, all_worker_kwargs) ] + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + # Start the driver worker after all the ray workers. if not use_dummy_driver: driver_worker_output = self.driver_worker.execute_method( @@ -260,6 +267,11 @@ def _run_workers( return [driver_worker_output] + ray_worker_outputs + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + def _compiled_ray_dag(self): import pkg_resources required_version = "2.9" @@ -303,30 +315,18 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.driver_executor = make_async(self.driver_worker.execute_method) + self.driver_exec_method = make_async(self.driver_worker.execute_method) - async def _run_workers_async( + async def _driver_execute_model_async( self, - method: str, - *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - coros = [] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - coros.append( - self.driver_executor(method, *driver_args, **driver_kwargs)) - - # Run the ray workers asynchronously. - for worker in self.workers: - coros.append(worker.execute_method.remote(method, *args, **kwargs)) - - all_outputs = await asyncio.gather(*coros) - return all_outputs + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 9628f7af5315a..c2b22f2acd7b4 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -47,7 +47,9 @@ def set_include_gpu_probs_tensor(self): # NGram don't need gpu sampler pass - def execute_model(self, execute_model_req: ExecuteModelRequest) -> None: + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None) -> None: """NGram doesn't depend on model execution, just pass this function""" pass diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index ef17b8c1e2cc0..3462a876c3e90 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -231,35 +231,6 @@ def initialize_cache(self, num_gpu_blocks: int, self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - def _broadcast_control_flow_decision( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - disable_all_speculation: bool = False) -> Tuple[int, bool]: - """Broadcast how many lookahead slots are scheduled for this step, and - whether all speculation is disabled, to all non-driver workers. - - This is required as if the number of draft model runs changes - dynamically, the non-driver workers won't know unless we perform a - communication to inform then. - - Returns the broadcasted num_lookahead_slots and disable_all_speculation. - """ - - if self.rank == self._driver_rank: - assert execute_model_req is not None - - broadcast_dict = dict( - num_lookahead_slots=execute_model_req.num_lookahead_slots, - disable_all_speculation=disable_all_speculation, - ) - broadcast_tensor_dict(broadcast_dict, src=self._driver_rank) - else: - assert execute_model_req is None - broadcast_dict = broadcast_tensor_dict(src=self._driver_rank) - - return (broadcast_dict["num_lookahead_slots"], - broadcast_dict["disable_all_speculation"]) - @torch.inference_mode() def execute_model( self, @@ -267,39 +238,58 @@ def execute_model( ) -> List[SamplerOutput]: """Perform speculative decoding on the input batch. """ + if self.rank != self._driver_rank: + self._run_non_driver_rank() + return [] - disable_all_speculation = False - if self.rank == self._driver_rank: - disable_all_speculation = self._should_disable_all_speculation( - execute_model_req) - - (num_lookahead_slots, - disable_all_speculation) = self._broadcast_control_flow_decision( - execute_model_req, disable_all_speculation) - - if self.rank == self._driver_rank: - assert execute_model_req is not None - assert execute_model_req.seq_group_metadata_list is not None, ( - "speculative decoding requires non-None seq_group_metadata_list" - ) - - self._maybe_disable_speculative_tokens( - disable_all_speculation, - execute_model_req.seq_group_metadata_list) - - # If no spec tokens, call the proposer and scorer workers normally. - # Used for prefill. - if num_lookahead_slots == 0 or len( - execute_model_req.seq_group_metadata_list) == 0: - return self._run_no_spec(execute_model_req, - skip_proposer=disable_all_speculation) - - return self._run_speculative_decoding_step(execute_model_req, - num_lookahead_slots) - else: - self._run_non_driver_rank(num_lookahead_slots) + if execute_model_req is None: + # This signals that there's no more requests to process for now. + # All workers are running infinite loop with broadcast_tensor_dict, + # and it stops the loop when the driver broadcasts an empty input. + # Send an empty input to notify all other workers to stop their + # execution loop. + broadcast_tensor_dict({}, src=0) return [] + disable_all_speculation = self._should_disable_all_speculation( + execute_model_req) + num_lookahead_slots = execute_model_req.num_lookahead_slots + + # Broadcast how many lookahead slots are scheduled for this step, and + # whether all speculation is disabled, to all non-driver workers. + + # This is required as if the number of draft model runs changes + # dynamically, the non-driver workers won't know unless we perform a + # communication to inform then. + broadcast_dict = dict( + num_lookahead_slots=num_lookahead_slots, + disable_all_speculation=disable_all_speculation, + ) + broadcast_tensor_dict(broadcast_dict, src=self._driver_rank) + + assert execute_model_req.seq_group_metadata_list is not None, ( + "speculative decoding requires non-None seq_group_metadata_list") + + self._maybe_disable_speculative_tokens( + disable_all_speculation, execute_model_req.seq_group_metadata_list) + + # If no spec tokens, call the proposer and scorer workers normally. + # Used for prefill. + if num_lookahead_slots == 0 or len( + execute_model_req.seq_group_metadata_list) == 0: + return self._run_no_spec(execute_model_req, + skip_proposer=disable_all_speculation) + + return self._run_speculative_decoding_step(execute_model_req, + num_lookahead_slots) + + @torch.inference_mode() + def start_worker_execution_loop(self) -> None: + """Execute model loop to perform speculative decoding + in parallel worker.""" + while self._run_non_driver_rank(): + pass + def _should_disable_all_speculation( self, execute_model_req: ExecuteModelRequest) -> bool: # When the batch size is too large, disable speculative decoding @@ -346,13 +336,19 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, sampler_output.logprobs = None return [sampler_output] - def _run_non_driver_rank(self, num_lookahead_slots: int) -> None: + def _run_non_driver_rank(self) -> bool: """Run proposer and verifier model in non-driver workers. This is used for both speculation cases (num_lookahead_slots>0) and non-speculation cases (e.g. prefill). + + Returns True iff there are remaining sequences to process. """ - # In non-driver workers the input is None - execute_model_req = None + assert self.rank != self._driver_rank + + data = broadcast_tensor_dict(src=self._driver_rank) + if not data: + return False + num_lookahead_slots = data["num_lookahead_slots"] # Even if num_lookahead_slots is zero, we want to run the proposer model # as it may have KV. @@ -360,9 +356,10 @@ def _run_non_driver_rank(self, num_lookahead_slots: int) -> None: # We run the proposer once per lookahead slot. In the future we should # delegate how many times it runs to the proposer. for _ in range(max(num_lookahead_slots, 1)): - self.proposer_worker.execute_model(execute_model_req) + self.proposer_worker.execute_model() - self.scorer_worker.execute_model(execute_model_req) + self.scorer_worker.execute_model() + return True @nvtx_range("spec_decode_worker._run_speculative_decoding_step") def _run_speculative_decoding_step( diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 91f30978ead87..ef02de95fc54e 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -47,7 +47,7 @@ def __init__( @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[torch.Tensor], ) -> Optional[PoolerOutput]: (input_tokens, input_positions, attn_metadata, pooling_metadata, @@ -84,10 +84,11 @@ def execute_model( def prepare_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata, Set[LoRARequest], LoRAMapping, torch.Tensor]: if self.is_driver_worker: + assert seq_group_metadata_list is not None # Prepare input tensors. ( input_tokens, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 9720363ac300e..87d5f5c1b9d67 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -609,10 +609,11 @@ def _prepare_model_input( def prepare_input_tensors( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, Set[LoRARequest], LoRAMapping, torch.Tensor]: if self.is_driver_worker: + assert seq_group_metadata_list is not None # Prepare input tensors. ( input_tokens, @@ -676,7 +677,7 @@ def prepare_input_tensors( @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: (input_tokens, input_positions, attn_metadata, sampling_metadata, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 97b3873b2a9f6..10411a2bf7a10 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -226,48 +226,42 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[Union[SamplerOutput, PoolerOutput]]: + if not self.is_driver_worker: + self._execute_model_non_driver() + return [] if execute_model_req is None: - seq_group_metadata_list = None - else: - seq_group_metadata_list = execute_model_req.seq_group_metadata_list + # This signals that there's no more requests to process for now. + # All workers are running infinite loop with broadcast_tensor_dict, + # and it stops the loop when the driver broadcasts an empty input. + # Send an empty input to notify all other workers to stop their + # execution loop. + broadcast_tensor_dict({}, src=0) + return [] - blocks_to_swap_in: torch.Tensor - blocks_to_swap_out: torch.Tensor - blocks_to_copy: torch.Tensor - if self.is_driver_worker: - assert seq_group_metadata_list is not None - assert execute_model_req is not None - num_seq_groups = len(seq_group_metadata_list) - # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. - # they contain parameters to launch cudamemcpyasync. - blocks_to_swap_in = torch.tensor( - execute_model_req.blocks_to_swap_in, - device="cpu", - dtype=torch.int64).view(-1, 2) - blocks_to_swap_out = torch.tensor( - execute_model_req.blocks_to_swap_out, - device="cpu", - dtype=torch.int64).view(-1, 2) - # `blocks_to_copy` is a gpu tensor. The src and tgt of - # blocks to copy are in the same device, and `blocks_to_copy` - # can be used directly within cuda kernels. - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device=self.device, + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + num_seq_groups = len(seq_group_metadata_list) + # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. + # they contain parameters to launch cudamemcpyasync. + blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, + device="cpu", + dtype=torch.int64).view(-1, 2) + blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, + device="cpu", dtype=torch.int64).view(-1, 2) - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - else: - data = broadcast_tensor_dict(src=0) - num_seq_groups = data["num_seq_groups"] - blocks_to_swap_in = data["blocks_to_swap_in"] - blocks_to_swap_out = data["blocks_to_swap_out"] - blocks_to_copy = data["blocks_to_copy"] + # `blocks_to_copy` is a gpu tensor. The src and tgt of + # blocks to copy are in the same device, and `blocks_to_copy` + # can be used directly within cuda kernels. + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device=self.device, + dtype=torch.int64).view(-1, 2) + data: Dict[str, Any] = { + "num_seq_groups": num_seq_groups, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + } + broadcast_tensor_dict(data, src=0) self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) @@ -282,6 +276,39 @@ def execute_model( # to conform to interface. return [output] + @torch.inference_mode() + def start_worker_execution_loop(self) -> None: + """Execute model loop in parallel worker. + + You can stop the loop by executing a driver worker with an empty output. + See `stop_remote_worker_execution_loop` for more details. + """ + while self._execute_model_non_driver(): + pass + + def _execute_model_non_driver(self) -> bool: + """Execute model in parallel worker. + + Returns True iff there are remaining sequences to process. + """ + assert not self.is_driver_worker + data = broadcast_tensor_dict(src=0) + if not data: + return False + + num_seq_groups = data.get("num_seq_groups", 0) + blocks_to_swap_in = data.get("blocks_to_swap_in") + blocks_to_swap_out = data.get("blocks_to_swap_out") + blocks_to_copy = data.get("blocks_to_copy") + self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return False + + self.model_runner.execute_model(None, self.gpu_cache) + return True + def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 1f04f821eb0f0..dbac1b5ba339b 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,7 +1,7 @@ import importlib import os from abc import ABC, abstractmethod -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -48,8 +48,9 @@ def initialize_cache(self, num_gpu_blocks: int, @abstractmethod def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: """Executes at least one model step on the given sequences, unless no sequences are provided.""" raise NotImplementedError From 847ca88984cb6ec4960b70aaf0fe7a4799a65979 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 22 May 2024 15:26:56 -0700 Subject: [PATCH 024/154] [Minor] Fix small typo in llama.py: QKVParallelLinear -> QuantizationConfig (#4991) --- vllm/model_executor/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 23141124e69e1..f43a40a0bfd34 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,7 +57,7 @@ def __init__( hidden_size: int, intermediate_size: int, hidden_act: str, - quant_config: Optional[QKVParallelLinear] = None, + quant_config: Optional[QuantizationConfig] = None, bias: bool = False, ) -> None: super().__init__() From c60384c2cef161d5dbfc81fe3e48927455907b52 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 22 May 2024 15:55:56 -0700 Subject: [PATCH 025/154] [Misc] Take user preference in attention selector (#4960) --- tests/kernels/test_attention_selector.py | 84 +++++++++++++ vllm/attention/backends/flashinfer.py | 1 + vllm/attention/selector.py | 145 +++++++++++++---------- 3 files changed, 169 insertions(+), 61 deletions(-) create mode 100644 tests/kernels/test_attention_selector.py diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py new file mode 100644 index 0000000000000..f439afa9b7d2b --- /dev/null +++ b/tests/kernels/test_attention_selector.py @@ -0,0 +1,84 @@ +import os +from unittest.mock import patch + +import pytest +import torch + +from vllm.attention.selector import which_attn_to_use + + +@pytest.mark.parametrize( + "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) +@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) +def test_env(name: str, device: str): + """Test that the attention selector can be set via environment variable. + Note that we do not test FlashAttn because it is the default backend. + """ + name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None) + os.environ["VLLM_ATTENTION_BACKEND"] = name + + if device == "cpu": + with patch("vllm.attention.selector.is_cpu", return_value=True): + backend = which_attn_to_use(8, 16, 8, None, torch.float16, + torch.float16, 16) + assert backend.name == "TORCH_SDPA" + elif device == "hip": + with patch("vllm.attention.selector.is_hip", return_value=True): + backend = which_attn_to_use(8, 16, 8, None, torch.float16, + torch.float16, 16) + assert backend.name == "ROCM_FLASH" + else: + backend = which_attn_to_use(8, 16, 8, None, torch.float16, + torch.float16, 16) + assert backend.name == name + + if name_backup is not None: + os.environ["VLLM_ATTENTION_BACKEND"] = name_backup + + +def test_flash_attn(): + """Test FlashAttn validation.""" + name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None) + os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN" + + # Unsupported CUDA arch + with patch("torch.cuda.get_device_capability", return_value=[7, 5]): + backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) + assert backend.name != "FLASH_ATTN" + + # Unsupported data type + backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16) + assert backend.name != "FLASH_ATTN" + + # Unsupported kv cache data type + backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16) + assert backend.name != "FLASH_ATTN" + + # Unsupported block size + backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8) + assert backend.name != "FLASH_ATTN" + + # Unsupported sliding window + backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16) + assert backend.name != "FLASH_ATTN" + + # flash-attn is not installed + with patch.dict('sys.modules', {'vllm_flash_attn': None}): + backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) + assert backend.name != "FLASH_ATTN" + + # Unsupported head size + backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16) + assert backend.name != "FLASH_ATTN" + + if name_backup is not None: + os.environ["VLLM_ATTENTION_BACKEND"] = name_backup + + +def test_invalid_env(): + """Throw an exception if the backend name is invalid.""" + name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None) + os.environ["VLLM_ATTENTION_BACKEND"] = "INVALID" + with pytest.raises(ValueError): + which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) + os.environ["VLLM_ATTENTION_BACKEND"] = name_backup diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 7210fefbd8162..7b7959d257fac 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -218,6 +218,7 @@ def forward( ) if prefill_meta := attn_metadata.prefill_metadata: + # Prompt run. assert prefill_meta.block_tables is not None if kv_cache is None or prefill_meta.block_tables.numel() == 0: output = flash_attn_varlen_func( diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 51c25a81b4130..f191461dcd3b7 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -30,24 +30,16 @@ def get_attn_backend( kv_cache_dtype: Optional[str], block_size: int, ) -> Type[AttentionBackend]: - backend = _which_attn_to_use(num_heads, head_size, num_kv_heads, - sliding_window, dtype, kv_cache_dtype, - block_size) + """Determine which attention backend to use and only import + the selected backend module. + """ + backend = which_attn_to_use(num_heads, head_size, num_kv_heads, + sliding_window, dtype, kv_cache_dtype, + block_size) if backend == _Backend.FLASH_ATTN: from vllm.attention.backends.flash_attn import ( # noqa: F401 FlashAttentionBackend) - - # We check it here not in _which_attn_to_use because we cannot know - # the head size until we import FlashAttentionBackend. - supported_head_sizes = FlashAttentionBackend.get_supported_head_sizes() - if head_size in supported_head_sizes: - logger.info("Using FlashAttention-2 backend.") - return FlashAttentionBackend - logger.info( - "Cannot use FlashAttention-2 backend for head size %d. " - "Using XFormers backend instead.", head_size) - backend = _Backend.XFORMERS - + return FlashAttentionBackend if backend == _Backend.XFORMERS: logger.info("Using XFormers backend.") from vllm.attention.backends.xformers import ( # noqa: F401 @@ -64,14 +56,15 @@ def get_attn_backend( return TorchSDPABackend elif backend == _Backend.FLASHINFER: logger.info("Using Flashinfer backend.") - logger.warning("Eager mode is enforced for the Flashinfer backend.") + logger.warning("Eager mode is required for the Flashinfer backend. " + "Please make sure --enforce-eager is set.") from vllm.attention.backends.flashinfer import FlashInferBackend return FlashInferBackend else: raise ValueError("Invalid attention backend.") -def _which_attn_to_use( +def which_attn_to_use( num_heads: int, head_size: int, num_kv_heads: int, @@ -81,54 +74,84 @@ def _which_attn_to_use( block_size: int, ) -> _Backend: """Returns which flash attention backend to use.""" + + # Default case. + selected_backend = _Backend.FLASH_ATTN + + # Check the environment variable and override if specified + backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + if backend_by_env_var is not None: + backend_members = _Backend.__members__ + if backend_by_env_var not in backend_members: + raise ValueError( + f"Invalid attention backend '{backend_by_env_var}'. " + f"Available backends: {', '.join(backend_members)} " + "(case-sensitive).") + selected_backend = _Backend[backend_by_env_var] + if is_cpu(): + if selected_backend != _Backend.TORCH_SDPA: + logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA if is_hip(): # AMD GPUs. - if torch.cuda.get_device_capability()[0] != 9: - # not Instinct series GPUs. - logger.info("flash_atten is not supported on NAVI GPUs.") + selected_backend = (_Backend.ROCM_FLASH if selected_backend + == _Backend.FLASH_ATTN else selected_backend) + if selected_backend == _Backend.ROCM_FLASH: + if torch.cuda.get_device_capability()[0] != 9: + # not Instinct series GPUs. + logger.info("flash_attn is not supported on NAVI GPUs.") + else: + logger.info("%s is not supported in AMD GPUs.", selected_backend) return _Backend.ROCM_FLASH - # NVIDIA GPUs. - if torch.cuda.get_device_capability()[0] < 8: - # Volta and Turing NVIDIA GPUs. - logger.info("Cannot use FlashAttention-2 backend for Volta and Turing " - "GPUs.") - return _Backend.XFORMERS - - if dtype not in (torch.float16, torch.bfloat16): - logger.info("Cannot use FlashAttention-2 backend for dtype other than " - "torch.float16 or torch.bfloat16.") - return _Backend.XFORMERS - - if kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"): - logger.info("Cannot use FlashAttention-2 backend for FP8 KV cache.") - return _Backend.XFORMERS - - if block_size % 16 != 0: - logger.info("Cannot use FlashAttention-2 backend for block size not " - "divisible by 16.") - return _Backend.XFORMERS - - if sliding_window is not None: - logger.info( - "Cannot use FlashAttention-2 backend due to sliding window.") - return _Backend.XFORMERS - - try: - import vllm_flash_attn # noqa: F401 - except ImportError: - logger.info( - "Cannot use FlashAttention-2 backend because the vllm_flash_attn " - "package is not found. `pip install vllm-flash-attn` for better " - "performance.") - return _Backend.XFORMERS - - backend_by_env_var = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - return _Backend[backend_by_env_var] - - # Default case. - return _Backend.FLASH_ATTN + # FlashAttn in NVIDIA GPUs. + if selected_backend == _Backend.FLASH_ATTN: + if torch.cuda.get_device_capability()[0] < 8: + # Volta and Turing NVIDIA GPUs. + logger.info( + "Cannot use FlashAttention-2 backend for Volta and Turing " + "GPUs.") + selected_backend = _Backend.XFORMERS + elif dtype not in (torch.float16, torch.bfloat16): + logger.info( + "Cannot use FlashAttention-2 backend for dtype other than " + "torch.float16 or torch.bfloat16.") + selected_backend = _Backend.XFORMERS + elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"): + logger.info( + "Cannot use FlashAttention-2 backend for FP8 KV cache.") + selected_backend = _Backend.XFORMERS + elif block_size % 16 != 0: + logger.info( + "Cannot use FlashAttention-2 backend for block size not " + "divisible by 16.") + selected_backend = _Backend.XFORMERS + elif sliding_window is not None: + logger.info( + "Cannot use FlashAttention-2 backend due to sliding window.") + selected_backend = _Backend.XFORMERS + + # FlashAttn is valid for the model, checking if the package is installed. + if selected_backend == _Backend.FLASH_ATTN: + try: + import vllm_flash_attn # noqa: F401 + + from vllm.attention.backends.flash_attn import ( # noqa: F401 + FlashAttentionBackend) + + supported_sizes = FlashAttentionBackend.get_supported_head_sizes() + if head_size not in supported_sizes: + logger.info( + "Cannot use FlashAttention-2 backend for head size %d.", + head_size) + selected_backend = _Backend.XFORMERS + except ImportError: + logger.info( + "Cannot use FlashAttention-2 backend because the " + "vllm_flash_attn package is not found. " + "`pip install vllm-flash-attn` for better performance.") + selected_backend = _Backend.XFORMERS + + return selected_backend From dae5aafe00dc35b11d8121368121d1c71d22197e Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Thu, 23 May 2024 02:39:27 -0400 Subject: [PATCH 026/154] Marlin 24 prefill performance improvement (about 25% better on average) (#4983) --- benchmarks/kernels/benchmark_marlin.py | 74 ++++++++++++++++--- .../marlin/sparse/marlin_24_cuda_kernel.cu | 55 ++++++++++---- tests/kernels/test_marlin_gemm.py | 2 +- .../layers/quantization/gptq_marlin_24.py | 8 +- 4 files changed, 107 insertions(+), 32 deletions(-) diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 5dcffc284f3d4..b771911781574 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -6,9 +6,13 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS) +from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( + GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - MarlinWorkspace, marlin_quantize) + MarlinWorkspace, marlin_24_quantize, marlin_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) @@ -44,6 +48,10 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, marlin_rand_perm, ) = marlin_quantize(b, num_bits, group_size, act_order) + # Marlin_24 quant + (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, + marlin_24_s) = marlin_24_quantize(b, num_bits, group_size) + # GPTQ quant (w_ref, q_w, s, g_idx, rand_perm) = quantize_weights(b, num_bits, group_size, act_order) @@ -56,28 +64,43 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) # Prepare - marlin_workspace = MarlinWorkspace(size_n) + marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_MAX_PARALLEL) globals = { + # Gen params + "num_bits": num_bits, + "group_size": group_size, + "size_m": size_m, + "size_n": size_n, + "size_k": size_k, + "a": a, + "a_tmp": a_tmp, + # Marlin params "marlin_w_ref": marlin_w_ref, "marlin_q_w": marlin_q_w, "marlin_s": marlin_s, "marlin_g_idx": marlin_g_idx, "marlin_sort_indices": marlin_sort_indices, "marlin_rand_perm": marlin_rand_perm, + "marlin_workspace": marlin_workspace, + "is_k_full": is_k_full, + # Marlin_24 params + "marlin_24_w_ref": marlin_24_w_ref, + "marlin_24_q_w_comp": marlin_24_q_w_comp, + "marlin_24_meta": marlin_24_meta, + "marlin_24_s": marlin_24_s, + "marlin_24_workspace": marlin_24_workspace, + # GPTQ params "q_w_gptq": q_w_gptq, "repack_sort_indices": repack_sort_indices, - "num_bits": num_bits, - "group_size": group_size, - "size_m": size_m, - "size_n": size_n, - "size_k": size_k, - "is_k_full": is_k_full, - "a": a, - "a_tmp": a_tmp, + # Kernels "gptq_marlin_gemm": ops.gptq_marlin_gemm, + "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, "gptq_marlin_repack": ops.gptq_marlin_repack, - "marlin_workspace": marlin_workspace, } min_run_time = 1 @@ -105,6 +128,18 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, description="gptq_marlin_gemm", ).blocked_autorange(min_run_time=min_run_time)) + if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS + and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES): + results.append( + benchmark.Timer( + stmt= + "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_24_gemm", + ).blocked_autorange(min_run_time=min_run_time)) + results.append( benchmark.Timer( stmt= @@ -135,8 +170,20 @@ def main(args): continue for act_order in ACT_ORDER_OPTS: + if len(args.limit_act_order + ) > 0 and act_order not in args.limit_act_order: + continue + for is_k_full in K_FULL_OPTS: + if len(args.limit_k_full + ) > 0 and is_k_full not in args.limit_k_full: + continue + for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS: + if len(args.limit_num_bits + ) > 0 and num_bits not in args.limit_num_bits: + continue + for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES: if len( args.limit_group_size @@ -159,7 +206,7 @@ def main(args): # For quick benchmarking use: -# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 # noqa E501 +# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 # if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -178,6 +225,9 @@ def main(args): parser.add_argument("--limit-k", nargs="+", type=int, default=[]) parser.add_argument("--limit-n", nargs="+", type=int, default=[]) parser.add_argument("--limit-group-size", nargs="+", type=int, default=[]) + parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[]) + parser.add_argument("--limit-act-order", nargs="+", type=int, default=[]) + parser.add_argument("--limit-k-full", nargs="+", type=int, default=[]) args = parser.parse_args() main(args) diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu index 54ad27676e207..686dd7851e6af 100644 --- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu +++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu @@ -48,12 +48,12 @@ namespace marlin_24 { // than 1 warp per schedule allows some more latency hiding. At the same time, // we want relatively few warps to have many registers per warp and small tiles. static constexpr int THREADS = 256; -static constexpr int STAGES = 4; // 4 pipeline stages fit into shared memory +static constexpr int STAGES = 4; static constexpr int min_thread_n = 128; static constexpr int tile_size = 16; -static constexpr int max_par = 16; +static constexpr int max_par = 64; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 @@ -736,10 +736,10 @@ __global__ void Marlin_24( for (int pipe = 0; pipe < stages;) { fetch_to_shared((pipe + stages - 1) % stages, pipe, slice_iters >= stages); + matmul(pipe); wait_for_stage(); fetch_to_registers(pipe + 1, (pipe + 1) % stages); - matmul(pipe); pipe++; slice_iters--; @@ -899,9 +899,12 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, // than better compute utilization thread_k = 128; thread_m = 128; - } else { + } else if (prob_n <= 256) { thread_k = 64; thread_m = 256; + } else { + thread_k = 32; + thread_m = 512; } } @@ -928,19 +931,21 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, int4* C_ptr = (int4*)C; const int4* s_ptr = (const int4*)s; + constexpr int max_m_blocks = 4; + int* locks = (int*)workspace; - for (int i = 0; i < tot_n_blocks; i += 4) { + for (int i = 0; i < tot_n_blocks; i += max_m_blocks) { int thread_n_blocks = tot_n_blocks - i; prob_n = tot_n - 16 * i; int par = 1; - if (thread_n_blocks > 4) { + if (thread_n_blocks > max_m_blocks) { // Note that parallel > 1 currently only works for inputs without any // padding - par = (16 * thread_n_blocks - pad) / 64; + par = (16 * thread_n_blocks - pad) / (max_m_blocks * 16); if (par > max_par) par = max_par; - prob_n = 64 * par; - i += 4 * (par - 1); - thread_n_blocks = 4; + prob_n = (max_m_blocks * 16) * par; + i += max_m_blocks * (par - 1); + thread_n_blocks = max_m_blocks; } // For compilation speed, we only define the kernel configurations that have @@ -951,8 +956,9 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, if (false) { } // BMxBNxBK, group // 4-bit - CALL_IF_2_4(4, 8, 1, 4, -1) // e.g., 16x128x128 - CALL_IF_2_4(4, 8, 1, 4, 4) // e.g., 16x128x128, 64 + CALL_IF_2_4(4, 8, 1, 4, -1) // e.g., 16x128x128 + CALL_IF_2_4(4, 8, 1, 4, 4) // e.g., 16x128x128, 64 + CALL_IF_2_4(4, 16, 1, 2, -1) // e.g., 16x256x64 CALL_IF_2_4(4, 16, 1, 2, 4) // e.g., 16x256x64, 64 CALL_IF_2_4(4, 16, 2, 2, -1) // e.g.. 32x256x64 @@ -962,9 +968,19 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, CALL_IF_2_4(4, 16, 4, 2, -1) CALL_IF_2_4(4, 16, 4, 2, 4) + CALL_IF_2_4(4, 32, 1, 1, -1) // e.g., 16x256x64 + CALL_IF_2_4(4, 32, 1, 1, 4) // e.g., 16x256x64, 64 + CALL_IF_2_4(4, 32, 2, 1, -1) // e.g.. 32x256x64 + CALL_IF_2_4(4, 32, 2, 1, 4) + CALL_IF_2_4(4, 32, 3, 1, -1) + CALL_IF_2_4(4, 32, 3, 1, 4) + CALL_IF_2_4(4, 32, 4, 1, -1) + CALL_IF_2_4(4, 32, 4, 1, 4) + // 8-bit - CALL_IF_2_4(8, 8, 1, 4, -1) // e.g., 16x128x128 - CALL_IF_2_4(8, 8, 1, 4, 4) // e.g., 16x128x128, 64 + CALL_IF_2_4(8, 8, 1, 4, -1) // e.g., 16x128x128 + CALL_IF_2_4(8, 8, 1, 4, 4) // e.g., 16x128x128, 64 + CALL_IF_2_4(8, 16, 1, 2, -1) // e.g., 16x256x64 CALL_IF_2_4(8, 16, 1, 2, 4) // e.g., 16x256x64, 64 CALL_IF_2_4(8, 16, 2, 2, -1) // e.g.. 32x256x64 @@ -973,6 +989,15 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, CALL_IF_2_4(8, 16, 3, 2, 4) CALL_IF_2_4(8, 16, 4, 2, -1) CALL_IF_2_4(8, 16, 4, 2, 4) + + CALL_IF_2_4(8, 32, 1, 1, -1) // e.g., 16x256x64 + CALL_IF_2_4(8, 32, 1, 1, 4) // e.g., 16x256x64, 64 + CALL_IF_2_4(8, 32, 2, 1, -1) // e.g.. 32x256x64 + CALL_IF_2_4(8, 32, 2, 1, 4) + CALL_IF_2_4(8, 32, 3, 1, -1) + CALL_IF_2_4(8, 32, 3, 1, 4) + CALL_IF_2_4(8, 32, 4, 1, -1) + CALL_IF_2_4(8, 32, 4, 1, 4) else { throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]" + @@ -1062,7 +1087,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, int thread_k = -1; int thread_m = -1; int sms = -1; - int max_par = 16; + int max_par = marlin_24::max_par; int groupsize = -1; if (b_scales.size(0) > 1) { diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 587fc3901eb7c..1f8d94bad26d9 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -27,7 +27,7 @@ MARLIN_N_CHUNKS = [64, 128, 256] MARLIN_24_K_CHUNKS = [128] -MARLIN_24_N_CHUNKS = [256] +MARLIN_24_N_CHUNKS = [512] MNK_FACTORS = [ (1, 1, 1), diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index f5345c0443029..6bcfc405afe71 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -15,7 +15,7 @@ GPTQ_MARLIN_24_TILE = 16 GPTQ_MARLIN_24_MIN_THREAD_N = 128 GPTQ_MARLIN_24_MIN_THREAD_K = 128 -GPTQ_MARLIN_24_MAX_PARALLEL = 16 +GPTQ_MARLIN_24_MAX_PARALLEL = 64 GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8] GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128] @@ -53,14 +53,14 @@ def __init__( self.tile_size = 16 # Min out_features dim - self.min_n_threads = 128 + self.min_n_threads = GPTQ_MARLIN_24_MIN_THREAD_N # Min in_features dim - self.min_k_threads = 128 + self.min_k_threads = GPTQ_MARLIN_24_MIN_THREAD_K # Max parallel problems to solve at once (improves large # batch performance) - self.max_parallel = 16 + self.max_parallel = GPTQ_MARLIN_24_MAX_PARALLEL # Permutation length used by the marlin kernels. self.perm_len = 1024 From 05a4f64160f56045e17632dd9be2286e2ec3b98d Mon Sep 17 00:00:00 2001 From: Letian Li Date: Thu, 23 May 2024 17:08:58 +0100 Subject: [PATCH 027/154] [Bugfix] Update Dockerfile.cpu to fix NameError: name 'vllm_ops' is not defined (#5009) --- .buildkite/run-cpu-test.sh | 2 +- Dockerfile.cpu | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index f187d1f181724..414045fe163e5 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -11,4 +11,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py +docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 4251fddd6cc3b..aec79824213f3 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -17,4 +17,6 @@ RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.py RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install +WORKDIR /workspace/ + CMD ["/bin/bash"] From bf4c411d5e9dbad935b6305296d9a057241d5482 Mon Sep 17 00:00:00 2001 From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Date: Thu, 23 May 2024 09:54:48 -0700 Subject: [PATCH 028/154] [Core][1/N] Support send/recv in PyNCCL Groups (#4988) Signed-off-by: Muralidhar Andoorveedu --- tests/distributed/test_pynccl.py | 75 +++++++++++++++++-- vllm/distributed/communication_op.py | 18 +++-- .../device_communicators/pynccl.py | 34 +++++++++ .../device_communicators/pynccl_wrapper.py | 26 +++++++ vllm/distributed/parallel_state.py | 34 +++++++-- 5 files changed, 170 insertions(+), 17 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 529e75fb2c9e3..0218295a3e3f9 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -3,6 +3,7 @@ import pytest import torch +import torch.distributed from vllm.distributed.communication_op import ( # noqa graph_capture, tensor_model_parallel_all_reduce) @@ -68,7 +69,7 @@ def test_pynccl(): @worker_fn_wrapper -def multiple_tp_worker_fn(): +def multiple_allreduce_worker_fn(): device = torch.device(f"cuda:{torch.distributed.get_rank()}") groups = [ torch.distributed.new_group(ranks=[0, 1], backend="gloo"), @@ -92,14 +93,14 @@ def multiple_tp_worker_fn(): @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") -def test_pynccl_multiple_tp(): +def test_pynccl_multiple_allreduce(): # this tests pynccl for multiple tp groups, in a standalone way # i.e. call `pynccl_comm.all_reduce` directly - distributed_run(multiple_tp_worker_fn, 4) + distributed_run(multiple_allreduce_worker_fn, 4) @worker_fn_wrapper -def multiple_tp_with_vllm_worker_fn(): +def multiple_allreduce_with_vllm_worker_fn(): device = torch.device(f"cuda:{torch.distributed.get_rank()}") ensure_model_parallel_initialized(2, 2) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) @@ -118,10 +119,10 @@ def multiple_tp_with_vllm_worker_fn(): @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") -def test_pynccl_multiple_tp_with_vllm(): +def test_pynccl_multiple_allreduce_with_vllm(): # this tests pynccl for multiple tp groups, together with vllm # i.e. call `tensor_model_parallel_all_reduce` - distributed_run(multiple_tp_with_vllm_worker_fn, 4) + distributed_run(multiple_allreduce_with_vllm_worker_fn, 4) @worker_fn_wrapper @@ -151,6 +152,68 @@ def test_pynccl_with_cudagraph(): distributed_run(worker_fn_with_cudagraph, 2) +@worker_fn_wrapper +def send_recv_worker_fn(): + pynccl_comm = PyNcclCommunicator() + if pynccl_comm.rank == 0: + tensor = torch.ones(16, 1024, 1024, + dtype=torch.float32).cuda(pynccl_comm.rank) + else: + tensor = torch.empty(16, 1024, 1024, + dtype=torch.float32).cuda(pynccl_comm.rank) + with pynccl_comm.change_state(enable=True): + if pynccl_comm.rank == 0: + pynccl_comm.send(tensor) + else: + pynccl_comm.recv(tensor) + result = tensor.mean().cpu().item() + assert result == 1 + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl_send_recv(): + distributed_run(send_recv_worker_fn, 2) + + +@worker_fn_wrapper +def multiple_send_recv_worker_fn(): + device = torch.device(f"cuda:{torch.distributed.get_rank()}") + groups = [ + torch.distributed.new_group(ranks=[0, 2], backend="gloo"), + torch.distributed.new_group(ranks=[1, 3], backend="gloo") + ] + group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1] + pynccl_comm = PyNcclCommunicator(group=group, device=device) + if torch.distributed.get_rank() == 0: + tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) + elif torch.distributed.get_rank() == 1: + tensor = 2 * torch.ones( + 16, 1024, 1024, dtype=torch.float32, device=device) + else: + tensor = torch.empty(16, + 1024, + 1024, + dtype=torch.float32, + device=device) + with pynccl_comm.change_state(enable=True): + if torch.distributed.get_rank() in [0, 1]: + pynccl_comm.send(tensor) + else: + pynccl_comm.recv(tensor) + result = tensor.mean().cpu().item() + if torch.distributed.get_rank() in [0, 2]: + assert result == 1 + else: + assert result == 2 + + +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +def test_pynccl_multiple_send_recv(): + distributed_run(multiple_send_recv_worker_fn, 4) + + def test_ncclGetUniqueId(): lib = NCCLLibrary() unique_id = lib.ncclGetUniqueId() diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 937fd4d392713..2b38ec472de66 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -6,7 +6,7 @@ import torch from torch.distributed import ProcessGroup -from .parallel_state import (get_cpu_world_group, +from .parallel_state import (get_cpu_world_group, get_pp_pynccl_communicator, get_tensor_model_parallel_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -54,13 +54,19 @@ def graph_capture(): # graph, we use either custom all-reduce kernel or PyTorch NCCL. # We always prioritize using custom all-reduce kernel but fall back # to PyTorch or pynccl if it is disabled or not supported. - pynccl_comm = get_tp_pynccl_communicator() - if pynccl_comm is None: - maybe_pynccl_context = nullcontext() + tp_pynccl_comm = get_tp_pynccl_communicator() + pp_pynccl_comm = get_pp_pynccl_communicator() + if not tp_pynccl_comm: + maybe_tp_pynccl_context = nullcontext() else: - maybe_pynccl_context = pynccl_comm.change_state( + maybe_tp_pynccl_context = tp_pynccl_comm.change_state( enable=True, stream=torch.cuda.current_stream()) - with maybe_pynccl_context: + if not pp_pynccl_comm: + maybe_pp_pynccl_context = nullcontext() + else: + maybe_pp_pynccl_context = pp_pynccl_comm.change_state( + enable=True, stream=torch.cuda.current_stream()) + with maybe_tp_pynccl_context, maybe_pp_pynccl_context: yield graph_capture_context diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 092a0910329ad..f5f1de0c71615 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -126,6 +126,40 @@ def all_reduce(self, ncclRedOpTypeEnum.from_torch(op), self.comm, cudaStream_t(stream.cuda_stream)) + def send(self, + tensor: torch.Tensor, + dst: Optional[int] = None, + stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}") + if stream is None: + stream = self.stream + if dst is None: + dst = (self.rank + 1) % self.world_size + self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), dst, + self.comm, cudaStream_t(stream.cuda_stream)) + + def recv(self, + tensor: torch.Tensor, + src: Optional[int] = None, + stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}") + if stream is None: + stream = self.stream + if src is None: + src = (self.rank - 1) % self.world_size + self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + self.comm, cudaStream_t(stream.cuda_stream)) + @contextmanager def change_state(self, enable: Optional[bool] = None, diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 43d85674b23d0..3aa3744d0d827 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -151,6 +151,22 @@ class NCCLLibrary: ncclRedOp_t, ncclComm_t, cudaStream_t ]), + # ncclResult_t ncclSend( + # const void* sendbuff, size_t count, ncclDataType_t datatype, + # int dest, ncclComm_t comm, cudaStream_t stream); + Function("ncclSend", ncclResult_t, [ + buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int, + ncclComm_t, cudaStream_t + ]), + + # ncclResult_t ncclRecv( + # void* recvbuff, size_t count, ncclDataType_t datatype, + # int src, ncclComm_t comm, cudaStream_t stream); + Function("ncclRecv", ncclResult_t, [ + buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int, + ncclComm_t, cudaStream_t + ]), + # be cautious! this is a collective call, it will block until all # processes in the communicator have called this function. # because Python object destruction can happen in random order, @@ -248,6 +264,16 @@ def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type, datatype, op, comm, stream)) + def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int, + dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None: + self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype, + dest, comm, stream)) + + def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int, + src: int, comm: ncclComm_t, stream: cudaStream_t) -> None: + self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src, + comm, stream)) + def ncclCommDestroy(self, comm: ncclComm_t) -> None: self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm)) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d24104e3ed276..0ebd7a15eab9b 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -22,6 +22,8 @@ _TP_CA_COMMUNICATOR = None # Pipeline model parallel group that the current rank belongs to. _PP_DEVICE_GROUP: Optional[ProcessGroup] = None +_PP_CPU_GROUP: Optional[ProcessGroup] = None +_PP_PYNCCL_COMMUNICATOR = None # when people blindly call `torch.distributed.all_reduce` etc, # it will use this group. It is initialized with the `backend` @@ -55,6 +57,11 @@ def set_custom_all_reduce(enable: bool): _ENABLE_CUSTOM_ALL_REDUCE = enable +def get_pp_pynccl_communicator(): + global _PP_PYNCCL_COMMUNICATOR + return _PP_PYNCCL_COMMUNICATOR + + def get_tp_pynccl_communicator(): global _TP_PYNCCL_COMMUNICATOR return _TP_PYNCCL_COMMUNICATOR @@ -180,10 +187,11 @@ def initialize_model_parallel( _TP_CPU_GROUP = cpu_group from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator - _TP_PYNCCL_COMMUNICATOR = PyNcclCommunicator( - group=_TP_CPU_GROUP, - device=_LOCAL_RANK, - ) + if tensor_model_parallel_size > 1: + _TP_PYNCCL_COMMUNICATOR = PyNcclCommunicator( + group=_TP_CPU_GROUP, + device=_LOCAL_RANK, + ) # Initialize a custom fast all-reduce implementation. if _ENABLE_CUSTOM_ALL_REDUCE: @@ -195,17 +203,26 @@ def initialize_model_parallel( ) # Build the pipeline model-parallel groups. - global _PP_DEVICE_GROUP + global _PP_DEVICE_GROUP, _PP_CPU_GROUP + global _PP_PYNCCL_COMMUNICATOR global _PP_GLOBAL_RANKS assert _PP_DEVICE_GROUP is None, ( "pipeline model parallel group is already initialized") for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) group = torch.distributed.new_group(ranks, backend=backend) + cpu_group = torch.distributed.new_group(ranks, backend="gloo") if rank in ranks: _PP_DEVICE_GROUP = group + _PP_CPU_GROUP = cpu_group _PP_GLOBAL_RANKS = ranks + if pipeline_model_parallel_size > 1: + _PP_PYNCCL_COMMUNICATOR = PyNcclCommunicator( + group=_PP_CPU_GROUP, + device=_LOCAL_RANK, + ) + def ensure_model_parallel_initialized( tensor_model_parallel_size: int, @@ -267,6 +284,13 @@ def get_pipeline_model_parallel_group(): return _PP_DEVICE_GROUP +def get_pipeline_model_parallel_cpu_group(): + """Get the pipeline model parallel cpu group the caller rank belongs to.""" + assert _PP_CPU_GROUP is not None, ( + "pipeline model parallel cpu group is not initialized") + return _PP_CPU_GROUP + + def get_tensor_model_parallel_world_size(): """Return world size for the tensor model parallel group.""" return torch.distributed.get_world_size( From c62366374fb9342ec7dacde54c39b08d12bc21c0 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 23 May 2024 17:29:18 -0400 Subject: [PATCH 029/154] [Kernel] Initial Activation Quantization Support (#4525) Co-authored-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- CMakeLists.txt | 1 + csrc/ops.h | 3 + csrc/pybind.cpp | 3 + .../compressed_tensors/int8_quant_kernels.cu | 59 +++++ tests/kernels/test_int8_quant.py | 31 +++ tests/quantization/test_compressed_tensors.py | 36 +++ vllm/_custom_ops.py | 18 ++ vllm/model_executor/layers/linear.py | 244 ++++++++++++------ .../layers/quantization/__init__.py | 3 + .../compressed_tensors/__init__.py | 0 .../compressed_tensors/compressed_tensors.py | 151 +++++++++++ .../compressed_tensors/schemes/__init__.py | 5 + .../schemes/compressed_tensors_scheme.py | 33 +++ .../schemes/compressed_tensors_unquantized.py | 39 +++ .../compressed_tensors_w8a8_statictensor.py | 119 +++++++++ .../model_loader/weight_utils.py | 7 + vllm/model_executor/models/llama.py | 25 +- 17 files changed, 683 insertions(+), 94 deletions(-) create mode 100644 csrc/quantization/compressed_tensors/int8_quant_kernels.cu create mode 100644 tests/kernels/test_int8_quant.py create mode 100644 tests/quantization/test_compressed_tensors.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/__init__.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 35846fd1cfa99..b668cbc97de15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -167,6 +167,7 @@ set(VLLM_EXT_SRC "csrc/layernorm_kernels.cu" "csrc/quantization/squeezellm/quant_cuda_kernel.cu" "csrc/quantization/gptq/q_gemm.cu" + "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" "csrc/cuda_utils_kernels.cu" "csrc/moe_align_block_size_kernels.cu" diff --git a/csrc/ops.h b/csrc/ops.h index f5e0e423bb65d..b839eaf0d26c8 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -93,6 +93,9 @@ int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a, #endif +void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor& input, + float scale); + void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor lookup_table); diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index cba07f0ae9f2a..cdbec4a34d77f 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -67,6 +67,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "Aligning the number of tokens to be processed by each expert such " "that it is divisible by the block size."); + ops.def("static_scaled_int8_quant", &static_scaled_int8_quant, + "Compute int8 quantized tensor for given scaling factor"); + // Cache ops pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); cache_ops.def("swap_blocks", &swap_blocks, diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu new file mode 100644 index 0000000000000..4902e4c23434c --- /dev/null +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -0,0 +1,59 @@ +#include +#include +#include + +#include "../../dispatch_utils.h" + +static inline __device__ int8_t float_to_int8_rn(float x) { +#ifdef USE_ROCM + static const float i8_min = + static_cast(std::numeric_limits::min()); + static const float i8_max = + static_cast(std::numeric_limits::max()); + // round + float dst = std::nearbyint(x); + // saturate + dst = std::clamp(dst, i8_min, i8_max); + return static_cast(dst); +#else + // CUDA path + uint32_t dst; + asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x)); + return reinterpret_cast(dst); +#endif +} + +namespace vllm { + +template +__global__ void static_scaled_int8_quant_kernel( + const scalar_t* __restrict__ input, int8_t* __restrict__ out, + scale_type scale, const int hidden_size) { + const int tid = threadIdx.x; + const int token_idx = blockIdx.x; + + for (int i = tid; i < hidden_size; i += blockDim.x) { + out[token_idx * hidden_size + i] = + float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale); + } +} +} // namespace vllm + +void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] + torch::Tensor& input, // [..., hidden_size] + float scale) { + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(out.is_contiguous()); + int hidden_size = input.size(-1); + int num_tokens = input.numel() / hidden_size; + dim3 grid(num_tokens); + dim3 block(std::min(hidden_size, 1024)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { + vllm::static_scaled_int8_quant_kernel + <<>>(input.data_ptr(), + out.data_ptr(), scale, + hidden_size); + }); +} diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py new file mode 100644 index 0000000000000..b9aa00ce13f56 --- /dev/null +++ b/tests/kernels/test_int8_quant.py @@ -0,0 +1,31 @@ +import pytest +import torch + +from vllm._C import ops + +DTYPES = [torch.half, torch.bfloat16, torch.float] +HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192] # Arbitrary values for testing +NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing +SEEDS = [0] +SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] + + +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("scale", SCALE) +@torch.inference_mode() +def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, + seed: int, scale: float) -> None: + torch.random.manual_seed(seed) + torch.cuda.manual_seed(seed) + x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 + + out1 = (x / scale).round().clamp( + torch.iinfo(torch.int8).min, + torch.iinfo(torch.int8).max).to(torch.int8) + out2 = torch.empty_like(x, dtype=torch.int8) + ops.static_scaled_int8_quant(out2, x, scale) + assert torch.allclose(out1, out2, + atol=1) # big atol to account for rounding errors diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py new file mode 100644 index 0000000000000..b83286992da3d --- /dev/null +++ b/tests/quantization/test_compressed_tensors.py @@ -0,0 +1,36 @@ +"""Test model set-up and weight loading for sparseml-quantized models. + +Run `pytest tests/quantization/test_compressed_tensors.py`. +""" + +import torch + +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor) + + +def test_compressed_tensors_w8a8_static_setup(vllm_runner): + model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed" + llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True) + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod) + + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) + + assert qkv_proj.weight.dtype is torch.int8 + assert o_proj.weight.dtype is torch.int8 + assert gate_up_proj.weight.dtype is torch.int8 + + assert qkv_proj.weight_scale.shard_splitter is not None + assert qkv_proj.weight_scale.logical_widths is not None + assert qkv_proj.input_scale.dtype is torch.float32 diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 9e7d0d96bf004..f0fab4d8aa26d 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -251,6 +251,24 @@ def scaled_fp8_quant( return output, scale +# int8 +def static_scaled_int8_quant(input: torch.Tensor, + scale: float) -> torch.Tensor: + """ + Quantize the input tensor to int8 and return the quantized tensor. + + Args: + input: The input tensor to be quantized to int8. + scale: Scaling factor for the int8 quantization. + + Returns: + torch.Tensor: Output tensor in int8. + """ + q = torch.empty_like(input, dtype=torch.int8) + vllm_ops.static_scaled_int8_quant(q, input, scale) + return q + + # moe def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, block_size: int, sorted_token_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 4fcc7eee09cde..0a26cadf90bb4 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -58,7 +58,6 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: """Apply the weights in layer to the input tensor. - Expects create_weights to have been called before on the layer.""" raise NotImplementedError @@ -79,8 +78,7 @@ def create_weights(self, layer: torch.nn.Module, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - output_size_per_partition = sum(output_partition_sizes) - weight = Parameter(torch.empty(output_size_per_partition, + weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, dtype=params_dtype), requires_grad=False) @@ -151,15 +149,13 @@ class ReplicatedLinear(LinearBase): quant_config: Quantization configure. """ - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, + input_size: int, + output_size: int, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -212,17 +208,15 @@ class ColumnParallelLinear(LinearBase): the list would be size 3. """ - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - gather_output: bool = False, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - output_sizes: Optional[List[int]] = None, - ): + def __init__(self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + output_sizes: Optional[List[int]] = None): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -230,18 +224,26 @@ def __init__( # Divide the weight matrix along the last dimension. tp_size = get_tensor_model_parallel_world_size() - self.output_size_per_partition = divide(output_size, tp_size) + assert self.quant_method is not None + self.output_size_per_partition = divide(self.output_size, tp_size) + self.output_partition_sizes = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = [ + divide(output_size, tp_size) + for output_size in self.output_sizes + ] + if output_sizes is None: output_sizes = [output_size] - # All the linear layer supports quant method. - assert self.quant_method is not None - self.quant_method.create_weights(self, - self.input_size, - [x // tp_size for x in output_sizes], - self.input_size, - self.output_size, - self.params_dtype, - weight_loader=self.weight_loader) + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=self.weight_loader) if bias: self.bias = Parameter( torch.empty(self.output_size_per_partition, @@ -323,24 +325,26 @@ class MergedColumnParallelLinear(ColumnParallelLinear): quant_config: Quantization configure. """ - def __init__( - self, - input_size: int, - output_sizes: List[int], - bias: bool = True, - gather_output: bool = False, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, + input_size: int, + output_sizes: List[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None): self.output_sizes = output_sizes # UPSTREAM SYNC: needed for LazyCompressedParameter self.loaded_shards: Set[int] = set() tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) - super().__init__(input_size, sum(output_sizes), bias, gather_output, - skip_bias_add, params_dtype, quant_config, - self.output_sizes) + super().__init__(input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config) def weight_loader(self, param: Parameter, @@ -351,6 +355,26 @@ def weight_loader(self, output_dim = getattr(param, "output_dim", None) # Special case for AQLM codebooks. is_metadata = getattr(param, "is_metadata", False) + + param_shard_splitter = getattr(param, "shard_splitter", None) + + if output_dim is not None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support output_dim != None and " + "shard_splitter != None for a parameter. Please open an issue." + ) + # If a parameter has defined a shard_splitter to be used for + # the weight, it should be applied before the weight is + # loaded/copied to the parameter. The shard_splitter applies + # logic by using the loaded_shard_id to ensure that the loaded + # param is loaded to the correct location + # within the parameter defined by the linear method. + if loaded_shard_id is None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support loaded_shard_id == None and " + "shard_splitter != None for a parameter. Please open an issue." + ) + # Special case for Fp8 scales. fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer", None) @@ -411,6 +435,13 @@ def weight_loader(self, shard_size = loaded_weight.shape[0] shard_offset = loaded_shard_id * shard_size param_data = param_data.narrow(0, shard_offset, shard_size) + + # If a param_shard_splitter is defined by the LinearMethod, use it. + elif param_shard_splitter is not None: + logical_widths = getattr(param, "logical_widths", None) + param_data, loaded_weight = param_shard_splitter( + param_data, loaded_weight, loaded_shard_id, logical_widths) + # Special case for Fp8 scales. elif fp8_scales_shard_indexer is not None: param_data, loaded_weight = fp8_scales_shard_indexer( @@ -424,6 +455,14 @@ def weight_loader(self, "MergedColumnParallelLinear, assume the weight is " "the same for all partitions.") + if fp8_scales_shard_indexer is None: + if len(param_data.shape) == 0: + param_data = param_data.reshape(1) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + # UPSTREAM SYNC: needed for LazyCompressedParameter self.loaded_shards.add(loaded_shard_id) assert param_data.shape == loaded_weight.shape @@ -463,17 +502,15 @@ class QKVParallelLinear(ColumnParallelLinear): quant_config: Quantization configure. """ - def __init__( - self, - hidden_size: int, - head_size: int, - total_num_heads: int, - total_num_kv_heads: Optional[int] = None, - bias: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, + hidden_size: int, + head_size: int, + total_num_heads: int, + total_num_kv_heads: Optional[int] = None, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None): self.hidden_size = hidden_size self.head_size = head_size self.total_num_heads = total_num_heads @@ -495,14 +532,19 @@ def __init__( input_size = self.hidden_size output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size - output_sizes = [ - self.num_heads * tp_size * self.head_size, - self.num_kv_heads * tp_size * self.head_size, - self.num_kv_heads * tp_size * self.head_size + self.output_sizes = [ + self.num_heads * self.head_size * tp_size, # q_proj + self.num_kv_heads * self.head_size * tp_size, # k_proj + self.num_kv_heads * self.head_size * tp_size, # v_proj ] - super().__init__(input_size, output_size, bias, False, skip_bias_add, - params_dtype, quant_config, output_sizes) + super().__init__(input_size=input_size, + output_size=output_size, + bias=bias, + gather_output=False, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config) def weight_loader(self, param: Parameter, @@ -512,6 +554,26 @@ def weight_loader(self, output_dim = getattr(param, "output_dim", None) # Special case for AQLM codebooks. is_metadata = getattr(param, "is_metadata", False) + + param_shard_splitter = getattr(param, "shard_splitter", None) + + if output_dim is not None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support output_dim != None and " + "shard_splitter != None for a parameter. Please open an issue." + ) + # If a parameter has defined a shard_splitter to be used for + # the weight, it should be applied before the weight is + # loaded/copied to the parameter. The shard_splitter applies + # logic by using the loaded_shard_id to ensure that the loaded + # param is loaded to the correct location + # within the parameter defined by the linear method. + if loaded_shard_id is None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support loaded_shard_id == None and " + "shard_splitter != None for a parameter. Please open an issue." + ) + # Special case for Fp8 scales. fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer", None) @@ -550,6 +612,8 @@ def weight_loader(self, tp_rank = get_tensor_model_parallel_rank() assert loaded_shard_id in ["q", "k", "v"] + + # If output dim is defined, use the default loading process. if output_dim is not None: if loaded_shard_id == "q": shard_offset = 0 @@ -589,6 +653,12 @@ def weight_loader(self, shard_index = ["q", "k", "v"].index(loaded_shard_id) param_data = param_data.narrow(0, shard_index * shard_size, shard_size) + # If a param_shard_splitter is defined by the LinearMethod, use it. + elif param_shard_splitter is not None: + logical_widths = getattr(param, "logical_widths", None) + param_data, loaded_weight = param_shard_splitter( + param_data, loaded_weight, loaded_shard_id, logical_widths) + # Special case for Fp8 scales. elif fp8_scales_shard_indexer is not None: param_data, loaded_weight = fp8_scales_shard_indexer( @@ -600,6 +670,13 @@ def weight_loader(self, "Loading a weight without `output_dim` attribute in " "QKVParallelLinear, assume the weight is the same " "for all partitions.") + + if len(param_data.shape) == 0: + param_data = param_data.reshape(1) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -639,17 +716,15 @@ class RowParallelLinear(LinearBase): quant_config: Quantization configure. """ - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - input_is_parallel: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - reduce_results: bool = True, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + quant_config: Optional[QuantizationConfig] = None): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -659,16 +734,15 @@ def __init__( # Divide the weight matrix along the last dimension. self.tp_size = get_tensor_model_parallel_world_size() self.input_size_per_partition = divide(input_size, self.tp_size) - # All the linear layer supports quant method. assert self.quant_method is not None - self.quant_method.create_weights(self, - self.input_size_per_partition, - [self.output_size], - self.input_size, - self.output_size, - self.params_dtype, - weight_loader=self.weight_loader) - + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=[self.output_size], + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=self.weight_loader) if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " "results can lead to incorrect results") @@ -696,12 +770,16 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) + # Special case for Fp8 scales. elif fp8_scales_shard_indexer is not None: param_data, loaded_weight = fp8_scales_shard_indexer(param_data, loaded_weight, shard_id=0) + if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index f938e7d37ec5f..7b9abe1b629a1 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -4,6 +4,8 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsConfig) from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig) from vllm.model_executor.layers.quantization.fp8 import Fp8Config @@ -27,6 +29,7 @@ "gptq_marlin": GPTQMarlinConfig, "gptq": GPTQConfig, "squeezellm": SqueezeLLMConfig, + "sparseml": CompressedTensorsConfig, } diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py new file mode 100644 index 0000000000000..19e464bd64325 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -0,0 +1,151 @@ +from typing import Any, Dict, List, Optional + +import torch + +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 + QuantizationConfig) +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, CompressedTensorsW8A8StaticTensor) + + +class CompressedTensorsConfig(QuantizationConfig): + + def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str]): + self.ignore = ignore + self.layer_quant_details = layer_quant_details + + def get_linear_method(self) -> "CompressedTensorsLinearMethod": + return CompressedTensorsLinearMethod(self) + + def get_scaled_act_names(self) -> List[str]: + return [] + + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.float16] + + # Need to figure it out + def get_min_capability(self) -> int: + return 60 + + def get_name(self) -> str: + return "compressed_tensors" + + def get_quant_method( + self, layer: torch.nn.Module + ) -> Optional["CompressedTensorsLinearMethod"]: + if isinstance(layer, LinearBase): + return CompressedTensorsLinearMethod(self) + return None + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": + layer_quant_details: Dict[str, Any] = dict() + ignore: List[str] = config.get("ignore", None) + + for key, quant_config in config["config_groups"].items(): + targets = quant_config.get("targets") + for target in targets: + layer_quant_details[target] = {} + layer_quant_details[target]["weight"] = quant_config.get( + "weights") + layer_quant_details[target]["input"] = quant_config.get( + "input_activations") + + return cls(layer_quant_details=layer_quant_details, ignore=ignore) + + @classmethod + def get_config_filenames(cls) -> List[str]: + return [] + + def _get_schema(self, weight_quant: Dict, input_quant: Dict): + # TODO: Refactor as additional cases are supported + + weight_bit = weight_quant.get("num_bits") + input_bit = input_quant.get("num_bits") + + weight_strategy = weight_quant.get("strategy") + input_strategy = input_quant.get("strategy") + + weight_symmetric = weight_quant.get("symmetric") + input_symmetric = input_quant.get("symmetric") + + is_8_bits = weight_bit == input_bit == 8 + is_tensor = weight_strategy == input_strategy == "tensor" + is_symmetric = weight_symmetric and input_symmetric + + if is_8_bits and is_tensor and is_symmetric and \ + torch.cuda.is_available(): + # CompressedTensorsW8A8StaticTensor only supports CUDA path for + # now. + return CompressedTensorsW8A8StaticTensor() + raise NotImplementedError( + "Scheme not supported. Only CUDA, 8-bit static symmtetric " + "per tensor quantization is currently supported") + + def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme": + + # TODO: update with matching function from `compressed_tensors` + layer_type_name = None + layer_name_class = type(layer).__name__.lower() + for target in self.layer_quant_details: + if target.lower() in layer_name_class: + layer_type_name = target + break + if layer_type_name is None: + raise ValueError(f"Could not matching target for layer {layer}") + + layer_quant_details: Dict[str, Any] = self.layer_quant_details.get( + layer_type_name, None) + if layer_quant_details is None: + raise ValueError( + f"Could not find quantization details for {layer}.") + + return self._get_schema(weight_quant=layer_quant_details["weight"], + input_quant=layer_quant_details["input"]) + + +class CompressedTensorsLinearMethod(LinearMethodBase): + + def __init__(self, quantization_config: CompressedTensorsConfig): + self.quantization_config = quantization_config + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + """ + Use the CompressedTensorsScheme associated with each layer to create + the necessary parameters for the layer. + """ + weight_loader = extra_weight_attrs.get("weight_loader") + + scheme = self.quantization_config.get_scheme(layer=layer) + scheme.create_weights( + layer=layer, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + output_size=output_size, + params_dtype=params_dtype, + weight_loader=weight_loader) + + layer.scheme = scheme + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None): + """ + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the + layer input. + """ + + if bias is not None: + raise ValueError("bias is not supported for this linear method") + + scheme = layer.scheme + if scheme is None: + raise ValueError("A scheme must be defined for each layer") + return scheme.apply_weights(layer, x) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py new file mode 100644 index 0000000000000..831905b63e2c9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -0,0 +1,5 @@ +from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401 +from .compressed_tensors_unquantized import ( # noqa: F401 + CompressedTensorsUnquantized) +from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 + CompressedTensorsW8A8StaticTensor) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py new file mode 100644 index 0000000000000..3a5904208656e --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -0,0 +1,33 @@ +from abc import ABC, abstractmethod + +import torch + +__all__ = ["CompressedTensorsScheme"] + + +class CompressedTensorsScheme(ABC): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by CompressedTensors. + """ + + @abstractmethod + def create_weights(self, *args, **kwargs): + """ + Weight creation for the particular scheme. Inputs to this function + + """ + raise NotImplementedError + + @abstractmethod + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + """ + Run the forward pass for the particular scheme. This is where + scheme-specific dequant/quant steps/kernels should be applied. + + :param layer: toch.nn.Module with the registered weights and + other parameters relevant to the particular scheme. + :param x: input to the layer + + """ + raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py new file mode 100644 index 0000000000000..0cfac13d1ca25 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py @@ -0,0 +1,39 @@ +from typing import Callable, List + +import torch +import torch.nn.functional as F +from torch.nn import Parameter + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsUnquantized"] + + +class CompressedTensorsUnquantized(CompressedTensorsScheme): + """ + Implements the scheme for all layers which are ignored + in the CompressedTensors config. The input and loaded weight are used + in a linear transformation. + """ + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + device="cuda", + dtype=params_dtype), + requires_grad=False) + + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, {"weight_loader": weight_loader}) + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + weight = layer.weight + return F.linear(x, weight) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py new file mode 100644 index 0000000000000..d16e570d12202 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -0,0 +1,119 @@ +from typing import Callable, List, Tuple, Union + +import torch +from torch.nn import Parameter + +from vllm import _custom_ops as custom_ops +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsW8A8StaticTensor"] + + +class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): + + def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: + if isinstance(shard_id, int): + return shard_id + + assert isinstance(shard_id, str) + qkv_idxs = {"q": 0, "k": 1, "v": 2} + assert shard_id in qkv_idxs + return qkv_idxs[shard_id] + + def scales_shard_splitter( + self, param: torch.Tensor, loaded_weight: torch.Tensor, + shard_id: Union[str, int], + logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shard_id = self._shard_id_as_int(shard_id) + offset = sum(logical_widths[:shard_id]) + size = logical_widths[shard_id] + # update loaded weight with copies for broadcast. + loaded_weight = loaded_weight.repeat(size) + return param[offset:offset + size], loaded_weight + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + # TODO: remove zero_point parameters once the configs given remove them + + # Note on input/weight scales and zero_points + # + # When the scales have a single value, it is required that they be + # on the CPU for 2 reasons, + # 1. Performance: + # When the scales (input_scale/weight_scales) have only a single + # value, we perform a scalar broadcast of that value during the + # quant/dequant operations. The "quant" and the "gemm+dequant" + # kernels accept the Scalar by-value. These tensors are allocated + # on the CPU in order to avoid the GPU-to-CPU copy when passing + # by-value. + # + # 2. CUDA Graphs: + # CUDA Graphs don't support GPU-to-CPU copy operations during + # stream capture. + # + # TODO: zero-points are not supported yet. But we expect a similar + # pattern. + + is_tensor_partitioned = len(output_partition_sizes) != 1 + weight_scale_dim = sum( + output_partition_sizes) if is_tensor_partitioned else 1 + weight_scale_device = "cpu" if weight_scale_dim == 1 else "cuda" + + input_scale = Parameter(torch.empty(1, + device="cpu", + dtype=torch.float32), + requires_grad=False) + input_zero_point = Parameter(torch.empty(1, + device="cpu", + dtype=torch.int8), + requires_grad=False) + + weight_scale = Parameter(torch.empty(weight_scale_dim, + device=weight_scale_device, + dtype=torch.float32), + requires_grad=False) + weight_zero_point = Parameter(torch.empty(1, + device="cpu", + dtype=torch.int8), + requires_grad=False) + + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8), + requires_grad=False) + + layer.register_parameter("weight", weight) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + + set_weight_attrs(weight, {"weight_loader": weight_loader}) + + layer.register_parameter("input_scale", input_scale) + set_weight_attrs(input_scale, {"weight_loader": weight_loader}) + layer.register_parameter("input_zero_point", input_zero_point) + set_weight_attrs(input_zero_point, {"weight_loader": weight_loader}) + layer.register_parameter("weight_scale", weight_scale) + set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) + set_weight_attrs( + weight_scale, { + "shard_splitter": self.scales_shard_splitter, + "logical_widths": output_partition_sizes + }) + layer.register_parameter("weight_zero_point", weight_zero_point) + set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader}) + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + weight = layer.weight + weight_scale = layer.weight_scale + act_scale = layer.input_scale + + # Input quantize + x_q = custom_ops.static_scaled_int8_quant(x, act_scale[0].item()) + + return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale, + weight_scale, x.dtype) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 4e826256bdba7..ecad5041099d8 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -135,6 +135,13 @@ def get_quant_config(model_config: ModelConfig, # Read the quantization config from the HF model config, if available. hf_quant_config = getattr(model_config.hf_config, "quantization_config", None) + if hf_quant_config is None: + compression_config = getattr(model_config.hf_config, + "compression_config", None) + if compression_config is not None: + hf_quant_config = compression_config.get("quantization_config", + None) + if hf_quant_config is not None: return quant_cls.from_config(hf_quant_config) model_name_or_path = model_config.model diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f43a40a0bfd34..086f9294c4f1c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -62,11 +62,12 @@ def __init__( ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, + input_size=hidden_size, + output_sizes=[intermediate_size] * 2, bias=bias, quant_config=quant_config) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, + self.down_proj = RowParallelLinear(input_size=intermediate_size, + output_size=hidden_size, bias=bias, quant_config=quant_config) if hidden_act != "silu": @@ -120,16 +121,16 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, bias=bias, quant_config=quant_config, ) self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, bias=bias, quant_config=quant_config, ) @@ -263,8 +264,10 @@ def __init__( org_num_embeddings=config.vocab_size, ) self.layers = nn.ModuleList([ - LlamaDecoderLayer(config, cache_config, quant_config) - for _ in range(config.num_hidden_layers) + LlamaDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config) + for idx in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) From a9ca32d16d0fbd33c75f65fc629895a28227c794 Mon Sep 17 00:00:00 2001 From: Elisei Smirnov <61423871+kezouke@users.noreply.github.com> Date: Fri, 24 May 2024 01:04:24 +0300 Subject: [PATCH 030/154] [Core]: Option To Use Prompt Token Ids Inside Logits Processor (#4985) Co-authored-by: Elisei Smirnov --- vllm/model_executor/layers/logits_processor.py | 17 ++++++++++++++--- vllm/sampling_params.py | 15 ++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 91eb96998c3cf..d450c46455d49 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,4 +1,5 @@ """A layer that compute logits from hidden_stats.""" +import inspect from typing import Optional import torch @@ -95,15 +96,25 @@ def _apply_logits_processors( seq_ids = seq_group.seq_ids sampling_params = seq_group.sampling_params logits_processors = sampling_params.logits_processors - if logits_processors: found_logits_processors = True + for seq_id, logits_row_idx in zip(seq_ids, seq_group.sample_indices): logits_row = logits[logits_row_idx] - token_ids = seq_group.seq_data[seq_id].output_token_ids + past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids + prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids + for logits_processor in logits_processors: - logits_row = logits_processor(token_ids, logits_row) + parameters = inspect.signature(logits_processor).parameters + if len(parameters) == 3: + logits_row = logits_processor(prompt_tokens_ids, + past_tokens_ids, + logits_row) + else: + logits_row = logits_processor(past_tokens_ids, + logits_row) + logits[logits_row_idx] = logits_row logits_processed += len(seq_group.sample_indices) + len( diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5fa94eb149ffb..9d8a361353e26 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -18,10 +18,14 @@ class SamplingType(IntEnum): BEAM = 3 -LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor] -"""LogitsProcessor is a function that takes a list of previously generated -tokens and a tensor of the logits for the next token, and returns a modified -tensor of logits to sample from.""" +LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor], + Callable[[List[int], List[int], torch.Tensor], + torch.Tensor]] +"""LogitsProcessor is a function that takes a list +of previously generated tokens, the logits tensor +for the next token and, optionally, prompt tokens as a +first argument, and returns a modified tensor of logits +to sample from.""" class SamplingParams: @@ -95,7 +99,8 @@ class SamplingParams: spaces_between_special_tokens: Whether to add spaces between special tokens in the output. Defaults to True. logits_processors: List of functions that modify logits based on - previously generated tokens. + previously generated tokens, and optionally prompt tokens as + a first argument. truncate_prompt_tokens: If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). Defaults to None (i.e., no truncation). From 0eb33b13c33ff1b0a2af38c27dd2339ac23c8830 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 23 May 2024 16:21:54 -0700 Subject: [PATCH 031/154] [Doc] add ccache guide in doc (#5012) Co-authored-by: Michael Goin --- docs/source/getting_started/installation.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 0c81f7ec6d2a9..ba23e7468dcc1 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -56,6 +56,10 @@ You can also build and install vLLM from source: $ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability $ pip install -e . # This may take 5-10 minutes. +.. tip:: + + Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache `_ via either `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster. + .. tip:: To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable `MAX_JOBS`. For example: From acf362ca3fc385b77172b559da86c6f13ec5596c Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 24 May 2024 14:28:27 +0200 Subject: [PATCH 032/154] [Kernel] Initial Activation Quantization Support (#4525) Co-authored-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/models/test_mistral.py | 1 + vllm/model_executor/model_loader/loader.py | 17 ++++- .../model_loader/weight_utils.py | 64 ++++++++++++++++++- 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index b600ff454491f..290d68501bc5c 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -8,6 +8,7 @@ MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", + "mistralai/Mistral-7B-Instruct-v0.3", ] diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 4b02fb7fcf984..fa9866abf7d28 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -24,7 +24,8 @@ set_default_torch_dtype) # UPSTREAM SYNC: needed for sparsity from vllm.model_executor.model_loader.weight_utils import ( - download_weights_from_hf, filter_files_not_needed_for_inference, + download_safetensors_index_file_from_hf, download_weights_from_hf, + filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, get_quant_config, get_sparse_config, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) @@ -209,7 +210,19 @@ def _prepare_weights(self, model_name_or_path: str, use_safetensors = True break - if not use_safetensors: + if use_safetensors: + # For models like Mistral-7B-Instruct-v0.3 + # there are both sharded safetensors files and a consolidated + # safetensors file. Using both breaks. + # Here, we download the `model.safetensors.index.json` and filter + # any files not found in the index. + if not is_local: + download_safetensors_index_file_from_hf( + model_name_or_path, self.load_config.download_dir, + revision) + hf_weights_files = filter_duplicate_safetensors_files( + hf_weights_files, hf_folder) + else: hf_weights_files = filter_files_not_needed_for_inference( hf_weights_files) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index ecad5041099d8..a251828b45eba 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -12,9 +12,10 @@ import huggingface_hub.constants import numpy as np import torch -from huggingface_hub import HfFileSystem, snapshot_download +from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm.config import LoadConfig, ModelConfig from vllm.logger import init_logger @@ -233,6 +234,67 @@ def download_weights_from_hf( return hf_folder +def download_safetensors_index_file_from_hf( + model_name_or_path: str, + cache_dir: Optional[str], + revision: Optional[str] = None, +) -> None: + """Download hf safetensors index file from Hugging Face Hub. + + Args: + model_name_or_path (str): The model name or path. + cache_dir (Optional[str]): The cache directory to store the model + weights. If None, will use HF defaults. + revision (Optional[str]): The revision of the model. + """ + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model_name_or_path, cache_dir): + try: + # Download the safetensors index file. + hf_hub_download( + repo_id=model_name_or_path, + filename=SAFE_WEIGHTS_INDEX_NAME, + cache_dir=cache_dir, + revision=revision, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ) + # If file not found on remote or locally, we should not fail since + # only some models will have SAFE_WEIGHTS_INDEX_NAME. + except huggingface_hub.utils.EntryNotFoundError: + logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME) + except huggingface_hub.utils.LocalEntryNotFoundError: + logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME) + + +# For models like Mistral-7B-v0.3, there are both sharded +# safetensors files and a consolidated safetensors file. +# Passing both of these to the weight loader functionality breaks. +# So, we use the SAFE_WEIGHTS_INDEX_NAME to +# look up which safetensors files should be used. +def filter_duplicate_safetensors_files(hf_weights_files: List[str], + hf_folder: str) -> List[str]: + # model.safetensors.index.json is a mapping from keys in the + # torch state_dict to safetensors file holding that weight. + index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME) + if not os.path.isfile(index_file_name): + return hf_weights_files + + # Iterate through the weight_map (weight_name: safetensors files) + # to identify weights that we should use. + with open(index_file_name) as index_file: + weight_map = json.load(index_file)["weight_map"] + weight_files_in_index = set() + for weight_name in weight_map: + weight_files_in_index.add( + os.path.join(hf_folder, weight_map[weight_name])) + # Filter out any fields that are not found in the index file. + hf_weights_files = [ + f for f in hf_weights_files if f in weight_files_in_index + ] + return hf_weights_files + + def filter_files_not_needed_for_inference( hf_weights_files: List[str]) -> List[str]: """ From 1226d5d977a9d9613e0225576fa98c3373f926e2 Mon Sep 17 00:00:00 2001 From: leiwen83 Date: Sat, 25 May 2024 01:07:09 +0800 Subject: [PATCH 033/154] [Core][Bugfix]: fix prefix caching for blockv2 (#4764) Co-authored-by: Lei Wen --- tests/core/block/test_prefix_caching_block.py | 117 ++++++++++++++++++ vllm/core/block/prefix_caching_block.py | 41 +++--- 2 files changed, 141 insertions(+), 17 deletions(-) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index c4c680e109a84..bcf08cda09f46 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -410,6 +410,123 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int, assert (len(res) == zero_point_blocks) + # Test case that assume those prompted block after first immutable would + # be freed into hashless allocator, while first immutable block get ref + # increased. + @staticmethod + @pytest.mark.parametrize("num_blocks", [3]) + @pytest.mark.parametrize("block_size", [16]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_alloc_promotion(num_blocks: int, block_size: int, seed: int): + random.seed(seed) + + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(block_size)) + + block = allocator.allocate_immutable(prev_block=None, + token_ids=token_ids) + + assert allocator._refcounter.get(block.block_id) == 1 + m = allocator.allocate_mutable(prev_block=None) + + block_id = m.block_id + for i in range(block_size): + m.append_token_ids([i]) + # After block get promoted to immutable from mutable, if there is + # already same content hash block, then it shall be released into + # hashless_allocator + # And first immutable block's ref get increased by 1 + assert m.block_id == block.block_id + assert block_id in allocator._hashless_allocator._free_block_indices + assert allocator._refcounter.get(block.block_id) == 2 + + # Test case when eviction and allocation are mixed, + # make sure they work as expected + @staticmethod + @pytest.mark.parametrize("num_blocks", [3]) + @pytest.mark.parametrize("block_size", [16]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int): + random.seed(seed) + + all_blocks_list = [i for i in range(num_blocks)] + zero_ref = {i: 0 for i in range(num_blocks)} + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(num_blocks * block_size)) + + # now we have num_blocks free blocks in hashless allocator + # with internal tracking list _blocks _cached_blocks and evictor + # empty and block's ref shall be 0 + assert list(allocator._hashless_allocator._free_block_indices + ) == all_blocks_list + assert len(allocator._blocks.keys()) == 0 + assert len(allocator._cached_blocks.values()) == 0 + assert len(allocator.evictor.free_table.keys()) == 0 + assert allocator._refcounter._refcounts == zero_ref + + # Allocate immutable chains with only one block residuled in + new_block = [] + for i in range(num_blocks): + block = allocator.allocate_immutable( + prev_block=None, + token_ids=token_ids[block_size * i:block_size * (i + 1)]) + new_block.append(block) + + # Free all blocks, and now all blocks shall be in the evictor + # there shall be no tracking data left in _blocks + # all blocks shall be tracked in _cached_blocks + # all blocks' ref shall be zero + for block in new_block: + allocator.free(block) + + assert len(allocator._blocks.keys()) == 0 + assert len(allocator._hashless_allocator._free_block_indices) == 0 + assert list(allocator._cached_blocks.values()) == all_blocks_list + assert list(allocator.evictor.free_table.keys()) == all_blocks_list + assert allocator._refcounter._refcounts == zero_ref + + # Allocate a mutable block, and the first block shall be evicted + # and set its content hash into None, ref to 1 + mutable = allocator.allocate_mutable(prev_block=None) + + assert mutable.block_id == 0 + assert mutable.content_hash is None + assert 0 in allocator._blocks + assert allocator._refcounter.get(0) == 1 + assert 0 not in allocator._cached_blocks + assert 0 not in allocator.evictor + + # Since this mutable block has no hash yet, it shall be released into + # hashless allocator + allocator.free(mutable) + + assert len(allocator._blocks.keys()) == 0 + assert allocator._refcounter._refcounts == zero_ref + assert 0 not in allocator._cached_blocks + assert 0 not in allocator.evictor + assert 0 in allocator._hashless_allocator._free_block_indices + + # when allocate immutable with first block_size tokens, we + # shall get free block from hashless allocator, thus no block left + # in hashless + block = allocator.allocate_immutable(prev_block=None, + token_ids=token_ids[:block_size]) + + assert block.block_id == 0 + assert len(allocator._hashless_allocator._free_block_indices) == 0 + assert 0 in allocator._blocks + assert 0 in allocator._cached_blocks.values() + assert allocator._refcounter.get(0) == 1 + assert 0 not in allocator.evictor + + # allocate mutable block again, it shall be popped from evictor + mutable = allocator.allocate_mutable(prev_block=None) + assert len(allocator._hashless_allocator._free_block_indices) == 0 + assert mutable.block_id not in allocator.evictor.free_table + assert allocator._refcounter.get(mutable.block_id) == 1 + # Test case where two last accessed times are equal @staticmethod @pytest.mark.parametrize("num_blocks", [1024]) diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 882f301c1f697..4eb32f145b05b 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -160,21 +160,17 @@ def allocate_mutable(self, # If the evictor has blocks available for eviction, evict a block # and return it. if self.evictor.num_blocks > 0: + # here we get an evicted block, which is only added + # into evictor if its ref counter is 0 + # and since its content would be changed, we need + # to remove it from _cached_blocks's tracking list block_id, content_hash_to_evict = self.evictor.evict() - # Here we may have scenario that several blocks have - # the same content hash, but due to the latter coming block - # is coming from mutable to immutable path, their physical - # block is added into evictor. - # However in this case, we shall not pop the _cached_blocks, - # as the same content is still used by others, which means - # we need to check ref before decide to pop the list. - _block_id = self._cached_blocks[content_hash_to_evict] - refcount = self._refcounter.get(_block_id) - if refcount == 1: - self._cached_blocks.pop(content_hash_to_evict) - assert _block_id == block_id + assert self._refcounter.get(_block_id) == 0 + assert _block_id == block_id + + self._cached_blocks.pop(content_hash_to_evict) self._refcounter.incr(block_id) @@ -199,7 +195,11 @@ def allocate_mutable(self, def _incr_refcount_cached_block(self, block: Block, block_id: BlockId) -> None: - # since block is already computed, mark it + # now _incr_refcount_cached_block comes from two place + # allocate_immutable/promote_to_immutable_block where hit + # _cached_blocks hash key. + # In both cases, it means that already exists a already + # computed block which shared with block now block.computed = True refcount = self._refcounter.incr(block_id) @@ -228,13 +228,19 @@ def _free_block_id_for_block(self, block_id: BlockId, block: Block) -> None: assert isinstance(block, PrefixCachingBlock) - if block.content_hash is None: + # if we comes from promote_to_immutable_block, it means that + # block.content_hash is never None. + # However we need to release the same content block, so that + # physical block could get reused. + if block.block_id != block_id or block.content_hash is None: refcount = self._refcounter.get(block_id) # We have fork case where block would get more than one ref, # so we cannot free it from tracking if ref cnt large than 1 - if refcount <= 1: - assert block.block_id is not None + assert block.block_id is not None + refcount = self._refcounter.get(block.block_id) + if refcount == 1: del self._blocks[block.block_id] + return self._hashless_allocator.free(block) refcount = self._refcounter.decr(block_id) @@ -317,7 +323,8 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: if block.content_hash not in self._cached_blocks: self._cached_blocks[block.content_hash] = block.block_id else: - self._free_block_id_for_block(block.block_id, block) + self._free_block_id_for_block( + self._cached_blocks[block.content_hash], block) self._incr_refcount_cached_block( block, self._cached_blocks[block.content_hash]) From 29a20983f688830d6832923d6c5e9990bbedb0aa Mon Sep 17 00:00:00 2001 From: Eric Xihui Lin Date: Sat, 25 May 2024 01:00:52 -0400 Subject: [PATCH 034/154] [Kernel][Backend][Model] Blocksparse flash attention kernel and Phi-3-Small model (#4799) Co-authored-by: beagleski Co-authored-by: bapatra Co-authored-by: Barun Patra Co-authored-by: Michael Goin --- csrc/attention/attention_kernels.cu | 185 ++++++-- csrc/cpu/attention.cpp | 37 +- csrc/ops.h | 35 +- docs/source/models/supported_models.rst | 4 + tests/kernels/test_blocksparse_attention.py | 442 +++++++++++++++++ vllm/_custom_ops.py | 30 +- vllm/attention/backends/abstract.py | 1 + vllm/attention/backends/blocksparse_attn.py | 410 ++++++++++++++++ vllm/attention/backends/flash_attn.py | 5 +- vllm/attention/backends/rocm_flash_attn.py | 5 +- vllm/attention/backends/torch_sdpa.py | 5 +- vllm/attention/backends/xformers.py | 5 +- vllm/attention/layer.py | 10 +- .../ops/blocksparse_attention/__init__.py | 0 .../blocksparse_attention_kernel.py | 423 +++++++++++++++++ .../ops/blocksparse_attention/interface.py | 238 ++++++++++ .../ops/blocksparse_attention/utils.py | 216 +++++++++ vllm/attention/ops/paged_attn.py | 25 +- vllm/attention/selector.py | 7 + vllm/entrypoints/openai/serving_engine.py | 1 + vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/phi3_small.py | 447 ++++++++++++++++++ vllm/transformers_utils/config.py | 2 +- 23 files changed, 2446 insertions(+), 88 deletions(-) create mode 100644 tests/kernels/test_blocksparse_attention.py create mode 100644 vllm/attention/backends/blocksparse_attn.py create mode 100644 vllm/attention/ops/blocksparse_attention/__init__.py create mode 100644 vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py create mode 100644 vllm/attention/ops/blocksparse_attention/interface.py create mode 100644 vllm/attention/ops/blocksparse_attention/utils.py create mode 100644 vllm/model_executor/models/phi3_small.py diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index d6203174e7275..45edc3252380c 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -85,6 +85,7 @@ inline __device__ float block_sum(float* red_smem, float sum) { // Grid: (num_heads, num_seqs, max_num_partitions). template // Zero means no partitioning. __device__ void paged_attention_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -104,7 +105,9 @@ __device__ void paged_attention_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float kv_scale) { + const float kv_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { const int seq_idx = blockIdx.y; const int partition_idx = blockIdx.z; const int max_num_partitions = gridDim.z; @@ -202,11 +205,55 @@ __device__ void paged_attention_kernel( // Each thread group in a warp fetches a key from the block, and computes // dot product with the query. const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq; + + // blocksparse specific vars + int bs_block_offset; + int q_bs_block_id; + if constexpr (IS_BLOCK_SPARSE) { + // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len, + // blocksparse_block_size); + q_bs_block_id = (seq_len - 1) / blocksparse_block_size; + if (blocksparse_head_sliding_step >= 0) + // sliding on q heads + bs_block_offset = + (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1; + else + // sliding on kv heads + bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) * + (-blocksparse_head_sliding_step) + + 1; + } + for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) { // NOTE(woosuk): The block number is stored in int32. However, we cast it to // int64 because int32 can lead to overflow when this variable is multiplied // by large numbers (e.g., kv_block_stride). + // For blocksparse attention: skip computation on blocks that are not + // attended + if constexpr (IS_BLOCK_SPARSE) { + const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size; + const bool is_remote = + ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0); + const bool is_local = + (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks); + if (!is_remote && !is_local) { + for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) { + const int physical_block_offset = + (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE; + const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset; + + if (thread_group_offset == 0) { + // NOTE(linxihui): assign very large number to skipped tokens to + // avoid contribution to the sumexp softmax normalizer. This will + // not be used at computing sum(softmax*v) as the blocks will be + // skipped. + logits[token_idx - start_token_idx] = -FLT_MAX; + } + } + continue; + } + } const int64_t physical_block_number = static_cast(block_table[block_idx]); @@ -335,6 +382,15 @@ __device__ void paged_attention_kernel( // NOTE(woosuk): The block number is stored in int32. However, we cast it to // int64 because int32 can lead to overflow when this variable is multiplied // by large numbers (e.g., kv_block_stride). + // For blocksparse attention: skip computation on blocks that are not + // attended + if constexpr (IS_BLOCK_SPARSE) { + int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size; + if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) && + !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) { + continue; + } + } const int64_t physical_block_number = static_cast(block_table[block_idx]); const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE; @@ -441,8 +497,8 @@ __device__ void paged_attention_kernel( // Grid: (num_heads, num_seqs, 1). template + int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE, + bool IS_BLOCK_SPARSE> __global__ void paged_attention_v1_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -457,18 +513,23 @@ __global__ void paged_attention_v1_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float kv_scale) { + const float kv_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { paged_attention_kernel( + KV_DTYPE, IS_BLOCK_SPARSE>( /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, - kv_head_stride, kv_scale); + kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks, + blocksparse_vert_stride, blocksparse_block_size, + blocksparse_head_sliding_step); } // Grid: (num_heads, num_seqs, max_num_partitions). template __global__ void paged_attention_v2_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -488,12 +549,16 @@ __global__ void paged_attention_v2_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float kv_scale) { + const float kv_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { paged_attention_kernel( + KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>( exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, - kv_block_stride, kv_head_stride, kv_scale); + kv_block_stride, kv_head_stride, kv_scale, tp_rank, + blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size, + blocksparse_head_sliding_step); } // Grid: (num_heads, num_seqs). @@ -607,25 +672,32 @@ __global__ void paged_attention_v2_reduce_kernel( #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ - ((void*)vllm::paged_attention_v1_kernel< \ - T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_DTYPE>), \ + ((void*)vllm::paged_attention_v1_kernel), \ shared_mem_size); \ vllm::paged_attention_v1_kernel \ + NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE> \ <<>>( \ out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - kv_scale); + kv_scale, tp_rank, blocksparse_local_blocks, \ + blocksparse_vert_stride, blocksparse_block_size, \ + blocksparse_head_sliding_step); // TODO(woosuk): Tune NUM_THREADS. template + vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE, + int NUM_THREADS = 128> void paged_attention_v1_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float kv_scale) { + const c10::optional& alibi_slopes, float kv_scale, + const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -691,23 +763,36 @@ void paged_attention_v1_launcher( } } -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE) \ - paged_attention_v1_launcher( \ +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v1_launcher( \ out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ - seq_lens, max_seq_len, alibi_slopes, kv_scale); + seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); + +#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. #define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ switch (block_size) { \ case 8: \ - CALL_V1_LAUNCHER(T, CACHE_T, 8, KV_DTYPE); \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ break; \ case 16: \ - CALL_V1_LAUNCHER(T, CACHE_T, 16, KV_DTYPE); \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ break; \ case 32: \ - CALL_V1_LAUNCHER(T, CACHE_T, 32, KV_DTYPE); \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -727,18 +812,26 @@ void paged_attention_v1( torch::Tensor& seq_lens, // [num_seqs] int block_size, int max_seq_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, float kv_scale){ + const std::string& kv_cache_dtype, float kv_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); + + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V1_LAUNCHER_BLOCK_SIZE) +} - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V1_LAUNCHER_BLOCK_SIZE)} #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ vllm::paged_attention_v2_kernel \ + NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE, \ + PARTITION_SIZE> \ <<>>( \ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, kv_scale); \ + kv_block_stride, kv_head_stride, kv_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); \ vllm::paged_attention_v2_reduce_kernel \ <<>>( \ @@ -746,14 +839,17 @@ void paged_attention_v1( max_num_partitions); template + vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE, + int NUM_THREADS = 128, int PARTITION_SIZE = 512> void paged_attention_v2_launcher( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float kv_scale) { + const c10::optional& alibi_slopes, float kv_scale, + const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -824,24 +920,36 @@ void paged_attention_v2_launcher( } } -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE) \ - paged_attention_v2_launcher( \ +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v2_launcher( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ - kv_scale); + kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); + +#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. #define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ switch (block_size) { \ case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, KV_DTYPE); \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ break; \ case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, KV_DTYPE); \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ break; \ case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, KV_DTYPE); \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -865,7 +973,10 @@ void paged_attention_v2( torch::Tensor& seq_lens, // [num_seqs] int block_size, int max_seq_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, float kv_scale) { + const std::string& kv_cache_dtype, float kv_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, CALL_V2_LAUNCHER_BLOCK_SIZE) } @@ -873,4 +984,4 @@ void paged_attention_v2( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index 54df69b7379d6..438e9bdb19f50 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -415,14 +415,17 @@ void paged_attention_v1_impl_launcher( } } // namespace -void paged_attention_v1(torch::Tensor& out, torch::Tensor& query, - torch::Tensor& key_cache, torch::Tensor& value_cache, - int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, - int block_size, int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, float kv_scale) { +void paged_attention_v1( + torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, + int max_seq_len, const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { TORCH_CHECK(kv_scale == 1.0f); + TORCH_CHECK(blocksparse_vert_stride <= 1, + "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl", [&] { CPU_KERNEL_GUARD_IN(paged_attention_v1_impl) @@ -726,16 +729,18 @@ void paged_attention_v2_impl_launcher( } } // namespace -void paged_attention_v2(torch::Tensor& out, torch::Tensor& exp_sums, - torch::Tensor& max_logits, torch::Tensor& tmp_out, - torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, - float scale, torch::Tensor& block_tables, - torch::Tensor& seq_lens, int block_size, - int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, float kv_scale) { +void paged_attention_v2( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, + int max_seq_len, const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { TORCH_CHECK(kv_scale == 1.0f); + TORCH_CHECK(blocksparse_vert_stride <= 1, + "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl", [&] { CPU_KERNEL_GUARD_IN(paged_attention_v2_impl) diff --git a/csrc/ops.h b/csrc/ops.h index b839eaf0d26c8..567d9fae4bd2a 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -2,23 +2,24 @@ #include -void paged_attention_v1(torch::Tensor& out, torch::Tensor& query, - torch::Tensor& key_cache, torch::Tensor& value_cache, - int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, - int block_size, int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, float kv_scale); - -void paged_attention_v2(torch::Tensor& out, torch::Tensor& exp_sums, - torch::Tensor& max_logits, torch::Tensor& tmp_out, - torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, - float scale, torch::Tensor& block_tables, - torch::Tensor& seq_lens, int block_size, - int max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, float kv_scale); +void paged_attention_v1( + torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, + int max_seq_len, const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step); + +void paged_attention_v2( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, + int max_seq_len, const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, float kv_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step); void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, float epsilon); diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 31d4b53bd4409..e4bae80343a2c 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -123,6 +123,10 @@ Alongside each architecture, we include some popular models that use it. - Phi-3 - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc. - + * - :code:`Phi3SmallForCausalLM` + - Phi-3-Small + - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. + - * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py new file mode 100644 index 0000000000000..9da13ca6e2310 --- /dev/null +++ b/tests/kernels/test_blocksparse_attention.py @@ -0,0 +1,442 @@ +import random +from typing import List, Optional, Tuple + +import pytest +import torch + +from vllm import _custom_ops as ops +from vllm.attention.ops.blocksparse_attention.interface import ( + LocalStridedBlockSparseAttn) +from vllm.utils import get_max_shared_memory_bytes, is_hip + +from .allclose_default import get_default_atol, get_default_rtol + +FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 +# This will change depending on the compute capability. +# - 512 as a buffer +MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 +# MAX_SEQ_LEN = 2771 + +# There may not be enough gpu memory due to large NUM_BLOCKS. +# Reduce NUM_BLOCKS when it happens. +NUM_BLOCKS = 4321 # Arbitrary values for testing +PARTITION_SIZE = 512 +DTYPES = [torch.half, torch.bfloat16] +NUM_GEN_SEQS = [3] # Arbitrary values for testing +NUM_PREFILL_SEQS = [3] # Arbitrary values for testing +NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing + +HEAD_SIZES = [64, 112] +BLOCK_SIZES = [16, 32] +USE_ALIBI = [False, True] +KV_CACHE_DTYPE = ["auto", "fp8"] +SEEDS = [0] +CUDA_DEVICES = ['cuda:0'] +BLOCKSPARSE_LOCAL_BLOCKS = [16] +BLOCKSPARSE_VERT_STRIDES = [8] + +BLOCKSPARSE_BLOCK_SIZES = [64] +BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1] +BLOCKSPARSE_HOMO_HEADS = [True, False] + + +def ref_masked_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, + attn_mask: Optional[torch.Tensor] = None, +) -> torch.Tensor: + attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() + if attn_mask is not None: + attn_weights = attn_weights + attn_mask.float() + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.einsum("hqk,khd->qhd", attn_weights, value) + return out + + +def ref_single_query_cached_kv_attention( + output: torch.Tensor, + query: torch.Tensor, + num_queries_per_kv: int, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + scale: float, + alibi_slopes: Optional[torch.Tensor], + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 1, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +) -> None: + num_query_heads = query.shape[1] + num_kv_heads = value_cache.shape[1] + head_size = value_cache.shape[2] + block_size = value_cache.shape[3] + num_seqs = query.shape[0] + + block_tables = block_tables.cpu().tolist() + seq_lens = seq_lens.cpu().tolist() + for i in range(num_seqs): + q = query[i].unsqueeze(0) + block_table = block_tables[i] + seq_len = int(seq_lens[i]) + + keys = [] + values = [] + for j in range(seq_len): + block_number = int(block_table[j // block_size]) + block_offset = j % block_size + + k = key_cache[block_number, :, :, block_offset, :] + k = k.reshape(num_kv_heads, head_size) + keys.append(k) + + v = value_cache[block_number, :, :, block_offset] + values.append(v) + keys = torch.stack(keys, dim=0) + values = torch.stack(values, dim=0) + if num_queries_per_kv > 1: + # Handle MQA and GQA + keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) + values = torch.repeat_interleave(values, num_queries_per_kv, dim=1) + + alibi_bias = None + if alibi_slopes is not None: + # Create the ALiBi bias used in the paged attention kernel. + position_ids = torch.arange(seq_len).int() + alibi_bias = (position_ids - seq_len + 1).float() + alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view( + 1, 1, -1) + + if blocksparse_vert_stride >= 1: + bsize = blocksparse_block_size + hsliding = blocksparse_head_sliding_step + vert = blocksparse_vert_stride + locals = blocksparse_local_blocks + qb = (seq_len - 1) // bsize + attn_mask = q.new_zeros( + (num_query_heads, 1, seq_len)).float() - torch.inf + for h in range(num_query_heads): + if hsliding >= 0: # slide with q heads + bs_offset = (tp_rank * num_query_heads + h) * hsliding + 1 + else: # slide with kv heads + bs_offset = (tp_rank * num_kv_heads + + h // num_queries_per_kv) * (-hsliding) + 1 + for kb in range(qb + 1): + kj = kb * bsize + if (qb - kb) < locals or \ + (kb + bs_offset) % vert == 0: + attn_mask[h, 0, kj:min(kj + bsize, seq_len)] = 0 + if alibi_bias is not None: + attn_mask += alibi_bias + else: + attn_mask = alibi_bias + + out = ref_masked_attention(q, keys, values, scale, attn_mask=attn_mask) + out = out.view(num_query_heads, head_size) + output[i].copy_(out, non_blocking=True) + + +@pytest.mark.parametrize("version", ["v1", "v2"]) +@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("use_alibi", USE_ALIBI) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS) +@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES) +@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES) +@pytest.mark.parametrize("blocksparse_head_sliding_step", + BLOCKSPARSE_HEADS_SLIDINGS) +def test_paged_attention( + kv_cache_factory, + version: str, + num_seqs: int, + num_heads: Tuple[int, int], + head_size: int, + use_alibi: bool, + block_size: int, + dtype: torch.dtype, + kv_cache_dtype: str, + seed: int, + device: str, + blocksparse_local_blocks: int, + blocksparse_vert_stride: int, + blocksparse_block_size: int, + blocksparse_head_sliding_step: int, +) -> None: + random.seed(seed) + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + scale = float(1.0 / (head_size**0.5)) + num_query_heads, num_kv_heads = num_heads + query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype) + query.uniform_(-scale, scale) + + assert num_query_heads % num_kv_heads == 0 + num_queries_per_kv = num_query_heads // num_kv_heads + alibi_slopes = None + if use_alibi: + alibi_slopes = torch.rand(num_query_heads, dtype=torch.float) + + seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] + seq_lens[-1] = MAX_SEQ_LEN + max_seq_len = max(seq_lens) + seq_lens = torch.tensor(seq_lens, dtype=torch.int) + + # Create the block tables. + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables = [] + for _ in range(num_seqs): + block_table = [ + random.randint(0, NUM_BLOCKS - 1) + for _ in range(max_num_blocks_per_seq) + ] + block_tables.append(block_table) + block_tables = torch.tensor(block_tables, dtype=torch.int) + + # Create the KV caches. + key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, + num_kv_heads, head_size, + kv_cache_dtype, dtype, seed, + device) + key_cache, value_cache = key_caches[0], value_caches[0] + + # Using default kv_scale + kv_scale = 1.0 + tp_rank = 0 + + # Call the paged attention kernel. + output = torch.empty_like(query) + if version == "v1": + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + kv_scale, + tp_rank=tp_rank, + blocksparse_local_blocks=blocksparse_local_blocks, + blocksparse_vert_stride=blocksparse_vert_stride, + blocksparse_block_size=blocksparse_block_size, + blocksparse_head_sliding_step=blocksparse_head_sliding_step, + ) + elif version == "v2": + num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) + assert PARTITION_SIZE % block_size == 0 + num_seqs, num_heads, head_size = output.shape + tmp_output = torch.empty( + size=(num_seqs, num_heads, num_partitions, head_size), + dtype=output.dtype, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, num_partitions), + dtype=torch.float32, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + kv_scale, + tp_rank=tp_rank, + blocksparse_local_blocks=blocksparse_local_blocks, + blocksparse_vert_stride=blocksparse_vert_stride, + blocksparse_block_size=blocksparse_block_size, + blocksparse_head_sliding_step=blocksparse_head_sliding_step, + ) + else: + raise AssertionError(f"Unknown version: {version}") + + # Run the reference implementation. + if kv_cache_dtype == "fp8": + # Convert cache data back to dtype. + x = 16 // torch.tensor([], dtype=dtype).element_size() + key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, + block_size, x) + dequantized_key_cache = torch.empty(size=key_cache_shape, + dtype=dtype, + device=device) + ops.convert_fp8(dequantized_key_cache, key_cache) + key_cache = dequantized_key_cache + + value_cache_shape = value_cache.shape + dequantized_value_cache = torch.empty(size=value_cache_shape, + dtype=dtype, + device=device) + ops.convert_fp8(dequantized_value_cache, value_cache) + value_cache = dequantized_value_cache + + ref_output = torch.empty_like(query) + ref_single_query_cached_kv_attention( + ref_output, + query, + num_queries_per_kv, + key_cache, + value_cache, + block_tables, + seq_lens, + scale, + alibi_slopes, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + + # NOTE(woosuk): Due to the kernel-level differences in the two + # implementations, there is a small numerical difference in the two + # outputs. Thus, we use a relaxed tolerance for the test. + atol = get_default_atol(output) if is_hip() else 1e-3 + rtol = get_default_rtol(output) if is_hip() else 1e-5 + + # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, + # so we use a relaxed tolerance for the test. + atol, rtol = 1e-3, 1e-5 + if kv_cache_dtype == "fp8": + atol, rtol = 1e-2, 1e-5 + assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) + + +def ref_multi_query_kv_attention( + cu_seq_lens: List[int], + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, + dtype: torch.dtype, +) -> torch.Tensor: + num_seqs = len(cu_seq_lens) - 1 + ref_outputs = [] + for i in range(num_seqs): + start_idx = cu_seq_lens[i] + end_idx = cu_seq_lens[i + 1] + seq_len = end_idx - start_idx + + # Create attention mask. + attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype), + diagonal=1) + attn_mask = attn_mask * torch.finfo(dtype).min + attn_mask = attn_mask.to(dtype=dtype) + + ref_output = ref_masked_attention( + query[start_idx:end_idx], + key[start_idx:end_idx], + value[start_idx:end_idx], + scale, + attn_mask=attn_mask, + ) + ref_outputs.append(ref_output) + ref_output = torch.cat(ref_outputs, dim=0) + return ref_output + + +@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS) +@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES) +@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES) +@pytest.mark.parametrize("blocksparse_homo_heads", BLOCKSPARSE_HOMO_HEADS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_varlen_blocksparse_attention_prefill( + num_seqs: int, + num_heads: Tuple[int, int], + head_size: int, + blocksparse_local_blocks: int, + blocksparse_vert_stride: int, + blocksparse_block_size: int, + blocksparse_homo_heads: bool, + dtype: torch.dtype, + seed: int, + device: str, +) -> None: + random.seed(seed) + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. + # As the xformers library is already tested with its own tests, we can use + # a smaller MAX_SEQ_LEN here. + max_len = min(MAX_SEQ_LEN, 4096) + seq_lens = random.sample(range(1, max_len), num_seqs) + cu_seq_lens = torch.cumsum(torch.tensor([0] + seq_lens), dim=0) + num_tokens = sum(seq_lens) + + scale = float(1.0 / (head_size**0.5)) + num_query_heads, num_kv_heads = num_heads + assert num_query_heads % num_kv_heads == 0 + num_queries_per_kv = num_query_heads // num_kv_heads + + qkv = torch.empty(num_tokens, + num_query_heads + 2 * num_kv_heads, + head_size, + dtype=dtype) + qkv.uniform_(-scale, scale) + query, key, value = qkv.split( + [num_query_heads, num_kv_heads, num_kv_heads], dim=1) + + bs_attn_op = LocalStridedBlockSparseAttn( + num_query_heads, + max_len, + local_blocks=blocksparse_local_blocks, + vert_stride=blocksparse_vert_stride, + block_size=blocksparse_block_size, + device=device, + dtype=dtype, + homo_head=blocksparse_homo_heads) + + output = bs_attn_op(query, + key, + value, + cu_seq_lens.to(device), + sm_scale=scale) + + if num_queries_per_kv > 1: + # Handle MQA and GQA + key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) + value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) + + ref_output = ref_multi_query_kv_attention( + cu_seq_lens, + query, + key, + value, + scale, + dtype, + ) + assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index f0fab4d8aa26d..22cf5a44e341f 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -45,11 +45,17 @@ def paged_attention_v1( alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, ) -> None: - vllm_ops.paged_attention_v1(out, query, key_cache, value_cache, - num_kv_heads, scale, block_tables, seq_lens, - block_size, max_seq_len, alibi_slopes, - kv_cache_dtype, kv_scale) + vllm_ops.paged_attention_v1( + out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, + seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, + kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, + blocksparse_block_size, blocksparse_head_sliding_step) def paged_attention_v2( @@ -69,12 +75,18 @@ def paged_attention_v2( alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, ) -> None: - vllm_ops.paged_attention_v2(out, exp_sum, max_logits, tmp_out, query, - key_cache, value_cache, num_kv_heads, scale, - block_tables, seq_lens, block_size, - max_seq_len, alibi_slopes, kv_cache_dtype, - kv_scale) + vllm_ops.paged_attention_v2( + out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, + num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, + alibi_slopes, kv_cache_dtype, kv_scale, tp_rank, + blocksparse_local_blocks, blocksparse_vert_stride, + blocksparse_block_size, blocksparse_head_sliding_step) # pos encoding ops diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 94ab64de30a94..6396103bf5efa 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -111,6 +111,7 @@ def __init__( alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, kv_cache_dtype: str = "auto", + blocksparse_params: Optional[Dict[str, Any]] = None, ) -> None: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py new file mode 100644 index 0000000000000..dce2b83615b7a --- /dev/null +++ b/vllm/attention/backends/blocksparse_attn.py @@ -0,0 +1,410 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata) +from vllm.attention.ops.blocksparse_attention.interface import ( + LocalStridedBlockSparseAttn, get_head_sliding_step) +from vllm.attention.ops.paged_attn import PagedAttention +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) + + +@dataclass +class BlocksparseParams: + max_seqlen: int + + # Num q heads per tensor-parallel rank/partition + num_heads: int # per TP partition + # Num kv heads per tensor-parallel rank/partition + num_kv_heads: int + + # block size used for blocksparse attention. + # This is the block_size used in `local_blocks`, `vert_stride`. + block_size: int + + # Number of blocks for local attention, i.e., number of + # local attended tokens / `sparse_block_size` + local_blocks: int + + # Attend to one block per every `vert_stride` blocks. + # Controlling the sparsity + vert_stride: int + """ + If to use the same vertical stride offset for all heads, + i.e., attend to the same block of tokens on all heads. + By default, it is False, i.e., attention on the non-local + blocks depends on the `head_idx`, that is on + blocks satisfying + `(block_idx + head_idx * head_sliding_step + 1) % vert_stride == 0` + where `head_sliding_step=max(1, int(vert_stride / num_total_heads))`, + `block_idx = position_id // sparse_block_size`. + See `..ops.blocksparse_attention.utils:get_sparse_attn_mask` + for more detail. + """ + homo_head: bool = False + + # If within a group, the kv offsets that each q attends is the same or no. + homo_head_group: bool = False + + # Decided by homo_head and homo_head group + head_sliding_step: int = field(init=False) + + # range of q heads to for a TP rank + active_head_range: Tuple = field(init=False) + + def __post_init__(self): + assert self.block_size > 0 + assert self.local_blocks >= 0 + assert self.vert_stride >= 1 + assert self.num_heads % self.num_kv_heads == 0 + + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + total_heads = tp_size * self.num_heads + total_kv_heads = tp_size * self.num_kv_heads + + if self.homo_head: + self.head_sliding_step = 0 + elif self.homo_head_group: + head_sliding_step = get_head_sliding_step(total_kv_heads, + self.vert_stride) + # negative indicates sliding along kv heads, i.e., homo q group + self.head_sliding_step = -head_sliding_step + else: + self.head_sliding_step = get_head_sliding_step( + total_heads, self.vert_stride) + + self.active_head_range = ( + tp_rank * self.num_heads, + (tp_rank + 1) * self.num_heads, + ) + + +class BlocksparseFlashAttentionBackend(AttentionBackend): + + @staticmethod + def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]: + return BlocksparseFlashAttentionImpl + + @staticmethod + def make_metadata(*args, **kwargs) -> "BlocksparseFlashAttentionMetadata": + return BlocksparseFlashAttentionMetadata(*args, **kwargs) + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: Dict[int, int], + ) -> None: + PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + ) -> None: + PagedAttention.copy_blocks(kv_caches, src_to_dists) + + +@dataclass +class BlocksparseFlashAttentionMetadata(AttentionMetadata): + """A copy of Metadata for FlashAttentionBackend, + to avoid having to install flash_attn. + + NOTE: Any python object stored here is not updated when it is + cuda-graph replayed. If you have values that need to be changed + dynamically, it should be stored in tensor. The tensor has to be + updated from `CUDAGraphRunner.forward` API. + """ + # (batch_size,). The sequence length per sequence. Sequence length means + # the computed tokens + new tokens None if it is a decoding. + seq_lens: Optional[List[int]] + # seq_lens stored as a tensor. + seq_lens_tensor: Optional[torch.Tensor] + + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ----------------------| + # |-- query_len ---| + + # Maximum query length in the batch. None for decoding. + max_query_len: Optional[int] + # Maximum sequence length among prefill batch. 0 if there are decoding + # requests only. + max_prefill_seq_len: int + # Maximum sequence length among decode batch. 0 if there are prefill + # requests only. + max_decode_seq_len: int + # (batch_size + 1,). The cumulative subquery lengths of the sequences in + # the batch, used to index into subquery. E.g., if the subquery length + # is [4, 6], it is [0, 4, 10]. + query_start_loc: Optional[torch.Tensor] + # (batch_size + 1,). The cumulative sequence lengths of the sequences in + # the batch, used to index into sequence. E.g., if the sequence length is + # [4, 6], it is [0, 4, 10]. + seq_start_loc: Optional[torch.Tensor] + # (batch_size,) A tensor of context lengths (tokens that are computed + # so far). + context_lens_tensor: Optional[torch.Tensor] + + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: Optional[torch.Tensor] + + # Whether or not if cuda graph is enabled. + # Cuda-graph is currently enabled for decoding only. + # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. + use_cuda_graph: bool + + _cached_prefill_metadata: Optional[ + "BlocksparseFlashAttentionMetadata"] = None + _cached_decode_metadata: Optional[ + "BlocksparseFlashAttentionMetadata"] = None + + @property + def prefill_metadata( + self) -> Optional["BlocksparseFlashAttentionMetadata"]: + if self.num_prefills == 0: + return None + + if self._cached_prefill_metadata is not None: + return self._cached_prefill_metadata + + assert self.seq_lens is not None + assert self.seq_lens_tensor is not None + assert self.query_start_loc is not None + assert self.context_lens_tensor is not None + assert self.block_tables is not None + assert self.seq_start_loc is not None + + self._cached_prefill_metadata = BlocksparseFlashAttentionMetadata( + num_prefills=self.num_prefills, + num_prefill_tokens=self.num_prefill_tokens, + num_decode_tokens=0, + slot_mapping=self.slot_mapping[:self.num_prefill_tokens], + seq_lens=self.seq_lens[:self.num_prefills], + seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], + max_query_len=self.max_query_len, + max_prefill_seq_len=self.max_prefill_seq_len, + max_decode_seq_len=0, + query_start_loc=self.query_start_loc[:self.num_prefills + 1], + seq_start_loc=self.seq_start_loc[:self.num_prefills + 1], + context_lens_tensor=self.context_lens_tensor[:self.num_prefills], + block_tables=self.block_tables[:self.num_prefills], + use_cuda_graph=False, + ) + return self._cached_prefill_metadata + + @property + def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]: + if self.num_decode_tokens == 0: + return None + + if self._cached_decode_metadata is not None: + return self._cached_decode_metadata + assert self.block_tables is not None + assert self.seq_lens_tensor is not None + + self._cached_decode_metadata = BlocksparseFlashAttentionMetadata( + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=self.num_decode_tokens, + slot_mapping=self.slot_mapping[self.num_prefill_tokens:], + seq_lens=None, + seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], + max_query_len=None, + max_prefill_seq_len=0, + max_decode_seq_len=self.max_decode_seq_len, + query_start_loc=None, + seq_start_loc=None, + context_lens_tensor=None, + block_tables=self.block_tables[self.num_prefills:], + use_cuda_graph=self.use_cuda_graph, + ) + return self._cached_decode_metadata + + +class BlocksparseFlashAttentionImpl(AttentionImpl): + """ + If the input tensors contain prompt tokens, the layout is as follows: + |<--------------- num_prompt_tokens -------------->| + |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->| + + Otherwise, the layout is as follows: + |<------------------ num_generation_tokens (M) ----------------->| + |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->| + + Generation tokens can contain padding when cuda-graph is used. + Currently, prompt tokens don't contain any padding. + + The prompts might have different lengths, while the generation tokens + always have length 1. + + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + ) -> None: + assert blocksparse_params is not None + assert alibi_slopes is None, ValueError( + "Alibi not support for blocksparse flash attention.") + assert sliding_window is None, ValueError( + "sliding_window is invalid for blocksparse attention.") + + if "num_heads" not in blocksparse_params: + blocksparse_params["num_heads"] = num_heads + if "num_kv_heads" not in blocksparse_params: + blocksparse_params["num_kv_heads"] = num_kv_heads or num_heads + self.blocksparse_params = BlocksparseParams(**blocksparse_params) + self.kv_cache_dtype = kv_cache_dtype + + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.alibi_slopes = alibi_slopes + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + self.local_blocks = self.blocksparse_params.local_blocks + self.vert_stride = self.blocksparse_params.vert_stride + self.sparse_block_size = self.blocksparse_params.block_size + self.head_sliding_step = self.blocksparse_params.head_sliding_step + + suppored_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + total_num_heads = num_heads * self.tp_size + self.bs_attn = LocalStridedBlockSparseAttn( + total_num_heads, + self.blocksparse_params.max_seqlen, + self.blocksparse_params.local_blocks, + self.blocksparse_params.vert_stride, + self.blocksparse_params.block_size, + homo_head=self.blocksparse_params.homo_head, + active_head_range=self.blocksparse_params.active_head_range, + ) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: BlocksparseFlashAttentionMetadata, + kv_scale: float = 1.0, + ) -> torch.Tensor: + """Forward pass with FlashAttention and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + num_tokens, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + if kv_cache is not None: + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + # Reshape the input keys and values and store them in the cache. + # If kv_cache is not provided, the new key and value tensors are + # not cached. This happens during the initial memory profiling run. + + PagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + kv_scale, + ) + + if prefill_meta := attn_metadata.prefill_metadata: + + # Prompt run. + # normal attention + # When block_tables are not filled, it means q and k are the + # prompt, and they have the same length. + + assert kv_cache is None \ + or prefill_meta.block_tables is None \ + or prefill_meta.block_tables.numel() == 0, \ + "Does not support prefix-enabled attention." + + output = self.bs_attn( + q=query, + k=key, + v=value, + cu_seqlens_q=prefill_meta.seq_start_loc, + cu_seqlens_k=prefill_meta.seq_start_loc, + sm_scale=self.scale, + ) + + if decode_meta := attn_metadata.decode_metadata: + # Decoding run. + output = PagedAttention.forward_decode( + query, + key_cache, + value_cache, + decode_meta.block_tables, + decode_meta.seq_lens_tensor, + self.blocksparse_params.max_seqlen, + self.kv_cache_dtype, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + kv_scale, + tp_rank=self.tp_rank, + blocksparse_local_blocks=self.local_blocks, + blocksparse_vert_stride=self.vert_stride, + blocksparse_block_size=self.sparse_block_size, + blocksparse_head_sliding_step=self.head_sliding_step, + ) + + # Reshape the output tensor. + return output.view(num_tokens, hidden_size) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 0f4568070cfc4..0b9d6283493f2 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -1,6 +1,6 @@ """Attention layer with FlashAttention.""" from dataclasses import dataclass -from typing import List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache @@ -219,7 +219,10 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, ) -> None: + assert blocksparse_params is None, ValueError( + "FlashAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 94f3f55636ed6..e92e6c5e2dc8d 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -1,6 +1,6 @@ """Attention layer ROCm GPUs.""" from dataclasses import dataclass -from typing import List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch @@ -201,7 +201,10 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, ) -> None: + assert blocksparse_params is None, ValueError( + "ROCFlashAttention does not support blocksparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index a19c97e1e0e35..9b50adec5244d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -1,7 +1,7 @@ """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass -from typing import List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch from torch.nn.functional import scaled_dot_product_attention @@ -100,7 +100,10 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, ) -> None: + assert blocksparse_params is None, ValueError( + "Torch SPDA does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 96169da6cf92c..99a3e88bc07b6 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -1,6 +1,6 @@ """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch from xformers import ops as xops @@ -212,7 +212,10 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, ) -> None: + assert blocksparse_params is None, ValueError( + "XFormer does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index dc7b3940bc9b7..b67f04c51d493 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,5 +1,5 @@ """Attention layer.""" -from typing import List, Optional +from typing import Any, Dict, List, Optional import torch import torch.nn as nn @@ -33,6 +33,7 @@ def __init__( sliding_window: Optional[int] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() if cache_config is not None: @@ -69,10 +70,12 @@ def __init__( dtype = torch.get_default_dtype() attn_backend = get_attn_backend(num_heads, head_size, num_kv_heads, sliding_window, dtype, kv_cache_dtype, - block_size) + block_size, blocksparse_params + is not None) impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype) + alibi_slopes, sliding_window, kv_cache_dtype, + blocksparse_params) def forward( self, @@ -90,4 +93,5 @@ def extra_repr(self) -> str: s += f", num_heads={self.impl.num_heads}" # type: ignore s += f", num_kv_heads={self.impl.num_kv_heads}" # type: ignore s += f", scale={self.impl.scale}" # type: ignore + s += f", backend={self.impl.__class__.__name__}" return s diff --git a/vllm/attention/ops/blocksparse_attention/__init__.py b/vllm/attention/ops/blocksparse_attention/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py new file mode 100644 index 0000000000000..ec1c37c5bcb0e --- /dev/null +++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py @@ -0,0 +1,423 @@ +import torch +import triton +import triton.language as tl + + +def blocksparse_flash_attn_varlen_fwd( + q, + k, + v, # (#tokens, n_heads, head_size) + cu_seqlens_k, + cu_seqlens_q, + sm_scale, + sparse_layout, + *, + block_size=64, + q_block_size=None, + max_seqlen=None): + # split q to blocks + + assert isinstance(sparse_layout, (list, tuple)) + + _, n_heads, head_size = q.shape + batch_size = cu_seqlens_k.size(0) - 1 + q_block_size = q_block_size or block_size + + assert q.dim() == k.dim() == v.dim() == 3 + assert q.size(1) % k.size(1) == 0 + assert q.size(2) == k.size(2) + # TODO(linxihui): allow k, v to have different head_size + assert k.shape == v.shape + assert cu_seqlens_k.dim() == 1 + + q_k_ratio = q.size(1) // k.size(1) + + if cu_seqlens_q is None: + if q.size(0) == batch_size: # decoding only + cu_seqlens_q = torch.arange( + 0, + batch_size + 1, + dtype=cu_seqlens_k.dtype, + device=cu_seqlens_k.device, + ) + elif q.size(0) == k.size(0): + cu_seqlens_q = cu_seqlens_k + else: + raise ValueError("cu_seqlens_q must be specified\ + if it mix of prefilling and decoding.") + else: + assert cu_seqlens_k.size(0) == cu_seqlens_q.size(0) + + # switch to use cpu to avoid too many kernel launches when iterated over + q_lens = (cu_seqlens_q[1:] - cu_seqlens_q[:-1]).cpu() + k_lens = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).cpu() + + assert torch.logical_or(q_lens == 1, k_lens == q_lens).all(), ( + "length of q should either be 1 (decoding) or same as k (prefilling).") + + if max_seqlen: + assert k_lens.max() <= max_seqlen + + n_blocks = (q_lens + q_block_size - 1) // q_block_size + + q_batch_ids = torch.tensor( + [i for i, n in enumerate(n_blocks) for _ in range(n)], + dtype=cu_seqlens_q.dtype, + device=cu_seqlens_q.device, + ) + q_start_sids = torch.tensor( + [i * q_block_size for n in n_blocks for i in range(n)], + dtype=cu_seqlens_q.dtype, + device=cu_seqlens_q.device, + ) + + out = q.new_empty(q.shape) + cu_seqlens_q = cu_seqlens_q.contiguous() + cu_seqlens_k = cu_seqlens_k.contiguous() + + layout_crow_indices, layout_col_indices = sparse_layout + block_d = triton.next_power_of_2(head_size) + + decoding_only = (q_lens == 1).all().item() + grid = (len(q_start_sids), n_heads, 1) + + _fwd_kernel_batch_inference[grid]( + q, + k, + v, + out, + sm_scale, + cu_seqlens_q[:-1], + cu_seqlens_q[1:], + cu_seqlens_k[:-1], + cu_seqlens_k[1:], + q_batch_ids, + q_start_sids, + 0, + *q.stride(), + 0, + *k.stride(), + 0, + *v.stride(), + 0, + *out.stride(), + layout_crow_indices, + layout_col_indices, + *layout_crow_indices.stride(), + *layout_col_indices.stride(), + q_k_ratio, + HAS_BATCH_DIM=False, + D_HEAD=head_size, + BLOCK_M=q_block_size, + BLOCK_N=block_size, + BLOCK_D=block_d, + BLOCK_M_LOADING=(16 if decoding_only else + q_block_size), # smaller for decoding + EVEN_D=block_d == head_size, + num_warps=1 if decoding_only else 4, + num_stages=3) + + return out + + +@triton.jit +def _fwd_kernel_inner( + acc, + l_i, + m_i, + q, + Q, + k_block_col_idx, + layout_col_ptr, + layout_col_stride_h, + layout_col_stride_m, + k_ptrs, + v_ptrs, + off_h, + offs_m, + offs_n, + offs_d, + stride_kt, + stride_vt, + sm_scale, + k_seqlen, + past_len, + LAST_K_BLOCK: tl.constexpr, + BLOCK_M_LOADING: tl.constexpr, + BLOCK_N: tl.constexpr, + D_HEAD: tl.constexpr, + EVEN_D: tl.constexpr, + M_LT_N: tl.constexpr, +): + k_block_id = tl.load(layout_col_ptr + off_h * layout_col_stride_h + + k_block_col_idx * layout_col_stride_m).to(tl.int32) + start_n = k_block_id * BLOCK_N + if LAST_K_BLOCK: + if EVEN_D: + k = tl.load( + k_ptrs + start_n * stride_kt, + mask=offs_n[None, :] + start_n < k_seqlen, + ) + else: + k = tl.load( + k_ptrs + start_n * stride_kt, + mask=(offs_n[None, :] + start_n < k_seqlen) & + (offs_d[:, None] < D_HEAD), + ) + else: + if EVEN_D: + k = tl.load(k_ptrs + start_n * stride_kt) + else: + k = tl.load(k_ptrs + start_n * stride_kt, + mask=offs_d[:, None] < D_HEAD) + + qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk *= sm_scale + + # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N + if LAST_K_BLOCK | M_LT_N: + qk += tl.where( + offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), + 0, + float("-inf"), + ) + + # flash-attn2 + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + p = tl.math.exp2(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + alpha = tl.math.exp2(m_i - m_ij) + acc = acc * alpha[:, None] + # update m_i + m_i = m_ij + l_i = l_i * alpha + l_ij + + p = p.to(Q.dtype.element_ty) + # update acc + if LAST_K_BLOCK: + if EVEN_D: + v = tl.load( + v_ptrs + start_n * stride_vt, + mask=offs_n[:, None] + start_n < k_seqlen, + ) + else: + v = tl.load( + v_ptrs + start_n * stride_vt, + mask=(offs_n[:, None] + start_n < k_seqlen) & + (offs_d[None, :] < D_HEAD), + ) + else: + if EVEN_D: + v = tl.load(v_ptrs + start_n * stride_vt) + else: + v = tl.load(v_ptrs + start_n * stride_vt, + mask=offs_d[None, :] < D_HEAD) + + acc += tl.dot(p, v) + + return acc, l_i, m_i + + +@triton.heuristics({ + "M_LT_N": + lambda kwargs: kwargs["BLOCK_M"] < kwargs["BLOCK_N"], +}) +@triton.jit +def _fwd_kernel_batch_inference( + Q, + K, + V, + Out, + sm_scale, + q_batch_starts, + q_batch_ends, + k_batch_starts, + k_batch_ends, + q_batch_ids, + q_start_sids, + stride_qb, + stride_qt, + stride_qh, + stride_qd, + stride_kb, + stride_kt, + stride_kh, + stride_kd, + stride_vb, + stride_vt, + stride_vh, + stride_vd, + stride_ob, + stride_ot, + stride_oh, + stride_od, + layout_crow_ptr, + layout_col_ptr, + layout_crow_stride_h, + layout_crow_stride_m, + layout_col_stride_h, + layout_col_stride_m, + q_k_ratio, + HAS_BATCH_DIM: tl.constexpr, + D_HEAD: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_M_LOADING: tl.constexpr, + EVEN_D: tl.constexpr, + M_LT_N: tl.constexpr, +): + """ + NOTATION: + pid: position id + sid: storage id + sbid: storage block id + pbid: position block id + offs_m, offs_n: storage offsets of m-dim(q, row) and n-dim(k, col) + + TODO(linxihui): + Optimize grouped-attn + """ + off_zm = tl.program_id(0) + off_h = tl.program_id(1) + + off_h_for_kv = off_h // q_k_ratio + + if HAS_BATCH_DIM: + off_z = tl.program_id(2) + Q += off_z * stride_qb + K += off_z * stride_kb + V += off_z * stride_vb + Out += off_z * stride_ob + start_m = off_zm + q_start_sid = start_m * BLOCK_M # always 0 for decoding + else: + off_z = tl.load(q_batch_ids + off_zm).to(tl.int32) # [0, 0, 0, 1] + q_start_sid = tl.load(q_start_sids + off_zm) + start_m = q_start_sid // BLOCK_M # q_sbid + + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M_LOADING) + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_D) + + q_cu_start = tl.load(q_batch_starts + off_z).to(tl.int32) + q_seqlen = tl.load(q_batch_ends + off_z).to(tl.int32) - q_cu_start + k_cu_start = tl.load(k_batch_starts + off_z).to(tl.int32) + k_seqlen = tl.load(k_batch_ends + off_z).to(tl.int32) - k_cu_start + past_len = k_seqlen - q_seqlen + + Q += q_cu_start * stride_qt + off_h * stride_qh + K += k_cu_start * stride_kt + off_h_for_kv * stride_kh + V += k_cu_start * stride_vt + off_h_for_kv * stride_vh + Out += q_cu_start * stride_ot + off_h * stride_oh + + q_pbid = (past_len + q_start_sid) // BLOCK_M + + if EVEN_D: + q = tl.load( + Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd, + mask=offs_m[:, None] < q_seqlen, + ) + else: + q = tl.load( + Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd, + mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD), + other=0, + ) + + sparse_crow_ptr = (layout_crow_ptr + off_h * layout_crow_stride_h + + q_pbid * layout_crow_stride_m) + + # TODO(linxihui): load at once, with any Triton version + # that supports `tl.split`, e.g., Triton 3.0 + k_block_start = tl.load(sparse_crow_ptr).to(tl.int32) + k_block_end = tl.load(sparse_crow_ptr + 1).to(tl.int32) + + m_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) + acc = tl.zeros([BLOCK_M_LOADING, BLOCK_D], dtype=tl.float32) + + k_ptrs = K + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd + v_ptrs = V + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd + + sm_scale *= ( + 1.44269504 # 1/log2 as we use base2 for exponential and logarithm + ) + + for k_block_col_idx in range(k_block_start, k_block_end - 1): + acc, l_i, m_i = _fwd_kernel_inner( + acc, + l_i, + m_i, + q, + Q, + k_block_col_idx, + layout_col_ptr, + layout_col_stride_h, + layout_col_stride_m, + k_ptrs, + v_ptrs, + off_h, + offs_m, + offs_n, + offs_d, + stride_kt, + stride_vt, + sm_scale, + k_seqlen, + past_len, + False, + BLOCK_M_LOADING, + BLOCK_N, + D_HEAD, + EVEN_D, + M_LT_N, + ) + + acc, l_i, m_i = _fwd_kernel_inner( + acc, + l_i, + m_i, + q, + Q, + k_block_end - 1, + layout_col_ptr, + layout_col_stride_h, + layout_col_stride_m, + k_ptrs, + v_ptrs, + off_h, + offs_m, + offs_n, + offs_d, + stride_kt, + stride_vt, + sm_scale, + k_seqlen, + past_len, + True, + BLOCK_M_LOADING, + BLOCK_N, + D_HEAD, + EVEN_D, + M_LT_N, + ) + + # flash-attn 2 + m_i += tl.math.log2(l_i) + acc = acc / l_i[:, None] + + # write output + if EVEN_D: + tl.store( + Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, + acc, + mask=offs_m[:, None] < q_seqlen, + ) + else: + tl.store( + Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, + acc, + mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD), + ) diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py new file mode 100644 index 0000000000000..300211e70bb79 --- /dev/null +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -0,0 +1,238 @@ +import math + +import torch + +from vllm.utils import is_cpu, is_hip + +from .utils import (dense_to_crow_col, get_head_sliding_step, + get_sparse_attn_mask) + +IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available() + and torch.cuda.get_device_capability()[0] >= 8) + +if IS_COMPUTE_8_OR_ABOVE: + from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd + + +class LocalStridedBlockSparseAttn(torch.nn.Module): + + def __init__( + self, + n_heads, + max_seqlen, + local_blocks, + vert_stride, + block_size, + device=None, + dtype=None, + homo_head=False, + active_head_range=None, + q_block_size=None, + use_spda=None, + ): + super().__init__() + if use_spda is None: + use_spda = is_hip() or is_cpu() or not \ + IS_COMPUTE_8_OR_ABOVE + device = device or (torch.cuda.current_device() + if torch.cuda.is_available() else "cpu") + device = torch.device(device) + # NOTE: vllm CPU backend support BF16 instead of FP16. + dtype = dtype or (torch.bfloat16 if IS_COMPUTE_8_OR_ABOVE + or device.type == "cpu" else torch.half) + + self.n_heads = n_heads + self.max_seqlen = max_seqlen + self.local_blocks = local_blocks + self.vert_stride = vert_stride + self.use_spda = use_spda + self.dtype = dtype + self.device = device + self.block_size = block_size + self.q_block_size = q_block_size + self.homo_head = homo_head + self.active_head_range = active_head_range + self.head_sliding_step = get_head_sliding_step(n_heads, vert_stride, + homo_head) + + sparse_layout, sparse_pattern, self.dense_attn_mask = ( + self.get_attn_pattern(dtype, device)) + + if q_block_size is not None and q_block_size != block_size: + if q_block_size > block_size: + assert q_block_size % block_size == 0 + blocks_to_merge = q_block_size // block_size + shape = sparse_pattern.shape + sparse_pattern = sparse_pattern.view(shape[0], -1, + blocks_to_merge, + shape[-1]) + sparse_pattern = sparse_pattern.sum(2) + sparse_layout = dense_to_crow_col(sparse_pattern) + else: + raise ValueError( + "Does not support smaller q_block_size. It will be slower." + ) + + self.sparse_layout = sparse_layout + + def get_attn_pattern(self, dtype, device): + sparse_layout, sparse_pattern, dense_attn_mask = get_sparse_attn_mask( + self.n_heads, + self.max_seqlen, + self.max_seqlen, + dtype, + device, + block_size=self.block_size, + local_blocks=self.local_blocks, + vert_stride=self.vert_stride, + homo_head=self.homo_head, + return_dense=self.use_spda, + dense_mask_type="bias", + ) + if (not self.homo_head) and (self.active_head_range is not None): + assert isinstance(self.active_head_range, tuple) + assert (len(self.active_head_range) == 2) + h_start, h_end = self.active_head_range + sparse_layout = tuple(x[h_start:h_end] for x in sparse_layout) + if self.use_spda: + dense_attn_mask = dense_attn_mask[h_start:h_end] + return sparse_layout, sparse_pattern, dense_attn_mask + + def varlen_attn(self, + q, + k, + v, + cu_seqlens_k, + cu_seqlens_q=None, + sm_scale=None): + """ + q, k, v: shape = (num_tokens, num_heads_q/kv, head_size). + Support grouped attention, with `q[:, i*r:(i*r + r)]` + is correspondent to `k[:, i]`, where `r` is the q/k ratio. + cu_seqlens_k: shape=(batch_size + 1,), + indicating segment of samples, + e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i + cu_seqlens_q: shape=(batch_size + 1, ). + Default None: same as cu_seqlens_k for prefilling or + [0, 1, .., batch_size] for decoding. + The only case you need to specify is when q is a mix of + prefilling and decoding. + sm_scale: softmax scale, default to 1/sqrt(head_size). + + return: tensor of shape as q. + """ + assert ( + IS_COMPUTE_8_OR_ABOVE + ), "Requires compute capability of 8 or above (Ampere or newer) to use \ + Triton kernel." + + sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1)) + + return blocksparse_flash_attn_varlen_fwd( + q, + k, + v, + cu_seqlens_k, + cu_seqlens_q, + sm_scale, + self.sparse_layout, + block_size=self.block_size, + q_block_size=self.q_block_size, + max_seqlen=self.max_seqlen, + ) + + @staticmethod + def transpose_and_pad(x, cu_seqlens, maxlen, head_repeats=1): + """ + :param x: (total_tokens, n_heads, head_size) + :return: (batch, n_heads, length, head_size) + """ + x_padded = x.new_empty( + len(cu_seqlens) - 1, x.size(1), head_repeats, maxlen, x.size(2)) + cu_seqlens = cu_seqlens.cpu() + for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])): + x_padded[i, :, :, :e - s].copy_(x[s:e].transpose(0, + 1).unsqueeze(1)) + return x_padded.flatten(1, 2) + + @staticmethod + def transpose_and_unpad(x_padded, cu_seqlens): + """ + :param x_padded: (batch, n_heads, length, head_size) + :return: (total_tokens, n_heads, head_size) + """ + cu_seqlens = cu_seqlens.cpu() + total_n_tokens = cu_seqlens[-1] + x = x_padded.new_empty(total_n_tokens, x_padded.size(1), + x_padded.size(3)) + for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])): + x[s:e].copy_(x_padded[i, :, :e - s].transpose(0, 1)) + return x + + def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): + """For CPU, V100 or other older GPUs. + NOTE: torch SPDA supports nested tensor, + but seems extremely slow. Choose to pad instead. + """ + assert (cu_seqlens_q is None or + (cu_seqlens_q + == cu_seqlens_k).all()), "Can only handle prompt with SPDA." + assert q.size(0) == k.size(0), "can only handle prompt with SPDA." + + assert q.size(1) % k.size(1) == 0 + q_k_ratio = q.size(1) // k.size(1) + sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1)) + cu_seqlens = cu_seqlens_k.cpu() + maxlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + + if (self.dense_attn_mask.dtype != q.dtype + or self.dense_attn_mask.device != q.device): + _, _, self.dense_attn_mask = self.get_attn_pattern( + q.dtype, q.device) + attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen] + + q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1) + k2, v2 = [ + self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) + for x in [k, v] + ] + spda_output = torch.nn.functional.scaled_dot_product_attention( + q2, k2, v2, attn_mask=attn_mask, scale=sm_scale) + return self.transpose_and_unpad(spda_output, cu_seqlens) + + def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): + """Dispatch to `varlen_attn` (Ampere or newer) or + `self.spda`(cpu, Volta, Turing or older)based on + the type of device used and cuda compute capability. + + q, k, v: shape = (num_tokens, num_heads_q/kv, head_size). + Support grouped attention, with `q[:, i*r:(i*r + r)]` + is correspondent to `k[:, i]`, where `r` is the q/k ratio. + cu_seqlens_k: shape=(batch_size + 1,), indicating segment of samples, + e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i + cu_seqlens_q: shape=(batch_size + 1, ). + Default None: same as cu_seqlens_k for prefilling or + [0, 1, .., batch_size] for decoding. + The only case you need to specify + is when q is a mix of prefilling + and decoding. + sm_scale: softmax scale, default to 1/sqrt(head_size). + + return: tensor of shape as q. + """ + assert k.dim() == 3 + if self.use_spda: + return self.spda( + q, + k, + v, + cu_seqlens_k, + cu_seqlens_q=cu_seqlens_q, + sm_scale=sm_scale, + ) + return self.varlen_attn(q, + k, + v, + cu_seqlens_k, + cu_seqlens_q=cu_seqlens_q, + sm_scale=sm_scale) \ No newline at end of file diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py new file mode 100644 index 0000000000000..0d90dd971e156 --- /dev/null +++ b/vllm/attention/ops/blocksparse_attention/utils.py @@ -0,0 +1,216 @@ +# Helper functions for 3D sparse pattern +# These function are not optimized and very inefficient. +# Avoid calling them too frequent or use a cache mechanism. + +from functools import lru_cache + +import torch +import triton +from scipy import sparse + + +def dense_to_crow_col(x: torch.Tensor): + """Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing. + NOTE: col_indices padded -1 + """ + device = x.device + pad = -1 + dim = x.dim() + assert x.dim() in (2, 3) + if x.dim() == 2: + x = x[None] + x = [sparse.csr_matrix(xi.bool().cpu().numpy()) for xi in x] + crows = torch.vstack([torch.from_numpy(xi.indptr) for xi in x]) + cols = [torch.from_numpy(xi.indices) for xi in x] + max_cols = max(len(xi) for xi in cols) + cols = [ + torch.cat([xi, pad + xi.new_zeros(max_cols - xi.shape[0])]) + for xi in cols + ] + cols = torch.vstack(cols) + if dim == 2: + crows = crows[0] + cols = cols[0] + return crows.to(device), cols.to(device) + + +def crow_col_to_dense(crows: torch.Tensor, + cols: torch.Tensor, + dtype: torch.dtype = torch.float16): + dim = crows.dim() + if dim == 1: + crows = crows[None] + cols = cols[None] + device = crows.device + crows, cols = crows.cpu(), cols.cpu() # faster in cpu + shape = (crows.shape[0], crows.shape[1] - 1, cols.max() + 1) + x = torch.zeros(shape, dtype=dtype) + for i in range(shape[0]): + for j in range(shape[1]): + x[i, j, cols[i, crows[i, j]:crows[i, j + 1]]] = 1 + if dim == 1: + x = x[0] + return x.to(device) + + +def dense_to_ccol_row(x: torch.Tensor): + """Similar, but to CSC format""" + x = x.transpose(-2, -1) + return dense_to_crow_col(x) + + +def ccol_row_to_dense(ccol: torch.Tensor, + rows: torch.Tensor, + dtype: torch.dtype = torch.float16): + return crow_col_to_dense(ccol, rows, dtype).permute(0, 2, 1).contiguous() + + +def _get_sparse_attn_mask_homo_head( + q_len: int, + max_seqlen: int, + dtype: torch.dtype, + device: torch.device, + block_size: int = 128, + local_blocks: int = 4, + vert_stride: int = 4, + return_dense: bool = False, +): + """ + :return: a tuple of 3: + - tuple of crow_indices, col_indices representation + of CSR format. + - block dense mask + - all token dense mask (be aware that it can be + OOM if it is too big) if `return_dense==True`, + otherwise, None + """ + with torch.no_grad(): + num_blocks = triton.cdiv(max_seqlen, block_size) + q_pos = torch.arange(num_blocks)[:, None] + k_pos = torch.arange(num_blocks)[None] + mask_vert_strided = (torch.arange(num_blocks) + 1) % vert_stride == 0 + block_mask_dense = (((q_pos >= k_pos) + & ((q_pos - k_pos < local_blocks) + | mask_vert_strided)).to(device).to(dtype)) + num_blocks_q = triton.cdiv(q_len, block_size) + block_mask_dense_output = (dense_to_crow_col( + block_mask_dense[-num_blocks_q:].contiguous())) + if return_dense: + mask_dense = torch.kron( + block_mask_dense, + block_mask_dense.new_ones((block_size, block_size)), + ) + causal_mask = torch.tril(torch.ones( + max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:] + mask_dense = mask_dense[-q_len:, :max_seqlen] * causal_mask + return ( + block_mask_dense_output, + block_mask_dense, + mask_dense, + ) + else: + return ( + block_mask_dense_output, + block_mask_dense, + None, + ) + + +def binary_mask_to_bias(mask_dense: torch.Tensor): + mask_dense = 1 - mask_dense + mask_dense.masked_fill_(mask_dense.bool(), -torch.inf) + return mask_dense + + +def get_head_sliding_step(n_heads: int, + vert_stride: int, + homo_head: bool = False): + if homo_head: + return 0 + return max(1, int(vert_stride / n_heads)) + + +@lru_cache +def get_sparse_attn_mask( + n_heads: int, + q_len: int, + max_seqlen: int, + dtype: torch.dtype, + device: torch.device, + block_size: int = 64, + local_blocks: int = 4, + vert_stride: int = 4, + homo_head: bool = True, + return_dense: bool = False, + dense_mask_type: str = "binary", +): + """ + :param dense_mask_type: "binary" (0 for skip token, 1 for others) + or "bias" (-inf for skip token, 0 or others) + :return: a tuple of 3: + - tuple of crow_indices, col_indices representation + of CSR format. + - block dense mask + - all token dense mask (be aware that it can be OOM if it + is too big) if `return_dense==True`, otherwise, None + """ + assert dense_mask_type in ("binary", "bias") + if homo_head: + with torch.no_grad(): + (crow, col), block_mask_dense, mask_dense = ( + _get_sparse_attn_mask_homo_head( + q_len, + max_seqlen, + dtype, + device, + block_size, + local_blocks, + vert_stride, + return_dense, + )) + crow = crow[None].expand(n_heads, crow.shape[0]) + col = col[None].expand(n_heads, col.shape[0]) + if return_dense: + mask_dense = mask_dense[None].expand(n_heads, + *mask_dense.shape) + if dense_mask_type == "bias": + mask_dense = binary_mask_to_bias(mask_dense) + return (crow, col), block_mask_dense, mask_dense + + with torch.no_grad(): + num_blocks = triton.cdiv(max_seqlen, block_size) + q_pos = torch.arange(num_blocks)[None, :, None] + k_pos = torch.arange(num_blocks)[None, None] + head_sliding_step = get_head_sliding_step(n_heads, vert_stride) + mask_vert_strided = [ + (torch.arange(num_blocks) + h * head_sliding_step + 1) % + vert_stride == 0 for h in range(n_heads) + ] + mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1) + block_mask_dense = (((q_pos >= k_pos) + & ((q_pos - k_pos < local_blocks) + | mask_vert_strided)).to(device).to(dtype)) + num_blocks_q = triton.cdiv(q_len, block_size) + block_mask_dense_output = block_mask_dense[:, -num_blocks_q:] + if return_dense: + mask_dense = torch.kron( + block_mask_dense, + block_mask_dense.new_ones((block_size, block_size)), + ) + causal_mask = torch.tril(torch.ones( + max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:] + mask_dense = mask_dense[..., -q_len:, :max_seqlen] * causal_mask[None] + if dense_mask_type == "bias": + mask_dense = binary_mask_to_bias(mask_dense) + + return ( + dense_to_crow_col(block_mask_dense_output), + block_mask_dense, + mask_dense, + ) + else: + return ( + dense_to_crow_col(block_mask_dense_output), + block_mask_dense, + None, + ) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 30feaa4da254d..e119fdcf11113 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -91,9 +91,21 @@ def forward_decode( scale: float, alibi_slopes: Optional[torch.Tensor], kv_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, ) -> torch.Tensor: - output = torch.empty_like(query) + if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: + # use blocksparse paged attention + block_size = value_cache.size(-1) + assert (blocksparse_block_size > 0 and + blocksparse_block_size % block_size == 0), \ + (f"{blocksparse_block_size=} needs to be a multiple of" + f"{block_size=} used in block_tables.") + output = torch.empty_like(query) block_size = value_cache.shape[3] num_seqs, num_heads, head_size = query.shape max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // @@ -107,6 +119,7 @@ def forward_decode( # For context len > 8192, use V2 kernel to avoid shared memory shortage. use_v1 = (max_seq_len <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)) + if use_v1: # Run PagedAttention V1. ops.paged_attention_v1( @@ -123,6 +136,11 @@ def forward_decode( alibi_slopes, kv_cache_dtype, kv_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, ) else: # Run PagedAttention V2. @@ -155,6 +173,11 @@ def forward_decode( alibi_slopes, kv_cache_dtype, kv_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, ) return output diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index f191461dcd3b7..9ceda3431b898 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -29,7 +29,14 @@ def get_attn_backend( dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, + is_blocksparse: bool = False, ) -> Type[AttentionBackend]: + + if is_blocksparse: + logger.info("Using BlocksparseFlashAttention backend.") + from vllm.attention.backends.blocksparse_attn import ( + BlocksparseFlashAttentionBackend) + return BlocksparseFlashAttentionBackend """Determine which attention backend to use and only import the selected backend module. """ diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index db3fc85decd70..0df0223b9dbb2 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -100,6 +100,7 @@ def _create_logprobs( token_logprob = step_top_logprobs[token_id].logprob token = step_top_logprobs[token_id].decoded_token logprobs.tokens.append(token) + token_logprob = max(token_logprob, -9999.0) logprobs.token_logprobs.append(token_logprob) if num_output_top_logprobs: diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 6aec104be8da4..a92abe6b5b8dc 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -56,6 +56,7 @@ "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"), "XverseForCausalLM": ("xverse", "XverseForCausalLM"), + "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), } _EMBEDDING_MODELS = { diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py new file mode 100644 index 0000000000000..0c5298eb6f100 --- /dev/null +++ b/vllm/model_executor/models/phi3_small.py @@ -0,0 +1,447 @@ +import math +from typing import Iterable, List, Optional, Tuple + +import torch +from torch import nn +from transformers.configuration_utils import PretrainedConfig + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig, LoRAConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput + + +def load_column_parallel_weight(param: torch.nn.Parameter, + loaded_weight: torch.Tensor): + tp = get_tensor_model_parallel_world_size() + rk = get_tensor_model_parallel_rank() + assert param.size(0) * tp == loaded_weight.size(0) + s = rk * param.size(0) + e = (rk + 1) * param.size(0) + loaded_weight = loaded_weight[s:e] + assert param.shape == loaded_weight.shape + param.data.copy_(loaded_weight) + + +class HeadMajorQKVParallelLinear(QKVParallelLinear): + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor): + return load_column_parallel_weight(param, loaded_weight) + + +class HeadMajorColumnParallelLinear(MergedColumnParallelLinear): + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor): + return load_column_parallel_weight(param, loaded_weight) + + +@torch.jit.script +def quick_gelu(x): + return x * torch.sigmoid(1.702 * x) + + +@torch.jit.script +def gegelu(input, limit: Optional[float] = None): + a_gelu, a_linear = input[..., ::2], input[..., 1::2] + if limit is not None: + a_gelu = torch.where(torch.isinf(a_gelu), a_gelu, + a_gelu.clamp(min=None, max=limit)) + a_linear = torch.where( + torch.isinf(a_linear), + a_linear, + a_linear.clamp(min=-limit, max=limit), + ) + out_gelu = quick_gelu(a_gelu) + return out_gelu * (a_linear + 1) + + +class Phi3SmallMLP(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + assert (self.config.hidden_act == "gegelu" + ), "Only `gegelu` is supported for the 4.7 series of models .." + self.hidden_size = config.hidden_size + self.gegelu_limit = config.gegelu_limit + self.intermediate_size = config.intermediate_size + + self.up_proj = HeadMajorColumnParallelLinear( + self.hidden_size, + 2 * [self.intermediate_size], + bias=True, + quant_config=quant_config, + ) + self.down_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + ) + + def forward(self, x): + gate_up, _ = self.up_proj(x) + x = gegelu(gate_up) + x, _ = self.down_proj(x) + return x + + +class Phi3SmallSelfAttention(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.layer_idx = layer_idx + self.config = config + self.sparse_block_size = config.blocksparse_block_size + self.homo_heads = config.blocksparse_homo_head_pattern + self.local_blocks = config.blocksparse_num_local_blocks + self.vert_stride = config.blocksparse_vert_stride + + assert (config.blocksparse_block_size == + config.blocksparse_triton_kernel_block_size) + + self.hidden_size = config.hidden_size + # Number of Query Heads + self.num_heads = config.num_attention_heads + + self.head_dim = self.hidden_size // self.num_heads + self.tp_size = get_tensor_model_parallel_world_size() + # Number of total Key Value Heads before tensor parallel + self.num_key_value_heads = config.num_key_value_heads + self.num_q_per_kv = self.num_heads // self.num_key_value_heads + if self.tp_size > 1: + assert self.num_key_value_heads % self.tp_size == 0 + self.num_kv_heads_per_partion = max( + 1, self.num_key_value_heads // self.tp_size) + self.num_heads_per_partition = self.num_heads // self.tp_size + + self.max_position_embeddings = config.max_position_embeddings + self.rope_embedding_base = config.rope_embedding_base + self.rope_position_scale = config.rope_position_scale + self.is_causal = True + + norm_factor = None + if config.mup_use_scaling: + norm_factor = self.head_dim / config.mup_attn_multiplier + else: + norm_factor = math.sqrt(self.head_dim) + self.scale = 1 / norm_factor + + self.query_key_value = HeadMajorQKVParallelLinear( + self.hidden_size, + self.head_dim, + self.num_heads, + self.num_key_value_heads, + bias=True, + quant_config=quant_config, + ) + + self.dense = RowParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config) + + if getattr(self.config, "rope_scaling", None) is not None: + rope_scaling = self.config.rope_scaling + for key in rope_scaling: + if isinstance(rope_scaling[key], list): + rope_scaling[key] = tuple(rope_scaling[key]) + + if "factor" not in rope_scaling: + rope_scaling["factor"] = self.rope_position_scale + else: + rope_scaling = { + "type": "linear", + "factor": self.rope_position_scale, + } + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_embedding_base, + rope_scaling=rope_scaling, + ) + + # blocksparse params + self.blocksparse_block_size = config.blocksparse_block_size + self.blocksparse_num_local_blocks = config.blocksparse_num_local_blocks + self.blocksparse_vert_stride = config.blocksparse_vert_stride + + use_dense_attn = (getattr(self.config, + "dense_attention_every_n_layers", None) + and (self.layer_idx + 1) % + self.config.dense_attention_every_n_layers == 0) + + bs_params = None + if not use_dense_attn: + bs_params = { + 'max_seqlen': self.max_position_embeddings, + 'num_heads': self.num_heads_per_partition, + "num_kv_heads": self.num_kv_heads_per_partion, + "block_size": self.sparse_block_size, + "local_blocks": self.local_blocks, + "vert_stride": self.vert_stride, + "homo_head": self.homo_heads + } + + self.attn = Attention( + self.num_heads_per_partition, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads_per_partion, + cache_config=cache_config, + quant_config=quant_config, + blocksparse_params=bs_params, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: + qkv, _ = self.query_key_value(hidden_states) + + qkv = qkv.view(qkv.shape[:-1] + + (-1, (self.num_q_per_kv + 2), self.head_dim)) + q, k, v = qkv.split([self.num_q_per_kv, 1, 1], dim=-2) + + # NOTE: this is required by RotaryEmbed, which indeed does not have to + # TODO: allow 3D QK for rotary forward + q = q.reshape(-1, self.head_dim * self.num_heads_per_partition) + k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) + v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) + + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata=attn_metadata) + output, _ = self.dense(attn_output) + + return output + + +class Phi3SmallDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Phi3SmallSelfAttention(config, + layer_idx, + cache_config=cache_config, + quant_config=quant_config) + self.mlp = Phi3SmallMLP(config, quant_config) + + self.input_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_epsilon) + self.post_attention_layernorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_epsilon) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class Phi3SmallModel(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.embed_tokens = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) + self.mup_embedding_multiplier = config.mup_embedding_multiplier + self.layers = nn.ModuleList([ + Phi3SmallDecoderLayer(config, layer_idx, cache_config, + quant_config) + for layer_idx in range(config.num_hidden_layers) + ]) + + self.final_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_epsilon) + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: torch.LongTensor, + positions: Optional[torch.LongTensor], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata = None, + ): + hidden_states = self.embed_tokens(input_ids) + if (self.mup_embedding_multiplier is not None + and self.mup_embedding_multiplier > 0.0): + hidden_states = hidden_states * self.mup_embedding_multiplier + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states = layer( + positions, + hidden_states, + kv_caches[i], + attn_metadata, + ) + hidden_states = self.final_layernorm(hidden_states) + return hidden_states + + +class Phi3SmallForCausalLM(nn.Module): + _tied_weights_keys = ["lm_head.weight"] + + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ): + super().__init__() + self.config = config + self.quant_config = quant_config + self.model = Phi3SmallModel(config, cache_config, quant_config) + self.vocab_size = config.vocab_size + self.mup_width_multiplier = config.mup_width_multiplier + self.lm_head = ParallelLMHead( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + + # tokens in tiktoken but not used + if hasattr(config, 'dummy_token_indices'): + device = self.lm_head.weight.device + self.register_buffer('dummy_token_indices', + torch.LongTensor( + config.dummy_token_indices).to(device), + persistent=False) + else: + self.dummy_token_indices = None + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, value): + self.lm_head = value + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head.weight, hidden_states, + sampling_metadata) + if self.dummy_token_indices is not None and logits is not None: + logits.index_fill_(-1, self.dummy_token_indices, -torch.inf) + return logits + + def forward( + self, + input_ids: torch.LongTensor, + positions: Optional[torch.LongTensor], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + output_hidden_states = self.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + output_hidden_states = output_hidden_states + return output_hidden_states + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + + next_tokens = self.sampler(logits / self.mup_width_multiplier, + sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + self.lm_head.weight.data.copy_(self.model.embed_tokens.weight.data) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index f36d84dbdf7f9..044eec6410a54 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -63,4 +63,4 @@ def get_hf_text_config(config: PretrainedConfig): assert hasattr(config.text_config, "num_attention_heads") return config.text_config else: - return config + return config \ No newline at end of file From 3fe7e528506f7f6ed78a43431077ecefd77d3b41 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 24 May 2024 23:49:49 -0700 Subject: [PATCH 035/154] [Misc] add logging level env var (#5045) --- .github/ISSUE_TEMPLATE/400-bug report.yml | 2 ++ vllm/envs.py | 5 +++++ vllm/logger.py | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml index 08120ad8e5a60..ce980c3f4a01d 100644 --- a/.github/ISSUE_TEMPLATE/400-bug report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug report.yml @@ -59,6 +59,8 @@ body: Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues. + If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs. placeholder: | A clear and concise description of what the bug is. diff --git a/vllm/envs.py b/vllm/envs.py index 56ff79e0cdea9..bef343d08429c 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -22,6 +22,7 @@ VLLM_DO_NOT_TRACK: bool = False VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 + VLLM_LOGGING_LEVEL: str = "INFO" VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None @@ -178,6 +179,10 @@ "VLLM_LOGGING_CONFIG_PATH": lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"), + # this is used for configuring the default logging level + "VLLM_LOGGING_LEVEL": + lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"), + # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging diff --git a/vllm/logger.py b/vllm/logger.py index 153cdfb373bb4..3c6bf0803a624 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -14,6 +14,7 @@ VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH +VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" _DATE_FORMAT = "%m-%d %H:%M:%S" @@ -30,7 +31,7 @@ "vllm": { "class": "logging.StreamHandler", "formatter": "vllm", - "level": "INFO", + "level": VLLM_LOGGING_LEVEL, "stream": "ext://sys.stdout", }, }, From 8768b3f9db66b4b420f57220d054519efcfad00d Mon Sep 17 00:00:00 2001 From: Lily Liu Date: Sat, 25 May 2024 10:00:14 -0700 Subject: [PATCH 036/154] [Dynamic Spec Decoding] Minor fix for disabling speculative decoding (#5000) --- .../spec_decode/e2e/test_ngram_correctness.py | 41 +++++++++++++++++++ tests/spec_decode/test_dynamic_spec_decode.py | 16 +++++--- vllm/spec_decode/spec_decode_worker.py | 17 +++++--- 3 files changed, 63 insertions(+), 11 deletions(-) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index c2004ff061a1e..d475d37af6425 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -170,3 +170,44 @@ def test_ngram_different_k(baseline_llm_generator, test_llm_generator, batch_size, max_output_len=output_len, force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", + [{ + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + "speculative_disable_by_batch_size": 4 + }]) +@pytest.mark.parametrize("batch_size", [1, 5]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_ngram_disable_queue(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify that ngram speculative decoding produces exact equality + to without spec decode with many different values of k and + different ngram_prompt_lookup_max. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index 948a74b22f0ae..48fa862b2e41a 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -1,4 +1,4 @@ -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest import torch @@ -13,9 +13,9 @@ from .utils import create_batch, mock_worker -@pytest.mark.parametrize('queue_size', [2, 4]) -@pytest.mark.parametrize('batch_size', [1, 2, 3, 6]) -@pytest.mark.parametrize('k', [1, 2, 5, 7, 10]) +@pytest.mark.parametrize('queue_size', [4]) +@pytest.mark.parametrize('batch_size', [1]) +@pytest.mark.parametrize('k', [1]) @torch.inference_mode() def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int): """Verify that speculative tokens are disabled when the batch size @@ -42,8 +42,12 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int): num_lookahead_slots=k, running_queue_size=queue_size) - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) + if queue_size > disable_by_batch_size: + with patch.object(worker, + '_run_no_spec', + side_effect=ValueError(exception_secret)), \ + pytest.raises(ValueError, match=exception_secret): + worker.execute_model(execute_model_req=execute_model_req) # When the batch size is larger than the threshold, # we expect no speculative tokens (0). diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3462a876c3e90..150e8db0c8aad 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -273,10 +273,17 @@ def execute_model( self._maybe_disable_speculative_tokens( disable_all_speculation, execute_model_req.seq_group_metadata_list) - # If no spec tokens, call the proposer and scorer workers normally. - # Used for prefill. + # Speculative decoding is disabled in the following cases: + # 1. Prefill phase: Speculative decoding is not + # used during the prefill phase. + # 2. Auto-disable enabled: The running queue size exceeds + # the specified threshold. + # 3. No request: There are no requests in the batch. + # In any of these cases, the proposer and scorer workers + # are called normally. if num_lookahead_slots == 0 or len( - execute_model_req.seq_group_metadata_list) == 0: + execute_model_req.seq_group_metadata_list + ) == 0 or disable_all_speculation: return self._run_no_spec(execute_model_req, skip_proposer=disable_all_speculation) @@ -316,8 +323,8 @@ def _maybe_disable_speculative_tokens( @nvtx_range("spec_decode_worker._run_no_spec") def _run_no_spec(self, execute_model_req: ExecuteModelRequest, skip_proposer: bool) -> List[SamplerOutput]: - """Run a prefill step, without any speculation. The input is sent to - the proposer and scorer model so that the KV cache is consistent + """Run a single generation step without any speculation. The input is + sent to the proposer and scorer model so that the KV cache is consistent between the two. When skip_proposer is True, the proposer model is not called, meaning that the kv-cache in proposer for requests is not updated, so they cannot enable spec decode in the rest decoding. From e7e376f0d3993977ba42d51bec96ca8401477b55 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 25 May 2024 10:28:16 -0700 Subject: [PATCH 037/154] [Misc] Make Serving Benchmark More User-friendly (#5044) --- benchmarks/backend_request_func.py | 6 ++++++ benchmarks/benchmark_serving.py | 29 ++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 34c22d2d9879a..0141cbfb472a5 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -93,6 +93,9 @@ async def async_request_tgi( output.latency = most_recent_timestamp - st output.success = True output.generated_text = data["generated_text"] + else: + output.error = response.reason or "" + output.success = False except Exception: output.success = False exc_info = sys.exc_info() @@ -280,6 +283,9 @@ async def async_request_openai_completions( output.generated_text = generated_text output.success = True output.latency = latency + else: + output.error = response.reason or "" + output.success = False except Exception: output.success = False exc_info = sys.exc_info() diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 0efb4dba06964..dc7288b8b7009 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -217,6 +217,11 @@ def calculate_metrics( else: actual_output_lens.append(0) + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -228,9 +233,9 @@ def calculate_metrics( 1000, # ttfts is empty if streaming is not supported by backend median_ttft_ms=np.median(ttfts or 0) * 1000, p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, - mean_tpot_ms=np.mean(tpots) * 1000, - median_tpot_ms=np.median(tpots) * 1000, - p99_tpot_ms=np.percentile(tpots, 99) * 1000, + mean_tpot_ms=np.mean(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000, ) return metrics, actual_output_lens @@ -252,6 +257,24 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") + print("Starting initial single prompt test run...") + test_prompt, test_prompt_len, test_output_len = input_requests[0] + test_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=api_url, + prompt_len=test_prompt_len, + output_len=test_output_len, + best_of=best_of, + use_beam_search=use_beam_search, + ) + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") print(f"Traffic request rate: {request_rate}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) From 67ce9eab837089ee0bd4acde0c69d5c1b48de13b Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Mon, 27 May 2024 15:18:17 -0700 Subject: [PATCH 038/154] [Bugfix / Core] Prefix Caching Guards (merged with main) (#4846) Co-authored-by: rsnm2 Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- .../test_disable_sliding_window.py | 44 ++++++++++++ tests/test_config.py | 24 +++++++ vllm/attention/layer.py | 3 +- vllm/config.py | 67 ++++++++++++++++++- vllm/engine/arg_utils.py | 18 +++-- vllm/model_executor/models/llama.py | 4 -- vllm/model_executor/models/mixtral.py | 22 +++--- vllm/model_executor/models/mixtral_quant.py | 4 -- vllm/model_executor/models/qwen2.py | 25 ++++--- vllm/model_executor/models/starcoder2.py | 2 - vllm/model_executor/models/xverse.py | 4 -- 11 files changed, 170 insertions(+), 47 deletions(-) create mode 100644 tests/prefix_caching/test_disable_sliding_window.py diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py new file mode 100644 index 0000000000000..eeac6ab43c05f --- /dev/null +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -0,0 +1,44 @@ +"""Compare the with and without prefix caching. + +Run `pytest tests/prefix_caching/test_prefix_caching.py`. +""" +import pytest + +from tests.conftest import cleanup +from vllm import LLM + +MODEL_LEN_LEN = [ + # Example models with sliding window. + ("bigcode/starcoder2-3b", 4096, 16384), + # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI + + # Confirm model with sliding window works. + # config has "use_sliding_window": false + ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768), + # config has no sliding window attribute. + ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048), +] + + +@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) +def test_disable_sliding_window(model_len_len, ): + model, sliding_len, full_len = model_len_len + vllm_disabled_model = LLM(model, disable_sliding_window=True) + vllm_disabled_model.generate("Hi my name is") + model_config = vllm_disabled_model.llm_engine.model_config + assert model_config.max_model_len == sliding_len, ( + "Max len expected to equal sliding_len of %s, but got %s", sliding_len, + model_config.max_model_len) + + del vllm_disabled_model + cleanup() + + vllm_enabled_model = LLM(model, disable_sliding_window=False) + vllm_enabled_model.generate("Hi my name is") + model_config = vllm_enabled_model.llm_engine.model_config + assert model_config.max_model_len == full_len, ( + "Max len expected to equal full_len of %s, but got %s", full_len, + model_config.max_model_len) + + del vllm_enabled_model + cleanup() diff --git a/tests/test_config.py b/tests/test_config.py index 6bc51a53dc07c..7cbdaeca9c4d4 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,5 +1,29 @@ +import pytest + from vllm.config import ModelConfig +MODEL_IDS_EXPECTED = [ + ("Qwen/Qwen1.5-7B", 32768), + ("mistralai/Mistral-7B-v0.1", 4096), + ("mistralai/Mistral-7B-Instruct-v0.2", 32768), +] + + +@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED) +def test_disable_sliding_window(model_id_expected): + model_id, expected = model_id_expected + model_config = ModelConfig( + model_id, + model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None, + disable_sliding_window=True, + ) + assert model_config.max_model_len == expected + def test_get_sliding_window(): TEST_SLIDING_WINDOW = 4096 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b67f04c51d493..db55a31476fed 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -30,7 +30,6 @@ def __init__( scale: float, num_kv_heads: Optional[int] = None, alibi_slopes: Optional[List[float]] = None, - sliding_window: Optional[int] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, blocksparse_params: Optional[Dict[str, Any]] = None, @@ -39,9 +38,11 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype block_size = cache_config.block_size + sliding_window = cache_config.sliding_window else: kv_cache_dtype = "auto" block_size = 16 + sliding_window = None if num_kv_heads is None: num_kv_heads = num_heads diff --git a/vllm/config.py b/vllm/config.py index 33b49a0fb2284..be65660883bd5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -69,6 +69,10 @@ class ModelConfig: max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode + disable_sliding_window: Whether to disable sliding window. If True, + we will disable the sliding window functionality of the model. + If the model does not support sliding window, this argument is + ignored. skip_tokenizer_init: If true, skip initialization of tokenizer and detokenizer. served_model_name: The model name used in metrics tag `model_name`, @@ -98,6 +102,7 @@ def __init__( max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: Optional[int] = None, max_logprobs: int = 5, + disable_sliding_window: bool = False, skip_tokenizer_init: bool = False, served_model_name: Optional[Union[str, List[str]]] = None, ) -> None: @@ -122,14 +127,18 @@ def __init__( self.max_seq_len_to_capture = (max_seq_len_to_capture or max_context_len_to_capture) self.max_logprobs = max_logprobs + self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init self.hf_config = get_config(self.model, trust_remote_code, revision, code_revision, rope_scaling) self.hf_text_config = get_hf_text_config(self.hf_config) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) - self.max_model_len = _get_and_verify_max_len(self.hf_text_config, - max_model_len) + self.max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + max_model_len=max_model_len, + disable_sliding_window=self.disable_sliding_window, + sliding_window_len=self.get_hf_config_sliding_window()) self.served_model_name = get_served_model_name(model, served_model_name) if not self.skip_tokenizer_init: @@ -252,7 +261,7 @@ def verify_with_parallel_config( "must be divisible by pipeline parallel size " f"({pipeline_parallel_size}).") - def get_sliding_window(self) -> Optional[int]: + def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled. """ @@ -264,6 +273,15 @@ def get_sliding_window(self) -> Optional[int]: return None return getattr(self.hf_text_config, "sliding_window", None) + def get_sliding_window(self) -> Optional[int]: + """Get the sliding window size, or None if disabled. + """ + # If user disables sliding window, return None. + if self.disable_sliding_window: + return None + # Otherwise get the value from the hf config. + return self.get_hf_config_sliding_window() + def get_vocab_size(self) -> int: return self.hf_text_config.vocab_size @@ -368,6 +386,7 @@ def __init__( self.enable_prefix_caching = enable_prefix_caching self._verify_args() self._verify_cache_dtype() + self._verify_prefix_caching() # Will be set after profiling. self.num_gpu_blocks = None @@ -396,6 +415,19 @@ def _verify_cache_dtype(self) -> None: else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") + def _verify_prefix_caching(self) -> None: + if not self.enable_prefix_caching: + return + + if self.sliding_window is not None: + raise NotImplementedError( + "Prefix caching is not supported with sliding window. " + "Run with --disable-sliding-window to use prefix caching.") + if self.cache_dtype == "fp8": + raise NotImplementedError( + "Prefix caching is not supported for fp8 cache_dtype. " + "Run with --kv-cache-dtype auto to use prefix caching.") + def verify_with_parallel_config( self, parallel_config: "ParallelConfig", @@ -1148,6 +1180,8 @@ def _get_and_verify_dtype( def _get_and_verify_max_len( hf_config: PretrainedConfig, max_model_len: Optional[int], + disable_sliding_window: bool, + sliding_window_len: Optional[int], ) -> int: """Get and verify the model's maximum length.""" derived_max_model_len = float("inf") @@ -1167,6 +1201,7 @@ def _get_and_verify_max_len( "max_seq_length", "seq_len", ] + # Choose the smallest "max_length" from the possible keys. max_len_key = None for key in possible_keys: max_len = getattr(hf_config, key, None) @@ -1174,6 +1209,16 @@ def _get_and_verify_max_len( max_len_key = key if max_len < derived_max_model_len \ else max_len_key derived_max_model_len = min(derived_max_model_len, max_len) + + # If sliding window is manually disabled, max_length should be less + # than the sliding window length in the model config. + if disable_sliding_window and sliding_window_len is not None: + max_len_key = "sliding_window" \ + if sliding_window_len < derived_max_model_len else max_len_key + derived_max_model_len = min(derived_max_model_len, sliding_window_len) + + # If none of the keys were found in the config, use a default and + # log a warning. if derived_max_model_len == float("inf"): if max_model_len is not None: # If max_model_len is specified, we use it. @@ -1189,6 +1234,13 @@ def _get_and_verify_max_len( rope_scaling = getattr(hf_config, "rope_scaling", None) if rope_scaling is not None and rope_scaling["type"] != "su": + if disable_sliding_window: + # TODO(robertgshaw): Find a model that supports rope_scaling + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "with rope_scaling. Please raise an issue so we can " + "investigate.") assert "factor" in rope_scaling scaling_factor = rope_scaling["factor"] if rope_scaling["type"] == "yarn": @@ -1196,6 +1248,8 @@ def _get_and_verify_max_len( "original_max_position_embeddings"] derived_max_model_len *= scaling_factor + # If the user specified a max length, make sure it is smaller than the + # derived length from the HF model config. if max_model_len is None: max_model_len = int(derived_max_model_len) elif max_model_len > derived_max_model_len: @@ -1204,6 +1258,13 @@ def _get_and_verify_max_len( # with model_max_length and allow this override when it's smaller. model_max_length = getattr(hf_config, "model_max_length", None) if model_max_length is not None and max_model_len <= model_max_length: + if disable_sliding_window: + # TODO(robertgshaw): Find a model that has model_max_length + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "model_max_length in the config. Please raise an issue " + "so we can investigate.") pass else: raise ValueError( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b94f2619ba767..ec2dca8505e3f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -43,6 +43,7 @@ class EngineArgs: max_parallel_loading_workers: Optional[int] = None block_size: int = 16 enable_prefix_caching: bool = False + disable_sliding_window: bool = False use_v2_block_manager: bool = False swap_space: int = 4 # GiB gpu_memory_utilization: float = 0.90 @@ -271,6 +272,10 @@ def add_cli_args( parser.add_argument('--enable-prefix-caching', action='store_true', help='Enables automatic prefix caching.') + parser.add_argument('--disable-sliding-window', + action='store_true', + help='Disables sliding window, ' + 'capping to sliding window size') parser.add_argument('--use-v2-block-manager', action='store_true', help='Use BlockSpaceMangerV2.') @@ -569,11 +574,11 @@ def create_engine_config(self, ) -> EngineConfig: model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, self.dtype, self.seed, self.revision, - self.code_revision, self.rope_scaling, - self.tokenizer_revision, self.max_model_len, - self.quantization, self.quantization_param_path, - self.enforce_eager, self.max_context_len_to_capture, - self.max_seq_len_to_capture, self.max_logprobs, + self.code_revision, self.rope_scaling, self.tokenizer_revision, + self.max_model_len, self.quantization, self.quantization_param_path, + self.sparsity, self.enforce_eager, + self.max_context_len_to_capture, self.max_seq_len_to_capture, + self.max_logprobs, self.disable_sliding_window, self.skip_tokenizer_init, self.served_model_name) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, @@ -660,7 +665,8 @@ def create_engine_config(self, ) -> EngineConfig: if (model_config.get_sliding_window() is not None and scheduler_config.chunked_prefill_enabled): raise ValueError( - "Chunked prefill is not supported with sliding window.") + "Chunked prefill is not supported with sliding window. " + "Set --disable-sliding-window to disable sliding window.") return EngineConfig(model_config=model_config, cache_config=cache_config, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 086f9294c4f1c..2ca55f9270fc7 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -94,7 +94,6 @@ def __init__( max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, - sliding_window: Optional[int] = None, cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() @@ -146,7 +145,6 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - sliding_window=sliding_window, cache_config=cache_config, quant_config=quant_config) @@ -183,7 +181,6 @@ def __init__( config.original_max_position_embeddings) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - sliding_window = getattr(config, "sliding_window", None) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias attention_bias = getattr(config, "attention_bias", False) or getattr( @@ -198,7 +195,6 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, - sliding_window=sliding_window, cache_config=cache_config, ) self.mlp = LlamaMLP( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index ea95cf7380d54..d6dd7fa1fe9e2 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -246,15 +246,16 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class MixtralAttention(nn.Module): - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - rope_theta: float = 10000, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - sliding_window: Optional[int] = None) -> None: + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -276,7 +277,6 @@ def __init__(self, self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.sliding_window = sliding_window if isinstance( quant_config, @@ -312,7 +312,6 @@ def __init__(self, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, cache_config=cache_config, quant_config=quant_config) @@ -349,7 +348,6 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - sliding_window=config.sliding_window, cache_config=cache_config, quant_config=quant_config) self.block_sparse_moe = MixtralMoE( diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 9b99ff729aadd..1894c05e167d6 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -166,7 +166,6 @@ def __init__( max_position: int = 4096 * 32, rope_theta: float = 10000, quant_config: Optional[QuantizationConfig] = None, - sliding_window: Optional[int] = None, cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() @@ -190,7 +189,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.sliding_window = sliding_window self.qkv_proj = QKVParallelLinear( hidden_size, @@ -217,7 +215,6 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, cache_config=cache_config, quant_config=quant_config) @@ -254,7 +251,6 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - sliding_window=config.sliding_window, cache_config=cache_config, quant_config=quant_config) self.block_sparse_moe = MixtralMoE(config=config, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index ec203c3b9001a..9a4829a27873e 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -86,10 +86,8 @@ def __init__(self, num_kv_heads: int, max_position: int = 4096 * 32, rope_theta: float = 10000, - use_sliding_window: bool = False, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - sliding_window: Optional[int] = None, rope_scaling: Optional[Tuple] = None) -> None: super().__init__() self.hidden_size = hidden_size @@ -112,7 +110,6 @@ def __init__(self, self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.sliding_window = sliding_window if use_sliding_window else None self.qkv_proj = QKVParallelLinear( hidden_size, @@ -140,7 +137,6 @@ def __init__(self, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, cache_config=cache_config, quant_config=quant_config) @@ -164,7 +160,6 @@ class Qwen2DecoderLayer(nn.Module): def __init__( self, config: Qwen2Config, - layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ) -> None: @@ -173,18 +168,14 @@ def __init__( # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) rope_scaling = getattr(config, "rope_scaling", None) - use_sliding_window = (config.use_sliding_window - and layer_idx < config.max_window_layers) self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - use_sliding_window=use_sliding_window, cache_config=cache_config, quant_config=quant_config, - sliding_window=config.sliding_window, rope_scaling=rope_scaling) self.mlp = Qwen2MLP( hidden_size=self.hidden_size, @@ -244,8 +235,8 @@ def __init__( config.hidden_size, ) self.layers = nn.ModuleList([ - Qwen2DecoderLayer(config, layer_idx, cache_config, quant_config) - for layer_idx in range(config.num_hidden_layers) + Qwen2DecoderLayer(config, cache_config, quant_config) + for _ in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -302,6 +293,18 @@ def __init__( lora_config: Optional[LoRAConfig] = None, ) -> None: del lora_config + # TODO (@robertgshaw2): see if this can be moved out + if (cache_config.sliding_window is not None + and hasattr(config, "max_window_layers")): + raise ValueError("Sliding window for some but all layers is not " + "supported. This model uses sliding window " + "but `max_window_layers` = %s is less than " + "`num_hidden_layers` = %s. Please open an issue " + "to discuss this feature." % ( + config.max_window_layers, + config.num_hidden_layers, + )) + super().__init__() self.config = config self.quant_config = quant_config diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 91ffd0861c39d..4324bf50d4ad1 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -74,7 +74,6 @@ def __init__(self, self.rope_theta = config.rope_theta self.max_position_embeddings = config.max_position_embeddings self.use_bias = config.use_bias - self.sliding_window = config.sliding_window self.qkv_proj = QKVParallelLinear( self.hidden_size, @@ -101,7 +100,6 @@ def __init__(self, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, cache_config=cache_config, quant_config=quant_config) diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index dda13d83f89a3..1e5280dde3ff9 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -88,7 +88,6 @@ def __init__( max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, - sliding_window: Optional[int] = None, cache_config: Optional[CacheConfig] = None, ) -> None: super().__init__() @@ -134,7 +133,6 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - sliding_window=sliding_window, cache_config=cache_config, quant_config=quant_config) @@ -167,7 +165,6 @@ def __init__( rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - sliding_window = getattr(config, "sliding_window", None) self.self_attn = XverseAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -178,7 +175,6 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=getattr(config, "bias", False), - sliding_window=sliding_window, cache_config=cache_config, ) self.mlp = XverseMLP( From 2c59c9158f2acee42c53911c80dfc63f7a9fd4d1 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Mon, 27 May 2024 22:26:14 +0000 Subject: [PATCH 039/154] [Core] Allow AQLM on Pascal (#5058) --- vllm/model_executor/layers/quantization/aqlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 83e24fadc1405..730595c3d36d1 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -192,7 +192,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]: @classmethod def get_min_capability(cls) -> int: - return 70 + return 60 @classmethod def get_config_filenames(cls) -> List[str]: From 9fb7b8282fa6f701745b4cef25fd16590fd5e0c0 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 28 May 2024 07:41:43 +0800 Subject: [PATCH 040/154] [Model] Add support for falcon-11B (#5069) --- vllm/model_executor/models/falcon.py | 55 ++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index ba707adb03dfe..9618652f70d23 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -41,7 +41,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput @@ -246,18 +246,26 @@ def __init__( self.mlp = FalconMLP(config, quant_config) self.config = config - if config.new_decoder_architecture: - # The layer norm before self-attention - self.ln_attn = LayerNorm(hidden_size, - eps=config.layer_norm_epsilon) - # The layer norm before the MLP - self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - else: + if (config.num_ln_in_parallel_attn is None + and config.new_decoder_architecture): + config.num_ln_in_parallel_attn = 2 + + if not config.parallel_attn: + self.post_attention_layernorm = LayerNorm( + hidden_size, eps=config.layer_norm_epsilon) self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - if not config.parallel_attn: - self.post_attention_layernorm = LayerNorm( - hidden_size, eps=config.layer_norm_epsilon) + else: + if config.num_ln_in_parallel_attn == 2: + # The layer norm before self-attention + self.ln_attn = LayerNorm(hidden_size, + eps=config.layer_norm_epsilon) + # The layer norm before the MLP + self.ln_mlp = LayerNorm(hidden_size, + eps=config.layer_norm_epsilon) + else: + self.input_layernorm = LayerNorm(hidden_size, + eps=config.layer_norm_epsilon) self.reduce_row_parallel_results = not (config.new_decoder_architecture or config.parallel_attn) @@ -271,7 +279,7 @@ def forward( ) -> torch.Tensor: residual = hidden_states - if self.config.new_decoder_architecture: + if self.config.num_ln_in_parallel_attn == 2: attention_layernorm_out = self.ln_attn(hidden_states) mlp_layernorm_out = self.ln_mlp(hidden_states) else: @@ -294,6 +302,10 @@ def forward( residual += attention_output mlp_layernorm_out = self.post_attention_layernorm(residual) + if (self.config.new_decoder_architecture and self.config.parallel_attn + and self.config.num_ln_in_parallel_attn == 1): + mlp_layernorm_out = attention_layernorm_out + # MLP. mlp_output, mlp_bias = self.mlp(mlp_layernorm_out) if self.reduce_row_parallel_results and mlp_bias is not None: @@ -375,7 +387,20 @@ def __init__( self.config = config self.quant_config = quant_config self.transformer = FalconModel(config, cache_config, quant_config) - self.lm_head_weight = self.transformer.word_embeddings.weight + # only Falcon-11B doesn't share lm_head weight with word embeddings + # and previous Falcon model doesn't have tie_word_embeddings config + # so we set tie_word_embeddings to True by default + self.tie_word_embeddings = (config.tie_word_embeddings + if config.tie_word_embeddings is not None + else True) + if self.tie_word_embeddings: + self.lm_head_weight = self.transformer.word_embeddings.weight + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + ) + self.lm_head_weight = self.lm_head.weight self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() @@ -419,8 +444,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads params_dict = dict(self.named_parameters(remove_duplicate=False)) for name, loaded_weight in weights: - if name == "lm_head.weight": - # Falcon uses tied embeddings. + if name == "lm_head.weight" and self.tie_word_embeddings: + # Falcon uses tied embeddings except Falcon-11b. continue # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: From 954c332a5c9b805e0577a750e9d800ec43ce44c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Moskal?= Date: Mon, 27 May 2024 19:07:07 -0700 Subject: [PATCH 041/154] [Core] Sliding window for block manager v2 (#4545) Co-authored-by: Ruth Evans --- tests/core/block/e2e/conftest.py | 26 +++ tests/core/block/e2e/test_correctness.py | 11 +- .../e2e/test_correctness_sliding_window.py | 168 ++++++++++++++++++ tests/core/block/test_block_manager_v2.py | 69 +++++++ vllm/attention/ops/prefix_prefill.py | 6 +- vllm/core/block/block_table.py | 34 +++- vllm/core/block/cpu_gpu_block_allocator.py | 74 ++++++++ vllm/core/block/interfaces.py | 9 + vllm/core/block_manager_v2.py | 24 ++- vllm/engine/arg_utils.py | 3 +- vllm/worker/cache_engine.py | 5 +- vllm/worker/model_runner.py | 73 +++++--- 12 files changed, 457 insertions(+), 45 deletions(-) create mode 100644 tests/core/block/e2e/test_correctness_sliding_window.py diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index b0d62c8993d3f..e870597b7a011 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,3 +1,5 @@ +from typing import Callable, Iterable, Optional + import pytest from vllm import LLM @@ -40,3 +42,27 @@ def generator_inner(): for llm in generator_inner(): yield llm del llm + + +def get_text_from_llm_generator(llm_generator: Iterable[LLM], + prompts, + sampling_params, + llm_cb: Optional[Callable[[LLM], + None]] = None): + for llm in llm_generator: + if llm_cb: + llm_cb(llm) + outputs = llm.generate(prompts, sampling_params, use_tqdm=True) + text = [output.outputs[0].text for output in outputs] + del llm + + return text + + +def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): + for llm in llm_generator: + outputs = llm.generate(prompts, sampling_params, use_tqdm=True) + token_ids = [output.outputs[0].token_ids for output in outputs] + del llm + + return token_ids diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index c3666da7542b5..3713ef2fed4d1 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -4,6 +4,8 @@ from vllm import SamplingParams +from .conftest import get_token_ids_from_llm_generator + @pytest.mark.parametrize( "common_llm_kwargs", @@ -444,12 +446,3 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, assert expected_token_ids == actual_token_ids assert baseline_token_ids == test_token_ids - - -def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): - for llm in llm_generator: - outputs = llm.generate(prompts, sampling_params, use_tqdm=True) - token_ids = [output.outputs[0].token_ids for output in outputs] - del llm - - return token_ids diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py new file mode 100644 index 0000000000000..e98292e807d73 --- /dev/null +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -0,0 +1,168 @@ +import random +from typing import List + +import pytest + +from vllm import LLM, SamplingParams + +from .conftest import get_text_from_llm_generator + +# relatively small model with 4k sliding window +MODEL = "bigcode/starcoder2-3b" +BLOCK_SIZE = 16 + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": MODEL, + + # skip cuda graph creation for fast test. + "enforce_eager": True, + "block_size": BLOCK_SIZE, + # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008 + "num_gpu_blocks_override": 100000 // BLOCK_SIZE, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{ + "use_v2_block_manager": False +}]) +@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}]) +@pytest.mark.parametrize("batch_size", [5]) +@pytest.mark.parametrize("seed", [1]) +def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, + batch_size, seed): + """ + The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then + asks for value of one of them (which is outside the sliding window). + If we tell it upfront which we are going to be looking for, then + it answers correctly (mostly). + + Additionally, we compare the results of the v1 and v2 managers. + """ + sampling_params = SamplingParams( + max_tokens=1024, + ignore_eos=True, + temperature=0.0, + ) + + prompts, answer, indices = prep_prompts(batch_size) + + print('Getting token ids from block manager v1') + baseline_texts = get_text_from_llm_generator(baseline_llm_generator, + prompts, + sampling_params, + llm_cb=check_window(prompts)) + + check_answers(indices, answer, baseline_texts) + + print('Getting token ids from block manager v2') + test_texts = get_text_from_llm_generator(test_llm_generator, prompts, + sampling_params) + check_answers(indices, answer, test_texts) + + cmp = [ + expected_text == actual_text + for expected_text, actual_text in zip(baseline_texts, test_texts) + ] + print(cmp) + # make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768 + # however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290 + # states that xformers and flash_attn have different ideas about the window + # size anyways + assert sum(cmp) > 0.7 * len(cmp) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": MODEL, + + # skip cuda graph creation for fast test. + "enforce_eager": True, + "block_size": BLOCK_SIZE, + "num_gpu_blocks_override": 100000 // BLOCK_SIZE, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "use_v2_block_manager": True, + "enable_chunked_prefill": True +}]) +@pytest.mark.parametrize("batch_size", [5]) +@pytest.mark.parametrize("seed", [1]) +def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed): + """ + This is similar to test_sliding_window_retrival, however, it doesn't + compare against the v1 block manager since v1 doesn't support + chunked prefill with sliding window. + + The results with and without chunked prefill are not the same due to + numerical instabilities. + """ + sampling_params = SamplingParams( + max_tokens=10, + ignore_eos=True, + temperature=0.0, + ) + + prompts, answer, indices = prep_prompts(batch_size) + + # We don't compare with the baseline model here, since the results + # slightly different due to different tailing in attention. + test_texts = get_text_from_llm_generator(test_llm_generator, + prompts, + sampling_params, + llm_cb=check_window(prompts)) + check_answers(indices, answer, test_texts) + + +def prep_prompts(batch_size: int): + """ + Generate prompts which a bunch of assignments, + then asking for the value of one of them. + The prompt is just under 10k tokens; sliding window is 4k + so the answer is outside sliding window, but should still be correct. + """ + prompts: List[str] = [] + answer: List[int] = [] + indices: List[int] = [] + random.seed(1) + for _ in range(batch_size): + idx = random.randint(30, 90) + indices.append(idx) + prompt = "```python\n# We set a number of variables, " + \ + f"x{idx} will be important later\n" + ln = random.randint(800, 1100) + for k in range(30, ln): + v = random.randint(10, 99) + if k == idx: + answer.append(v) + prompt += f"x{k} = {v}\n" + prompt += f"# Now, we check the value of x{idx}:\n" + prompt += f"assert x{idx} == " + prompts.append(prompt) + return prompts, answer, indices + + +def check_answers(indices: List[int], answer: List[int], outputs: List[str]): + answer2 = [int(text[0:2].strip()) for text in outputs] + print(list(zip(indices, zip(answer, answer2)))) + numok = 0 + for a1, a2 in zip(answer, answer2): + if a1 == a2: + numok += 1 + frac_ok = numok / len(answer) + print(f"Num OK: {numok}/{len(answer)} {frac_ok}") + assert frac_ok > 0.7 + + +def check_window(prompts: List[str]): + + def inner(llm: LLM): + sliding_window = llm.llm_engine.model_config.get_sliding_window() + assert sliding_window and sliding_window > 0 + assert any( + len(llm.get_tokenizer().tokenize(prompt)) > sliding_window + for prompt in prompts) + + return inner diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 1e8e4ccdfb151..91b047f0e183e 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -101,3 +101,72 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, range(prompt_len + num_slots_to_append + num_lookahead_slots)), block_size)) - len(chunk_list(list(range(prompt_len)), block_size)) assert num_consumed_blocks == expected_consumed_blocks + + +@pytest.mark.parametrize("block_size", [8, 16]) +@pytest.mark.parametrize("prompt_len", [10, 300, 1000]) +@pytest.mark.parametrize("num_slots_to_append", [50]) +@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512]) +def test_sliding_window(block_size, prompt_len, num_slots_to_append, + sliding_window): + """Verify append_slots consumes the correct number of blocks from the block + table. + """ + + num_gpu_blocks = 1024 + watermark = 0.1 + block_manager = BlockSpaceManagerV2( + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=0, + watermark=watermark, + sliding_window=sliding_window, + ) + + def check_used(min_n, max_n=None): + if max_n is None: + max_n = min_n + used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks() + #print("check", min_n, used, max_n) + assert min_n <= used + assert used <= max_n + + def num_blocks(num_tokens): + return (num_tokens + block_size - 1) // block_size + + check_used(0) + + seq_group = create_seq_group( + seq_prompt_len=prompt_len, + seq_output_lens=[0], + ) + + check_used(0) + + # Allocate seq + assert block_manager.can_allocate(seq_group) + block_manager.allocate(seq_group) + + check_used(num_blocks(prompt_len)) + + # Seq seq to RUNNING + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + seq.data.update_num_computed_tokens(prompt_len) + check_used(num_blocks(prompt_len)) + + # this is how we compute it in BlockSpaceManagerV2.__init__ + sliding_blocks = (sliding_window // block_size) + 2 + # plus one block for null block + sliding_blocks += 1 + + # Append tokens to the sequeqnce + for token_id in range(num_slots_to_append): + seq.append_token_id(token_id, {token_id: Logprob(0.0)}) + seq.data.update_num_computed_tokens(1) + block_manager.append_slots(seq, num_lookahead_slots=0) + if prompt_len < sliding_window + 10: + check_used(0, sliding_blocks + 1) + else: + check_used(sliding_blocks, sliding_blocks + 1) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 997b25e887e30..b99cf9a50d105 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -697,6 +697,10 @@ def context_attention_fwd(q, grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, + # 0 means "disable" + if sliding_window is None or sliding_window <= 0: + sliding_window = 0 + num_warps = 8 if Lk <= 64 else 8 if alibi_slopes is not None: _fwd_kernel_alibi[grid]( @@ -794,7 +798,7 @@ def context_attention_fwd(q, BLOCK_DMODEL=Lk, BLOCK_DMODEL_PADDED=Lk_padded, BLOCK_N=BLOCK, - SLIDING_WINDOW=sliding_window if sliding_window is not None else 0, + SLIDING_WINDOW=sliding_window, num_warps=num_warps, num_stages=1, ) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index b0d9511fba521..26c704b8de901 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -20,6 +20,10 @@ class BlockTable: _blocks (Optional[List[Block]], optional): An optional list of existing blocks to initialize the BlockTable with. If not provided, an empty BlockTable is created. + max_block_sliding_window (Optional[int], optional): The number of + blocks to keep around for each sequance. If None, all blocks + are kept (eg., when sliding window is not used). + It should at least fit the sliding window size of the model. Attributes: _block_size (int): The maximum number of tokens that can be stored in a @@ -37,6 +41,7 @@ def __init__( block_size: int, block_allocator: DeviceAwareBlockAllocator, _blocks: Optional[List[Block]] = None, + max_block_sliding_window: Optional[int] = None, ): self._block_size = block_size self._allocator = block_allocator @@ -44,6 +49,7 @@ def __init__( _blocks = [] self._blocks: List[Block] = _blocks + self._max_block_sliding_window = max_block_sliding_window # Use helper method instead of directly calculating, as blocks # may not be allocated. self._num_full_slots = len(self._get_all_token_ids()) @@ -89,7 +95,8 @@ def allocate(self, def append_token_ids(self, token_ids: List[int], - num_lookahead_slots: int = 0) -> None: + num_lookahead_slots: int = 0, + num_computed_slots: Optional[int] = None) -> None: """Appends a sequence of token IDs to the existing blocks in the BlockTable. @@ -104,13 +111,35 @@ def append_token_ids(self, Args: token_ids (List[int]): The sequence of token IDs to be appended. + num_computed_slots (Optional[int]): The number of KV cache slots + that are already filled (computed). + When sliding window is enabled, this is used to compute how many + blocks to drop at the front of the sequence. + Without sliding window, None can be passed. + Without chunked prefill, it should be the same as + _num_full_slots. """ - assert self._is_allocated + assert self._is_allocated, "no blocks have been allocated" assert len(self._blocks) > 0 + # Drop blocks that are no longer needed due to sliding window + if self._max_block_sliding_window is not None: + null_block = self._allocator.allocate_or_get_null_block() + assert num_computed_slots is not None + end_block_idx = (num_computed_slots // + self._block_size) - self._max_block_sliding_window + for idx in range(0, end_block_idx): + b = self._blocks[idx] + if b is not null_block: + self._allocator.free(b) + self._blocks[idx] = null_block + + # Ensure there are enough empty slots for the new tokens plus + # lookahead slots self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + num_lookahead_slots) + # Update the blocks with the new tokens blocks = self._blocks[self._num_full_slots // self._block_size:] token_blocks = self._chunk_token_blocks_for_append(token_ids) @@ -168,6 +197,7 @@ def fork(self) -> "BlockTable": block_size=self._block_size, block_allocator=self._allocator, _blocks=forked_blocks, + max_block_sliding_window=self._max_block_sliding_window, ) def free(self) -> None: diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 0577ca76ea971..d28a684376974 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -105,11 +105,19 @@ def __init__( Device.GPU: gpu_block_allocator, } + self._null_block: Optional[Block] = None + self._block_ids_to_allocator: Dict[int, BlockAllocator] = {} for _, allocator in self._allocators.items(): for block_id in allocator.all_block_ids: self._block_ids_to_allocator[block_id] = allocator + def allocate_or_get_null_block(self) -> Block: + if self._null_block is None: + self._null_block = NullBlock( + self.allocate_mutable(None, Device.GPU)) + return self._null_block + def allocate_mutable(self, prev_block: Optional[Block], device: Device) -> Block: """Allocates a new mutable block on the specified device. @@ -149,6 +157,9 @@ def free(self, block: Block) -> None: Args: block (Block): The block to be freed. """ + # Null block should never be freed + if isinstance(block, NullBlock): + return block_id = block.block_id assert block_id is not None allocator = self._block_ids_to_allocator[block_id] @@ -165,6 +176,8 @@ def fork(self, last_block: Block) -> List[Block]: List[Block]: A new list of blocks that shares the same memory as the original sequence. """ + # do not attempt to fork the null block + assert not isinstance(last_block, NullBlock) block_id = last_block.block_id assert block_id is not None allocator = self._block_ids_to_allocator[block_id] @@ -226,3 +239,64 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]: raise NotImplementedError + + +class NullBlock(Block): + """ + Null blocks are used as a placeholders for KV cache blocks that have + been dropped due to sliding window. + This implementation just wraps an ordinary block and prevents it from + being modified. It also allows for testing if a block is NullBlock + via isinstance(). + """ + + def __init__(self, proxy: Block): + super().__init__() + self._proxy = proxy + + def append_token_ids(self, token_ids: List[BlockId]): + raise ValueError("null block should not be modified") + + @property + def block_id(self): + return self._proxy.block_id + + @block_id.setter + def block_id(self, value: Optional[BlockId]): + raise ValueError("null block should not be modified") + + @property + def token_ids(self) -> List[BlockId]: + return self._proxy.token_ids + + @property + def num_empty_slots(self) -> BlockId: + return self._proxy.num_empty_slots + + @property + def is_full(self): + return self._proxy.is_full + + @property + def prev_block(self): + return self._proxy.prev_block + + @property + def computed(self): + return self._proxy.computed + + @computed.setter + def computed(self, value): + self._proxy.computed = value + + @property + def last_accessed(self) -> float: + return self._proxy.last_accessed + + @last_accessed.setter + def last_accessed(self, last_accessed_ts: float): + self._proxy.last_accessed = last_accessed_ts + + @property + def content_hash(self): + return self._proxy.content_hash diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 140fbbb0949cc..8fc4c601106cd 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -203,3 +203,12 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None: def get_common_computed_block_ids( self, seq_block_ids: List[List[int]]) -> List[int]: pass + + @abstractmethod + def allocate_or_get_null_block(self) -> Block: + """ + Null blocks are used as a placeholders for KV cache blocks that have + been dropped due to sliding window. + There is at most one null block per allocator. + """ + pass diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index f0bc96564050a..834436c25e160 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -66,9 +66,18 @@ def __init__( self.num_total_gpu_blocks = num_gpu_blocks self.num_total_cpu_blocks = num_cpu_blocks - assert sliding_window is None, "Sliding window not yet supported" - - self.block_sliding_window = None + self.sliding_window = sliding_window + # max_block_sliding_window is the max number of blocks that need to be + # allocated + self.max_block_sliding_window = None + if sliding_window is not None: + # +1 here because // rounds down + num_blocks = sliding_window // block_size + 1 + # +1 here because the last block may not be full, + # and so the sequence stretches one more block at the beginning + # For example, if sliding_window is 3 and block_size is 4, + # we may need 2 blocks when the second block only holds 1 token. + self.max_block_sliding_window = num_blocks + 1 self.watermark = watermark assert watermark >= 0.0 @@ -96,10 +105,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: block_size=self.block_size, ) - assert self.block_sliding_window is None - if self.block_sliding_window is not None: + if self.max_block_sliding_window is not None: num_required_blocks = min(num_required_blocks, - self.block_sliding_window) + self.max_block_sliding_window) num_free_gpu_blocks = self.block_allocator.get_num_free_blocks( device=Device.GPU) @@ -125,8 +133,9 @@ def allocate(self, seq_group: SequenceGroup) -> None: block_table = BlockTable( block_size=self.block_size, block_allocator=self.block_allocator, + max_block_sliding_window=self.max_block_sliding_window, ) - assert self.block_sliding_window is None + block_table.allocate(seq.get_token_ids()) self.block_tables[seq.seq_id] = block_table @@ -174,6 +183,7 @@ def append_slots( block_table.append_token_ids( token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()), num_lookahead_slots=num_lookahead_slots, + num_computed_slots=seq.data.get_num_computed_tokens(), ) # Return any new copy-on-writes. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ec2dca8505e3f..8291daa05ec80 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -663,7 +663,8 @@ def create_engine_config(self, ) -> EngineConfig: guided_decoding_backend=self.guided_decoding_backend) if (model_config.get_sliding_window() is not None - and scheduler_config.chunked_prefill_enabled): + and scheduler_config.chunked_prefill_enabled + and not scheduler_config.use_v2_block_manager): raise ValueError( "Chunked prefill is not supported with sliding window. " "Set --disable-sliding-window to disable sliding window.") diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 07d51dca226bd..2f0e59f7ae7c9 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -68,8 +68,11 @@ def _allocate_kv_cache( pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] for _ in range(self.num_layers): + # null block in CpuGpuBlockAllocator requires at least that + # block to be zeroed-out. + # We zero-out everything for simplicity. kv_cache.append( - torch.empty(kv_cache_shape, + torch.zeros(kv_cache_shape, dtype=self.dtype, pin_memory=pin_memory, device=device)) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 87d5f5c1b9d67..5ddd2d1b65f81 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -269,6 +269,12 @@ def _prepare_model_input( if len(seq_group_metadata_list) == 0: return ModelInput.empty(self.device) + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window + self.block_size - + 1) // self.block_size + block_aligned_sliding_window = \ + sliding_window_blocks * self.block_size + for seq_group_metadata in seq_group_metadata_list: seq_ids = list(seq_group_metadata.seq_data.keys()) is_prompt = seq_group_metadata.is_prompt @@ -309,6 +315,30 @@ def _prepare_model_input( and self.sliding_window is None and is_prompt) + # These are seq_len/context_len capped to the sliding window. + # They are passed to decode kernel. + # We still need original seq_len/context_len to compute slot + # mapping (and input position) below. + curr_sliding_window_blocks = None + sliding_seq_len = seq_len + sliding_context_len = context_len + + # TODO(sang): This is a hack to make sliding window work with + # paged attn. We can remove it if we make paged attn kernel + # to properly handle slinding window attn. + if (self.sliding_window is not None and not is_prompt): + curr_sliding_window_blocks = sliding_window_blocks + if self.scheduler_config.use_v2_block_manager: + # number of elements in last block + suff_len = seq_len % self.block_size + sliding_seq_len = min( + seq_len, block_aligned_sliding_window + suff_len) + if suff_len > 0: + curr_sliding_window_blocks += 1 + else: + sliding_seq_len = min(seq_len, self.sliding_window) + sliding_context_len = sliding_seq_len - 1 + # TODO(sang): Combine chunked prefill and prefix caching by # only allowing multiple of block_size chunk size. # NOTE: This only works for oooooooxxx style attention. @@ -316,6 +346,13 @@ def _prepare_model_input( assert computed_block_nums is not None context_len = len(computed_block_nums) * self.block_size tokens = tokens[context_len:] + + # need to think what to set it to when we have both sliding + # window and prefix caching... + assert self.sliding_window is None, \ + "Prefix caching is not supported with sliding window" + sliding_context_len = context_len + if self.attn_backend.get_name() == "flash-attn": # NOTE(woosuk): For flash-attn, the block table should # include the entries for the incoming prefill tokens. @@ -329,14 +366,9 @@ def _prepare_model_input( if seq_group_metadata.block_tables is not None: # chunked prefill or decode block_table = seq_group_metadata.block_tables[seq_id] - if self.sliding_window is not None: - # chunked prefill doesn't support sliding window. - assert (not self.scheduler_config. - chunked_prefill_enabled) - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - + if curr_sliding_window_blocks is not None: + block_table = block_table[ + -curr_sliding_window_blocks:] if self.attn_backend.get_name() == "flashinfer": paged_kv_indices.extend(block_table) paged_kv_indptr.append(paged_kv_indptr[-1] + @@ -354,16 +386,9 @@ def _prepare_model_input( block_table = [] block_tables.append(block_table) - # TODO(sang): This is a hack to make sliding window work with - # paged attn. We can remove it if we make paged attn kernel - # to properly handle slinding window attn. - if (self.sliding_window is not None and not is_prompt): - seq_len = min(seq_len, self.sliding_window) - context_len = seq_len - 1 - - seq_lens.append(seq_len) - context_lens.append(context_len) - query_len = seq_len - context_len + seq_lens.append(sliding_seq_len) + context_lens.append(sliding_context_len) + query_len = sliding_seq_len - sliding_context_len query_lens.append(query_len) input_tokens.extend(tokens) input_positions.extend(list(range(context_len, seq_len))) @@ -380,16 +405,15 @@ def _prepare_model_input( "seq_len: {}, context_len: {}, query_len: {}".format( seq_len, context_len, query_len)) num_decode_tokens += query_len - decode_seq_lens.append(seq_len) + decode_seq_lens.append(sliding_seq_len) if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping += [lora_id] * (seq_len - context_len) + lora_index_mapping += [lora_id] * query_len lora_prompt_mapping.extend( [lora_id] * - (seq_len - - context_len if seq_group_metadata.sampling_params + (query_len if seq_group_metadata.sampling_params and seq_group_metadata.sampling_params.prompt_logprobs else 1)) @@ -417,9 +441,10 @@ def _prepare_model_input( start_idx = 0 if self.sliding_window is not None: if is_prompt: - assert context_len == 0, ( + assert self.scheduler_config.use_v2_block_manager \ + or context_len == 0, ( "Prefix caching is currently not supported with " - "sliding window attention") + "sliding window attention in V1 block manager") # It is an optimization. When it is decoding, it is always # 0. When prefill, we use it to not write slots to kv cache # to save memory. From 9929fb22ccbc48f8bbc190ff36360eafc854ba6a Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Tue, 28 May 2024 08:32:42 -0700 Subject: [PATCH 042/154] [BugFix] Fix Embedding Models with TP>1 (#5075) --- vllm/worker/embedding_model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index ef02de95fc54e..0ba1200696cab 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -79,6 +79,10 @@ def execute_model( execute_model_kwargs.update({"image_input": multi_modal_input}) hidden_states = model_executable(**execute_model_kwargs) + # Only perform pooling in the driver worker. + if not self.is_driver_worker: + return None + return self.model.pooler(hidden_states=hidden_states, pooling_metadata=pooling_metadata) From b22d985cd9e805d6d86ddcb0958e2315396832f9 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Tue, 28 May 2024 11:03:23 -0500 Subject: [PATCH 043/154] [Kernel][ROCm][AMD] Add fused_moe Triton configs for MI300X (#4951) This PR adds Triton kernel configs for the MoE kernel for MI300X --- ...14336,device_name=AMD_Instinct_MI300X.json | 128 ++++++++++++++++++ ...=1792,device_name=AMD_Instinct_MI300X.json | 110 +++++++++++++++ ...=3584,device_name=AMD_Instinct_MI300X.json | 128 ++++++++++++++++++ ...=7168,device_name=AMD_Instinct_MI300X.json | 128 ++++++++++++++++++ 4 files changed, 494 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..93472eb08a462 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_stages": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..5bd9d71e8f9bb --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,110 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8 + }, + "48": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..02e66280c1a3a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 32, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_stages": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_stages": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_stages": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..34c3b593d9799 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_stages": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_stages": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_stages": 1 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_stages": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_stages": 0 + } +} From 54c17a974fa678a4a64d71b9f1279eb520752e13 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 28 May 2024 12:29:09 -0500 Subject: [PATCH 044/154] [Docs] Add Dropbox as sponsors (#5089) --- docs/source/community/sponsors.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index 532ce77beb7b8..d167b66267a4d 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -12,6 +12,7 @@ vLLM is a community project. Our compute resources for development and testing a - Crusoe Cloud - Databricks - DeepInfra +- Dropbox - Lambda Lab - NVIDIA - Replicate From 8c9aab438c69882a6231f8a129024986786aa398 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 29 May 2024 04:29:31 +0800 Subject: [PATCH 045/154] [Core] Consolidate prompt arguments to LLM engines (#4328) Co-authored-by: Roger Wang --- .buildkite/test-pipeline.yaml | 9 +- benchmarks/benchmark_latency.py | 11 +- .../{ => dev}/offline_inference/llm.rst | 2 +- .../dev/offline_inference/llm_inputs.rst | 14 + .../dev/offline_inference/offline_index.rst | 8 + .../sampling_params.rst | 0 docs/source/index.rst | 11 +- .../serving/openai_compatible_server.md | 4 +- examples/llava_example.py | 25 +- pyproject.toml | 7 + tests/async_engine/test_async_llm_engine.py | 2 +- tests/async_engine/test_openapi_server_ray.py | 2 +- tests/conftest.py | 23 +- tests/core/test_block_manager.py | 15 +- tests/core/utils.py | 15 +- tests/engine/test_skip_tokenizer_init.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 4 + tests/entrypoints/test_guided_processors.py | 2 + tests/entrypoints/test_llm_encode.py | 144 +++++++ tests/entrypoints/test_llm_generate.py | 137 +++++- tests/entrypoints/test_openai_server.py | 34 +- .../test_server_oot_registration.py | 11 +- tests/lora/test_long_context.py | 8 +- tests/samplers/test_logits_processor.py | 11 +- tests/samplers/test_seeded_generate.py | 6 +- tests/test_cache_block_hashing.py | 11 +- tests/test_inputs.py | 53 +++ tests/test_utils.py | 63 +++ tests/tokenization/test_detokenize.py | 7 +- tests/utils.py | 14 + vllm/__init__.py | 4 + vllm/engine/async_llm_engine.py | 171 ++++---- vllm/engine/llm_engine.py | 269 ++++++++---- vllm/engine/output_processor/util.py | 10 +- vllm/entrypoints/llm.py | 404 +++++++++++++----- vllm/entrypoints/openai/serving_chat.py | 12 +- vllm/entrypoints/openai/serving_completion.py | 17 +- vllm/entrypoints/openai/serving_embedding.py | 42 +- vllm/entrypoints/openai/serving_engine.py | 6 +- vllm/inputs.py | 130 ++++++ vllm/outputs.py | 42 +- vllm/sequence.py | 38 +- vllm/utils.py | 43 +- 43 files changed, 1404 insertions(+), 439 deletions(-) rename docs/source/{ => dev}/offline_inference/llm.rst (86%) create mode 100644 docs/source/dev/offline_inference/llm_inputs.rst create mode 100644 docs/source/dev/offline_inference/offline_index.rst rename docs/source/{offline_inference => dev}/sampling_params.rst (100%) create mode 100644 tests/entrypoints/test_llm_encode.py create mode 100644 tests/test_inputs.py create mode 100644 tests/test_utils.py create mode 100644 vllm/inputs.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index def8a460e84a7..08e132d0c68bf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -63,9 +63,9 @@ steps: mirror_hardwares: [amd] commands: - # these tests have to be separated, because each one will allocate all posible GPU memory - - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py - - pytest -v -s entrypoints/test_server_oot_registration.py + - pytest -v -s test_inputs.py + - pytest -v -s entrypoints -m llm + - pytest -v -s entrypoints -m openai - label: Examples Test working_dir: "/vllm-workspace/examples" @@ -110,6 +110,9 @@ steps: mirror_hardwares: [amd] command: pytest -v -s test_logits_processor.py +- label: Utils Test + command: pytest -v -s test_utils.py + - label: Worker Test mirror_hardwares: [amd] command: pytest -v -s worker diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index a9657f7859750..3146fb33cc27e 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -3,13 +3,14 @@ import json import time from pathlib import Path -from typing import Optional +from typing import List, Optional import numpy as np import torch from tqdm import tqdm from vllm import LLM, SamplingParams +from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -48,7 +49,9 @@ def main(args: argparse.Namespace): dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_prompt_token_ids = dummy_prompt_token_ids.tolist() + dummy_inputs: List[PromptStrictInputs] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: @@ -59,13 +62,13 @@ def run_to_completion(profile_dir: Optional[str] = None): ], on_trace_ready=torch.profiler.tensorboard_trace_handler( str(profile_dir))) as p: - llm.generate(prompt_token_ids=dummy_prompt_token_ids, + llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False) print(p.key_averages()) else: start_time = time.perf_counter() - llm.generate(prompt_token_ids=dummy_prompt_token_ids, + llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False) end_time = time.perf_counter() diff --git a/docs/source/offline_inference/llm.rst b/docs/source/dev/offline_inference/llm.rst similarity index 86% rename from docs/source/offline_inference/llm.rst rename to docs/source/dev/offline_inference/llm.rst index 1a443ea406994..83ba1b6987c6d 100644 --- a/docs/source/offline_inference/llm.rst +++ b/docs/source/dev/offline_inference/llm.rst @@ -1,5 +1,5 @@ LLM Class -========== +========= .. autoclass:: vllm.LLM :members: diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.rst new file mode 100644 index 0000000000000..31c3d16a3c8eb --- /dev/null +++ b/docs/source/dev/offline_inference/llm_inputs.rst @@ -0,0 +1,14 @@ +LLM Inputs +========== + +.. autodata:: vllm.inputs.PromptStrictInputs + +.. autoclass:: vllm.inputs.TextPrompt + :show-inheritance: + :members: + :member-order: bysource + +.. autoclass:: vllm.inputs.TokensPrompt + :show-inheritance: + :members: + :member-order: bysource diff --git a/docs/source/dev/offline_inference/offline_index.rst b/docs/source/dev/offline_inference/offline_index.rst new file mode 100644 index 0000000000000..27dfb0e9df90e --- /dev/null +++ b/docs/source/dev/offline_inference/offline_index.rst @@ -0,0 +1,8 @@ +Offline Inference +================================= + +.. toctree:: + :maxdepth: 1 + + llm + llm_inputs diff --git a/docs/source/offline_inference/sampling_params.rst b/docs/source/dev/sampling_params.rst similarity index 100% rename from docs/source/offline_inference/sampling_params.rst rename to docs/source/dev/sampling_params.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 5db1c9346c45d..5f18fe9ae0a73 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -68,13 +68,6 @@ Documentation getting_started/quickstart getting_started/examples/examples_index -.. toctree:: - :maxdepth: 1 - :caption: Offline Inference - - offline_inference/llm - offline_inference/sampling_params - .. toctree:: :maxdepth: 1 :caption: Serving @@ -108,7 +101,9 @@ Documentation .. toctree:: :maxdepth: 2 :caption: Developer Documentation - + + dev/sampling_params + dev/offline_inference/offline_index dev/engine/engine_index dev/kernel/paged_attention dev/dockerfile/dockerfile diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index a775c6addf1d9..15a8761eb5738 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -48,7 +48,7 @@ completion = client.chat.completions.create( ``` ### Extra Parameters for Chat API -The following [sampling parameters (click through to see documentation)](../offline_inference/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -65,7 +65,7 @@ The following extra parameters are supported: ``` ### Extra Parameters for Completions API -The following [sampling parameters (click through to see documentation)](../offline_inference/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python diff --git a/examples/llava_example.py b/examples/llava_example.py index 3d22b492654bf..60250c4303fbf 100644 --- a/examples/llava_example.py +++ b/examples/llava_example.py @@ -23,11 +23,15 @@ def run_llava_pixel_values(): "\nUSER: What is the content of this image?\nASSISTANT:") # This should be provided by another online or offline component. - images = torch.load("images/stop_sign_pixel_values.pt") + image = torch.load("images/stop_sign_pixel_values.pt") + + outputs = llm.generate({ + "prompt": + prompt, + "multi_modal_data": + MultiModalData(type=MultiModalData.Type.IMAGE, data=image), + }) - outputs = llm.generate(prompt, - multi_modal_data=MultiModalData( - type=MultiModalData.Type.IMAGE, data=images)) for o in outputs: generated_text = o.outputs[0].text print(generated_text) @@ -46,11 +50,14 @@ def run_llava_image_features(): "\nUSER: What is the content of this image?\nASSISTANT:") # This should be provided by another online or offline component. - images = torch.load("images/stop_sign_image_features.pt") - - outputs = llm.generate(prompt, - multi_modal_data=MultiModalData( - type=MultiModalData.Type.IMAGE, data=images)) + image = torch.load("images/stop_sign_image_features.pt") + + outputs = llm.generate({ + "prompt": + prompt, + "multi_modal_data": + MultiModalData(type=MultiModalData.Type.IMAGE, data=image), + }) for o in outputs: generated_text = o.outputs[0].text print(generated_text) diff --git a/pyproject.toml b/pyproject.toml index 96f78c37cfefb..0e9096fb4c035 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,3 +65,10 @@ skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" [tool.isort] use_parentheses = true skip_gitignore = true + +[tool.pytest.ini_options] +markers = [ + "skip_global_cleanup", + "llm: run tests for vLLM API only", + "openai: run tests for OpenAI API only", +] diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index b69cdc0a21409..10a46422887e3 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -25,7 +25,7 @@ async def step_async(self): return [RequestOutput( request_id=self.request_id)] if self.request_id else [] - async def encode_request_async(self, *args, **kwargs): + async def process_model_inputs_async(self, *args, **kwargs): pass def generate(self, request_id): diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index efa88c7318156..fe558c2b2cc92 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -29,7 +29,7 @@ def server(): ray.shutdown() -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def client(): client = openai.AsyncOpenAI( base_url="http://localhost:8000/v1", diff --git a/tests/conftest.py b/tests/conftest.py index dfec28aa9db4d..d23216966247c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,6 +14,7 @@ from vllm import LLM, SamplingParams from vllm.config import TokenizerPoolConfig, VisionLanguageConfig from vllm.distributed import destroy_model_parallel +from vllm.inputs import PromptInputs from vllm.logger import init_logger from vllm.sequence import MultiModalData @@ -587,12 +588,22 @@ def generate( ) -> List[Tuple[List[int], str]]: if images is not None: assert len(prompts) == images.shape[0] - req_outputs = self.model.generate( - prompts, - sampling_params=sampling_params, - multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE, - data=images) - if images is not None else None) + + prompt_inputs: List[PromptInputs] = [] + for i, prompt in enumerate(prompts): + image = None if images is None else images[i:i + 1] + mm_data = None if image is None else MultiModalData( + type=MultiModalData.Type.IMAGE, + data=image, + ) + + prompt_inputs.append({ + "prompt": prompt, + "multi_modal_data": mm_data, + }) + + req_outputs = self.model.generate(prompt_inputs, + sampling_params=sampling_params) outputs = [] for req_output in req_outputs: prompt_str = req_output.prompt diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 22a9f0cf47d32..88cd4f98091f9 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -133,8 +133,11 @@ def test_append_slot_cow(): # Allocate prompt to gpu block. There is one slot left in the block. prompt = Sequence(seq_id=1, - prompt="one two three", - prompt_token_ids=[1, 2, 3], + inputs={ + "prompt": "one two three", + "prompt_token_ids": [1, 2, 3], + "multi_modal_data": None + }, block_size=block_size) # Fork the sequence, such that a COW will be required when we append a new @@ -304,7 +307,13 @@ def test_sliding_window_multi_seq(): assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks - parent = Sequence(1, "one two three", [0, 1, 2], block_size) + parent = Sequence(seq_id=1, + inputs={ + "prompt": "one two three", + "prompt_token_ids": [0, 1, 2], + "multi_modal_data": None + }, + block_size=block_size) seq_group = SequenceGroup(request_id="1", seqs=[parent], arrival_time=time.time(), diff --git a/tests/core/utils.py b/tests/core/utils.py index 8fb13177a2d6c..1c5724090b69b 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -21,7 +21,13 @@ def create_dummy_prompt( # and prompt "0 ... block_size". prompt_tokens = list(range(prompt_length)) prompt_str = " ".join([str(t) for t in prompt_tokens]) - prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) + prompt = Sequence(int(request_id), + inputs={ + "prompt": prompt_str, + "prompt_token_ids": prompt_tokens, + "multi_modal_data": None, + }, + block_size=block_size) seq_group = SequenceGroup(request_id=request_id, seqs=[prompt], arrival_time=time.time(), @@ -51,8 +57,11 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - prompt="", - prompt_token_ids=prompt_token_ids, + inputs={ + "prompt": "", + "prompt_token_ids": prompt_token_ids, + "multi_modal_data": None, + }, block_size=16, ) diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index baa463a316902..338b208723ba9 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -14,7 +14,7 @@ def test_skip_tokenizer_initialization(model: str): with pytest.raises(ValueError) as err: llm.generate("abc", sampling_params) assert "prompts must be None if" in str(err.value) - outputs = llm.generate(prompt_token_ids=[[1, 2, 3]], + outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, sampling_params=sampling_params) assert len(outputs) > 0 completions = outputs[0].outputs diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 74b49726734b5..c45f02fe564a3 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1,11 +1,15 @@ import asyncio from dataclasses import dataclass +import pytest + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat MODEL_NAME = "openai-community/gpt2" CHAT_TEMPLATE = "Dummy chat template for testing {}" +pytestmark = pytest.mark.openai + @dataclass class MockModelConfig: diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index 41c871ca40bc8..5d4163e96fd87 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -52,6 +52,8 @@ TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") +pytestmark = pytest.mark.openai + def test_guided_logits_processors(): """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/test_llm_encode.py new file mode 100644 index 0000000000000..7c3fbe43a8384 --- /dev/null +++ b/tests/entrypoints/test_llm_encode.py @@ -0,0 +1,144 @@ +import weakref +from typing import List + +import pytest + +from vllm import LLM, EmbeddingRequestOutput, PoolingParams + +from ..conftest import cleanup + +MODEL_NAME = "intfloat/e5-mistral-7b-instruct" + +PROMPTS = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +TOKEN_IDS = [ + # Using ID={0, 1, 2, 3} results in NaN values, + # so we add this offset of 1000 + [1000], + [1000, 1001], + [1000, 1002, 1001], + [1000, 1003, 1001, 1002], +] + +pytestmark = pytest.mark.llm + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup() + + +def assert_outputs_equal(o1: List[EmbeddingRequestOutput], + o2: List[EmbeddingRequestOutput]): + assert [o.outputs for o in o1] == [o.outputs for o in o2] + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt', PROMPTS) +def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params) + + v2_output = llm.encode(prompt, pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) +def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, + prompt_token_ids): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.encode(prompt_token_ids=prompt_token_ids, + pooling_params=pooling_params) + + v2_output = llm.encode({"prompt_token_ids": prompt_token_ids}, + pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params) + + v2_output = llm.encode(PROMPTS, pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.encode( + [{ + "prompt": p + } for p in PROMPTS], + pooling_params=pooling_params, + ) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.encode(prompt_token_ids=TOKEN_IDS, + pooling_params=pooling_params) + + v2_output = llm.encode( + [{ + "prompt_token_ids": p + } for p in TOKEN_IDS], + pooling_params=pooling_params, + ) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_multiple_pooling_params(llm: LLM): + pooling_params = [ + PoolingParams(), + PoolingParams(), + PoolingParams(), + PoolingParams(), + ] + + # Multiple PoolingParams should be matched with each prompt + outputs = llm.encode(PROMPTS, pooling_params=pooling_params) + assert len(PROMPTS) == len(outputs) + + # Exception raised, if the size of params does not match the size of prompts + with pytest.raises(ValueError): + outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3]) + + # Single PoolingParams should be applied to every prompt + single_pooling_params = PoolingParams() + outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params) + assert len(PROMPTS) == len(outputs) + + # pooling_params is None, default params should be applied + outputs = llm.encode(PROMPTS, pooling_params=None) + assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py index 5e8b7ca4d9977..a00fff91a310e 100644 --- a/tests/entrypoints/test_llm_generate.py +++ b/tests/entrypoints/test_llm_generate.py @@ -1,21 +1,124 @@ +import weakref +from typing import List + import pytest -from vllm import LLM, SamplingParams +from vllm import LLM, RequestOutput, SamplingParams + +from ..conftest import cleanup + +MODEL_NAME = "facebook/opt-125m" + +PROMPTS = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +TOKEN_IDS = [ + [0], + [0, 1], + [0, 2, 1], + [0, 3, 1, 2], +] -def test_multiple_sampling_params(): +pytestmark = pytest.mark.llm - llm = LLM(model="facebook/opt-125m", + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, max_num_batched_tokens=4096, - tensor_parallel_size=1) + tensor_parallel_size=1, + gpu_memory_utilization=0.10, + enforce_eager=True) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup() + + +def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): + assert [o.outputs for o in o1] == [o.outputs for o in o2] + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt', PROMPTS) +def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.generate(prompts=prompt, + sampling_params=sampling_params) + + v2_output = llm.generate(prompt, sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.generate({"prompt": prompt}, + sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) +def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, + prompt_token_ids): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.generate(prompt_token_ids=prompt_token_ids, + sampling_params=sampling_params) + + v2_output = llm.generate({"prompt_token_ids": prompt_token_ids}, + sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.generate(prompts=PROMPTS, + sampling_params=sampling_params) + + v2_output = llm.generate(PROMPTS, sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.generate( + [{ + "prompt": p + } for p in PROMPTS], + sampling_params=sampling_params, + ) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.generate(prompt_token_ids=TOKEN_IDS, + sampling_params=sampling_params) + + v2_output = llm.generate( + [{ + "prompt_token_ids": p + } for p in TOKEN_IDS], + sampling_params=sampling_params, + ) + assert_outputs_equal(v1_output, v2_output) - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] +@pytest.mark.skip_global_cleanup +def test_multiple_sampling_params(llm: LLM): sampling_params = [ SamplingParams(temperature=0.01, top_p=0.95), SamplingParams(temperature=0.3, top_p=0.95), @@ -24,18 +127,18 @@ def test_multiple_sampling_params(): ] # Multiple SamplingParams should be matched with each prompt - outputs = llm.generate(prompts, sampling_params=sampling_params) - assert len(prompts) == len(outputs) + outputs = llm.generate(PROMPTS, sampling_params=sampling_params) + assert len(PROMPTS) == len(outputs) # Exception raised, if the size of params does not match the size of prompts with pytest.raises(ValueError): - outputs = llm.generate(prompts, sampling_params=sampling_params[:3]) + outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3]) # Single SamplingParams should be applied to every prompt single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95) - outputs = llm.generate(prompts, sampling_params=single_sampling_params) - assert len(prompts) == len(outputs) + outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params) + assert len(PROMPTS) == len(outputs) # sampling_params is None, default params should be applied - outputs = llm.generate(prompts, sampling_params=None) - assert len(prompts) == len(outputs) \ No newline at end of file + outputs = llm.generate(PROMPTS, sampling_params=None) + assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index e42b49166fdda..619f0b3bd1d17 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -70,7 +70,7 @@ "Swift", "Kotlin" ] -pytestmark = pytest.mark.asyncio +pytestmark = pytest.mark.openai @pytest.fixture(scope="session") @@ -90,6 +90,8 @@ def server(zephyr_lora_files): "--max-model-len", "8192", "--enforce-eager", + "--gpu-memory-utilization", + "0.75", # lora config below "--enable-lora", "--lora-modules", @@ -117,9 +119,11 @@ def embedding_server(zephyr_lora_files): # use half precision for speed and memory savings in CI environment "--dtype", "bfloat16", + "--enforce-eager", + "--gpu-memory-utilization", + "0.75", "--max-model-len", "8192", - "--enforce-eager", ]) ray.get(server_runner.ready.remote()) yield server_runner @@ -135,6 +139,7 @@ def client(): yield client +@pytest.mark.asyncio async def test_check_models(server, client: openai.AsyncOpenAI): models = await client.models.list() models = models.data @@ -146,6 +151,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): assert lora_models[1].id == "zephyr-lora2" +@pytest.mark.asyncio @pytest.mark.parametrize( # first test base model, then test loras "model_name", @@ -177,6 +183,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, completion.choices[0].text) >= 5 +@pytest.mark.asyncio @pytest.mark.parametrize( # first test base model, then test loras "model_name", @@ -198,6 +205,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, assert choice.logprobs.top_logprobs is None +@pytest.mark.asyncio @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", @@ -242,6 +250,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, model_name: str): @@ -297,6 +306,7 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", @@ -334,6 +344,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, assert "".join(chunks) == single_output +@pytest.mark.asyncio @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", @@ -384,6 +395,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, assert "".join(chunks) == output +@pytest.mark.asyncio @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", @@ -437,6 +449,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, assert texts[0] == texts[1] +@pytest.mark.asyncio async def test_logits_bias(server, client: openai.AsyncOpenAI): prompt = "Hello, my name is" max_tokens = 5 @@ -484,6 +497,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): assert first_response != completion.choices[0].text +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_json_completion(server, client: openai.AsyncOpenAI, @@ -506,6 +520,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_json_chat(server, client: openai.AsyncOpenAI, @@ -552,6 +567,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, assert json1["age"] != json2["age"] +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, @@ -572,6 +588,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, @@ -609,6 +626,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, assert ip1 != ip2 +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, @@ -628,6 +646,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, assert completion.choices[i].text in TEST_CHOICE +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, @@ -666,6 +685,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, assert choice1 != choice2 +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, @@ -701,6 +721,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA)) +@pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, @@ -731,6 +752,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, for token, logprob in token_dict.items()) +@pytest.mark.asyncio async def test_response_format_json_object(server, client: openai.AsyncOpenAI): for _ in range(2): resp = await client.chat.completions.create( @@ -748,6 +770,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): assert loaded == {"result": 2}, loaded +@pytest.mark.asyncio async def test_extra_fields(server, client: openai.AsyncOpenAI): with pytest.raises(BadRequestError) as exc_info: await client.chat.completions.create( @@ -763,6 +786,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI): assert "extra_forbidden" in exc_info.value.message +@pytest.mark.asyncio async def test_complex_message_content(server, client: openai.AsyncOpenAI): resp = await client.chat.completions.create( model=MODEL_NAME, @@ -782,6 +806,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI): assert content == "2" +@pytest.mark.asyncio async def test_custom_role(server, client: openai.AsyncOpenAI): # Not sure how the model handles custom roles so we just check that # both string and complex message content are handled in the same way @@ -812,6 +837,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI): assert content1 == content2 +@pytest.mark.asyncio async def test_guided_grammar(server, client: openai.AsyncOpenAI): simple_sql_grammar = """ start: select_statement @@ -846,6 +872,7 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI): assert content.strip() == ground_truth +@pytest.mark.asyncio @pytest.mark.parametrize( # first test base model, then test loras "model_name", @@ -877,6 +904,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, assert len(logprobs.tokens) > 5 +@pytest.mark.asyncio async def test_long_seed(server, client: openai.AsyncOpenAI): for seed in [ torch.iinfo(torch.long).min - 1, @@ -896,6 +924,7 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): or "less_than_equal" in exc_info.value.message) +@pytest.mark.asyncio @pytest.mark.parametrize( "model_name", [EMBEDDING_MODEL_NAME], @@ -934,6 +963,7 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 5 +@pytest.mark.asyncio @pytest.mark.parametrize( "model_name", [EMBEDDING_MODEL_NAME], diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py index 22e65bf7e7da1..3e55d7f4297fb 100644 --- a/tests/entrypoints/test_server_oot_registration.py +++ b/tests/entrypoints/test_server_oot_registration.py @@ -1,7 +1,7 @@ -import multiprocessing import sys import time +import pytest import torch from openai import OpenAI, OpenAIError @@ -10,6 +10,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.utils import get_open_port +pytestmark = pytest.mark.openai + class MyOPTForCausalLM(OPTForCausalLM): @@ -26,15 +28,16 @@ def server_function(port): # register our dummy model ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) sys.argv = ["placeholder.py"] + \ - ("--model facebook/opt-125m --dtype" - f" float32 --api-key token-abc123 --port {port}").split() + ("--model facebook/opt-125m --gpu-memory-utilization 0.10 " + f"--dtype float32 --api-key token-abc123 --port {port}").split() import runpy runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') def test_oot_registration_for_api_server(): port = get_open_port() - server = multiprocessing.Process(target=server_function, args=(port, )) + ctx = torch.multiprocessing.get_context() + server = ctx.Process(target=server_function, args=(port, )) server.start() client = OpenAI( base_url=f"http://localhost:{port}/v1", diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 3cc4ca3f8940c..cc1d4d620ff8a 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -86,20 +86,18 @@ def generate( def batched_generate( - llm, + llm: vllm.LLM, inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]], ): for input in inputs: prompt, sampling_param, lora_req = input - requests_data = llm._validate_and_prepare_requests( + # Add requests to the engine and run the engine + llm._validate_and_add_requests( prompt, sampling_param, lora_request=lora_req, ) - # Add requests to the engine and run the engine - for request_data in requests_data: - llm._add_request(**request_data) outputs = llm._run_engine(use_tqdm=True) return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))] diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index be4c2ea1b7810..0ccbabfff6403 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -35,28 +35,25 @@ def pick_vllm(token_ids, logits): # test logits_processors when prompt_logprobs is not None vllm_model.model._add_request( - prompt=example_prompts[0], + example_prompts[0], params=params_with_logprobs, - prompt_token_ids=None, ) # test prompt_logprobs is not None vllm_model.model._add_request( - prompt=example_prompts[1], + example_prompts[1], params=SamplingParams( prompt_logprobs=3, max_tokens=max_tokens, ), - prompt_token_ids=None, ) # test grouped requests vllm_model.model._add_request( - prompt=example_prompts[2], + example_prompts[2], params=SamplingParams(max_tokens=max_tokens), - prompt_token_ids=None, ) - outputs = vllm_model.model._run_engine(False) + outputs = vllm_model.model._run_engine(use_tqdm=False) assert outputs[0].outputs[0].text == enforced_answers * repeat_times diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index ce4501bbf71e5..fef5ff3fb9e8e 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -57,11 +57,7 @@ def test_random_sample_with_seed( sampling_params_seed_1, sampling_params_seed_2, ): - llm._add_request( - prompt=prompt, - prompt_token_ids=None, - params=params, - ) + llm._add_request(prompt, params=params) results = llm._run_engine(use_tqdm=False) all_outputs = [[out.token_ids for out in output.outputs] diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 3b257ac062f56..97864af88e40a 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -70,8 +70,15 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, for prompt in prompts: hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) - seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, - tokenizer.tokenizer.eos_token_id, lora_request) + seq = Sequence(seq_id, + inputs={ + "prompt": prompt, + "prompt_token_ids": prompt_token_ids, + "multi_modal_data": None, + }, + block_size=block_size, + eos_token_id=tokenizer.tokenizer.eos_token_id, + lora_request=lora_request) num_blocks = len(prompt_token_ids) // block_size for idx in range(num_blocks): diff --git a/tests/test_inputs.py b/tests/test_inputs.py new file mode 100644 index 0000000000000..887c7101decda --- /dev/null +++ b/tests/test_inputs.py @@ -0,0 +1,53 @@ +from typing import List + +import pytest + +from vllm.inputs import parse_and_batch_prompt + +STRING_INPUTS = [ + '', + 'foo', + 'foo bar', + 'foo baz bar', + 'foo bar qux baz', +] + +TOKEN_INPUTS = [ + [-1], + [1], + [1, 2], + [1, 3, 4], + [1, 2, 4, 3], +] + +INPUTS_SLICES = [ + slice(None, None, -1), + slice(None, None, 2), + slice(None, None, -2), +] + + +def test_parse_single_batch_empty(): + with pytest.raises(ValueError, match="at least one prompt"): + parse_and_batch_prompt([]) + + with pytest.raises(ValueError, match="at least one prompt"): + parse_and_batch_prompt([[]]) + + +@pytest.mark.parametrize('string_input', STRING_INPUTS) +def test_parse_single_batch_string_consistent(string_input: str): + assert parse_and_batch_prompt(string_input) \ + == parse_and_batch_prompt([string_input]) + + +@pytest.mark.parametrize('token_input', TOKEN_INPUTS) +def test_parse_single_batch_token_consistent(token_input: List[int]): + assert parse_and_batch_prompt(token_input) \ + == parse_and_batch_prompt([token_input]) + + +@pytest.mark.parametrize('inputs_slice', INPUTS_SLICES) +def test_parse_single_batch_string_slice(inputs_slice: slice): + assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \ + == parse_and_batch_prompt(STRING_INPUTS[inputs_slice]) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000000000..54dc5c6f5bfba --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,63 @@ +import pytest + +from vllm.utils import deprecate_kwargs + +from .utils import error_on_warning + + +def test_deprecate_kwargs_always(): + + @deprecate_kwargs("old_arg", is_deprecated=True) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="'old_arg'"): + dummy(old_arg=1) + + with error_on_warning(): + dummy(new_arg=1) + + +def test_deprecate_kwargs_never(): + + @deprecate_kwargs("old_arg", is_deprecated=False) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with error_on_warning(): + dummy(old_arg=1) + + with error_on_warning(): + dummy(new_arg=1) + + +def test_deprecate_kwargs_dynamic(): + is_deprecated = True + + @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="'old_arg'"): + dummy(old_arg=1) + + with error_on_warning(): + dummy(new_arg=1) + + is_deprecated = False + + with error_on_warning(): + dummy(old_arg=1) + + with error_on_warning(): + dummy(new_arg=1) + + +def test_deprecate_kwargs_additional_message(): + + @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd") + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="abcd"): + dummy(old_arg=1) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 9bc9becb2a6f1..1d4c74d6bd8da 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -123,8 +123,11 @@ def create_sequence(prompt_token_ids=None): prompt_token_ids = prompt_token_ids or [1] return Sequence( seq_id=0, - prompt="", - prompt_token_ids=prompt_token_ids, + inputs={ + "prompt": "", + "prompt_token_ids": prompt_token_ids, + "multi_modal_data": None, + }, block_size=16, ) diff --git a/tests/utils.py b/tests/utils.py index 689d8c8c5ba8a..329842911e159 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,6 +2,8 @@ import subprocess import sys import time +import warnings +from contextlib import contextmanager import ray import requests @@ -87,3 +89,15 @@ def multi_process_tensor_parallel( ray.get(refs) ray.shutdown() + + +@contextmanager +def error_on_warning(): + """ + Within the scope of this context manager, tests will fail if any warning + is emitted. + """ + with warnings.catch_warnings(): + warnings.simplefilter("error") + + yield diff --git a/vllm/__init__.py b/vllm/__init__.py index eb137ce9442e9..b3fc5cc26f19b 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -5,6 +5,7 @@ from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.llm import LLM from vllm.executor.ray_utils import initialize_ray_cluster +from vllm.inputs import PromptStrictInputs, TextPrompt, TokensPrompt from vllm.model_executor.models import ModelRegistry from vllm.outputs import (CompletionOutput, EmbeddingOutput, EmbeddingRequestOutput, RequestOutput) @@ -17,6 +18,9 @@ __all__ = [ "LLM", "ModelRegistry", + "PromptStrictInputs", + "TextPrompt", + "TokensPrompt", "SamplingParams", "RequestOutput", "CompletionOutput", diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5a15ed67e3327..d4289c715d9e6 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -12,12 +12,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.executor.ray_utils import initialize_ray_cluster, ray +from vllm.inputs import LLMInputs, PromptInputs from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams -from vllm.sequence import ExecuteModelRequest, MultiModalData, SamplerOutput +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.usage.usage_lib import UsageContext logger = init_logger(__name__) @@ -244,64 +245,69 @@ async def step_async( return request_outputs - async def encode_request_async( + async def process_model_inputs_async( self, - request_id: str, # pylint: disable=unused-argument - prompt: Optional[str], - prompt_token_ids: Optional[List[int]] = None, + request_id: str, + inputs: PromptInputs, lora_request: Optional[LoRARequest] = None, - ): - if prompt_token_ids is None: - assert prompt is not None - prompt_token_ids = await self.tokenizer.encode_async( + ) -> LLMInputs: + if isinstance(inputs, str): + inputs = {"prompt": inputs} + + if "prompt_token_ids" not in inputs: + tokenizer = self.get_tokenizer_group("prompts must be None if " + "skip_tokenizer_init is True") + + prompt_token_ids = await tokenizer.encode_async( request_id=request_id, - prompt=prompt, + prompt=inputs["prompt"], lora_request=lora_request) - return prompt_token_ids + else: + prompt_token_ids = inputs["prompt_token_ids"] + + return LLMInputs(prompt_token_ids=prompt_token_ids, + prompt=inputs.get("prompt"), + multi_modal_data=inputs.get("multi_modal_data")) async def add_request_async( self, request_id: str, - prompt: Optional[str], + inputs: PromptInputs, params: Union[SamplingParams, PoolingParams], - prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, ) -> None: if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") if arrival_time is None: arrival_time = time.time() - prompt_token_ids = await self.encode_request_async( + + processed_inputs = await self.process_model_inputs_async( + request_id=request_id, inputs=inputs, lora_request=lora_request) + + self._add_processed_request( request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - lora_request=lora_request) - - return self.add_request(request_id, - prompt=prompt, - params=params, - prompt_token_ids=prompt_token_ids, - arrival_time=arrival_time, - lora_request=lora_request, - multi_modal_data=multi_modal_data) + processed_inputs=processed_inputs, + params=params, + arrival_time=arrival_time, + lora_request=lora_request, + ) async def check_health_async(self) -> None: self.model_executor.check_health() class AsyncLLMEngine: - """An asynchronous wrapper for LLMEngine. + """An asynchronous wrapper for :class:`LLMEngine`. - This class is used to wrap the LLMEngine class to make it asynchronous. It - uses asyncio to create a background loop that keeps processing incoming - requests. The LLMEngine is kicked by the generate method when there - are requests in the waiting queue. The generate method yields the outputs - from the LLMEngine to the caller. + This class is used to wrap the :class:`LLMEngine` class to make it + asynchronous. It uses asyncio to create a background loop that keeps + processing incoming requests. The :class:`LLMEngine` is kicked by the + generate method when there are requests in the waiting queue. The generate + method yields the outputs from the :class:`LLMEngine` to the caller. - NOTE: For the comprehensive list of arguments, see `LLMEngine`. + NOTE: For the comprehensive list of arguments, see :class:`LLMEngine`. Args: worker_use_ray: Whether to use Ray for model workers. Required for @@ -315,8 +321,8 @@ class AsyncLLMEngine: being printed in log. start_engine_loop: If True, the background task to run the engine will be automatically started in the generate call. - *args: Arguments for LLMEngine. - *kwargs: Arguments for LLMEngine. + *args: Arguments for :class:`LLMEngine`. + **kwargs: Arguments for :class:`LLMEngine`. """ _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine @@ -526,22 +532,26 @@ async def run_engine_loop(self): async def add_request( self, request_id: str, - prompt: Optional[str], + inputs: PromptInputs, params: Union[SamplingParams, PoolingParams], - prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, ) -> AsyncStream: if self.log_requests: - shortened_prompt = prompt - shortened_token_ids = prompt_token_ids - if self.max_log_len is not None: + if isinstance(inputs, str): + shortened_prompt = inputs + shortened_token_ids = None + else: + shortened_prompt = inputs.get("prompt") + shortened_token_ids = inputs.get("prompt_token_ids") + + max_log_len = self.max_log_len + if max_log_len is not None: if shortened_prompt is not None: - shortened_prompt = shortened_prompt[:self.max_log_len] + shortened_prompt = shortened_prompt[:max_log_len] if shortened_token_ids is not None: - shortened_token_ids = shortened_token_ids[:self. - max_log_len] + shortened_token_ids = shortened_token_ids[:max_log_len] + logger.info( "Received request %s: prompt: %r, " "params: %s, prompt_token_ids: %s, " @@ -562,39 +572,33 @@ async def add_request( arrival_time = time.time() if self.engine_use_ray: - prompt_token_ids = await ( - self.engine.encode_request_async.remote( # type: ignore + processed_inputs = await self.engine.process_model_inputs_async \ + .remote( # type: ignore request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - lora_request=lora_request)) + inputs=inputs, + lora_request=lora_request) else: - prompt_token_ids = await self.engine.encode_request_async( + processed_inputs = await self.engine.process_model_inputs_async( request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, + inputs=inputs, lora_request=lora_request) stream = self._request_tracker.add_request( request_id, - prompt=prompt, + inputs=processed_inputs, params=params, - prompt_token_ids=prompt_token_ids, arrival_time=arrival_time, lora_request=lora_request, - multi_modal_data=multi_modal_data, ) return stream async def generate( self, - prompt: Optional[str], + inputs: PromptInputs, sampling_params: SamplingParams, request_id: str, - prompt_token_ids: Optional[List[int]] = None, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None ) -> AsyncIterator[RequestOutput]: """Generate outputs for a request. @@ -603,14 +607,12 @@ async def generate( from the LLMEngine to the caller. Args: - prompt: The prompt string. Can be None if prompt_token_ids is - provided. + inputs: The inputs to the LLM. See + :class:`~vllm.inputs.PromptInputs` + for more details about the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. - prompt_token_ids: The token IDs of the prompt. If None, we - use the tokenizer to convert the prompts to token IDs. lora_request: LoRA request to use for generation, if any. - multi_modal_data: Multi modal data per request. Yields: The output `RequestOutput` objects from the LLMEngine @@ -659,24 +661,20 @@ async def generate( >>> # Process and return the final output >>> ... """ - async for output in self.process_request( + async for output in self._process_request( request_id, - prompt, + inputs, sampling_params, - prompt_token_ids, - lora_request, - multi_modal_data, + lora_request=lora_request, ): - yield output + yield LLMEngine.validate_output(output, RequestOutput) async def encode( self, - prompt: Optional[str], + inputs: PromptInputs, pooling_params: PoolingParams, request_id: str, - prompt_token_ids: Optional[List[int]] = None, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None ) -> AsyncIterator[EmbeddingRequestOutput]: """Generate outputs for a request from an embedding model. @@ -685,14 +683,12 @@ async def encode( from the LLMEngine to the caller. Args: - prompt: The prompt string. Can be None if prompt_token_ids is - provided. + inputs: The inputs to the LLM. See + :class:`~vllm.inputs.PromptInputs` + for more details about the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. - prompt_token_ids: The token IDs of the prompt. If None, we - use the tokenizer to convert the prompts to token IDs. lora_request: LoRA request to use for generation, if any. - multi_modal_data: Multi modal data per request. Yields: The output `EmbeddingRequestOutput` objects from the LLMEngine @@ -739,24 +735,21 @@ async def encode( >>> # Process and return the final output >>> ... """ - async for output in self.process_request( + async for output in self._process_request( request_id, - prompt, + inputs, pooling_params, - prompt_token_ids, - lora_request, - multi_modal_data, + lora_request=lora_request, ): - yield output + yield LLMEngine.validate_output(output, EmbeddingRequestOutput) - async def process_request( + async def _process_request( self, request_id: str, - prompt: Optional[str], + inputs: PromptInputs, params: Union[SamplingParams, PoolingParams], - prompt_token_ids: Optional[List[int]] = None, + *, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]: """Common logic to process requests with SamplingParams or PoolingParams.""" @@ -764,12 +757,10 @@ async def process_request( stream = await self.add_request( request_id, - prompt, + inputs, params, - prompt_token_ids=prompt_token_ids, arrival_time=arrival_time, lora_request=lora_request, - multi_modal_data=multi_modal_data, ) try: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index db3141b277b5b..2b716d9953381 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,5 +1,8 @@ import time -from typing import Iterable, List, Optional, Type, Union +from contextlib import contextmanager +from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional +from typing import Sequence as GenericSequence +from typing import Type, TypeVar, Union from transformers import GenerationConfig, PreTrainedTokenizer @@ -18,6 +21,7 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.executor.executor_base import ExecutorBase from vllm.executor.ray_utils import initialize_ray_cluster +from vllm.inputs import LLMInputs, PromptInputs from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, @@ -25,8 +29,8 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest, - MultiModalData, PoolerOutput, SamplerOutput, - Sequence, SequenceGroup, SequenceGroupMetadata, + PoolerOutput, SamplerOutput, Sequence, + SequenceGroup, SequenceGroupMetadata, SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, @@ -50,6 +54,9 @@ def _load_generation_config_dict(model_config: ModelConfig): return {} +_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput) + + class LLMEngine: """An LLM engine that receives requests and generates texts. @@ -60,11 +67,11 @@ class LLMEngine: iteration-level scheduling and efficient memory management to maximize the serving throughput. - The `LLM` class wraps this class for offline batched inference and the - `AsyncLLMEngine` class wraps this class for online serving. + The :class:`~vllm.LLM` class wraps this class for offline batched inference + and the :class:`AsyncLLMEngine` class wraps this class for online serving. - NOTE: The config arguments are derived from the `EngineArgs` class. For the - comprehensive list of arguments, see `EngineArgs`. + NOTE: The config arguments are derived from the :class:`~vllm.EngineArgs` + class. For the comprehensive list of arguments, see :ref:`engine_args`. Args: model_config: The configuration related to the LLM model. @@ -81,9 +88,60 @@ class LLMEngine: executor_class: The model executor class for managing distributed execution. log_stats: Whether to log statistics. - usage_context: Specified entry point, used for usage info collection + usage_context: Specified entry point, used for usage info collection. """ + DO_VALIDATE_OUTPUT: ClassVar[bool] = False + """A flag to toggle whether to validate the type of request output.""" + + @classmethod + @contextmanager + def enable_output_validation(cls): + cls.DO_VALIDATE_OUTPUT = True + + yield + + cls.DO_VALIDATE_OUTPUT = False + + @classmethod + def validate_output( + cls, + output: object, + output_type: Type[_O], + ) -> _O: + do_validate = cls.DO_VALIDATE_OUTPUT + + if ((TYPE_CHECKING or do_validate) + and not isinstance(output, output_type)): + raise TypeError(f"Expected output of type {output_type}, " + f"but found type {type(output)}") + + return output + + @classmethod + def validate_outputs( + cls, + outputs: GenericSequence[object], + output_type: Type[_O], + ) -> List[_O]: + do_validate = cls.DO_VALIDATE_OUTPUT + + outputs_: List[_O] + if TYPE_CHECKING or do_validate: + outputs_ = [] + for output in outputs: + if not isinstance(output, output_type): + raise TypeError(f"Expected output of type {output_type}, " + f"but found type {type(output)}") + + outputs_.append(output) + else: + outputs_ = outputs + + return outputs_ + + tokenizer: Optional[BaseTokenizerGroup] + def __init__( self, model_config: ModelConfig, @@ -153,12 +211,11 @@ def __init__( self.log_stats = log_stats if not self.model_config.skip_tokenizer_init: - self.tokenizer: BaseTokenizerGroup - self._init_tokenizer() + self.tokenizer = self._init_tokenizer() self.detokenizer = Detokenizer(self.tokenizer) else: - self.detokenizer = None self.tokenizer = None + self.detokenizer = None self.seq_counter = Counter() self.generation_config_fields = _load_generation_config_dict( @@ -320,14 +377,26 @@ def __del__(self): if model_executor := getattr(self, "model_executor", None): model_executor.shutdown() + MISSING_TOKENIZER_GROUP_MSG = ("Unable to get tokenizer because " + "skip_tokenizer_init is True") + + def get_tokenizer_group( + self, + fail_msg: str = MISSING_TOKENIZER_GROUP_MSG) -> BaseTokenizerGroup: + if self.tokenizer is None: + raise ValueError(fail_msg) + + return self.tokenizer + def get_tokenizer(self) -> "PreTrainedTokenizer": - return self.tokenizer.get_lora_tokenizer(None) + return self.get_tokenizer_group().get_lora_tokenizer(None) def get_tokenizer_for_seq(self, sequence: Sequence) -> "PreTrainedTokenizer": - return self.tokenizer.get_lora_tokenizer(sequence.lora_request) + return self.get_tokenizer_group().get_lora_tokenizer( + sequence.lora_request) - def _init_tokenizer(self, **tokenizer_init_kwargs): + def _init_tokenizer(self, **tokenizer_init_kwargs) -> BaseTokenizerGroup: init_kwargs = dict( tokenizer_id=self.model_config.tokenizer, enable_lora=bool(self.lora_config), @@ -337,8 +406,9 @@ def _init_tokenizer(self, **tokenizer_init_kwargs): trust_remote_code=self.model_config.trust_remote_code, revision=self.model_config.tokenizer_revision) init_kwargs.update(tokenizer_init_kwargs) - self.tokenizer = get_tokenizer_group( - self.parallel_config.tokenizer_pool_config, **init_kwargs) + + return get_tokenizer_group(self.parallel_config.tokenizer_pool_config, + **init_kwargs) def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) @@ -348,29 +418,85 @@ def _verify_args(self) -> None: self.lora_config.verify_with_scheduler_config( self.scheduler_config) - def encode_request( + def _get_eos_token_id( + self, lora_request: Optional[LoRARequest]) -> Optional[int]: + if self.tokenizer is None: + logger.warning("Using None for EOS token id because tokenizer " + "is not initialized") + return None + + return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id + + def _add_processed_request( self, - request_id: str, # pylint: disable=unused-argument - prompt: Optional[str], - prompt_token_ids: Optional[List[int]] = None, + request_id: str, + processed_inputs: LLMInputs, + params: Union[SamplingParams, PoolingParams], + arrival_time: float, + lora_request: Optional[LoRARequest], + ) -> None: + # Create the sequences. + block_size = self.cache_config.block_size + seq_id = next(self.seq_counter) + eos_token_id = self._get_eos_token_id(lora_request) + + seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id, + lora_request) + + # Create a SequenceGroup based on SamplingParams or PoolingParams + if isinstance(params, SamplingParams): + seq_group = self._create_sequence_group_with_sampling( + request_id, + seq, + params, + arrival_time=arrival_time, + lora_request=lora_request, + ) + elif isinstance(params, PoolingParams): + seq_group = self._create_sequence_group_with_pooling( + request_id, + seq, + params, + arrival_time=arrival_time, + lora_request=lora_request, + ) + else: + raise ValueError( + "Either SamplingParams or PoolingParams must be provided.") + + # Add the sequence group to the scheduler. + self.scheduler.add_seq_group(seq_group) + + def process_model_inputs( + self, + request_id: str, + inputs: PromptInputs, lora_request: Optional[LoRARequest] = None, - ): - if prompt_token_ids is None: - assert prompt is not None - prompt_token_ids = self.tokenizer.encode(request_id=request_id, - prompt=prompt, - lora_request=lora_request) - return prompt_token_ids + ) -> LLMInputs: + if isinstance(inputs, str): + inputs = {"prompt": inputs} + + if "prompt_token_ids" not in inputs: + tokenizer = self.get_tokenizer_group("prompts must be None if " + "skip_tokenizer_init is True") + + prompt_token_ids = tokenizer.encode(request_id=request_id, + prompt=inputs["prompt"], + lora_request=lora_request) + else: + prompt_token_ids = inputs["prompt_token_ids"] + + return LLMInputs(prompt_token_ids=prompt_token_ids, + prompt=inputs.get("prompt"), + multi_modal_data=inputs.get("multi_modal_data")) def add_request( self, request_id: str, - prompt: Optional[str], + inputs: PromptInputs, params: Union[SamplingParams, PoolingParams], - prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, ) -> None: """Add a request to the engine's request pool. @@ -380,15 +506,14 @@ def add_request( Args: request_id: The unique ID of the request. - prompt: The prompt string. Can be None if prompt_token_ids is - provided. - params: Parameters for sampling or pooling. SamplingParams - for text generation. PoolingParams for pooling. - prompt_token_ids: The token IDs of the prompt. If None, we - use the tokenizer to convert the prompts to token IDs. + inputs: The inputs to the LLM. See + :class:`~vllm.inputs.PromptInputs` + for more details about the format of each input. + params: Parameters for sampling or pooling. + :class:`~vllm.SamplingParams` for text generation. + :class:`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. - multi_modal_data: Multi modal data per request. Details: - Set arrival_time to the current time if it is None. @@ -419,59 +544,26 @@ def add_request( "not enabled!") if arrival_time is None: arrival_time = time.time() - prompt_token_ids = self.encode_request( - request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - lora_request=lora_request) - - # Create the sequences. - block_size = self.cache_config.block_size - seq_id = next(self.seq_counter) - eos_token_id = None - if self.tokenizer: - eos_token_id = self.tokenizer.get_lora_tokenizer( - lora_request).eos_token_id - else: - logger.warning("Use None for EOS token id because tokenizer is " - "not initialized") - seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, - eos_token_id, lora_request) - # Create a SequenceGroup based on SamplingParams or PoolingParams - if isinstance(params, SamplingParams): - seq_group = self._create_sequence_group_with_sampling( - request_id, - seq, - params, - arrival_time, - lora_request, - multi_modal_data, - ) - elif isinstance(params, PoolingParams): - seq_group = self._create_sequence_group_with_pooling( - request_id, - seq, - params, - arrival_time, - lora_request, - multi_modal_data, - ) - else: - raise ValueError( - "Either SamplingParams or PoolingParams must be provided.") + processed_inputs = self.process_model_inputs(request_id=request_id, + inputs=inputs, + lora_request=lora_request) - # Add the sequence group to the scheduler. - self.scheduler.add_seq_group(seq_group) + self._add_processed_request( + request_id=request_id, + processed_inputs=processed_inputs, + params=params, + arrival_time=arrival_time, + lora_request=lora_request, + ) def _create_sequence_group_with_sampling( self, request_id: str, seq: Sequence, sampling_params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, + arrival_time: float, + lora_request: Optional[LoRARequest], ) -> SequenceGroup: """Creates a SequenceGroup with SamplingParams.""" max_logprobs = self.get_model_config().max_logprobs @@ -497,8 +589,7 @@ def _create_sequence_group_with_sampling( seqs=[seq], arrival_time=arrival_time, sampling_params=sampling_params, - lora_request=lora_request, - multi_modal_data=multi_modal_data) + lora_request=lora_request) return seq_group @@ -507,9 +598,8 @@ def _create_sequence_group_with_pooling( request_id: str, seq: Sequence, pooling_params: PoolingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, + arrival_time: float, + lora_request: Optional[LoRARequest], ) -> SequenceGroup: """Creates a SequenceGroup with PoolingParams.""" # Defensive copy of PoolingParams, which are used by the pooler @@ -519,7 +609,6 @@ def _create_sequence_group_with_pooling( seqs=[seq], arrival_time=arrival_time, lora_request=lora_request, - multi_modal_data=multi_modal_data, pooling_params=pooling_params) return seq_group @@ -572,7 +661,7 @@ def _process_sequence_group_outputs( def _process_model_outputs( self, - output: List[Union[SamplerOutput, PoolerOutput]], + output: GenericSequence[Union[SamplerOutput, PoolerOutput]], scheduled_seq_groups: List[ScheduledSequenceGroup], ignored_seq_groups: List[SequenceGroup], seq_group_metadata_list: List[SequenceGroupMetadata], @@ -587,7 +676,7 @@ def _process_model_outputs( # Organize outputs by [sequence group][step] instead of # [step][sequence group]. output_by_sequence_group = create_output_by_sequence_group( - sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) + output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs, seq_group_meta in zip( diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 9816e966c1e36..57cc33d911183 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,18 +1,20 @@ from typing import List +from typing import Sequence as GenericSequence +from typing import Union -from vllm.sequence import SamplerOutput, SequenceGroupOutput +from vllm.sequence import PoolerOutput, SamplerOutput, SequenceGroupOutput def create_output_by_sequence_group( - sampler_outputs: List[SamplerOutput], + outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]], num_seq_groups: int) -> List[List[SequenceGroupOutput]]: """Helper method which transforms a 2d list organized by [step][sequence group] into [sequence group][step]. """ - output_by_sequence_group: List[List[SamplerOutput]] = [ + output_by_sequence_group: List[List[SequenceGroupOutput]] = [ [] for _ in range(num_seq_groups) ] - for step in sampler_outputs: + for step in outputs: for i, sequence_group_output in enumerate(step): output_by_sequence_group[i].append(sequence_group_output) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 015eb5812a844..4cab8aa884fc1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,11 +1,14 @@ -from typing import List, Optional, Union +from contextlib import contextmanager +from typing import ClassVar, List, Optional, Sequence, Union, cast, overload -import torch from tqdm import tqdm from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine +from vllm.inputs import (PromptInputs, PromptStrictInputs, TextPrompt, + TextTokensPrompt, TokensPrompt, + parse_and_batch_prompt) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import EmbeddingRequestOutput, RequestOutput @@ -13,7 +16,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import MultiModalData from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter +from vllm.utils import Counter, deprecate_kwargs logger = init_logger(__name__) @@ -28,8 +31,10 @@ class LLM: mechanism and efficient memory management. NOTE: This class is intended to be used for offline inference. For online - serving, use the `AsyncLLMEngine` class instead. - NOTE: For the comprehensive list of arguments, see `EngineArgs`. + serving, use the :class:`~vllm.AsyncLLMEngine` class instead. + + NOTE: For the comprehensive list of arguments, see + :class:`~vllm.EngineArgs`. Args: model: The name or path of a HuggingFace Transformers model. @@ -86,6 +91,18 @@ class LLM: disable_custom_all_reduce: See ParallelConfig """ + DEPRECATE_LEGACY: ClassVar[bool] = False + """A flag to toggle whether to deprecate the legacy generate/encode API.""" + + @classmethod + @contextmanager + def deprecate_legacy_api(cls): + cls.DEPRECATE_LEGACY = True + + yield + + cls.DEPRECATE_LEGACY = False + def __init__( self, model: str, @@ -147,15 +164,101 @@ def set_tokenizer( ) -> None: self.llm_engine.tokenizer.tokenizer = tokenizer + @overload # LEGACY: single (prompt + optional token ids) + def generate( + self, + prompts: str, + sampling_params: Optional[Union[SamplingParams, + List[SamplingParams]]] = None, + prompt_token_ids: Optional[List[int]] = None, + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[RequestOutput]: + ... + + @overload # LEGACY: multi (prompt + optional token ids) def generate( self, - prompts: Optional[Union[str, List[str]]] = None, + prompts: List[str], sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None, prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, multi_modal_data: Optional[MultiModalData] = None, + ) -> List[RequestOutput]: + ... + + @overload # LEGACY: single (token ids + optional prompt) + def generate( + self, + prompts: Optional[str] = None, + sampling_params: Optional[Union[SamplingParams, + List[SamplingParams]]] = None, + *, + prompt_token_ids: List[int], + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[RequestOutput]: + ... + + @overload # LEGACY: multi (token ids + optional prompt) + def generate( + self, + prompts: Optional[List[str]] = None, + sampling_params: Optional[Union[SamplingParams, + List[SamplingParams]]] = None, + *, + prompt_token_ids: List[List[int]], + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[RequestOutput]: + ... + + @overload # LEGACY: single or multi token ids [pos-only] + def generate( + self, + prompts: None, + sampling_params: None, + prompt_token_ids: Union[List[int], List[List[int]]], + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[RequestOutput]: + ... + + @overload + def generate( + self, + inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]], + /, # We may enable `inputs` keyword after removing the old API + *, + sampling_params: Optional[Union[SamplingParams, + Sequence[SamplingParams]]] = None, + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + ) -> List[RequestOutput]: + ... + + @deprecate_kwargs("prompts", + "prompt_token_ids", + "multi_modal_data", + is_deprecated=lambda: LLM.DEPRECATE_LEGACY, + additional_message="Please use the 'inputs' parameter " + "instead.") + def generate( + self, + prompts: Union[Union[PromptStrictInputs, Sequence[PromptStrictInputs]], + Optional[Union[str, List[str]]]] = None, + sampling_params: Optional[Union[SamplingParams, + Sequence[SamplingParams]]] = None, + prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, ) -> List[RequestOutput]: """Generates the completions for the input prompts. @@ -164,49 +267,138 @@ def generate( into a single list and pass it to this method. Args: - prompts: A list of prompts to generate completions for. + inputs: A list of inputs to generate completions for. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt. - prompt_token_ids: A list of token IDs for the prompts. If None, we - use the tokenizer to convert the prompts to token IDs. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. - multi_modal_data: Multi modal data. Returns: A list of `RequestOutput` objects containing the generated completions in the same order as the input prompts. """ + if prompt_token_ids is not None or multi_modal_data is not None: + inputs = self._convert_v1_inputs( + prompts=cast(Optional[Union[str, List[str]]], prompts), + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + ) + else: + inputs = cast( + Union[PromptStrictInputs, Sequence[PromptStrictInputs]], + prompts) + if sampling_params is None: # Use default sampling params. sampling_params = SamplingParams() - requests_data = self._validate_and_prepare_requests( - prompts, - sampling_params, - prompt_token_ids, - lora_request, - multi_modal_data, + self._validate_and_add_requests( + inputs=inputs, + params=sampling_params, + lora_request=lora_request, ) - # Add requests to the engine and run the engine - for request_data in requests_data: - self._add_request(**request_data) + outputs = self._run_engine(use_tqdm=use_tqdm) + return LLMEngine.validate_outputs(outputs, RequestOutput) - return self._run_engine(use_tqdm) + @overload # LEGACY: single (prompt + optional token ids) + def encode( + self, + prompts: str, + pooling_params: Optional[Union[PoolingParams, + Sequence[PoolingParams]]] = None, + prompt_token_ids: Optional[List[int]] = None, + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[EmbeddingRequestOutput]: + ... + @overload # LEGACY: multi (prompt + optional token ids) def encode( self, - prompts: Optional[Union[str, List[str]]] = None, + prompts: List[str], pooling_params: Optional[Union[PoolingParams, - List[PoolingParams]]] = None, + Sequence[PoolingParams]]] = None, prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, multi_modal_data: Optional[MultiModalData] = None, + ) -> List[EmbeddingRequestOutput]: + ... + + @overload # LEGACY: single (token ids + optional prompt) + def encode( + self, + prompts: Optional[str] = None, + pooling_params: Optional[Union[PoolingParams, + Sequence[PoolingParams]]] = None, + *, + prompt_token_ids: List[int], + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[EmbeddingRequestOutput]: + ... + + @overload # LEGACY: multi (token ids + optional prompt) + def encode( + self, + prompts: Optional[List[str]] = None, + pooling_params: Optional[Union[PoolingParams, + Sequence[PoolingParams]]] = None, + *, + prompt_token_ids: List[List[int]], + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[EmbeddingRequestOutput]: + ... + + @overload # LEGACY: single or multi token ids [pos-only] + def encode( + self, + prompts: None, + pooling_params: None, + prompt_token_ids: Union[List[int], List[List[int]]], + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, + ) -> List[EmbeddingRequestOutput]: + ... + + @overload + def encode( + self, + inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]], + /, # We may enable `inputs` keyword after removing the old API + *, + pooling_params: Optional[Union[PoolingParams, + Sequence[PoolingParams]]] = None, + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + ) -> List[EmbeddingRequestOutput]: + ... + + @deprecate_kwargs("prompts", + "prompt_token_ids", + "multi_modal_data", + is_deprecated=lambda: LLM.DEPRECATE_LEGACY, + additional_message="Please use the 'inputs' parameter " + "instead.") + def encode( + self, + prompts: Union[Union[PromptStrictInputs, Sequence[PromptStrictInputs]], + Optional[Union[str, List[str]]]] = None, + pooling_params: Optional[Union[PoolingParams, + Sequence[PoolingParams]]] = None, + prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + multi_modal_data: Optional[MultiModalData] = None, ) -> List[EmbeddingRequestOutput]: """Generates the completions for the input prompts. @@ -215,124 +407,133 @@ def encode( into a single list and pass it to this method. Args: - prompts: A list of prompts to generate completions for. + inputs: The inputs to the LLM. You may pass a sequence of inputs for + batch inference. See :class:`~vllm.inputs.PromptStrictInputs` + for more details about the format of each input. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. - prompt_token_ids: A list of token IDs for the prompts. If None, we - use the tokenizer to convert the prompts to token IDs. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. - multi_modal_data: Multi modal data. Returns: A list of `EmbeddingRequestOutput` objects containing the generated embeddings in the same order as the input prompts. """ + if prompt_token_ids is not None or multi_modal_data is not None: + inputs = self._convert_v1_inputs( + prompts=cast(Optional[Union[str, List[str]]], prompts), + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + ) + else: + inputs = cast( + Union[PromptStrictInputs, Sequence[PromptStrictInputs]], + prompts) + if pooling_params is None: # Use default pooling params. pooling_params = PoolingParams() - requests_data = self._validate_and_prepare_requests( - prompts, - pooling_params, - prompt_token_ids, - lora_request, - multi_modal_data, + self._validate_and_add_requests( + inputs=inputs, + params=pooling_params, + lora_request=lora_request, ) - # Add requests to the engine and run the engine - for request_data in requests_data: - self._add_request(**request_data) + outputs = self._run_engine(use_tqdm=use_tqdm) + return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput) - return self._run_engine(use_tqdm) - - def _validate_and_prepare_requests( + # LEGACY + def _convert_v1_inputs( self, prompts: Optional[Union[str, List[str]]], - params: Union[Union[SamplingParams, PoolingParams], - List[Union[SamplingParams, - PoolingParams]]], # Unified parameter - prompt_token_ids: Optional[List[List[int]]] = None, - lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, - ) -> List[dict]: - """Validates and prepares request data for adding to the engine. + prompt_token_ids: Optional[Union[List[int], List[List[int]]]], + multi_modal_data: Optional[MultiModalData], + ): + # skip_tokenizer_init is now checked in engine - Ensures prompts and token IDs are consistent, and returns a list of - dictionaries with request data for further processing. - """ - if prompts is None and prompt_token_ids is None: - raise ValueError("Either prompts or prompt_token_ids must be " - "provided.") - if self.llm_engine.model_config.skip_tokenizer_init \ - and prompts is not None: - raise ValueError("prompts must be None if skip_tokenizer_init " - "is True") - if isinstance(prompts, str): - # Convert a single prompt to a list. - prompts = [prompts] - if (prompts is not None and prompt_token_ids is not None - and len(prompts) != len(prompt_token_ids)): - raise ValueError("The lengths of prompts and prompt_token_ids " - "must be the same.") + if prompts is not None: + prompts = [p["content"] for p in parse_and_batch_prompt(prompts)] + if prompt_token_ids is not None: + prompt_token_ids = [ + p["content"] for p in parse_and_batch_prompt(prompt_token_ids) + ] + num_requests = None if prompts is not None: num_requests = len(prompts) - else: - assert prompt_token_ids is not None + if prompt_token_ids is not None: + if (num_requests is not None + and num_requests != len(prompt_token_ids)): + raise ValueError("The lengths of prompts and prompt_token_ids " + "must be the same.") + num_requests = len(prompt_token_ids) + if num_requests is None: + raise ValueError("Either prompts or prompt_token_ids must be " + "provided.") + + inputs: List[PromptInputs] = [] + for i in range(num_requests): + if prompts is not None: + if prompt_token_ids is not None: + item = TextTokensPrompt( + prompt=prompts[i], + prompt_token_ids=prompt_token_ids[i]) + else: + item = TextPrompt(prompt=prompts[i]) + else: + if prompt_token_ids is not None: + item = TokensPrompt(prompt_token_ids=prompt_token_ids[i]) + else: + raise AssertionError + + if multi_modal_data is not None: + item["multi_modal_data"] = multi_modal_data + + inputs.append(item) + + return inputs + + def _validate_and_add_requests( + self, + inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]], + params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams, + Sequence[PoolingParams]], + lora_request: Optional[LoRARequest], + ) -> None: + if isinstance(inputs, (str, dict)): + # Convert a single prompt to a list. + inputs = [inputs] + + num_requests = len(inputs) if isinstance(params, list) and len(params) != num_requests: raise ValueError("The lengths of prompts and params " "must be the same.") - if multi_modal_data: - multi_modal_data.data = multi_modal_data.data.to(torch.float16) # Add requests to the engine. - requests_data = [] - for i in range(num_requests): - prompt = prompts[i] if prompts is not None else None - token_ids = None if prompt_token_ids is None else prompt_token_ids[ - i] - - multi_modal_item = MultiModalData( - type=multi_modal_data.type, - data=multi_modal_data.data[i].unsqueeze(0), - ) if multi_modal_data else None - - requests_data.append({ - "prompt": - prompt, - "params": - params[i] if isinstance(params, list) else params, - "prompt_token_ids": - token_ids, - "lora_request": - lora_request, - "multi_modal_data": - multi_modal_item, - }) - - return requests_data + for i, request_inputs in enumerate(inputs): + self._add_request( + request_inputs, + params[i] if isinstance(params, Sequence) else params, + lora_request=lora_request, + ) def _add_request( self, - prompt: Optional[str], + inputs: PromptInputs, params: Union[SamplingParams, PoolingParams], - prompt_token_ids: Optional[List[int]], lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, ) -> None: request_id = str(next(self.request_counter)) self.llm_engine.add_request(request_id, - prompt, + inputs, params, - prompt_token_ids, - lora_request=lora_request, - multi_modal_data=multi_modal_data) + lora_request=lora_request) def _run_engine( - self, use_tqdm: bool + self, *, use_tqdm: bool ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: # Initialize tqdm. if use_tqdm: @@ -364,5 +565,4 @@ def _run_engine( # Sort the outputs by request ID. # This is necessary because some requests may be finished earlier than # its previous requests. - outputs = sorted(outputs, key=lambda x: int(x.request_id)) - return outputs + return sorted(outputs, key=lambda x: int(x.request_id)) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7e179362eef8a..33daabd881df0 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -176,9 +176,15 @@ async def create_chat_completion( except ValueError as e: return self.create_error_response(str(e)) - result_generator = self.engine.generate(prompt_text, sampling_params, - request_id, prompt_ids, - lora_request) + result_generator = self.engine.generate( + { + "prompt": prompt_text, + "prompt_token_ids": prompt_ids + }, + sampling_params, + request_id, + lora_request, + ) # Streaming response if request.stream: return self.chat_completion_stream_generator( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 158d8ed7fbbf5..d1812c8f44f41 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -119,12 +119,17 @@ async def create_completion(self, request: CompletionRequest, truncate_prompt_tokens) prompt_ids, prompt_text = prompt_formats - generators.append( - self.engine.generate(prompt_text, - sampling_params, - f"{request_id}-{i}", - prompt_token_ids=prompt_ids, - lora_request=lora_request)) + generator = self.engine.generate( + { + "prompt": prompt_text, + "prompt_token_ids": prompt_ids + }, + sampling_params, + f"{request_id}-{i}", + lora_request=lora_request, + ) + + generators.append(generator) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 7a57be0c88915..5a3448de3d7a4 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,5 +1,5 @@ import time -from typing import AsyncIterator, List, Tuple +from typing import AsyncIterator, List, Optional, Tuple from fastapi import Request @@ -100,11 +100,16 @@ async def create_embedding(self, request: EmbeddingRequest, prompt_ids, prompt_text = prompt_formats - generators.append( - self.engine.generate(prompt_text, - pooling_params, - f"{request_id}-{i}", - prompt_token_ids=prompt_ids)) + generator = self.engine.encode( + { + "prompt": prompt_text, + "prompt_token_ids": prompt_ids + }, + pooling_params, + f"{request_id}-{i}", + ) + + generators.append(generator) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -113,16 +118,21 @@ async def create_embedding(self, request: EmbeddingRequest, int, EmbeddingRequestOutput]] = merge_async_iterators(*generators) # Non-streaming response - final_res_batch: EmbeddingRequestOutput = [None] * len(prompts) - async for i, res in result_generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - await self.engine.abort(f"{request_id}-{i}") - # TODO: Use a vllm-specific Validation Error - return self.create_error_response("Client disconnected") - final_res_batch[i] = res - response = request_output_to_embedding_response( - final_res_batch, request_id, created_time, model_name) + final_res_batch: List[Optional[EmbeddingRequestOutput]] + final_res_batch = [None] * len(prompts) + try: + async for i, res in result_generator: + if await raw_request.is_disconnected(): + # Abort the request if the client disconnects. + await self.engine.abort(f"{request_id}-{i}") + # TODO: Use a vllm-specific Validation Error + return self.create_error_response("Client disconnected") + final_res_batch[i] = res + response = request_output_to_embedding_response( + final_res_batch, request_id, created_time, model_name) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) return response diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 0df0223b9dbb2..708b0dad102c4 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -143,7 +143,8 @@ def create_streaming_error_response( return json_str async def _check_model( - self, request: Union[CompletionRequest, ChatCompletionRequest] + self, request: Union[CompletionRequest, ChatCompletionRequest, + EmbeddingRequest] ) -> Optional[ErrorResponse]: if request.model in self.served_model_names: return None @@ -155,7 +156,8 @@ async def _check_model( status_code=HTTPStatus.NOT_FOUND) def _maybe_get_lora( - self, request: Union[CompletionRequest, ChatCompletionRequest] + self, request: Union[CompletionRequest, ChatCompletionRequest, + EmbeddingRequest] ) -> Optional[LoRARequest]: if request.model in self.served_model_names: return None diff --git a/vllm/inputs.py b/vllm/inputs.py new file mode 100644 index 0000000000000..f5d99b1b66b70 --- /dev/null +++ b/vllm/inputs.py @@ -0,0 +1,130 @@ +from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, + TypedDict, Union, cast, overload) + +from typing_extensions import NotRequired + +if TYPE_CHECKING: + from vllm.sequence import MultiModalData + + +class ParsedText(TypedDict): + content: str + is_tokens: Literal[False] + + +class ParsedTokens(TypedDict): + content: List[int] + is_tokens: Literal[True] + + +# https://github.com/vllm-project/vllm/pull/4028 +@overload +def parse_and_batch_prompt( + prompt: Union[str, List[str]]) -> Sequence[ParsedText]: + ... + + +@overload +def parse_and_batch_prompt( + prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: + ... + + +def parse_and_batch_prompt( + prompt: Union[str, List[str], List[int], List[List[int]]], +) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]: + if isinstance(prompt, str): + # case 1: a string + return [ParsedText(content=prompt, is_tokens=False)] + + if isinstance(prompt, list): + if len(prompt) == 0: + raise ValueError("please provide at least one prompt") + + if isinstance(prompt[0], str): + # case 2: array of strings + return [ + ParsedText(content=elem, is_tokens=False) + for elem in cast(List[str], prompt) + ] + if isinstance(prompt[0], int): + # case 3: array of tokens + elem = cast(List[int], prompt) + return [ParsedTokens(content=elem, is_tokens=True)] + if isinstance(prompt[0], list): + if len(prompt[0]) == 0: + raise ValueError("please provide at least one prompt") + + if isinstance(prompt[0][0], int): + # case 4: array of token arrays + return [ + ParsedTokens(content=elem, is_tokens=True) + for elem in cast(List[List[int]], prompt) + ] + + raise ValueError("prompt must be a string, array of strings, " + "array of tokens, or array of token arrays") + + +class TextPrompt(TypedDict): + """Schema for a text prompt.""" + + prompt: str + """The input text to be tokenized before passing to the model.""" + + multi_modal_data: NotRequired["MultiModalData"] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ + + +class TokensPrompt(TypedDict): + """Schema for a tokenized prompt.""" + + prompt_token_ids: List[int] + """A list of token IDs to pass to the model.""" + + multi_modal_data: NotRequired["MultiModalData"] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ + + +class TextTokensPrompt(TypedDict): + """It is assumed that :attr:`prompt` is consistent with + :attr:`prompt_token_ids`. This is currently used in + :class:`AsyncLLMEngine` for logging both the text and token IDs.""" + + prompt: str + """The prompt text.""" + + prompt_token_ids: List[int] + """The token IDs of the prompt. If None, we use the + tokenizer to convert the prompts to token IDs.""" + + multi_modal_data: NotRequired["MultiModalData"] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ + + +PromptStrictInputs = Union[str, TextPrompt, TokensPrompt] +""" +The inputs to the LLM, which can take one of the following forms: + +- A text prompt (:class:`str` or :class:`TextPrompt`) +- A tokenized prompt (:class:`TokensPrompt`) +""" + +PromptInputs = Union[str, TextPrompt, TokensPrompt, TextTokensPrompt] +"""Same as :const:`PromptStrictInputs` but additionally accepts +:class:`TextTokensPrompt`.""" + + +class LLMInputs(TypedDict): + prompt_token_ids: List[int] + prompt: Optional[str] + multi_modal_data: Optional["MultiModalData"] diff --git a/vllm/outputs.py b/vllm/outputs.py index f9bce9e683f22..49f526b5f9300 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,4 +1,5 @@ import time +from dataclasses import dataclass from typing import List, Optional, Union from vllm.lora.request import LoRARequest @@ -6,6 +7,7 @@ SequenceGroup, SequenceStatus) +@dataclass class CompletionOutput: """The output data of one completion output of a request. @@ -24,25 +26,14 @@ class CompletionOutput: lora_request: The LoRA request that was used to generate the output. """ - def __init__( - self, - index: int, - text: str, - token_ids: List[int], - cumulative_logprob: float, - logprobs: Optional[SampleLogprobs], - finish_reason: Optional[str] = None, - stop_reason: Union[int, str, None] = None, - lora_request: Optional[LoRARequest] = None, - ) -> None: - self.index = index - self.text = text - self.token_ids = token_ids - self.cumulative_logprob = cumulative_logprob - self.logprobs = logprobs - self.finish_reason = finish_reason - self.stop_reason = stop_reason - self.lora_request = lora_request + index: int + text: str + token_ids: List[int] + cumulative_logprob: float + logprobs: Optional[SampleLogprobs] + finish_reason: Optional[str] = None + stop_reason: Union[int, str, None] = None + lora_request: Optional[LoRARequest] = None def finished(self) -> bool: return self.finish_reason is not None @@ -57,6 +48,7 @@ def __repr__(self) -> str: f"stop_reason={self.stop_reason})") +@dataclass class EmbeddingOutput: """The output data of one completion output of a request. @@ -65,15 +57,11 @@ class EmbeddingOutput: length of vector depends on the model as listed in the embedding guide. """ - def __init__( - self, - embedding: List[float], - ) -> None: - self.embedding = embedding + embedding: List[float] def __repr__(self) -> str: return (f"EmbeddingOutput(" - f"embedding={len(self.embedding)}") + f"embedding={len(self.embedding)})") class RequestOutput: @@ -93,7 +81,7 @@ class RequestOutput: def __init__( self, request_id: str, - prompt: str, + prompt: Optional[str], prompt_token_ids: List[int], prompt_logprobs: Optional[PromptLogprobs], outputs: List[CompletionOutput], @@ -183,7 +171,7 @@ class EmbeddingRequestOutput: finished (bool): A flag indicating whether the embedding is completed. """ - def __init__(self, request_id: str, outputs: 'EmbeddingOutput', + def __init__(self, request_id: str, outputs: "EmbeddingOutput", prompt_token_ids: List[int], finished: bool): self.request_id = request_id self.prompt_token_ids = prompt_token_ids diff --git a/vllm/sequence.py b/vllm/sequence.py index aa759448d82b1..f8e9da6c7965a 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union from vllm.block import LogicalTokenBlock +from vllm.inputs import LLMInputs from vllm.lora.request import LoRARequest from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -210,8 +211,7 @@ class Sequence: Args: seq_id: The ID of the sequence. - prompt: The prompt of the sequence. - prompt_token_ids: The token IDs of the prompt. + inputs: The inputs of the sequence. block_size: The block size of the sequence. Should be the same as the block size used by the block manager and cache engine. lora_request: LoRA request. @@ -220,25 +220,24 @@ class Sequence: def __init__( self, seq_id: int, - prompt: str, - prompt_token_ids: List[int], + inputs: LLMInputs, block_size: int, eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, ) -> None: self.seq_id = seq_id - self.prompt = prompt + self.inputs = inputs self.block_size = block_size self.eos_token_id = eos_token_id self.lora_request = lora_request - self.data: SequenceData = SequenceData(prompt_token_ids) + self.data = SequenceData(self.prompt_token_ids) self.output_logprobs: SampleLogprobs = [] self.output_text = "" self.logical_token_blocks: List[LogicalTokenBlock] = [] # Initialize the logical token blocks with the prompt token ids. - self._append_tokens_to_blocks(prompt_token_ids) + self._append_tokens_to_blocks(self.prompt_token_ids) self.status = SequenceStatus.WAITING self.stop_reason: Union[int, str, None] = None @@ -248,6 +247,18 @@ def __init__( # Input + output tokens self.tokens: Optional[List[str]] = None + @property + def prompt(self) -> Optional[str]: + return self.inputs["prompt"] + + @property + def prompt_token_ids(self) -> List[int]: + return self.inputs["prompt_token_ids"] + + @property + def multi_modal_data(self) -> Optional["MultiModalData"]: + return self.inputs["multi_modal_data"] + @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 @@ -415,7 +426,6 @@ class SequenceGroup: sampling_params: The sampling parameters used to generate the outputs. arrival_time: The arrival time of the request. lora_request: LoRA request. - multi_modal_data: Multi modal data associated with the request. embeddings: The embeddings vectors of the prompt of the sequence group for an embedding model. pooling_params: The pooling parameters used to generate the pooling @@ -429,7 +439,6 @@ def __init__( arrival_time: float, sampling_params: Optional[SamplingParams] = None, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, embeddings: Optional[List[float]] = None, pooling_params: Optional[PoolingParams] = None, ) -> None: @@ -444,12 +453,11 @@ def __init__( self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None self.state = SequenceGroupState() - self.multi_modal_data = multi_modal_data self.embeddings = embeddings self.pooling_params = pooling_params @property - def prompt(self) -> str: + def prompt(self) -> Optional[str]: # All sequences in the group should have the same prompt. # We use the prompt of an arbitrary sequence. return next(iter(self.seqs_dict.values())).prompt @@ -458,7 +466,13 @@ def prompt(self) -> str: def prompt_token_ids(self) -> List[int]: # All sequences in the group should have the same prompt. # We use the prompt of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).data.prompt_token_ids + return next(iter(self.seqs_dict.values())).prompt_token_ids + + @property + def multi_modal_data(self) -> Optional[MultiModalData]: + # All sequences in the group should have the same multi-modal data. + # We use the multi-modal data of an arbitrary sequence. + return next(iter(self.seqs_dict.values())).multi_modal_data @property def lora_int_id(self) -> int: diff --git a/vllm/utils.py b/vllm/utils.py index f4f027ce70e37..d2a9162ff7320 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -11,7 +11,7 @@ import uuid import warnings from collections import defaultdict -from functools import lru_cache, partial +from functools import lru_cache, partial, wraps from platform import uname from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic, Hashable, List, Optional, OrderedDict, Tuple, TypeVar, @@ -658,3 +658,44 @@ def enable_trace_function_call_for_thread() -> None: filename) os.makedirs(os.path.dirname(log_path), exist_ok=True) enable_trace_function_call(log_path) + + +def identity(value: T) -> T: + return value + + +F = TypeVar('F', bound=Callable[..., Any]) + + +def deprecate_kwargs( + *kws: str, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None) -> Callable[[F], F]: + deprecated_kws = set(kws) + + if not callable(is_deprecated): + is_deprecated = partial(identity, is_deprecated) + + def wrapper(fn: F) -> F: + + @wraps(fn) + def inner(*args, **kwargs): + if is_deprecated(): + deprecated_kwargs = kwargs.keys() & deprecated_kws + if deprecated_kwargs: + msg = ( + f"The keyword arguments {deprecated_kwargs} are " + "deprecated and will be removed in a future update.") + if additional_message is not None: + msg += f" {additional_message}" + + warnings.warn( + DeprecationWarning(msg), + stacklevel=3, # The inner function takes up one level + ) + + return fn(*args, **kwargs) + + return inner # type: ignore + + return wrapper From 705789d18d5a8ef0d9e097d5342805023f2a123f Mon Sep 17 00:00:00 2001 From: Junichi Sato Date: Wed, 29 May 2024 09:15:35 +0900 Subject: [PATCH 046/154] [Bugfix] Remove the last EOS token unless explicitly specified (#5077) --- .../output_processor/test_stop_checker.py | 86 +++++++++++++++++++ vllm/engine/output_processor/stop_checker.py | 5 ++ 2 files changed, 91 insertions(+) create mode 100644 tests/engine/output_processor/test_stop_checker.py diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py new file mode 100644 index 0000000000000..ae54c83605e11 --- /dev/null +++ b/tests/engine/output_processor/test_stop_checker.py @@ -0,0 +1,86 @@ +from unittest.mock import MagicMock + +import pytest +from transformers import PreTrainedTokenizer + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.sampling_params import SamplingParams +from vllm.sequence import Logprob, Sequence, SequenceStatus + + +def sequence_with_eos(text: str, eos_token: str, + eos_token_id: int) -> Sequence: + """ + Create a Sequence that ends with an EOS token. + """ + seq = Sequence( + seq_id=0, + prompt="", + prompt_token_ids=[], + block_size=16, + eos_token_id=eos_token_id, + ) + seq.output_text = text + eos_token + + offset = eos_token_id + 1 + for i in range(offset, len(text) + offset): + seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)}) + seq.append_token_id(token_id=eos_token_id, + logprobs={eos_token_id: Logprob(0.0)}) + + seq.status = SequenceStatus.RUNNING + + return seq + + +@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [ + ("This text ends with EOS token", "", 2), +]) +@pytest.mark.parametrize("ignore_eos", [True, False, None]) +@pytest.mark.parametrize("include_stop_str_in_output", [True, False, None]) +@pytest.mark.skip_global_cleanup +def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int, + ignore_eos: bool, include_stop_str_in_output: bool): + """ + Test the behavior of the StopChecker's maybe_stop_sequence method + when an EOS token is encountered. + + This test covers: + - When the EOS token should stop the sequence and be removed from the output + - When the EOS token should stop the sequence and be included in the output + - When the EOS token should be ignored, and the sequence continues + """ + + tokenizer = MagicMock(spec=PreTrainedTokenizer) + get_tokenizer_for_seq = MagicMock(return_value=tokenizer) + stop_checker = StopChecker(max_model_len=1024, + get_tokenizer_for_seq=get_tokenizer_for_seq) + + seq = sequence_with_eos( + text=text_wo_eos, + eos_token=eos_token, + eos_token_id=eos_token_id, + ) + new_char_count = len(eos_token) + + # Note that `stop` and `stop_token_ids` are not specified + sampling_params = SamplingParams( + min_tokens=1, + ignore_eos=ignore_eos, + include_stop_str_in_output=include_stop_str_in_output) + + stop_checker.maybe_stop_sequence( + seq=seq, + new_char_count=new_char_count, + sampling_params=sampling_params, + ) + + if ignore_eos: + assert seq.status == SequenceStatus.RUNNING + assert seq.output_text == text_wo_eos + eos_token + elif include_stop_str_in_output: + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert seq.output_text == text_wo_eos + eos_token + else: + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert seq.output_text == text_wo_eos diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 5fb11b32bad6d..96f0d1142611b 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -48,6 +48,11 @@ def maybe_stop_sequence( # Check if the sequence has generated the EOS token. if ((not sampling_params.ignore_eos) and seq.get_last_token_id() == seq.eos_token_id): + # Remove the last EOS token unless explicitly specified + # This prevents unintended exposure of the EOS token + if new_char_count and ( + not sampling_params.include_stop_str_in_output): + seq.output_text = seq.output_text[:-new_char_count] seq.status = SequenceStatus.FINISHED_STOPPED return From 95c2a3d3c2fe65ed87003261b1e94a944269542e Mon Sep 17 00:00:00 2001 From: Marut Pandya Date: Tue, 28 May 2024 17:16:18 -0700 Subject: [PATCH 047/154] [Misc] add gpu_memory_utilization arg (#5079) Signed-off-by: pandyamarut --- benchmarks/benchmark_latency.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 3146fb33cc27e..f69d91a086a9f 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -35,7 +35,8 @@ def main(args: argparse.Namespace): use_v2_block_manager=args.use_v2_block_manager, enable_chunked_prefill=args.enable_chunked_prefill, download_dir=args.download_dir, - block_size=args.block_size) + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization) sampling_params = SamplingParams( n=args.n, @@ -214,5 +215,11 @@ def run_to_completion(profile_dir: Optional[str] = None): type=str, default=None, help='Path to save the latency results in JSON format.') + parser.add_argument('--gpu-memory-utilization', + type=float, + default=0.9, + help='the fraction of GPU memory to be used for ' + 'the model executor, which can range from 0 to 1.' + 'If unspecified, will use the default value of 0.9.') args = parser.parse_args() main(args) From 9175890928a64a7044a6a54e21ef1e312b096256 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 28 May 2024 22:13:52 -0700 Subject: [PATCH 048/154] [Core][Optimization] remove vllm-nccl (#5091) --- .buildkite/test-pipeline.yaml | 1 - requirements-cuda.txt | 1 - setup.py | 7 +-- tests/distributed/test_pynccl_library.py | 43 ------------------- .../device_communicators/pynccl_wrapper.py | 20 +++------ vllm/utils.py | 43 ++++--------------- vllm/worker/worker_base.py | 6 ++- 7 files changed, 21 insertions(+), 100 deletions(-) delete mode 100644 tests/distributed/test_pynccl_library.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 08e132d0c68bf..21cbd9ba13780 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -37,7 +37,6 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: - - pytest -v -s distributed/test_pynccl_library.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py diff --git a/requirements-cuda.txt b/requirements-cuda.txt index acb0164007dba..5109f17356178 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,6 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py # for pynvml package -vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library torch == 2.3.0 xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 vllm-flash-attn == 2.5.8.post2 # Requires PyTorch 2.3.0 diff --git a/setup.py b/setup.py index dee4faafc364d..7814af2b9e1b7 100644 --- a/setup.py +++ b/setup.py @@ -383,11 +383,8 @@ def _read_requirements(filename: str) -> List[str]: cuda_major, cuda_minor = torch.version.cuda.split(".") modified_requirements = [] for req in requirements: - if "vllm-nccl-cu12" in req: - req = req.replace("vllm-nccl-cu12", - f"vllm-nccl-cu{cuda_major}") - elif ("vllm-flash-attn" in req - and not (cuda_major == "12" and cuda_minor == "1")): + if ("vllm-flash-attn" in req + and not (cuda_major == "12" and cuda_minor == "1")): # vllm-flash-attn is built only for CUDA 12.1. # Skip for other versions. continue diff --git a/tests/distributed/test_pynccl_library.py b/tests/distributed/test_pynccl_library.py deleted file mode 100644 index ec60a5ed3114d..0000000000000 --- a/tests/distributed/test_pynccl_library.py +++ /dev/null @@ -1,43 +0,0 @@ -import multiprocessing -import tempfile - - -def target_fn(env, filepath): - from vllm.utils import update_environment_variables - update_environment_variables(env) - from vllm.utils import nccl_integrity_check - nccl_integrity_check(filepath) - - -def test_library_file(): - # note: don't import vllm.distributed.device_communicators.pynccl - # before running this test, otherwise the library file will be loaded - # and it might interfere with the test - from vllm.utils import find_nccl_library - so_file = find_nccl_library() - with open(so_file, 'rb') as f: - content = f.read() - try: - # corrupt the library file, should raise an exception - with open(so_file, 'wb') as f: - f.write(content[:len(content) // 2]) - p = multiprocessing.Process(target=target_fn, args=({}, so_file)) - p.start() - p.join() - assert p.exitcode != 0 - - # move the library file to a tmp path - # test VLLM_NCCL_SO_PATH - fd, path = tempfile.mkstemp() - with open(path, 'wb') as f: - f.write(content) - p = multiprocessing.Process(target=target_fn, - args=({ - "VLLM_NCCL_SO_PATH": path - }, path)) - p.start() - p.join() - assert p.exitcode == 0 - finally: - with open(so_file, 'wb') as f: - f.write(content) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 3aa3744d0d827..50d6719fbfe62 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -28,7 +28,7 @@ from torch.distributed import ReduceOp from vllm.logger import init_logger -from vllm.utils import find_nccl_library, nccl_integrity_check +from vllm.utils import find_nccl_library logger = init_logger(__name__) @@ -188,28 +188,22 @@ def __init__(self, so_file: Optional[str] = None): so_file = so_file or find_nccl_library() try: - # load the library in another process. - # if it core dumps, it will not crash the current process - nccl_integrity_check(so_file) + if so_file not in NCCLLibrary.path_to_dict_mapping: + lib = ctypes.CDLL(so_file) + NCCLLibrary.path_to_library_cache[so_file] = lib + self.lib = NCCLLibrary.path_to_library_cache[so_file] except Exception as e: logger.error( "Failed to load NCCL library from %s ." "It is expected if you are not running on NVIDIA/AMD GPUs." "Otherwise, the nccl library might not exist, be corrupted " "or it does not support the current platform %s." - "One solution is to download libnccl2 version 2.18 from " - "https://developer.download.nvidia.com/compute/cuda/repos/ " - "and extract the libnccl.so.2 file. If you already have the " - "library, please set the environment variable VLLM_NCCL_SO_PATH" + "If you already have the library, please set the " + "environment variable VLLM_NCCL_SO_PATH" " to point to the correct nccl library path.", so_file, platform.platform()) raise e - if so_file not in NCCLLibrary.path_to_dict_mapping: - lib = ctypes.CDLL(so_file) - NCCLLibrary.path_to_library_cache[so_file] = lib - self.lib = NCCLLibrary.path_to_library_cache[so_file] - if so_file not in NCCLLibrary.path_to_dict_mapping: _funcs = {} for func in NCCLLibrary.exported_functions: diff --git a/vllm/utils.py b/vllm/utils.py index d2a9162ff7320..f6cd7262dba75 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2,7 +2,6 @@ import datetime import enum import gc -import glob import os import socket import subprocess @@ -565,28 +564,6 @@ def init_cached_hf_modules(): init_hf_modules() -def nccl_integrity_check(filepath): - """ - when the library is corrupted, we cannot catch - the exception in python. it will crash the process. - instead, we use the exit code of `ldd` to check - if the library is corrupted. if not, we will return - the version of the library. - """ - exit_code = os.system(f"ldd {filepath} 2>&1 > /dev/null") - if exit_code != 0: - raise RuntimeError(f"Failed to load NCCL library from {filepath} .") - import ctypes - - nccl = ctypes.CDLL(filepath) - version = ctypes.c_int() - nccl.ncclGetVersion.restype = ctypes.c_int - nccl.ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)] - result = nccl.ncclGetVersion(ctypes.byref(version)) - assert result == 0 - return version.value - - @lru_cache(maxsize=None) def find_library(lib_name: str) -> str: """ @@ -616,17 +593,13 @@ def find_library(lib_name: str) -> str: def find_nccl_library(): + """ + We either use the library file specified by the `VLLM_NCCL_SO_PATH` + environment variable, or we find the library file brought by PyTorch. + After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be + found by `ctypes` automatically. + """ so_file = envs.VLLM_NCCL_SO_PATH - VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT - - # check if we have vllm-managed nccl - vllm_nccl_path = None - if torch.version.cuda is not None: - cuda_major = torch.version.cuda.split(".")[0] - path = os.path.expanduser( - f"{VLLM_CONFIG_ROOT}/vllm/nccl/cu{cuda_major}/libnccl.so.*") - files = glob.glob(path) - vllm_nccl_path = files[0] if files else None # manually load the nccl library if so_file: @@ -635,9 +608,9 @@ def find_nccl_library(): so_file) else: if torch.version.cuda is not None: - so_file = vllm_nccl_path or find_library("libnccl.so.2") + so_file = "libnccl.so.2" elif torch.version.hip is not None: - so_file = find_library("librccl.so.1") + so_file = "librccl.so.1" else: raise ValueError("NCCL only supports CUDA and ROCm backends.") logger.info("Found nccl from library %s", so_file) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index dbac1b5ba339b..258f31de17d87 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -121,12 +121,14 @@ def update_environment_variables(envs: Dict[str, str]) -> None: def init_worker(self, *args, **kwargs): """ - Actual initialization of the worker class, and set up - function tracing if required. + Here we inject some common logic before initializing the worker. Arguments are passed to the worker class constructor. """ enable_trace_function_call_for_thread() + # see https://github.com/NVIDIA/nccl/issues/1234 + os.environ['NCCL_CUMEM_ENABLE'] = '0' + mod = importlib.import_module(self.worker_module_name) worker_class = getattr(mod, self.worker_class_name) self.worker = worker_class(*args, **kwargs) From 420c4ffca2476ca7868268be96787c54c7630ee3 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 29 May 2024 15:16:41 +0800 Subject: [PATCH 049/154] [Bugfix] Fix arguments passed to `Sequence` in stop checker test (#5092) --- tests/engine/output_processor/test_stop_checker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index ae54c83605e11..1d9c878ddde50 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -15,8 +15,11 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - prompt="", - prompt_token_ids=[], + inputs={ + "prompt": "", + "prompt_token_ids": [], + "multi_modal_data": None, + }, block_size=16, eos_token_id=eos_token_id, ) From 5bde5ba2c6b16b80b442a6c5e522bb33c0477d6b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 29 May 2024 04:29:07 -0700 Subject: [PATCH 050/154] [Core][Distributed] improve p2p access check (#4992) --- .../device_communicators/custom_all_reduce.py | 3 +- .../custom_all_reduce_utils.py | 186 ++++++++++++++++++ vllm/distributed/utils.py | 90 +-------- 3 files changed, 189 insertions(+), 90 deletions(-) create mode 100644 vllm/distributed/device_communicators/custom_all_reduce_utils.py diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 30ee9d1f8a1e9..a3902aecb3793 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -6,6 +6,8 @@ from torch.distributed import ProcessGroup import vllm.envs as envs +from vllm.distributed.device_communicators.custom_all_reduce_utils import ( + gpu_p2p_access_check) from vllm.distributed.parallel_state import ( get_local_rank, get_tensor_model_parallel_cpu_group) from vllm.logger import init_logger @@ -65,7 +67,6 @@ def _is_full_nvlink(device_ids: List[int]) -> bool: def _can_p2p(rank: int, world_size: int) -> bool: - from vllm.distributed.utils import gpu_p2p_access_check for i in range(world_size): if i == rank: continue diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py new file mode 100644 index 0000000000000..24ef3cb45b19d --- /dev/null +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -0,0 +1,186 @@ +import json +import os +import sys +import tempfile +import time +from contextlib import contextmanager +from typing import Callable, Dict, List, Optional + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +import vllm.envs as envs +from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@contextmanager +def mute_output(): + with open(os.devnull, "w") as f: + sys.stderr = f + sys.stdout = f + yield + + +def producer(i: int, + init_method: str, + cuda_visible_devices: Optional[str] = None): + if cuda_visible_devices is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + with mute_output(): + dist.init_process_group( + backend="gloo", + init_method=init_method, + world_size=2, + rank=0, + ) + # produce a tensor in GPU i + data = torch.zeros((128, ), device=f"cuda:{i}") + # get the information to reconstruct the shared tensor + func, args = torch.multiprocessing.reductions.reduce_tensor(data) + args = list(args) + dist.broadcast_object_list([(func, args)], src=0) + dist.barrier() + torch.cuda.synchronize() + assert torch.all(data == 1).item() + + +def consumer(j: int, + init_method: str, + cuda_visible_devices: Optional[str] = None): + if cuda_visible_devices is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + with mute_output(): + dist.init_process_group( + backend="gloo", + init_method=init_method, + world_size=2, + rank=1, + ) + torch.cuda.set_device(j) + recv = [None] + dist.broadcast_object_list(recv, src=0) + func: Callable + args: List + func, args = recv[0] # type: ignore + # `args[6]` is the device id + # by default pytorch will use `i` from the producer + # here we need to set it to `j` to test P2P access + args[6] = j + data = func(*args) + data += 1 + dist.barrier() + torch.cuda.synchronize() + assert torch.all(data == 1).item() + + +def can_actually_p2p(i, j): + """ + Usually, checking if P2P access is enabled can be done by + `torch.cuda.can_device_access_peer(i, j)`. However, sometimes + the driver might be broken, and `torch.cuda.can_device_access_peer(i, j)` + returns `True` even if P2P access is not actually possible. + See https://github.com/vllm-project/vllm/issues/2728 and + https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10 + Therefore, we have to perform a real P2P access to check if it is actually + possible. + + Note on p2p and cuda IPC: + Usually, one process uses one GPU: + GPU i --> cuda context i --> tensor i --> process i + + We need to combine p2p and cuda IPC, so that: + GPU i --> cuda context i --> tensor i --> process i + |shared| + GPU j --> cuda context j --> tensor j --> process j + That is to say, process i creates a tensor in GPU i, passes IPC handle to + process j, and process j accesses the tensor in GPU j. Any operation on the + tensor in process j will be reflected in the tensor in process i, because + they are the same memory segment. + It is important to note that process j accesses the tensor in GPU j, not + GPU i. That's why we need p2p access. # noqa + """ + cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) + # pass the CUDA_VISIBLE_DEVICES to the child process + # to make sure they see the same set of GPUs + + # make sure the temp file is not the same across different calls + temp_path = tempfile.mktemp() + str(time.time()) + # create an empty file + with open(temp_path, "w"): + pass + init_method = f"file://{temp_path}" + + # make sure the processes are spawned + smp = mp.get_context("spawn") + pi = smp.Process(target=producer, + args=(i, init_method, cuda_visible_devices)) + pj = smp.Process(target=consumer, + args=(j, init_method, cuda_visible_devices)) + pi.start() + pj.start() + pi.join() + pj.join() + return pi.exitcode == 0 and pj.exitcode == 0 + + +# why do we need this cache? +# we are testing peer-to-peer (p2p) access between GPUs,across processes. +# if we test it every time, it will be very slow, because we need to create +# N * N * 2 processes, where N is the world size. This is very slow. +# to reduce the time, we use a cache file to store the p2p access status. +# the cache file is generated by the master process if it does not exist. +# then all the processes can read the cache file to check the p2p access status. +# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we +# can have different cache files for different CUDA_VISIBLE_DEVICES settings, +# e.g. used by different vllm engines. The device id in the cache file is a +# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number +# of visible devices in the vllm engine. +_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None + + +def gpu_p2p_access_check(i: int, j: int) -> bool: + """Check if GPU i can access GPU j.""" + + # if the cache variable is already calculated, + # read from the cache instead of checking it again + global _gpu_p2p_access_cache + if _gpu_p2p_access_cache is not None: + return _gpu_p2p_access_cache[f"{i}->{j}"] + + is_distributed = dist.is_initialized() + + num_dev = torch.cuda.device_count() + cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + if cuda_visible_devices is None: + cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) + VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT + path = os.path.expanduser( + f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json" + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + if ((not is_distributed or get_local_rank() == 0) + and (not os.path.exists(path))): + # only the local master process (with local_rank == 0) can + # enter this block to calculate the cache + logger.info("generating GPU P2P access cache for in %s", path) + cache = {} + for _i in range(num_dev): + for _j in range(num_dev): + cache[f"{_i}->{_j}"] = can_actually_p2p(_i, _j) + with open(path, "w") as f: + json.dump(cache, f, indent=4) + if is_distributed: + cpu_world_group = get_cpu_world_group() + dist.barrier(cpu_world_group) + logger.info("reading GPU P2P access cache from %s", path) + with open(path, "r") as f: + cache = json.load(f) + _gpu_p2p_access_cache = cache + return _gpu_p2p_access_cache[f"{i}->{j}"] + + +__all__ = ["gpu_p2p_access_check"] diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 1965d4c1d3cbc..0cd420c8e11b5 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -2,19 +2,9 @@ # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -import json -import os -from typing import Dict, Optional, Sequence +from typing import Sequence import torch -import torch.distributed as dist - -import vllm.envs as envs -from vllm.logger import init_logger - -from .parallel_state import get_cpu_world_group, get_local_rank - -logger = init_logger(__name__) def ensure_divisibility(numerator, denominator): @@ -56,81 +46,3 @@ def split_tensor_along_last_dim( return tuple(chunk.contiguous() for chunk in tensor_list) return tensor_list - - -# code partly borrowed from -# https://github.com/turboderp/exllamav2/blob/1c67f97f3d2a968605a9c31ab791a05c85bb7879/exllamav2/compat.py#L10 -# License: MIT -def _can_actually_p2p(idx_a, idx_b): - dev_i = f"cuda:{idx_a}" - dev_j = f"cuda:{idx_b}" - a = torch.randn(5, device=dev_i) + 123.0 - b = a.to(dev_j) - c = b.to(dev_i) - return torch.all(a == c).cpu().item() - - -# why do we need this cache? -# 1. we can have runtime checks for P2P access, where every process checks -# P2P access to all other GPUs. Unfortunately, the test might cost many -# (world_size * world_size) cuda context, and reduce the memory available -# for the model. see https://github.com/vllm-project/vllm/issues/3821 -# 2. alternatively, we can have a p2p map that is generated by the master -# process and broadcasted to all other processes. This still requires -# #world_size of cuda context, belonging to the master process, on each GPU. -# 3. we can have a cache file, that records the p2p access status. The first -# time the master process checks the p2p access, it will generate the cache -# file, at the cost of #world_size of cuda context. Later on, all processes -# can read the cache file to check the p2p access status without any cost of -# additional cuda context. -# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we -# can have different cache files for different CUDA_VISIBLE_DEVICES settings, -# e.g. used by different vllm engines. The device id in the cache file is a -# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number -# of visible devices in the vllm engine. -_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None - - -def gpu_p2p_access_check(i: int, j: int) -> bool: - """Check if GPU i can access GPU j.""" - - # if the cache variable is already calculated, - # read from the cache instead of checking it again - global _gpu_p2p_access_cache - if _gpu_p2p_access_cache is not None: - return _gpu_p2p_access_cache[f"{i}->{j}"] - - is_distributed = dist.is_initialized() - - num_dev = torch.cuda.device_count() - cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES - if cuda_visible_devices is None: - cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) - VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT - path = os.path.expanduser( - f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json" - ) - os.makedirs(os.path.dirname(path), exist_ok=True) - if (not is_distributed or get_local_rank() == 0) \ - and (not os.path.exists(path)): - # only the local master process (with local_rank == 0) can - # enter this block to calculate the cache - logger.info("generating GPU P2P access cache for in %s", path) - cache = {} - for _i in range(num_dev): - for _j in range(num_dev): - # on some platforms, P2P support might be buggy and we need - # additional checks. See also: - # https://github.com/vllm-project/vllm/issues/2728 - cache[f"{_i}->{_j}"] = torch.cuda.can_device_access_peer( - _i, _j) and _can_actually_p2p(_i, _j) - with open(path, "w") as f: - json.dump(cache, f, indent=4) - if is_distributed: - cpu_world_group = get_cpu_world_group() - dist.barrier(cpu_world_group) - logger.info("reading GPU P2P access cache from %s", path) - with open(path, "r") as f: - cache = json.load(f) - _gpu_p2p_access_cache = cache - return _gpu_p2p_access_cache[f"{i}->{j}"] From b86aa897422ad06a59a18b32617b2fbd070c9ae9 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Wed, 29 May 2024 12:09:13 -0400 Subject: [PATCH 051/154] [Core] Cross-attention KV caching and memory-management (towards eventual encoder/decoder model support) (#4837) --- tests/core/block/test_block_manager_v2.py | 154 ++++++++++++++- tests/core/test_block_manager.py | 220 +++++++++++++++++++++- tests/core/utils.py | 99 +++++++++- vllm/core/block/utils.py | 56 ++++++ vllm/core/block_manager_v1.py | 187 ++++++++++++------ vllm/core/block_manager_v2.py | 65 ++++++- vllm/sequence.py | 23 +++ 7 files changed, 735 insertions(+), 69 deletions(-) create mode 100644 vllm/core/block/utils.py diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 91b047f0e183e..f98fc0e217278 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -1,11 +1,13 @@ import pytest +from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.block_manager_v2 import BlockSpaceManagerV2 from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list -from ..utils import create_seq_group +from ..utils import create_seq_group, create_seq_group_encoder_decoder @pytest.mark.parametrize("block_size", [16]) @@ -52,6 +54,156 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, assert can_allocate_result == AllocStatus.LATER +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160]) +@pytest.mark.parametrize("num_seqs_per_group", [1, 4]) +@pytest.mark.parametrize("watermark", [0.0, 0.5]) +def test_can_allocate_seq_group_encoder_decoder(block_size: int, + num_seqs_per_group: int, + num_gpu_blocks: int, + watermark: float): + block_manager = BlockSpaceManagerV2( + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=1024, + watermark=watermark, + ) + num_watermark_blocks = int(watermark * num_gpu_blocks) + + num_output_blocks_per_seq = 1 + + # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but + # the current implementation assumes all seqs are new prompts / don't have + # different output lens. + num_output_blocks = num_output_blocks_per_seq + + for bdx, num_prompt_blocks in enumerate( + range(1, num_gpu_blocks - num_output_blocks)): + num_cross_blocks_per_seq = num_prompt_blocks + + seq_group = create_seq_group_encoder_decoder( + seq_prompt_len=block_size * num_prompt_blocks, + seq_output_lens=[ + block_size * num_output_blocks_per_seq + for _ in range(num_seqs_per_group) + ], + request_id=str(bdx)) + + assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks + + can_allocate_result = block_manager.can_allocate(seq_group) + + num_required_blocks = num_prompt_blocks + \ + num_output_blocks + \ + num_cross_blocks_per_seq + + if num_gpu_blocks - num_required_blocks < num_watermark_blocks: + assert can_allocate_result == AllocStatus.NEVER + elif num_gpu_blocks >= num_required_blocks: + assert can_allocate_result == AllocStatus.OK + else: + assert can_allocate_result == AllocStatus.LATER + + +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_gpu_blocks", [16]) +@pytest.mark.parametrize("num_seqs_per_group", [1]) +@pytest.mark.parametrize("watermark", [0.0, 0.5]) +def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, + num_seqs_per_group: int, + num_gpu_blocks: int, + watermark: float): + ''' + SWA short for Sliding Window Attention. + + At time of writing block manager v2 does not support SWA. + + However even when SWA is implemented for block manager v2, + there will still most likely be a separate workstream required + to enable SWA for encoder/decoder models. + + Therefore this test enforces that one of the following cases + hold true: + 1. Block manager v2 does not support SWA at all (true at time of writing) + 2. Block manager v2 fails with NotImplementError when SWA is enabled + AND a SequenceGroup with an encoder sequence (i.e. in support of an + encoder/decoder model) is passed into can_allocate() as an argument + + The setup for this test is stripped down version of + test_can_allocate_seq_group_encoder_decoder() + ''' + + with pytest.raises((NotImplementedError, AssertionError)) as exc_info: + block_manager = BlockSpaceManagerV2( + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=1024, + watermark=watermark, + sliding_window=5 # SWA + ) + + num_output_blocks_per_seq = 1 + num_prompt_blocks = 1 + num_output_blocks = num_output_blocks_per_seq + seq_group = create_seq_group_encoder_decoder( + seq_prompt_len=block_size * num_prompt_blocks, + seq_output_lens=[ + block_size * num_output_blocks_per_seq + for _ in range(num_seqs_per_group) + ], + request_id="0") + + assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks + block_manager.can_allocate(seq_group) + + # Assert that either + # 1. Block manager v2 constructor fails with assertion that sliding window + # is not yet supported (most likely near-term outcome at time of + # writing), or + # 2. can_allocate() fails with NotImplementedError due to combination of + # encoder/decoder and sliding window attention + if isinstance(exc_info.value, NotImplementedError): + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA + elif isinstance(exc_info.value, AssertionError): + assert str(exc_info.value) == "Sliding window not yet supported" + + +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_gpu_blocks", [16]) +@pytest.mark.parametrize("num_seqs_per_group", [1]) +@pytest.mark.parametrize("watermark", [0.0, 0.5]) +def test_can_allocate_encoder_decoder_fails_with_prefix_cache( + block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, + watermark: float): + + block_manager = BlockSpaceManagerV2( + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=1024, + watermark=watermark, + enable_caching=True # Prefix cache + ) + + num_output_blocks_per_seq = 1 + num_prompt_blocks = 1 + num_output_blocks = num_output_blocks_per_seq + seq_group = create_seq_group_encoder_decoder( + seq_prompt_len=block_size * num_prompt_blocks, + seq_output_lens=[ + block_size * num_output_blocks_per_seq + for _ in range(num_seqs_per_group) + ], + request_id="0") + + assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks + + # Assert that either can_allocate() fails with NotImplementedError + # due to combination of encoder/decoder and prefix cache + with pytest.raises(NotImplementedError) as exc_info: + block_manager.can_allocate(seq_group) + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE + + @pytest.mark.parametrize("block_size", [1, 8]) @pytest.mark.parametrize("prompt_len", [1, 7, 8]) @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129]) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 88cd4f98091f9..ddd843174f7b1 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -6,13 +6,15 @@ from vllm import SamplingParams from vllm.block import PhysicalTokenBlock +from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.block_manager_v1 import (BlockSpaceManagerV1, UncachedBlockAllocator) from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device -from .utils import create_dummy_prompt +from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder def test_block_allocator_allocate(): @@ -73,7 +75,7 @@ def test_allocate(): # Allocate same sequence group to all available gpu blocks. for i in range(num_gpu_blocks): _, seq_group = create_dummy_prompt(str(i), block_size) - assert block_manager.can_allocate(seq_group) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK @@ -85,11 +87,107 @@ def test_allocate(): watermark=1 / num_gpu_blocks) for i in range(num_gpu_blocks - 1): _, seq_group = create_dummy_prompt(str(i), block_size) - assert block_manager.can_allocate(seq_group) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK +def test_allocate_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_req_per_seq_group = 2 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same sequence group to all available gpu blocks. + for i in range(num_gpu_blocks // block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder( + str(i), + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK + + # Allocate same sequence group to all available gpu blocks. + # Use watermark to reserve one gpu block. + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=1 / num_gpu_blocks) + for i in range((num_gpu_blocks - 1) // block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder( + str(i), + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK + + +def test_allocate_encoder_decoder_fails_with_swa(): + # SWA short for sliding window attention + + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0, + sliding_window=5) # swa + + # Allocate same sequence group to all available gpu blocks. + _, _, seq_group = create_dummy_prompt_encoder_decoder( + "0", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + + # Assert that can_allocate() fails due to SWA + with pytest.raises(NotImplementedError) as exc_info: + block_manager.can_allocate(seq_group) + + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA + + # Assert that allocate() fails due to SWA + with pytest.raises(NotImplementedError) as exc_info: + block_manager.allocate(seq_group) + + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA + + +def test_allocate_encoder_decoder_fails_with_prefix_caching(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0, + enable_caching=True) # Prefix cache + + # Allocate same sequence group to all available gpu blocks. + _, _, seq_group = create_dummy_prompt_encoder_decoder( + "0", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + + # Assert that can_allocate() fails due to prefix caching + with pytest.raises(NotImplementedError) as exc_info: + block_manager.can_allocate(seq_group) + + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE + + # Assert that allocate() fails due to prefix caching + with pytest.raises(NotImplementedError) as exc_info: + block_manager.allocate(seq_group) + + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE + + def test_append_slot_single_seq(): block_size = 4 num_cpu_blocks = 4 @@ -244,6 +342,62 @@ def test_swap(): assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) +def test_swap_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + decoder_prompt, encoder_prompt, seq_group = \ + create_dummy_prompt_encoder_decoder( + "1", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + decoder_prompt.status = SequenceStatus.WAITING + encoder_prompt.status = SequenceStatus.WAITING + block_manager.allocate(seq_group) + + # Emulate a forward pass by appending a single token. + # The block manager then knows how many unprocessed + # tokens will be written in the next forward pass. + token_id = 0 + decoder_prompt.status = SequenceStatus.RUNNING + decoder_prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) + + # Swap encoder/decoder seq group from GPU -> CPU. + decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt) + cross_gpu_blocks = block_manager.get_cross_block_table(seq_group) + gpu_blocks = decoder_gpu_blocks + cross_gpu_blocks + assert block_manager.can_swap_out(seq_group) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_out(seq_group) + assert [x[0] for x in mapping] == gpu_blocks + #assert list(mapping.keys()) == gpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) + assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks + decoder_prompt.status = SequenceStatus.SWAPPED + + # Swap encoder/decoder seq group from CPU -> GPU. + decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt) + cross_cpu_blocks = block_manager.get_cross_block_table(seq_group) + cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks + assert block_manager.can_swap_in(seq_group) == AllocStatus.OK + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_in(seq_group) + assert [x[0] for x in mapping] == cpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks + assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) + + def test_free(): block_size = 4 num_cpu_blocks = 4 @@ -268,6 +422,41 @@ def test_free(): block_manager.get_block_table(prompt) +def test_free_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + decoder_prompt, encoder_prompt, seq_group = \ + create_dummy_prompt_encoder_decoder( + "1", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + block_manager.allocate(seq_group) + + # Free allocated seq. + decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt)) + encoder_prompt_blocks = len(block_manager.get_cross_block_table(seq_group)) + prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks + before_blocks = block_manager.get_num_free_gpu_blocks() + block_manager.free(decoder_prompt) + block_manager.free_cross(seq_group) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert after_blocks == before_blocks + prompt_blocks + + # Block table for freed encoder & decoder seq's are deleted. + with pytest.raises(KeyError): + block_manager.get_block_table(decoder_prompt) + + # Block table for freed encoder & decoder seq's are deleted. + with pytest.raises(KeyError): + block_manager.get_block_table(encoder_prompt) + + def test_reset(): block_size = 4 num_cpu_blocks = 4 @@ -289,6 +478,31 @@ def test_reset(): assert block_manager.get_num_free_gpu_blocks() == original_blocks +def test_reset_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_req_per_seq_group = 2 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same seq group on all available gpu blocks. + original_blocks = block_manager.get_num_free_gpu_blocks() + for i in range(num_gpu_blocks // block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder( + f"{i}", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + block_manager.allocate(seq_group) + assert block_manager.get_num_free_gpu_blocks() == 0 + + # Resetting block manager frees all allocated blocks. + block_manager.reset() + assert block_manager.get_num_free_gpu_blocks() == original_blocks + + def test_sliding_window_multi_seq(): """ Tests that memory allocation and deallocation is handled diff --git a/tests/core/utils.py b/tests/core/utils.py index 1c5724090b69b..cd2045b8a1889 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -39,6 +39,52 @@ def create_dummy_prompt( return prompt, seq_group +def create_dummy_prompt_encoder_decoder( + request_id: str, + decoder_prompt_length: int, + encoder_prompt_length: int, + block_size: Optional[int] = None, + lora_request: Optional[LoRARequest] = None, + use_beam_search: bool = False, + best_of: int = 1, +) -> Tuple[Sequence, SequenceGroup]: + if not block_size: + block_size = decoder_prompt_length + + # Create dummy prompt sequence with tokens 0...block_size-1 + # and prompt "0 ... block_size". + decoder_prompt_tokens = list(range(decoder_prompt_length)) + decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens]) + + decoder_prompt = Sequence(int(request_id), + inputs={ + "prompt": decoder_prompt_str, + "prompt_token_ids": decoder_prompt_tokens, + "multi_modal_data": None, + }, + block_size=block_size) + + encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) + encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) + encoder_prompt = Sequence(int(request_id), + inputs={ + "prompt": encoder_prompt_str, + "prompt_token_ids": encoder_prompt_tokens, + "multi_modal_data": None, + }, + block_size=block_size) + seq_group = SequenceGroup(request_id=request_id, + seqs=[decoder_prompt], + sampling_params=SamplingParams( + use_beam_search=use_beam_search, + best_of=best_of), + arrival_time=time.time(), + lora_request=lora_request, + encoder_seq=encoder_prompt) + + return decoder_prompt, encoder_prompt, seq_group + + def create_seq_group( seq_prompt_len: int = 1024, seq_output_lens: Iterable[int] = (128, ), @@ -82,5 +128,56 @@ def create_seq_group( return seq_group +def create_seq_group_encoder_decoder( + seq_prompt_len: int = 1024, + seq_output_lens: Iterable[int] = (128, ), + request_id: str = '0', + seq_id_start: int = 0, + sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: + + assert len(seq_output_lens) > 0 + + if sampling_params is None: + sampling_params = SamplingParams() + + prompt_token_ids = [0] * seq_prompt_len + + seqs = [] + for seq_id_offset, output_len in enumerate(seq_output_lens): + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs={ + "prompt": "", + "prompt_token_ids": prompt_token_ids, + "multi_modal_data": None, + }, + block_size=16, + ) + + for i in range(output_len): + seq.append_token_id( + token_id=i, + logprobs={i: Logprob(0.0)}, + ) + seqs.append(seq) + + # Encoder sequence + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + inputs={ + "prompt": "", + "prompt_token_ids": prompt_token_ids, + "multi_modal_data": None, + }, + block_size=16, + ) + + return SequenceGroup(request_id=request_id, + seqs=seqs, + sampling_params=sampling_params, + arrival_time=time.time(), + encoder_seq=encoder_seq) + + def round_up_to_next_block(seq_len: int, block_size: int) -> int: - return (seq_len + block_size - 1) // block_size + return (seq_len + block_size - 1) // block_size \ No newline at end of file diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py new file mode 100644 index 0000000000000..2c412a8f472e0 --- /dev/null +++ b/vllm/core/block/utils.py @@ -0,0 +1,56 @@ +"""Block manager utils.""" +from vllm.sequence import SequenceGroup + +# Exception strings for non-implemented block manager enc/dec scenarios + +STR_NOT_IMPL_ENC_DEC_SWA = \ + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported." + +STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ + "Prefix caching for encoder/decoder models " + \ + "is not currently supported." + + +def _get_block_mgr_sliding_window_attr(block_mgr): + ''' + BlockManagerV1 and BlockManagerV2 have slightly different + members related to sliding window attention (SWA). This + function extracts the appropriate member to use for determining + whether SWA is enabled. + + Arguments: + + * block_mgr: BlockManagerV1 or BlockManagerV2 instance + ''' + + if hasattr(block_mgr, 'block_sliding_window'): + return block_mgr.block_sliding_window + if hasattr(block_mgr, 'max_block_sliding_window'): + return block_mgr.max_block_sliding_window + + raise AttributeError("Block manager instance has neither " + \ + "block_sliding_window nor " + \ + "max_block_sliding_window attributes.") + + +def check_no_caching_or_swa_for_blockmgr_encdec( + block_mgr, seq_group: SequenceGroup) -> None: + ''' + Enforce that prefix caching & sliding-window attention (SWA) + are currently unsupported *specifically* for encoder/decoder models. + + Raises NotImplementedError if unsupported scenario is detected. + + Arguments: + + * block_mgr: BlockSpaceManager instance + * seq_group: SequenceGroup passed to block_mgr + ''' + + if seq_group.is_encoder_decoder(): + if _get_block_mgr_sliding_window_attr(block_mgr) is not None: + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) + + if block_mgr.enable_caching: + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 52a170d79e4e7..201cba309f6ef 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -8,6 +8,7 @@ from typing import Set, Tuple from vllm.block import BlockTable, PhysicalTokenBlock +from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger @@ -255,14 +256,30 @@ def __init__( Device.CPU, block_size, num_cpu_blocks) # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} + # Mapping: req_id -> BlockTable + # Note that each SequenceGroup has a unique + # request ID + self.cross_block_tables: Dict[str, BlockTable] = {} + + def _get_seq_num_required_blocks(self, seq: Sequence) -> int: + return 0 if seq is None \ + else len(seq.logical_token_blocks) def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - num_required_blocks = len(seq.logical_token_blocks) + + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) + + self_num_required_blocks = self._get_seq_num_required_blocks( + seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) + cross_num_required_blocks = self._get_seq_num_required_blocks( + seq_group.get_encoder_seq()) + num_required_blocks = self_num_required_blocks + \ + cross_num_required_blocks if self.block_sliding_window is not None: + num_required_blocks = min(num_required_blocks, self.block_sliding_window) num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() @@ -276,11 +293,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def allocate(self, seq_group: SequenceGroup) -> None: - # NOTE: Here we assume that all sequences in the group have the same - # prompt. - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - + def _allocate_sequence(self, \ + seq: Sequence, \ + ref_count: int, \ + is_encoder_decoder: bool = True) -> BlockTable: # Allocate new physical token blocks that will store the prompt tokens. num_prompt_blocks = len(seq.logical_token_blocks) @@ -290,21 +306,46 @@ def allocate(self, seq_group: SequenceGroup) -> None: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() - elif self.enable_caching: + block.ref_count = ref_count + elif not is_encoder_decoder and self.enable_caching: block = self.gpu_allocator.allocate( seq.hash_of_block(logical_idx), seq.num_hashed_tokens_of_block(logical_idx)) else: block = self.gpu_allocator.allocate() # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() + block.ref_count = ref_count block_table.append(block) - # Assign the block table for each sequence. + return block_table + + def allocate(self, seq_group: SequenceGroup) -> None: + is_encoder_decoder = seq_group.is_encoder_decoder() + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) + + # Allocate decoder sequences + # + # NOTE: Here we assume that all sequences in the group have the same + # decoder prompt. + seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] + block_table: BlockTable = \ + self._allocate_sequence(seq, + seq_group.num_seqs(), + is_encoder_decoder) + + # Assign the self-attention block tables for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() + # Allocate encoder sequence + if is_encoder_decoder: + # A SequenceGroup has only a single encoder sequence (at most), + # thus allocate with a ref count of 1 + block_table = self._allocate_sequence(seq_group.get_encoder_seq(), + 1, is_encoder_decoder) + # Assign the cross-attention block table for the SequenceGroup. + self.cross_block_tables[seq_group.request_id] = block_table + def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int = 0) -> bool: @@ -443,13 +484,18 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: def _get_physical_blocks( self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: + # NOTE: Here, we assume that the physical blocks are only shared by # the sequences in the same group. + request_id = seq_group.request_id blocks: Set[PhysicalTokenBlock] = set() for seq in seq_group.get_seqs(): if seq.is_finished(): continue blocks.update(self.block_tables[seq.seq_id]) + # Cross-attention blocks + if seq_group.is_encoder_decoder(): + blocks.update(self.cross_block_tables[request_id]) return list(blocks) def can_swap_in(self, @@ -457,8 +503,11 @@ def can_swap_in(self, num_lookahead_slots: int = 0) -> AllocStatus: assert (num_lookahead_slots == 0 ), "BlockSpaceManagerV1 does not support lookahead allocation" + blocks = self._get_physical_blocks(seq_group) num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED) + if seq_group.is_encoder_decoder(): + num_swapped_seqs += 1 num_free_blocks = self.gpu_allocator.get_num_free_blocks() # NOTE: Conservatively, we assume that every sequence will allocate # at least one free block right after the swap-in. @@ -471,70 +520,81 @@ def can_swap_in(self, else: return AllocStatus.LATER + def _swap_block_table( + self, block_table: BlockTable, src_allocator: BlockAllocatorBase, + dest_allocator: BlockAllocatorBase, + mapping: Dict[PhysicalTokenBlock, + PhysicalTokenBlock]) -> BlockTable: + new_block_table = [] + + for from_block in block_table: + if from_block in mapping: + to_block = mapping[from_block] + to_block.ref_count += 1 + else: + to_block = dest_allocator.allocate( + from_block.block_hash, from_block.num_hashed_tokens) + mapping[from_block] = to_block + new_block_table.append(to_block) + # Free the source block swapped in to destination. + src_allocator.free(from_block) + + return new_block_table + def swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int = 0) -> List[Tuple[int, int]]: assert (num_lookahead_slots == 0 ), "BlockSpaceManagerV1 does not support lookahead allocation" + request_id = seq_group.request_id + # CPU block -> GPU block. # dict is efficient in lookup `if cpu_block in mapping` mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - new_block_table: BlockTable = [] - block_table = self.block_tables[seq.seq_id] - - for cpu_block in block_table: - if cpu_block in mapping: - gpu_block = mapping[cpu_block] - gpu_block.ref_count += 1 - else: - gpu_block = self.gpu_allocator.allocate( - cpu_block.block_hash, cpu_block.num_hashed_tokens) - mapping[cpu_block] = gpu_block - new_block_table.append(gpu_block) - # Free the CPU block swapped in to GPU. - self.cpu_allocator.free(cpu_block) - self.block_tables[seq.seq_id] = new_block_table - - block_number_mapping = { - cpu_block.block_number: gpu_block.block_number - for cpu_block, gpu_block in mapping.items() - } - # convert to list of tuples once here - return list(block_number_mapping.items()) + self.block_tables[seq.seq_id] = \ + self._swap_block_table(self.block_tables[seq.seq_id], + self.cpu_allocator, + self.gpu_allocator, + mapping) + + if seq_group.is_encoder_decoder(): + self.cross_block_tables[request_id] = \ + self._swap_block_table(self.cross_block_tables[request_id], + self.cpu_allocator, + self.gpu_allocator, + mapping) + + return [(cpu_block.block_number, gpu_block.block_number) + for cpu_block, gpu_block in mapping.items()] def can_swap_out(self, seq_group: SequenceGroup) -> bool: blocks = self._get_physical_blocks(seq_group) return len(blocks) <= self.cpu_allocator.get_num_free_blocks() def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + request_id = seq_group.request_id + # GPU block -> CPU block. # dict is efficient in lookup `if gpu_block in mapping` mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - new_block_table: BlockTable = [] - block_table = self.block_tables[seq.seq_id] - - for gpu_block in block_table: - if gpu_block in mapping: - cpu_block = mapping[gpu_block] - cpu_block.ref_count += 1 - else: - cpu_block = self.cpu_allocator.allocate( - gpu_block.block_hash, gpu_block.num_hashed_tokens) - mapping[gpu_block] = cpu_block - new_block_table.append(cpu_block) - # Free the GPU block swapped out to CPU. - self.gpu_allocator.free(gpu_block) - self.block_tables[seq.seq_id] = new_block_table - - block_number_mapping = { - gpu_block.block_number: cpu_block.block_number - for gpu_block, cpu_block in mapping.items() - } - # convert to list of tuples once here - return list(block_number_mapping.items()) + self.block_tables[seq.seq_id] = \ + self._swap_block_table(self.block_tables[seq.seq_id], + self.gpu_allocator, + self.cpu_allocator, + mapping) + + if seq_group.is_encoder_decoder(): + self.cross_block_tables[request_id] = \ + self._swap_block_table(self.cross_block_tables[request_id], + self.gpu_allocator, + self.cpu_allocator, + mapping) + + return [(cpu_block.block_number, gpu_block.block_number) + for cpu_block, gpu_block in mapping.items()] def _free_block_table(self, block_table: BlockTable) -> None: # when using a sliding window, each seq will only use up @@ -559,15 +619,32 @@ def free(self, seq: Sequence) -> None: self._free_block_table(block_table) del self.block_tables[seq.seq_id] + def free_cross(self, seq_group: SequenceGroup) -> None: + if seq_group.request_id not in self.cross_block_tables: + # Already freed or hasn't ben scheduled yet. + return + block_table = self.cross_block_tables[seq_group.request_id] + self._free_block_table(block_table) + del self.cross_block_tables[seq_group.request_id] + def reset(self) -> None: + # Free decoder block tables for block_table in self.block_tables.values(): self._free_block_table(block_table) self.block_tables.clear() + # Free cross-attention block tables + for block_table in self.cross_block_tables.values(): + self._free_block_table(block_table) + self.cross_block_tables.clear() def get_block_table(self, seq: Sequence) -> List[int]: block_table = self.block_tables[seq.seq_id] return [block.block_number for block in block_table] + def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: + block_table = self.cross_block_tables[seq_group.request_id] + return [block.block_number for block in block_table] + def get_num_free_gpu_blocks(self) -> int: return self.gpu_allocator.get_num_free_blocks() diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 834436c25e160..cad42ab3c1ba2 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -5,11 +5,13 @@ from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator +from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device SeqId = int +EncoderSeqId = str class BlockSpaceManagerV2(BlockSpaceManager): @@ -94,17 +96,26 @@ def __init__( ) self.block_tables: Dict[SeqId, BlockTable] = {} + self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) + + seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = BlockTable.get_num_required_blocks( seq.get_token_ids(), block_size=self.block_size, ) + if seq_group.is_encoder_decoder(): + num_required_blocks += BlockTable.get_num_required_blocks( + seq_group.get_encoder_seq().get_token_ids(), + block_size=self.block_size, + ) + if self.max_block_sliding_window is not None: num_required_blocks = min(num_required_blocks, self.max_block_sliding_window) @@ -121,7 +132,19 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER + def _allocate_sequence(self, seq: Sequence) -> BlockTable: + block_table = BlockTable( + block_size=self.block_size, + block_allocator=self.block_allocator, + max_block_sliding_window=self.max_block_sliding_window, + ) + block_table.allocate(seq.get_token_ids()) + + return block_table + def allocate(self, seq_group: SequenceGroup) -> None: + + # Allocate self-attention block tables for decoder sequences waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) assert not (set(seq.seq_id for seq in waiting_seqs) & self.block_tables.keys()), "block table already exists" @@ -129,20 +152,29 @@ def allocate(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # prompt. seq = waiting_seqs[0] - - block_table = BlockTable( - block_size=self.block_size, - block_allocator=self.block_allocator, - max_block_sliding_window=self.max_block_sliding_window, - ) - - block_table.allocate(seq.get_token_ids()) + block_table: BlockTable = self._allocate_sequence(seq) self.block_tables[seq.seq_id] = block_table # Assign the block table for each sequence. for seq in waiting_seqs[1:]: self.block_tables[seq.seq_id] = block_table.fork() + # Allocate cross-attention block table for encoder sequence + # + # NOTE: Here we assume that all sequences in the group have the same + # encoder prompt. + request_id = seq_group.request_id + + assert (request_id + not in self.cross_block_tables), \ + "block table already exists" + + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) + + if seq_group.is_encoder_decoder(): + block_table = self._allocate_sequence(seq_group.get_encoder_seq()) + self.cross_block_tables[request_id] = block_table + def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> bool: """Determine if there is enough space in the GPU KV cache to continue @@ -197,12 +229,27 @@ def free(self, seq: Sequence) -> None: self.block_tables[seq.seq_id].free() del self.block_tables[seq.seq_id] + def free_cross(self, seq_group: SequenceGroup) -> None: + request_id = seq_group.request_id + if request_id not in self.cross_block_tables: + # Already freed or hasn't been scheduled yet. + return + self.cross_block_tables[request_id].free() + del self.cross_block_tables[request_id] + def get_block_table(self, seq: Sequence) -> List[int]: assert seq.seq_id in self.block_tables block_ids = self.block_tables[seq.seq_id].physical_block_ids assert all(b is not None for b in block_ids) return block_ids # type: ignore + def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: + request_id = seq_group.request_id + assert request_id in self.cross_block_tables + block_ids = self.cross_block_tables[request_id].physical_block_ids + assert all(b is not None for b in block_ids) + return block_ids # type: ignore + def access_all_blocks_in_seq(self, seq: Sequence, now: float): # Update the last accessed time of all the blocks accessed # in this step. diff --git a/vllm/sequence.py b/vllm/sequence.py index f8e9da6c7965a..ee8c94bbf06f7 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -430,6 +430,8 @@ class SequenceGroup: for an embedding model. pooling_params: The pooling parameters used to generate the pooling for an embedding model. + encoder_seq: Optional, the single encoder sequence. Should be None + unless you are working with an encoder/decoder model. """ def __init__( @@ -441,6 +443,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, embeddings: Optional[List[float]] = None, pooling_params: Optional[PoolingParams] = None, + encoder_seq: Optional[Sequence] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -455,6 +458,7 @@ def __init__( self.state = SequenceGroupState() self.embeddings = embeddings self.pooling_params = pooling_params + self.encoder_seq = encoder_seq @property def prompt(self) -> Optional[str]: @@ -538,6 +542,12 @@ def get_seqs( seq for seq in self.seqs_dict.values() if seq.status == status ] + def is_encoder_decoder(self) -> bool: + return self.encoder_seq is not None + + def get_encoder_seq(self) -> Optional[Sequence]: + return self.encoder_seq + def get_unfinished_seqs(self) -> List[Sequence]: return [ seq for seq in self.seqs_dict.values() if not seq.is_finished() @@ -621,6 +631,15 @@ class SequenceGroupMetadata: used in prefix caching. state: Internal state tied to this sequence group. multi_modal_data: Multi modal data. + encoder_seq_data: Optional sequence data for encoder prompt + (SequenceGroup.encoder_seq). Should be None + unless you are working with an encoder/decoder + model. + cross_block_table: Optional cross-attention block table associated + with the encoder prompt + (SequenceGroup.encoder_seq). Should be None + unless you are working with an encoder/decoder + model. """ def __init__( @@ -637,6 +656,8 @@ def __init__( computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, multi_modal_data: Optional[MultiModalData] = None, + encoder_seq_data: Optional[SequenceData] = None, + cross_block_table: Optional[List[int]] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt @@ -648,6 +669,8 @@ def __init__( self.computed_block_nums = computed_block_nums self.multi_modal_data = multi_modal_data self.state = SequenceGroupState() if state is None else state + self.encoder_seq_data = encoder_seq_data + self.cross_block_table = cross_block_table self._token_chunk_size = token_chunk_size self.do_sample = do_sample From f63e8ddc8dad33faa551f5593c8a5e934d789674 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Thu, 30 May 2024 01:26:33 +0300 Subject: [PATCH 052/154] [Doc]Replace deprecated flag in readme (#4526) --- examples/production_monitoring/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md index 78feed5984a43..ead44709d3b73 100644 --- a/examples/production_monitoring/README.md +++ b/examples/production_monitoring/README.md @@ -30,7 +30,8 @@ python3 ../../benchmarks/benchmark_serving.py \ --tokenizer mistralai/Mistral-7B-v0.1 \ --backend openai \ --endpoint /v1/completions \ - --dataset ShareGPT_V3_unfiltered_cleaned_split.json \ + --dataset-name sharegpt \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ --request-rate 3.0 ``` From 62a4fcbc1ef583e3d5e8e233335b62f7250efe32 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 07:02:25 +0800 Subject: [PATCH 053/154] [Bugfix][CI/Build] Fix test and improve code for `merge_async_iterators` (#5096) --- .../test_merge_async_iterators.py | 46 --------------- tests/test_utils.py | 57 ++++++++++++++++++- vllm/utils.py | 9 ++- 3 files changed, 62 insertions(+), 50 deletions(-) delete mode 100644 tests/async_engine/test_merge_async_iterators.py diff --git a/tests/async_engine/test_merge_async_iterators.py b/tests/async_engine/test_merge_async_iterators.py deleted file mode 100644 index 22dcfc44cd25d..0000000000000 --- a/tests/async_engine/test_merge_async_iterators.py +++ /dev/null @@ -1,46 +0,0 @@ -import asyncio -# UPSTREAM SYNC -import sys -from typing import AsyncIterator, Tuple - -import pytest - -from vllm.utils import merge_async_iterators - - -# UPSTREAM SYNC -@pytest.mark.skipif(sys.version_info < (3, 10), - reason="`anext` requires Python 3.10") -@pytest.mark.asyncio -async def test_merge_async_iterators(): - - async def mock_async_iterator(idx: int) -> AsyncIterator[str]: - try: - while True: - yield f"item from iterator {idx}" - await asyncio.sleep(0.1) - except asyncio.CancelledError: - pass - - iterators = [mock_async_iterator(i) for i in range(3)] - merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators( - *iterators) - - async def stream_output(generator: AsyncIterator[Tuple[int, str]]): - async for idx, output in generator: - print(f"idx: {idx}, output: {output}") - - task = asyncio.create_task(stream_output(merged_iterator)) - await asyncio.sleep(0.5) - task.cancel() - with pytest.raises(asyncio.CancelledError): - await task - - for iterator in iterators: - try: - await asyncio.wait_for(anext(iterator), 1) - except StopAsyncIteration: - # All iterators should be cancelled and print this message. - print("Iterator was cancelled normally") - except (Exception, asyncio.CancelledError) as e: - raise AssertionError() from e diff --git a/tests/test_utils.py b/tests/test_utils.py index 54dc5c6f5bfba..a6c3896fa43bf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,64 @@ +import asyncio +import sys +from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol, + Tuple, TypeVar) + import pytest -from vllm.utils import deprecate_kwargs +from vllm.utils import deprecate_kwargs, merge_async_iterators from .utils import error_on_warning +if sys.version_info < (3, 10): + if TYPE_CHECKING: + _AwaitableT = TypeVar("_AwaitableT", bound=Awaitable[Any]) + _AwaitableT_co = TypeVar("_AwaitableT_co", + bound=Awaitable[Any], + covariant=True) + + class _SupportsSynchronousAnext(Protocol[_AwaitableT_co]): + + def __anext__(self) -> _AwaitableT_co: + ... + + def anext(i: "_SupportsSynchronousAnext[_AwaitableT]", /) -> "_AwaitableT": + return i.__anext__() + + +@pytest.mark.asyncio +async def test_merge_async_iterators(): + + async def mock_async_iterator(idx: int) -> AsyncIterator[str]: + try: + while True: + yield f"item from iterator {idx}" + await asyncio.sleep(0.1) + except asyncio.CancelledError: + pass + + iterators = [mock_async_iterator(i) for i in range(3)] + merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators( + *iterators) + + async def stream_output(generator: AsyncIterator[Tuple[int, str]]): + async for idx, output in generator: + print(f"idx: {idx}, output: {output}") + + task = asyncio.create_task(stream_output(merged_iterator)) + await asyncio.sleep(0.5) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + + for iterator in iterators: + try: + await asyncio.wait_for(anext(iterator), 1) + except StopAsyncIteration: + # All iterators should be cancelled and print this message. + print("Iterator was cancelled normally") + except (Exception, asyncio.CancelledError) as e: + raise AssertionError() from e + def test_deprecate_kwargs_always(): diff --git a/vllm/utils.py b/vllm/utils.py index f6cd7262dba75..c36a927e3e6ff 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -5,6 +5,7 @@ import os import socket import subprocess +import sys import tempfile import threading import uuid @@ -234,9 +235,11 @@ async def consumer(): yield item except (Exception, asyncio.CancelledError) as e: for task in _tasks: - # NOTE: Pass the error msg in cancel() - # when only Python 3.9+ is supported. - task.cancel() + if sys.version_info >= (3, 9): + # msg parameter only supported in Python 3.9+ + task.cancel(e) + else: + task.cancel() raise e await asyncio.gather(*_tasks) From f900bccafb41f1caca100df4a026a65a0bc38154 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 07:02:54 +0800 Subject: [PATCH 054/154] [Bugfix][CI/Build] Fix codespell failing to skip files in `git diff` (#5097) --- format.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/format.sh b/format.sh index 2740f56241e0c..4fd7071a0278c 100755 --- a/format.sh +++ b/format.sh @@ -113,8 +113,11 @@ mypy vllm/logging --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml +# If git diff returns a file that is in the skip list, the file may be checked anyway: +# https://github.com/codespell-project/codespell/issues/1915 +# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem CODESPELL_EXCLUDES=( - '--skip' '*docs/source/_build/**,*tests/lora/data/*' + '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,tests/lora/data/**,build/**' ) From 6824b2f8e4243e97f87febf763859479feeba7b5 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 07:05:01 +0800 Subject: [PATCH 055/154] [Core] Avoid the need to pass `None` values to `Sequence.inputs` (#5099) --- tests/core/test_block_manager.py | 2 -- tests/core/utils.py | 7 +------ tests/engine/output_processor/test_stop_checker.py | 6 +----- tests/test_cache_block_hashing.py | 1 - tests/tokenization/test_detokenize.py | 1 - vllm/inputs.py | 4 ++-- vllm/sequence.py | 4 ++-- 7 files changed, 6 insertions(+), 19 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index ddd843174f7b1..cd306b9e4d3cc 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -234,7 +234,6 @@ def test_append_slot_cow(): inputs={ "prompt": "one two three", "prompt_token_ids": [1, 2, 3], - "multi_modal_data": None }, block_size=block_size) @@ -525,7 +524,6 @@ def test_sliding_window_multi_seq(): inputs={ "prompt": "one two three", "prompt_token_ids": [0, 1, 2], - "multi_modal_data": None }, block_size=block_size) seq_group = SequenceGroup(request_id="1", diff --git a/tests/core/utils.py b/tests/core/utils.py index cd2045b8a1889..2fbf099c5f90b 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -25,7 +25,6 @@ def create_dummy_prompt( inputs={ "prompt": prompt_str, "prompt_token_ids": prompt_tokens, - "multi_modal_data": None, }, block_size=block_size) seq_group = SequenceGroup(request_id=request_id, @@ -103,11 +102,7 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - inputs={ - "prompt": "", - "prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, - }, + inputs={"prompt_token_ids": prompt_token_ids}, block_size=16, ) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index 1d9c878ddde50..f795403e3d8ad 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -15,11 +15,7 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - inputs={ - "prompt": "", - "prompt_token_ids": [], - "multi_modal_data": None, - }, + inputs={"prompt_token_ids": []}, block_size=16, eos_token_id=eos_token_id, ) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 97864af88e40a..0fbe3dae1ff08 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -74,7 +74,6 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, inputs={ "prompt": prompt, "prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, }, block_size=block_size, eos_token_id=tokenizer.tokenizer.eos_token_id, diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 1d4c74d6bd8da..8d019fe5f38ca 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -126,7 +126,6 @@ def create_sequence(prompt_token_ids=None): inputs={ "prompt": "", "prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, }, block_size=16, ) diff --git a/vllm/inputs.py b/vllm/inputs.py index f5d99b1b66b70..85c9cd84f5ed5 100644 --- a/vllm/inputs.py +++ b/vllm/inputs.py @@ -126,5 +126,5 @@ class TextTokensPrompt(TypedDict): class LLMInputs(TypedDict): prompt_token_ids: List[int] - prompt: Optional[str] - multi_modal_data: Optional["MultiModalData"] + prompt: NotRequired[Optional[str]] + multi_modal_data: NotRequired[Optional["MultiModalData"]] diff --git a/vllm/sequence.py b/vllm/sequence.py index ee8c94bbf06f7..ac5c234d052bd 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -249,7 +249,7 @@ def __init__( @property def prompt(self) -> Optional[str]: - return self.inputs["prompt"] + return self.inputs.get("prompt") @property def prompt_token_ids(self) -> List[int]: @@ -257,7 +257,7 @@ def prompt_token_ids(self) -> List[int]: @property def multi_modal_data(self) -> Optional["MultiModalData"]: - return self.inputs["multi_modal_data"] + return self.inputs.get("multi_modal_data") @property def lora_int_id(self) -> int: From 623275f132ce6fc6ec3f56b234e91886ef03a8f4 Mon Sep 17 00:00:00 2001 From: Itay Etelis <92247226+Etelis@users.noreply.github.com> Date: Thu, 30 May 2024 02:13:22 +0300 Subject: [PATCH 056/154] [Bugfix] logprobs is not compatible with the OpenAI spec #4795 (#5031) --- vllm/entrypoints/openai/protocol.py | 5 ++--- vllm/entrypoints/openai/serving_chat.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 41e2f77fe56f1..e6eae689d7e03 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -109,7 +109,7 @@ class ChatCompletionRequest(OpenAIBaseModel): frequency_penalty: Optional[float] = 0.0 logit_bias: Optional[Dict[str, float]] = None logprobs: Optional[bool] = False - top_logprobs: Optional[int] = None + top_logprobs: Optional[int] = 0 max_tokens: Optional[int] = None n: Optional[int] = 1 presence_penalty: Optional[float] = 0.0 @@ -192,8 +192,7 @@ class ChatCompletionRequest(OpenAIBaseModel): # doc: end-chat-completion-extra-params def to_sampling_params(self) -> SamplingParams: - if self.logprobs and not self.top_logprobs: - raise ValueError("Top logprobs must be set when logprobs is.") + # We now allow logprobs being true without top_logrobs. logits_processors = None if self.logit_bias: diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 33daabd881df0..8cb50e33e58d1 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -286,7 +286,7 @@ async def chat_completion_stream_generator( logprobs = self._create_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, + num_output_top_logprobs=request.top_logprobs, initial_text_offset=len(previous_texts[i]), ) else: @@ -373,7 +373,7 @@ async def chat_completion_full_generator( logprobs = self._create_logprobs( token_ids=token_ids, top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, + num_output_top_logprobs=request.top_logprobs, ) else: logprobs = None From 15dcd3e15eb1d4643655f4d71bc595e27d65fd5e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 29 May 2024 16:51:18 -0700 Subject: [PATCH 057/154] [Bugfix / Core] Prefix Caching Guards (merged with main) (#4846) Co-authored-by: rsnm2 Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- docs/source/serving/deploying_with_docker.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index cfc462ff33b90..fa82bc8e3bd33 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -51,4 +51,4 @@ To run vLLM: .. note:: - vLLM docker image is currently designed to be run under the root user (contribution welcomed for changing this!). It will try to load library at runtime under the root user's home directory, e.g. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . If you are running the container under a different user, you may need to change the permissions of the library (and all the parent directories) to allow the user to access it. Then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . + **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . From 5763c73d0399a8a23ecdbcdebb07b2e2cb81b082 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Wed, 29 May 2024 20:30:18 -0400 Subject: [PATCH 058/154] [Bugfix] gptq_marlin: Ensure g_idx_sort_indices is not a Parameter (#5108) --- .../layers/quantization/gptq_marlin.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4374fd98012f6..ae440743fdf8e 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -298,14 +298,10 @@ def create_weights( }, ) - g_idx_sort_indices = Parameter( - torch.empty( - g_idx.shape, - dtype=torch.int32, - ), - requires_grad=False, + g_idx_sort_indices = torch.empty( + g_idx.shape, + dtype=torch.int32, ) - set_weight_attrs(g_idx_sort_indices, extra_weight_attrs) # Scales scales = Parameter( @@ -356,9 +352,9 @@ def create_weights( layer.register_parameter("qweight", qweight) layer.register_parameter("g_idx", g_idx) - layer.register_parameter("g_idx_sort_indices", g_idx_sort_indices) layer.register_parameter("scales", scales) layer.register_parameter("qzeros", qzeros) + layer.g_idx_sort_indices = g_idx_sort_indices layer.workspace = workspace layer.input_size_per_partition = input_size_per_partition layer.output_size_per_partition = output_size_per_partition From 3a8332ce9c43ba6e3e57f6d974659f3d8985cb92 Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Wed, 29 May 2024 22:27:39 -0500 Subject: [PATCH 059/154] [CI/Build] Docker cleanup functionality for amd servers (#5112) Co-authored-by: Alexey Kondratiev Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Co-authored-by: Alexei V. Ivanov Co-authored-by: omkarkakarparthi --- .buildkite/run-amd-test.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 7452423479521..bde8ab6184d3c 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -5,6 +5,34 @@ set -ex echo "--- ROCm info" rocminfo +# cleanup older docker images +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes + docker volume prune -f + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} + +# Call the cleanup docker function +cleanup_docker + echo "--- Resetting GPUs" echo "reset" > /opt/amdgpu/etc/gpu_state From 11a5a264a55c6601a622717a6517f96afd103b61 Mon Sep 17 00:00:00 2001 From: Breno Faria Date: Thu, 30 May 2024 11:52:14 +0200 Subject: [PATCH 060/154] [BUGFIX] [FRONTEND] Correct chat logprobs (#5029) Co-authored-by: Breno Faria --- tests/async_engine/test_openapi_server_ray.py | 6 +- tests/entrypoints/test_openai_server.py | 209 +++++++++++++++--- vllm/entrypoints/openai/protocol.py | 50 ++++- vllm/entrypoints/openai/serving_chat.py | 68 +++++- vllm/entrypoints/openai/serving_completion.py | 74 ++++++- vllm/entrypoints/openai/serving_engine.py | 52 +---- 6 files changed, 361 insertions(+), 98 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index fe558c2b2cc92..c4434301201ce 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -94,8 +94,10 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI): chat_completion.choices) == 1 assert chat_completion.choices[0].message is not None assert chat_completion.choices[0].logprobs is not None - assert chat_completion.choices[0].logprobs.top_logprobs is not None - assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5 + assert chat_completion.choices[0].logprobs.content[ + 0].top_logprobs is not None + assert len( + chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5 message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 619f0b3bd1d17..c4c1f8fe3afec 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -183,6 +183,26 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, completion.choices[0].text) >= 5 +@pytest.mark.asyncio +@pytest.mark.parametrize( + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +) +async def test_no_logprobs(server, client: openai.AsyncOpenAI, + model_name: str): + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=None, + ) + choice = completion.choices[0] + assert choice.logprobs is None + + @pytest.mark.asyncio @pytest.mark.parametrize( # first test base model, then test loras @@ -202,7 +222,72 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, choice = completion.choices[0] assert choice.logprobs is not None assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is None + assert choice.logprobs.top_logprobs is not None + assert len(choice.logprobs.top_logprobs[0]) <= 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_some_logprobs(server, client: openai.AsyncOpenAI, + model_name: str): + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=5, + ) + choice = completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.token_logprobs is not None + assert choice.logprobs.top_logprobs is not None + assert len(choice.logprobs.top_logprobs[0]) <= 6 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, + model_name: str): + + with pytest.raises( + (openai.BadRequestError, openai.APIError)): # test using token IDs + await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=6, + ) + ... + with pytest.raises( + (openai.BadRequestError, openai.APIError)): # test using token IDs + stream = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=6, + stream=True, + ) + async for chunk in stream: + ... + + # the server should still work afterwards + completion = await client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + completion = completion.choices[0].text + assert completion is not None and len(completion) >= 0 @pytest.mark.asyncio @@ -232,8 +317,10 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, chat_completion.choices) == 1 assert chat_completion.choices[0].message is not None assert chat_completion.choices[0].logprobs is not None - assert chat_completion.choices[0].logprobs.top_logprobs is not None - assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5 + assert chat_completion.choices[0].logprobs.content[ + 0].top_logprobs is not None + assert len( + chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5 message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" @@ -250,10 +337,93 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize( + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +) +async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=False) + + choice = chat_completion.choices[0] + assert choice.logprobs is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # just test 1 lora hereafter + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=0) + + choice = chat_completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.content is not None + assert len(choice.logprobs.content[0].top_logprobs) <= 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=5) + + choice = chat_completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.content is not None + assert len(choice.logprobs.content[0].top_logprobs) <= 6 + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, + model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -262,13 +432,13 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, "content": "what is 1+1?" }] - # Default max_logprobs is 5, so this should raise an error + # Default max_logprobs is 20, so this should raise an error with pytest.raises((openai.BadRequestError, openai.APIError)): stream = await client.chat.completions.create(model=model_name, messages=messages, max_tokens=10, logprobs=True, - top_logprobs=10, + top_logprobs=21, stream=True) async for chunk in stream: ... @@ -278,25 +448,9 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, messages=messages, max_tokens=10, logprobs=True, - top_logprobs=10, + top_logprobs=30, stream=False) - with pytest.raises((openai.BadRequestError, openai.APIError)): - stream = await client.completions.create(model=model_name, - prompt="Test", - max_tokens=10, - logprobs=10, - stream=True) - async for chunk in stream: - ... - - with pytest.raises(openai.BadRequestError): - await client.completions.create(model=model_name, - prompt="Test", - max_tokens=10, - logprobs=10, - stream=False) - # the server should still work afterwards chat_completion = await client.chat.completions.create(model=model_name, messages=messages, @@ -743,13 +897,12 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, top_logprobs=5, extra_body=dict(guided_choice=TEST_CHOICE, guided_decoding_backend=guided_decoding_backend)) - top_logprobs = chat_completion.choices[0].logprobs.top_logprobs + top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs # -9999.0 is the minimum logprob returned by OpenAI assert all( - isinstance(logprob, float) and logprob >= -9999.0 - for token_dict in top_logprobs - for token, logprob in token_dict.items()) + isinstance(token.logprob, float) and token.logprob >= -9999.0 + for token in top_logprobs) @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e6eae689d7e03..e380212a4d76b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -250,6 +250,19 @@ def check_guided_decoding_count(cls, data): "('guided_json', 'guided_regex' or 'guided_choice').") return data + @model_validator(mode="before") + @classmethod + def check_logprobs(cls, data): + if "top_logprobs" in data and data["top_logprobs"] is not None: + if "logprobs" not in data or data["logprobs"] is False: + raise ValueError( + "when using `top_logprobs`, `logprobs` must be set to true." + ) + elif not 0 <= data["top_logprobs"] <= 20: + raise ValueError( + "`top_logprobs` must be a value in the interval [0, 20].") + return data + class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation @@ -396,6 +409,15 @@ def check_guided_decoding_count(cls, data): "('guided_json', 'guided_regex' or 'guided_choice').") return data + @model_validator(mode="before") + @classmethod + def check_logprobs(cls, data): + if "logprobs" in data and data[ + "logprobs"] is not None and not 0 <= data["logprobs"] <= 5: + raise ValueError(("if passed, `logprobs` must be a value", + " in the interval [0, 5].")) + return data + class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation @@ -415,7 +437,7 @@ def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) -class LogProbs(OpenAIBaseModel): +class CompletionLogProbs(OpenAIBaseModel): text_offset: List[int] = Field(default_factory=list) token_logprobs: List[Optional[float]] = Field(default_factory=list) tokens: List[str] = Field(default_factory=list) @@ -425,7 +447,7 @@ class LogProbs(OpenAIBaseModel): class CompletionResponseChoice(OpenAIBaseModel): index: int text: str - logprobs: Optional[LogProbs] = None + logprobs: Optional[CompletionLogProbs] = None finish_reason: Optional[str] = None stop_reason: Optional[Union[int, str]] = Field( default=None, @@ -448,7 +470,7 @@ class CompletionResponse(OpenAIBaseModel): class CompletionResponseStreamChoice(OpenAIBaseModel): index: int text: str - logprobs: Optional[LogProbs] = None + logprobs: Optional[CompletionLogProbs] = None finish_reason: Optional[str] = None stop_reason: Optional[Union[int, str]] = Field( default=None, @@ -488,11 +510,25 @@ class ChatMessage(OpenAIBaseModel): content: str +class ChatCompletionLogProb(OpenAIBaseModel): + token: str + logprob: float = -9999.0 + bytes: Optional[List[int]] = None + + +class ChatCompletionLogProbsContent(ChatCompletionLogProb): + top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list) + + +class ChatCompletionLogProbs(OpenAIBaseModel): + content: Optional[List[ChatCompletionLogProbsContent]] = None + + class ChatCompletionResponseChoice(OpenAIBaseModel): index: int message: ChatMessage - logprobs: Optional[LogProbs] = None - finish_reason: Optional[str] = None + logprobs: Optional[ChatCompletionLogProbs] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None stop_reason: Optional[Union[int, str]] = None @@ -513,8 +549,8 @@ class DeltaMessage(OpenAIBaseModel): class ChatCompletionResponseStreamChoice(OpenAIBaseModel): index: int delta: DeltaMessage - logprobs: Optional[LogProbs] = None - finish_reason: Optional[str] = None + logprobs: Optional[ChatCompletionLogProbs] = None + finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None stop_reason: Optional[Union[int, str]] = None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 8cb50e33e58d1..cc5b896e0e56c 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,8 +1,10 @@ import codecs import time from dataclasses import dataclass -from typing import (AsyncGenerator, AsyncIterator, Iterable, List, Optional, - TypedDict, Union, cast, final) +from typing import (AsyncGenerator, AsyncIterator, Dict, Iterable, List, + Optional) +from typing import Sequence as GenericSequence +from typing import TypedDict, Union, cast, final from fastapi import Request from openai.types.chat import ChatCompletionContentPartTextParam @@ -10,8 +12,9 @@ from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import ( - ChatCompletionContentPartParam, ChatCompletionMessageParam, - ChatCompletionRequest, ChatCompletionResponse, + ChatCompletionContentPartParam, ChatCompletionLogProb, + ChatCompletionLogProbs, ChatCompletionLogProbsContent, + ChatCompletionMessageParam, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse, UsageInfo) @@ -21,6 +24,7 @@ from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.outputs import RequestOutput +from vllm.sequence import Logprob from vllm.utils import random_uuid logger = init_logger(__name__) @@ -283,11 +287,10 @@ async def chat_completion_stream_generator( previous_num_tokens[i]:] if output.logprobs else None if request.logprobs: - logprobs = self._create_logprobs( + logprobs = self._create_chat_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.top_logprobs, - initial_text_offset=len(previous_texts[i]), ) else: logprobs = None @@ -370,7 +373,7 @@ async def chat_completion_full_generator( top_logprobs = output.logprobs if request.logprobs: - logprobs = self._create_logprobs( + logprobs = self._create_chat_logprobs( token_ids=token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.top_logprobs, @@ -383,8 +386,7 @@ async def chat_completion_full_generator( message=ChatMessage(role=role, content=output.text), logprobs=logprobs, finish_reason=output.finish_reason, - stop_reason=output.stop_reason, - ) + stop_reason=output.stop_reason) choices.append(choice_data) if request.echo: @@ -414,3 +416,51 @@ async def chat_completion_full_generator( ) return response + + def _get_top_logprobs( + self, logprobs: Dict[int, Logprob], + top_logprobs: Optional[int]) -> List[ChatCompletionLogProb]: + return [ + ChatCompletionLogProb( + token=self._get_decoded_token(p[1], p[0]), + logprob=max(p[1].logprob, -9999.0), + bytes=list( + self._get_decoded_token(p[1], + p[0]).encode("utf-8", + errors="replace"))) + for i, p in enumerate(logprobs.items()) + if top_logprobs and i < top_logprobs + ] + + def _create_chat_logprobs( + self, + token_ids: GenericSequence[int], + top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], + num_output_top_logprobs: Optional[int] = None, + ) -> ChatCompletionLogProbs: + """Create OpenAI-style logprobs.""" + + logprobs_content = [] + + for i, token_id in enumerate(token_ids): + step_top_logprobs = top_logprobs[i] + if step_top_logprobs is None: + logprobs_content.append( + ChatCompletionLogProbsContent( + token=self.tokenizer.decode(token_id), + bytes=list( + self.tokenizer.decode(token_id).encode( + "utf-8", errors="replace")))) + else: + logprobs_content.append( + ChatCompletionLogProbsContent( + token=step_top_logprobs[token_id].decoded_token, + logprob=max(step_top_logprobs[token_id].logprob, + -9999.0), + bytes=list( + step_top_logprobs[token_id].decoded_token.encode( + "utf-8", errors="replace")), + top_logprobs=self._get_top_logprobs( + step_top_logprobs, num_output_top_logprobs))) + + return ChatCompletionLogProbs(content=logprobs_content) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index d1812c8f44f41..2fb122edaf98a 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,23 +1,29 @@ import time from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List, - Optional, Tuple) + Optional) +from typing import Sequence as GenericSequence +from typing import Tuple from fastapi import Request from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import (CompletionRequest, +# yapf: disable +from vllm.entrypoints.openai.protocol import (CompletionLogProbs, + CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, - LogProbs, UsageInfo) + UsageInfo) +# yapf: enable from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, OpenAIServing) from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.outputs import RequestOutput +from vllm.sequence import Logprob from vllm.utils import merge_async_iterators, random_uuid logger = init_logger(__name__) @@ -25,7 +31,7 @@ TypeTokenIDs = List[int] TypeTopLogProbs = List[Optional[Dict[int, float]]] TypeCreateLogProbsFn = Callable[ - [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs] + [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs] def parse_prompt_format(prompt) -> Tuple[bool, list]: @@ -235,7 +241,7 @@ async def completion_stream_generator( i]:] if output.logprobs else None if request.logprobs is not None: - logprobs = self._create_logprobs( + logprobs = self._create_completion_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.logprobs, @@ -317,7 +323,7 @@ def request_output_to_completion_response( assert top_logprobs is not None, ( "top_logprobs must be provided when logprobs " "is requested") - logprobs = self._create_logprobs( + logprobs = self._create_completion_logprobs( token_ids=token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.logprobs, @@ -351,3 +357,59 @@ def request_output_to_completion_response( choices=choices, usage=usage, ) + + def _create_completion_logprobs( + self, + token_ids: GenericSequence[int], + top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], + num_output_top_logprobs: int, + initial_text_offset: int = 0, + ) -> CompletionLogProbs: + """Create logprobs for OpenAI Completion API.""" + out_text_offset: List[int] = [] + out_token_logprobs: List[Optional[float]] = [] + out_tokens: List[str] = [] + out_top_logprobs: List[Optional[Dict[str, float]]] = [] + + last_token_len = 0 + + for i, token_id in enumerate(token_ids): + step_top_logprobs = top_logprobs[i] + if step_top_logprobs is None: + token = self.tokenizer.decode(token_id) + out_tokens.append(token) + out_token_logprobs.append(None) + out_top_logprobs.append(None) + else: + token = self._get_decoded_token(step_top_logprobs[token_id], + token_id) + token_logprob = max(step_top_logprobs[token_id].logprob, + -9999.0) + out_tokens.append(token) + out_token_logprobs.append(token_logprob) + + # makes sure to add the top num_output_top_logprobs + 1 + # logprobs, as defined in the openai API + # (cf. https://github.com/openai/openai-openapi/blob/ + # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153) + out_top_logprobs.append({ + # Convert float("-inf") to the + # JSON-serializable float that OpenAI uses + self._get_decoded_token(top_lp[1], top_lp[0]): + max(top_lp[1].logprob, -9999.0) + for i, top_lp in enumerate(step_top_logprobs.items()) + if num_output_top_logprobs >= i + }) + + if len(out_text_offset) == 0: + out_text_offset.append(initial_text_offset) + else: + out_text_offset.append(out_text_offset[-1] + last_token_len) + last_token_len = len(token) + + return CompletionLogProbs( + text_offset=out_text_offset, + token_logprobs=out_token_logprobs, + tokens=out_tokens, + top_logprobs=out_top_logprobs, + ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 708b0dad102c4..066acdf1c019a 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest, EmbeddingRequest, ErrorResponse, - LogProbs, ModelCard, ModelList, + ModelCard, ModelList, ModelPermission) from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -75,51 +75,6 @@ async def show_available_models(self) -> ModelList: model_cards.extend(lora_cards) return ModelList(data=model_cards) - def _create_logprobs( - self, - token_ids: List[int], - top_logprobs: List[Optional[Dict[int, Logprob]]], - num_output_top_logprobs: Optional[int] = None, - initial_text_offset: int = 0, - ) -> LogProbs: - """Create OpenAI-style logprobs.""" - logprobs = LogProbs() - last_token_len = 0 - if num_output_top_logprobs: - logprobs.top_logprobs = [] - - for i, token_id in enumerate(token_ids): - step_top_logprobs = top_logprobs[i] - if step_top_logprobs is None: - token = self.tokenizer.decode(token_id) - logprobs.tokens.append(token) - logprobs.token_logprobs.append(None) - assert logprobs.top_logprobs is not None - logprobs.top_logprobs.append(None) - else: - token_logprob = step_top_logprobs[token_id].logprob - token = step_top_logprobs[token_id].decoded_token - logprobs.tokens.append(token) - token_logprob = max(token_logprob, -9999.0) - logprobs.token_logprobs.append(token_logprob) - - if num_output_top_logprobs: - assert logprobs.top_logprobs is not None - logprobs.top_logprobs.append({ - # Convert float("-inf") to the - # JSON-serializable float that OpenAI uses - p.decoded_token: max(p.logprob, -9999.0) - for i, p in step_top_logprobs.items() - } if step_top_logprobs else None) - - if len(logprobs.text_offset) == 0: - logprobs.text_offset.append(initial_text_offset) - else: - logprobs.text_offset.append(logprobs.text_offset[-1] + - last_token_len) - last_token_len = len(token) - return logprobs - def create_error_response( self, message: str, @@ -235,3 +190,8 @@ def _validate_prompt_and_tokenize( f"Please reduce the length of the messages or completion.", ) else: return input_ids, input_text + + def _get_decoded_token(self, logprob: Logprob, token_id: int) -> str: + if logprob.decoded_token is not None: + return logprob.decoded_token + return self.tokenizer.decode(token_id) From 2827c6835b41bc216fcb5993a7dddfc317a64953 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 30 May 2024 05:58:37 -0700 Subject: [PATCH 061/154] [Bugfix] Automatically Detect SparseML models (#5119) --- vllm/config.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index be65660883bd5..ea372eda38d2d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -188,6 +188,17 @@ def _verify_embedding_mode(self) -> None: self.embedding_mode = any( ModelRegistry.is_embedding_model(arch) for arch in architectures) + def _parse_quant_hf_config(self): + quant_cfg = getattr(self.hf_config, "quantization_config", None) + if quant_cfg is None: + # SparseML uses a "compression_config" with a "quantization_config". + compression_cfg = getattr(self.hf_config, "compression_config", + None) + if compression_cfg is not None: + quant_cfg = compression_cfg.get("quantization_config", None) + + return quant_cfg + def _verify_quantization(self) -> None: supported_quantization = [*QUANTIZATION_METHODS] rocm_supported_quantization = ["gptq", "squeezellm"] @@ -195,12 +206,13 @@ def _verify_quantization(self) -> None: self.quantization = self.quantization.lower() # Parse quantization method from the HF model config, if available. - quant_cfg = getattr(self.hf_config, "quantization_config", None) + quant_cfg = self._parse_quant_hf_config() + if quant_cfg is not None: quant_method = quant_cfg.get("quant_method", "").lower() # Detect which checkpoint is it - for name, method in QUANTIZATION_METHODS.items(): + for _, method in QUANTIZATION_METHODS.items(): quantization_override = method.override_quantization_method( quant_cfg, self.quantization) if quantization_override: From 4ae80ddee9a6012da3547638b65f5a6a79d9a10c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 30 May 2024 06:29:48 -0700 Subject: [PATCH 062/154] [CI/Build] increase wheel size limit to 200 MB (#5130) --- .buildkite/check-wheel-size.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 41d9e682572a6..75ad094fa1382 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,7 +1,7 @@ import os import zipfile -MAX_SIZE_MB = 150 +MAX_SIZE_MB = 200 def print_top_10_largest_files(zip_file): From 886ead6b3bc7dc42d169a7a4bee9f16a6caac7aa Mon Sep 17 00:00:00 2001 From: Hyunsung Lee Date: Thu, 30 May 2024 22:56:19 +0900 Subject: [PATCH 063/154] [Misc] remove duplicate definition of `seq_lens_tensor` in model_runner.py (#5129) --- vllm/worker/model_runner.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 5ddd2d1b65f81..47aa70dc617af 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -518,9 +518,6 @@ def _prepare_model_input( else: multi_modal_input = None - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.int, - device=self.device) query_lens_tensor = torch.tensor(query_lens, dtype=torch.long, device=self.device) From 758b903c4ffa0e812e79c96c5dedaeab4270e781 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 31 May 2024 00:59:23 +0800 Subject: [PATCH 064/154] [Doc] Use intersphinx and update entrypoints docs (#5125) --- docs/source/conf.py | 13 ++++++++++++- vllm/engine/async_llm_engine.py | 2 -- vllm/engine/llm_engine.py | 4 ++-- vllm/entrypoints/llm.py | 26 ++++++++++++++++++-------- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 9da5a4991734d..cfebc2ff9bb33 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,7 +80,7 @@ def setup(app): generate_examples() -# Mock out external dependencies here. +# Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ "cpuinfo", "torch", @@ -115,4 +115,15 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: autodoc.ClassDocumenter = MockedClassDocumenter +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'typing_extensions': + ('https://typing-extensions.readthedocs.io/en/latest', None), + 'numpy': ('https://numpy.org/doc/stable', None), + 'torch': ('https://pytorch.org/docs/stable', None), + 'psutil': ('https://psutil.readthedocs.io/en/stable', None), +} + +autodoc_preserve_defaults = True + navigation_with_keys = False diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index d4289c715d9e6..db4d2849b3f0e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -307,8 +307,6 @@ class AsyncLLMEngine: generate method when there are requests in the waiting queue. The generate method yields the outputs from the :class:`LLMEngine` to the caller. - NOTE: For the comprehensive list of arguments, see :class:`LLMEngine`. - Args: worker_use_ray: Whether to use Ray for model workers. Required for distributed execution. Should be the same as diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2b716d9953381..30e963f0d4071 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -70,8 +70,8 @@ class LLMEngine: The :class:`~vllm.LLM` class wraps this class for offline batched inference and the :class:`AsyncLLMEngine` class wraps this class for online serving. - NOTE: The config arguments are derived from the :class:`~vllm.EngineArgs` - class. For the comprehensive list of arguments, see :ref:`engine_args`. + The config arguments are derived from :class:`~vllm.EngineArgs`. (See + :ref:`engine_args`) Args: model_config: The configuration related to the LLM model. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 4cab8aa884fc1..8a4245f93679b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -30,12 +30,6 @@ class LLM: this class generates texts from the model, using an intelligent batching mechanism and efficient memory management. - NOTE: This class is intended to be used for offline inference. For online - serving, use the :class:`~vllm.AsyncLLMEngine` class instead. - - NOTE: For the comprehensive list of arguments, see - :class:`~vllm.EngineArgs`. - Args: model: The name or path of a HuggingFace Transformers model. tokenizer: The name or path of a HuggingFace Transformers tokenizer. @@ -89,6 +83,12 @@ class LLM: When a sequence has context length larger than this, we fall back to eager mode. disable_custom_all_reduce: See ParallelConfig + **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See + :ref:`engine_args`) + + Note: + This class is intended to be used for offline inference. For online + serving, use the :class:`~vllm.AsyncLLMEngine` class instead. """ DEPRECATE_LEGACY: ClassVar[bool] = False @@ -262,7 +262,7 @@ def generate( ) -> List[RequestOutput]: """Generates the completions for the input prompts. - NOTE: This class automatically batches the given prompts, considering + This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method. @@ -279,6 +279,11 @@ def generate( Returns: A list of `RequestOutput` objects containing the generated completions in the same order as the input prompts. + + Note: + Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the ``inputs`` parameter. """ if prompt_token_ids is not None or multi_modal_data is not None: inputs = self._convert_v1_inputs( @@ -402,7 +407,7 @@ def encode( ) -> List[EmbeddingRequestOutput]: """Generates the completions for the input prompts. - NOTE: This class automatically batches the given prompts, considering + This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method. @@ -418,6 +423,11 @@ def encode( Returns: A list of `EmbeddingRequestOutput` objects containing the generated embeddings in the same order as the input prompts. + + Note: + Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the ``inputs`` parameter. """ if prompt_token_ids is not None or multi_modal_data is not None: inputs = self._convert_v1_inputs( From a190463117cf8bb39e30e40f7eb27cf59b81cb3f Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Fri, 31 May 2024 02:11:07 +0900 Subject: [PATCH 065/154] add doc about serving option on dstack (#3074) Co-authored-by: Roger Wang --- docs/source/serving/deploying_with_dstack.rst | 103 ++++++++++++++++++ docs/source/serving/integrations.rst | 1 + 2 files changed, 104 insertions(+) create mode 100644 docs/source/serving/deploying_with_dstack.rst diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst new file mode 100644 index 0000000000000..baf87314ca8e4 --- /dev/null +++ b/docs/source/serving/deploying_with_dstack.rst @@ -0,0 +1,103 @@ +.. _deploying_with_dstack: + +Deploying with dstack +============================ + +.. raw:: html + +

+ vLLM_plus_dstack +

+ +vLLM can be run on a cloud based GPU machine with `dstack `__, an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. + +To install dstack client, run: + +.. code-block:: console + + $ pip install "dstack[all] + $ dstack server + +Next, to configure your dstack project, run: + +.. code-block:: console + + $ mkdir -p vllm-dstack + $ cd vllm-dstack + $ dstack init + +Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: + +.. code-block:: yaml + + type: service + + python: "3.11" + env: + - MODEL=NousResearch/Llama-2-7b-chat-hf + port: 8000 + resources: + gpu: 24GB + commands: + - pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 + model: + format: openai + type: chat + name: NousResearch/Llama-2-7b-chat-hf + +Then, run the following CLI for provisioning: + +.. code-block:: console + + $ dstack run . -f serve.dstack.yml + + ⠸ Getting run plan... + Configuration serve.dstack.yml + Project deep-diver-main + User deep-diver + Min resources 2..xCPU, 8GB.., 1xGPU (24GB) + Max price - + Max duration - + Spot policy auto + Retry policy no + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + ... + Shown 3 of 193 offers, $5.876 max + + Continue? [y/n]: y + ⠙ Submitting run... + ⠏ Launching spicy-treefrog-1 (pulling) + spicy-treefrog-1 provisioning completed (running) + Service is published at ... + +After the provisioning, you can interact with the model by using the OpenAI SDK: + +.. code-block:: python + + from openai import OpenAI + + client = OpenAI( + base_url="https://gateway.", + api_key="" + ) + + completion = client.chat.completions.create( + model="NousResearch/Llama-2-7b-chat-hf", + messages=[ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + } + ] + ) + + print(completion.choices[0].message.content) + +.. note:: + + dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out `this repository `__ diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst index 2066e80b03298..83a8b5a88bd38 100644 --- a/docs/source/serving/integrations.rst +++ b/docs/source/serving/integrations.rst @@ -9,4 +9,5 @@ Integrations deploying_with_triton deploying_with_bentoml deploying_with_lws + deploying_with_dstack serving_with_langchain From 51cf757483bd9331bf70616efd08e9c51fbb0d3b Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 30 May 2024 13:13:46 -0500 Subject: [PATCH 066/154] Bump version to v0.4.3 (#5046) From c72d8908b7b54e68c1b358180611d597eefa8b28 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 30 May 2024 16:37:16 -0500 Subject: [PATCH 067/154] [Build] Disable sm_90a in cu11 (#5141) --- CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b668cbc97de15..8df3a7a26d884 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,7 +177,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") include(FetchContent) SET(CUTLASS_ENABLE_HEADERS_ONLY=ON) FetchContent_Declare( - cutlass + cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git # CUTLASS 3.5.0 GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc @@ -200,11 +200,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The CUTLASS kernels for Hopper require sm90a to be enabled. # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. # That adds an extra 17MB to compiled binary, so instead we selectively enable it. - set_source_files_properties( - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu" - PROPERTIES - COMPILE_FLAGS - "-gencode arch=compute_90a,code=sm_90a") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 11) + set_source_files_properties( + "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu" + PROPERTIES + COMPILE_FLAGS + "-gencode arch=compute_90a,code=sm_90a") + endif() endif() From cf0711b092b26f9f0a8ff9ec87ada38e92e8a216 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 30 May 2024 17:04:37 -0700 Subject: [PATCH 068/154] [Bugfix] Avoid Warnings in SparseML Activation Quantization (#5120) --- .../compressed_tensors_w8a8_statictensor.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py index d16e570d12202..64a88b01cd260 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -89,23 +89,34 @@ def create_weights(self, layer: torch.nn.Module, requires_grad=False) layer.register_parameter("weight", weight) - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - - set_weight_attrs(weight, {"weight_loader": weight_loader}) - + set_weight_attrs(weight, { + "weight_loader": weight_loader, + "input_dim": 1, + "output_dim": 0, + }) layer.register_parameter("input_scale", input_scale) - set_weight_attrs(input_scale, {"weight_loader": weight_loader}) + set_weight_attrs(input_scale, { + "weight_loader": weight_loader, + "ignore_warning": True, + }) layer.register_parameter("input_zero_point", input_zero_point) - set_weight_attrs(input_zero_point, {"weight_loader": weight_loader}) + set_weight_attrs(input_zero_point, { + "weight_loader": weight_loader, + "ignore_warning": True, + }) layer.register_parameter("weight_scale", weight_scale) - set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) set_weight_attrs( weight_scale, { + "weight_loader": weight_loader, "shard_splitter": self.scales_shard_splitter, - "logical_widths": output_partition_sizes + "logical_widths": output_partition_sizes, + "ignore_warning": True, }) layer.register_parameter("weight_zero_point", weight_zero_point) - set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader}) + set_weight_attrs(weight_zero_point, { + "weight_loader": weight_loader, + "ignore_warning": True + }) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): weight = layer.weight From dcaf819f4efe4cae03b4cc81a2c5847e880d2600 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Thu, 30 May 2024 22:02:11 -0400 Subject: [PATCH 069/154] [Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5) (#5136) --- csrc/quantization/marlin/sparse/common/mma.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 45ab67a78a1de..fd3dbda5b9c93 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -32,7 +32,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, float* c = reinterpret_cast(&frag_c); if (psel == 0) { asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -40,7 +41,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) @@ -49,7 +51,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(e[0])); } else { asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -57,7 +60,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) From 7da3c3f6e0a39c8c949fde2a7fa3140823d38e65 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 31 May 2024 02:13:01 +0000 Subject: [PATCH 070/154] Fix cutlass sm_90a vesrion in CMakeList --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8df3a7a26d884..5f991af61d9bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,7 +200,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The CUTLASS kernels for Hopper require sm90a to be enabled. # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. # That adds an extra 17MB to compiled binary, so instead we selectively enable it. - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 11) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) set_source_files_properties( "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu" PROPERTIES From 2c66f1796d0c12f3ae35b149da965eab4e0964fe Mon Sep 17 00:00:00 2001 From: SnowDist Date: Fri, 31 May 2024 10:24:41 +0800 Subject: [PATCH 071/154] [Model] Support MAP-NEO model (#5081) Co-authored-by: Zhuohan Li --- benchmarks/kernels/benchmark_paged_attention.py | 2 +- benchmarks/kernels/benchmark_rope.py | 2 +- csrc/attention/attention_kernels.cu | 6 ++++++ csrc/cpu/attention.cpp | 6 ++++++ tests/kernels/test_attention.py | 2 +- tests/kernels/test_cache.py | 2 +- tests/kernels/test_pos_encoding.py | 2 +- vllm/attention/ops/paged_attn.py | 2 +- 8 files changed, 18 insertions(+), 6 deletions(-) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index fc9621e885dc4..e6f4e9e6b9716 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -170,7 +170,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 256], + choices=[64, 80, 96, 112, 128, 192, 256], default=128) parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) parser.add_argument("--use-alibi", action="store_true") diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 9188e811e2982..00e55f6060b52 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -93,7 +93,7 @@ def benchmark_rope_kernels_multi_lora( parser.add_argument("--num-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 256], + choices=[64, 80, 96, 112, 128, 192, 256], default=128) parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) parser.add_argument("--dtype", diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 45edc3252380c..8f89f89786c3b 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -754,6 +754,9 @@ void paged_attention_v1_launcher( case 128: LAUNCH_PAGED_ATTENTION_V1(128); break; + case 192: + LAUNCH_PAGED_ATTENTION_V1(192); + break; case 256: LAUNCH_PAGED_ATTENTION_V1(256); break; @@ -911,6 +914,9 @@ void paged_attention_v2_launcher( case 128: LAUNCH_PAGED_ATTENTION_V2(128); break; + case 192: + LAUNCH_PAGED_ATTENTION_V2(192); + break; case 256: LAUNCH_PAGED_ATTENTION_V2(256); break; diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index 438e9bdb19f50..ed8cfbd421f0f 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -390,6 +390,9 @@ void paged_attention_v1_impl_launcher( case 128: LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); break; + case 192: + LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE); + break; case 256: LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); break; @@ -703,6 +706,9 @@ void paged_attention_v2_impl_launcher( case 128: LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); break; + case 192: + LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE); + break; case 256: LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); break; diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index d9380da888f97..fa5c951a7fa7a 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -28,7 +28,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 128, 256 +HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256 ] if not is_hip() else [64, 80, 96, 112, 128] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 856d21020cadf..f26eb896105f6 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -11,7 +11,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 128, 256] +HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 076730cdbae0d..fbabc02bf9a9d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -10,7 +10,7 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 128, 256] +HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [7, 17] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index e119fdcf11113..a214f40d16514 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -31,7 +31,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 256] + return [64, 80, 96, 112, 128, 192, 256] @staticmethod def get_kv_cache_shape( From 5388c64b154cfdd4d50d6e5134556f7358a81cfd Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 31 May 2024 00:00:26 -0500 Subject: [PATCH 072/154] Revert "[Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5)" (#5149) --- csrc/quantization/marlin/sparse/common/mma.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index fd3dbda5b9c93..45ab67a78a1de 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -32,8 +32,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, float* c = reinterpret_cast(&frag_c); if (psel == 0) { asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -41,8 +40,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) @@ -51,8 +49,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(e[0])); } else { asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -60,8 +57,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) From 5e9f300e9f2284eb79e6f9194bdc7632c149510e Mon Sep 17 00:00:00 2001 From: functionxu123 <1229853312@qq.com> Date: Fri, 31 May 2024 13:14:50 +0800 Subject: [PATCH 073/154] [Misc]: optimize eager mode host time (#4196) Co-authored-by: xuhao --- vllm/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index c36a927e3e6ff..f0ea434a0969a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -17,6 +17,7 @@ Hashable, List, Optional, OrderedDict, Tuple, TypeVar, Union) +import numpy as np import psutil import torch @@ -501,11 +502,6 @@ def str_to_int_tuple(s: str) -> Tuple[int, ...]: f"(e.g., 1, 2, 3). Given input: {s}") from e -def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]: - assert len(x) <= max_len - return x + [pad] * (max_len - len(x)) - - def make_tensor_with_pad( x: List[List[int]], max_len: int, @@ -518,7 +514,10 @@ def make_tensor_with_pad( The padding is applied to the end of each inner list until it reaches `max_len`. """ - padded_x = [pad_to_max_length(x_i, max_len, pad) for x_i in x] + padded_x = np.zeros([len(x), max_len], dtype=np.int32) + pad + for ind, blocktb in enumerate(x): + assert len(blocktb) <= max_len + padded_x[ind, :len(blocktb)] = blocktb return torch.tensor(padded_x, dtype=dtype, device=device) From f329e2eaccc66a2d51c6ee4f931a5ccf5cff1936 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 31 May 2024 14:29:19 -0700 Subject: [PATCH 074/154] [Model] Enable FP8 QKV in MoE and refine kernel tuning script (#5039) --- benchmarks/kernels/benchmark_mixtral_moe.py | 48 ++++-- ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 138 +++++++++++++++++ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 ++++++++++++++++++ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 108 +++++++------ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 ++++++++++++++++++ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 84 +++++----- ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 ++++++++++++++++++ vllm/model_executor/models/mixtral.py | 9 -- 8 files changed, 711 insertions(+), 114 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py index 5280b214144c9..196ec8cfce88e 100644 --- a/benchmarks/kernels/benchmark_mixtral_moe.py +++ b/benchmarks/kernels/benchmark_mixtral_moe.py @@ -11,25 +11,36 @@ from vllm.model_executor.layers.fused_moe import (fused_moe, get_config_file_name) -os.environ['CUDA_VISIBLE_DEVICES'] = '0' - -def main(dtype: str): +def main(model, tp_size, gpu, dtype: str): + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu) method = fused_moe for bs in [ 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, 2048, 3072, 4096 ]: - run_grid(bs, method=method, dtype=dtype) - - -def run_grid(bs, method, dtype: str): - d_model = 4096 + run_grid(bs, + model=model, + method=method, + gpu=gpu, + tp_size=tp_size, + dtype=dtype) + + +def run_grid(bs, model, method, gpu, tp_size, dtype: str): + if model == '8x7B': + d_model = 4096 + model_intermediate_size = 14336 + num_layers = 32 + elif model == '8x22B': + d_model = 6144 + model_intermediate_size = 16384 + num_layers = 56 + else: + raise ValueError(f'Unsupported Mixtral model {model}') num_total_experts = 8 top_k = 2 - tp_size = 2 - model_intermediate_size = 14336 - num_layers = 32 + # tp_size = 2 num_calls = 100 num_warmup_trials = 1 @@ -211,5 +222,18 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int, choices=['float8', 'float16'], help='Data type used for fused_moe kernel computations', ) + parser.add_argument('--model', + type=str, + default='8x7B', + choices=['8x7B', '8x22B'], + help='The Mixtral model to benchmark') + parser.add_argument('--tp-size', + type=int, + default=2, + help='Tensor paralleli size') + parser.add_argument('--gpu', + type=int, + default=0, + help="GPU ID for benchmarking") args = parser.parse_args() - sys.exit(main(args.dtype)) + sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype)) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..3f3ccdafa88f3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,138 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..0c495e7e290c6 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json index 9287808a94d0e..5b78c30f08b68 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -3,61 +3,59 @@ "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1 + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1 + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 }, "4": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1 + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 }, "8": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, "num_warps": 8, - "num_stages": 5 + "num_stages": 2 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4, - "num_stages": 5 - }, - "24": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5 }, - "32": { + "24": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, + "GROUP_SIZE_M": 64, + "num_warps": 4, "num_stages": 4 }, - "48": { + "32": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 3 + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 }, - "64": { + "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, @@ -65,37 +63,45 @@ "num_warps": 4, "num_stages": 4 }, - "96": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 32, - "num_warps": 8, - "num_stages": 2 - }, - "128": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 2 }, - "256": { + "96": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5 }, - "512": { + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, - "num_warps": 4, - "num_stages": 2 + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 }, "1024": { "BLOCK_SIZE_M": 128, @@ -109,7 +115,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4 }, @@ -125,7 +131,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4 }, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..60a65724d68b9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json index 2ad07bf79a25c..75f8b0017b9c6 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -2,104 +2,104 @@ "1": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "2": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, + "GROUP_SIZE_M": 32, + "num_warps": 8, "num_stages": 4 }, "4": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 2 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 4 + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 }, "16": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 4 }, "24": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 4 + "num_warps": 4, + "num_stages": 5 }, "32": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, "num_stages": 4 }, "48": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 4 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, - "num_stages": 4 + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 }, "256": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 4 + "num_stages": 5 }, "512": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4 }, @@ -115,7 +115,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4 }, @@ -139,7 +139,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4 } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..34b916e574f88 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index d6dd7fa1fe9e2..2f4237339486e 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -278,15 +278,6 @@ def __init__( self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - if isinstance( - quant_config, - Fp8Config) and not quant_config.is_checkpoint_fp8_serialized: - print_warning_once( - "For Mixtral FP8 quantization, we currently do not quantize " - "the attention layers until their FP8 performance is improved." - ) - quant_config = None - self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, From 951e3d20a0758efe30568ab0ccf8532a8c414a39 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 31 May 2024 17:20:19 -0700 Subject: [PATCH 075/154] [Doc] Add checkmark for GPTBigCodeForCausalLM LoRA support (#5171) --- docs/source/models/supported_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index e4bae80343a2c..82e71e61975c8 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -62,7 +62,7 @@ Alongside each architecture, we include some popular models that use it. * - :code:`GPTBigCodeForCausalLM` - StarCoder, SantaCoder, WizardCoder - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. - - + - ✅︎ * - :code:`GPTJForCausalLM` - GPT-J - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. From d349dbd4017ac1e768b5af7f175b8b7a08f73a2d Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 31 May 2024 20:21:38 -0400 Subject: [PATCH 076/154] [Build] Guard against older CUDA versions when building CUTLASS 3.x kernels (#5168) --- csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu | 10 ++++++++-- csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu | 11 ++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu index 5fd6d8ff20867..531414bc45165 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu @@ -1,3 +1,9 @@ +// clang-format will break include orders +// clang-format off +#include + +#if defined CUDA_VERSION && CUDA_VERSION >= 12000 + #include #include @@ -6,8 +12,6 @@ #include #include -// clang-format will break include orders -// clang-format off #include "cutlass/cutlass.h" #include "cute/tensor.hpp" @@ -241,3 +245,5 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, } } } + +#endif diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu index dab73ac6c831e..eb532f2ac7a9b 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu @@ -1,5 +1,6 @@ +#include + #include -#include #include void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a, @@ -17,10 +18,12 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales); +#if defined CUDA_VERSION && CUDA_VERSION >= 12000 void cutlass_scaled_mm_dq_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales); +#endif void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -51,7 +54,13 @@ void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a, if (version_num >= 90) { // Hopper + + // Guard against compilation issues for sm90 kernels +#if defined CUDA_VERSION && CUDA_VERSION >= 12000 cutlass_scaled_mm_dq_sm90(c, a, b, a_scales, b_scales); +#else + cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales); +#endif } else if (version_num == 89) { // Ada Lovelace cutlass_scaled_mm_dq_sm89(c, a, b, a_scales, b_scales); From 031fd4ee3604edbebb54aebd39ec1b53a1eb5e48 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 8 Jun 2024 17:01:55 +0000 Subject: [PATCH 077/154] format --- vllm/engine/arg_utils.py | 4 ++-- vllm/model_executor/layers/linear.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8291daa05ec80..24d42b791b5d0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -575,8 +575,8 @@ def create_engine_config(self, ) -> EngineConfig: self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, self.dtype, self.seed, self.revision, self.code_revision, self.rope_scaling, self.tokenizer_revision, - self.max_model_len, self.quantization, self.quantization_param_path, - self.sparsity, self.enforce_eager, + self.max_model_len, self.quantization, + self.quantization_param_path, self.sparsity, self.enforce_eager, self.max_context_len_to_capture, self.max_seq_len_to_capture, self.max_logprobs, self.disable_sliding_window, self.skip_tokenizer_init, self.served_model_name) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 0a26cadf90bb4..1b18efd5177f2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -462,7 +462,6 @@ def weight_loader(self, if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) - # UPSTREAM SYNC: needed for LazyCompressedParameter self.loaded_shards.add(loaded_shard_id) assert param_data.shape == loaded_weight.shape From 9ed5f76bd60b63c8519823b066646e774f093bc5 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 12:34:39 +0000 Subject: [PATCH 078/154] skip blockspase attention --- tests/kernels/test_blocksparse_attention.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 9da13ca6e2310..08262115534c4 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -361,6 +361,13 @@ def ref_multi_query_kv_attention( return ref_output +# UPSTREAM SYNC: breaks NM automation. +@pytest.mark.skip( + "C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now." +) @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) From ec71544411efffe2021420742150771105a184a6 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 13:08:40 +0000 Subject: [PATCH 079/154] fix falcon --- tests/conftest.py | 25 ++----------------------- tests/models/test_models_logprobs.py | 7 ++++++- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index d23216966247c..0b44b1761c9ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -564,12 +564,13 @@ def __init__( block_size: int = 16, enable_chunked_prefill: bool = False, swap_space=4, + trust_remote_code: bool = True, **kwargs, ) -> None: self.model = LLM( model=model_name, tokenizer=tokenizer_name, - trust_remote_code=True, + trust_remote_code=trust_remote_code, dtype=dtype, swap_space=swap_space, disable_log_stats=disable_log_stats, @@ -695,28 +696,6 @@ def vllm_runner(): # UPSTREAM SYNC: needed for nm-automation class VllmRunnerNm(VllmRunner): - def __init__( - self, - model_name: str, - sparsity: Optional[str] = None, - tokenizer_name: Optional[str] = None, - dtype: str = "half", - disable_log_stats: bool = True, - tensor_parallel_size: int = 1, - max_model_len: Optional[int] = None, - ) -> None: - self.model = LLM( - model=model_name, - sparsity=sparsity, - tokenizer=tokenizer_name, - trust_remote_code=True, - dtype=dtype, - swap_space=0, - disable_log_stats=disable_log_stats, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - ) - def generate_w_logprobs( self, prompts: List[str], diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 091620ca357b5..04c172e0a7942 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -39,7 +39,7 @@ @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["bfloat16", "half"]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [3]) def test_models( @@ -64,8 +64,13 @@ def test_models( del hf_model + trust_remote_code = True + # Falcon fails if trust_remote_code = True + # https://github.com/vllm-project/vllm/issues/5363 + trust_remote_code = model != "tiiuae/falcon-7b" vllm_model = vllm_runner_nm(model, dtype=dtype, + trust_remote_code=trust_remote_code, max_model_len=MODEL_MAX_LEN) vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, From 7381340d59fddc79ad8bac29c6c45c925de5a34b Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 13:14:23 +0000 Subject: [PATCH 080/154] skip sliding window chunked prefill --- tests/core/block/e2e/test_correctness_sliding_window.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index e98292e807d73..b75cdd9456b24 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -73,6 +73,13 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, assert sum(cmp) > 0.7 * len(cmp) +# UPSTREAM SYNC: breaks NM automation. +@pytest.mark.skip( + "C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now." +) @pytest.mark.parametrize( "common_llm_kwargs", [{ From c23ca05712898c7a9c550bee41cb6e97e80d6064 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 13:15:04 +0000 Subject: [PATCH 081/154] skip prefix prefill --- tests/kernels/test_prefix_prefill.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 2646307674efb..1c6c4a2e0d088 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -20,6 +20,13 @@ SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048] +# UPSTREAM SYNC: breaks NM automation. +@pytest.mark.skip( + "C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now." +) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) @pytest.mark.parametrize("head_size", HEAD_SIZES) From 85512eb54c14d52ce14af7b68865368822a393d3 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 13:19:12 +0000 Subject: [PATCH 082/154] skip tensorizer --- tests/tensorizer_loader/test_tensorizer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index cc377247765e3..2f8d5824b194f 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -83,6 +83,12 @@ def test_can_deserialize_s3(vllm_runner): assert deserialized_outputs +# UPSTREAM SYNC: breaks NM automation. +@pytest.mark.skip( + "This test requires libsodium23 to be installed. " + "It is not installed in our test instances on GCP, " + "we need to skip this for now." +) @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): From 0cea2c247a57570f02a521888f9590771eac4546 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 8 Jun 2024 13:54:05 -0400 Subject: [PATCH 083/154] [Misc][Breaking] Change FP8 checkpoint format from act_scale -> input_scale (#5353) --- .../model_executor/layers/quantization/fp8.py | 28 ++++++------- vllm/model_executor/models/mixtral.py | 42 ++++++++++++------- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b084b9cee4983..afb554fecdb33 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -152,10 +152,10 @@ def create_weights( output_partition_sizes=output_partition_sizes, **extra_weight_attrs) - # ACTIVATION SCALE + # INPUT ACTIVATION SCALE if self.quant_config.activation_scheme == "static": self._create_scale_param( - scale_name="act_scale", + scale_name="input_scale", layer=layer, output_partition_sizes=output_partition_sizes, **extra_weight_attrs) @@ -188,7 +188,7 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.weight = Parameter(qweight.t(), requires_grad=False) layer.weight_scale = Parameter(weight_scale, requires_grad=False) layer.logical_widths = None - layer.act_scale = None + layer.input_scale = None return # If checkpoint is fp8, requantize the separately quantized logical @@ -213,18 +213,18 @@ def process_weights_after_loading(self, layer: Module) -> None: weight = layer.weight layer.weight = Parameter(weight.t(), requires_grad=False) - # ACT_SCALE + # INPUT ACTIVATION SCALE # Dynamic: set to None (required input to ops.scaled_fp8_quant). - # Static: set to max of the act_scales (since they are equal). + # Static: set to max of the input_scales (since they are equal). if self.quant_config.activation_scheme == "dynamic": - layer.act_scale = None + layer.input_scale = None elif self.quant_config.activation_scheme == "static": - if not all_close_1d(layer.act_scale): + if not all_close_1d(layer.input_scale): raise ValueError( - "All the act_scales for the logical weights of a layer " - f"must be equal. But got {layer.act_scale}") - layer.act_scale = Parameter(layer.act_scale.max(), - requires_grad=False) + "All the input_scales for the logical weights of a " + f"layer must be equal. But got {layer.input_scale}") + layer.input_scale = Parameter(layer.input_scale.max(), + requires_grad=False) else: raise ValueError( f"Unknown scheme {self.quant_config.activation_scheme}") @@ -234,10 +234,10 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: # ops.scaled_fp8_quant supports both dynamic and static quant. - # If dynamic, layer.act_scale is None and x_scale computed from x. - # If static, layer.act_scale is scalar and x_scale set to act_scale. + # If dynamic, layer.input_scale is None and x_scale computed from x. + # If static, layer.input_scale is scalar and x_scale set to act_scale. qinput, x_scale = ops.scaled_fp8_quant(x, - layer.act_scale, + layer.input_scale, batch_dim_padding=17) # Fused GEMM_DQ -- note we padded the input above because diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 2f4237339486e..10f823b229fdf 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -142,7 +142,7 @@ def __init__( "weight_loader": self.weight_loader, }) - # ACT_SCALE (for fp8) + # INPUT_SCALE (for fp8) if quant_config.activation_scheme == "static": if not quant_config.is_checkpoint_fp8_serialized: raise ValueError( @@ -175,7 +175,15 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, shard_size:2 * shard_size, :] = loaded_weight[shard, :] if weight_name.endswith("w2.weight"): param_data[expert_id, :, :] = loaded_weight[:, shard] - if "act_scale" in weight_name or "weight_scale" in weight_name: + + # Loading scales + if "input_scale" in weight_name or "w2.weight_scale" in weight_name: + if param_data[expert_id] != 1 and (param_data[expert_id] - + loaded_weight).abs() > 1e-5: + raise ValueError( + "input_scales of w1 and w3 of a layer " + f"must be equal. But got {param_data[expert_id]} " + f"vs. {loaded_weight}") param_data[expert_id] = loaded_weight def process_weights_after_loading(self): @@ -199,20 +207,22 @@ def process_weights_after_loading(self): self.w13_weight = nn.Parameter(w13_weight, requires_grad=False) self.w2_weight = nn.Parameter(w2_weight, requires_grad=False) - # If checkpoint is fp8 + static, cleanup act_scales. - # Since state_dict has an act_scale per expert but our kernels - # are passed one act_scale shared across all experts. - elif self.quant_config.activation_scheme == "static": - if self.a13_scale is None or self.a2_scale is None: - raise ValueError( - "QuantConfig has static quantization, but found " - "activation scales are None.") + else: + # If checkpoint is fp8 + static, cleanup input_scales. + # Since state_dict has an input_scale per expert but our kernels + # are passed one input_scale shared across all experts. + if self.quant_config.activation_scheme == "static": + if self.a13_scale is None or self.a2_scale is None: + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None.") - if (not all_close_1d(self.a13_scale) - or not all_close_1d(self.a2_scale)): - print_warning_once( - "Found act_scales that are not equal for fp8 MoE layer. " - "Using the maximum across experts for each layer. ") + if (not all_close_1d(self.a13_scale) + or not all_close_1d(self.a2_scale)): + print_warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer. ") self.a13_scale = nn.Parameter(self.a13_scale.max(), requires_grad=False) @@ -532,7 +542,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # These are the activation scales for the experts # (param_name, weight_name, expert_id) ("a13_scale" if weight_name in ["w1", "w3"] else "a2_scale", - f"experts.{expert_id}.{weight_name}.act_scale", expert_id) + f"experts.{expert_id}.{weight_name}.input_scale", expert_id) for expert_id in range(self.config.num_local_experts) for weight_name in ["w1", "w2", "w3"] ] From 31147dfd3dd8ee098e5982c9b36bba2997596b2f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 13:25:42 +0000 Subject: [PATCH 084/154] format --- .../core/block/e2e/test_correctness_sliding_window.py | 10 ++++------ tests/kernels/test_blocksparse_attention.py | 10 ++++------ tests/kernels/test_prefix_prefill.py | 10 ++++------ tests/tensorizer_loader/test_tensorizer.py | 2 +- vllm/model_executor/layers/quantization/fp8.py | 2 +- 5 files changed, 14 insertions(+), 20 deletions(-) diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index b75cdd9456b24..d77d6a1dbb741 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -74,12 +74,10 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, # UPSTREAM SYNC: breaks NM automation. -@pytest.mark.skip( - "C compiler not installed in NM automation. " - "This codepath follows a triton pathway, which " - "JITs using clang or gcc. Since neither are installed " - "in our test instances, we need to skip this for now." -) +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 08262115534c4..12109f8767782 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -362,12 +362,10 @@ def ref_multi_query_kv_attention( # UPSTREAM SYNC: breaks NM automation. -@pytest.mark.skip( - "C compiler not installed in NM automation. " - "This codepath follows a triton pathway, which " - "JITs using clang or gcc. Since neither are installed " - "in our test instances, we need to skip this for now." -) +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 1c6c4a2e0d088..2bf0ecc1fcb69 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -21,12 +21,10 @@ # UPSTREAM SYNC: breaks NM automation. -@pytest.mark.skip( - "C compiler not installed in NM automation. " - "This codepath follows a triton pathway, which " - "JITs using clang or gcc. Since neither are installed " - "in our test instances, we need to skip this for now." -) +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) @pytest.mark.parametrize("head_size", HEAD_SIZES) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 2f8d5824b194f..b63fcf23af09a 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -86,7 +86,7 @@ def test_can_deserialize_s3(vllm_runner): # UPSTREAM SYNC: breaks NM automation. @pytest.mark.skip( "This test requires libsodium23 to be installed. " - "It is not installed in our test instances on GCP, " + "It is not installed in our test instances on GCP, " "we need to skip this for now." ) @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index afb554fecdb33..44161bde73f4e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -235,7 +235,7 @@ def apply(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: # ops.scaled_fp8_quant supports both dynamic and static quant. # If dynamic, layer.input_scale is None and x_scale computed from x. - # If static, layer.input_scale is scalar and x_scale set to act_scale. + # If static, layer.input_scale is scalar and x_scale uses it. qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale, batch_dim_padding=17) From b2afd771c68ec1e6780843b6750654e23a0a90a8 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 14:45:45 +0000 Subject: [PATCH 085/154] added lm eval test group --- tests/accuracy/test_lm_eval_correctness.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index ded6d98d6f6ad..4ee213dde29cd 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -8,6 +8,7 @@ import yaml from tests.nm_utils.server import ServerContext +from tests.nm_utils.utils_skip import should_skip_lm_eval_test_group if TYPE_CHECKING: import lm_eval as lm_eval_t @@ -50,6 +51,8 @@ class EvalTaskDefinition(EvalTaskDefinitionOpts): DEFAULT_RTOL = 0.05 +@pytest.mark.skipif(should_skip_lm_eval_test_group(), + reason="Current job configured to skip this test group") @pytest.mark.parametrize("eval_data", TEST_DATA) def test_lm_eval_correctness( eval_data: EvalTaskDefinition, From 85d54e81718678b13d039ce5c0bfeb8107cb2305 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 14:45:57 +0000 Subject: [PATCH 086/154] added env variable entrypoint --- tests/nm_utils/utils_skip.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 tests/nm_utils/utils_skip.py diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py new file mode 100644 index 0000000000000..a0f7ce33954fd --- /dev/null +++ b/tests/nm_utils/utils_skip.py @@ -0,0 +1,37 @@ +"""Checks environment variables to skip various test groups. +The functions here are imported by each test file. +The .github/actions/nm-test-skipping-env-setup sets these + variables in the testing automation. +""" + +import os + + +def should_skip_kernel_test_group(): + TEST_KERNELS = os.getenv("TEST_KERNELS", "0") + return TEST_KERNELS != "1" + + +def should_skip_lora_test_group(): + TEST_LORA = os.getenv("TEST_LORA", "0") + return TEST_LORA != "1" + + +def should_skip_spec_decode_test_group(): + TEST_SPEC_DECODE = os.getenv("TEST_SPEC_DECODE", "0") + return TEST_SPEC_DECODE != "1" + + +def should_skip_models_test_group(): + TEST_ALL_MODELS = os.getenv("TEST_ALL_MODELS", "0") + return TEST_ALL_MODELS != "1" + + +def should_skip_lm_eval_test_group(): + TEST_LM_EVAL = os.getenv("TEST_LM_EVAL", "0") + return TEST_LM_EVAL != "1" + + +def should_skip_tensorizer_test_group(): + TEST_TENSORIZER = os.getenv("TEST_TENSORIZER", "0") + return TEST_TENSORIZER != "1" \ No newline at end of file From 49fdf7da1bf361ab2d8746ec79cc3aee91b93750 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 14:58:43 +0000 Subject: [PATCH 087/154] format --- tests/accuracy/test_lm_eval_correctness.py | 8 +++++--- tests/kernels/test_activation.py | 5 +++++ tests/kernels/test_attention_selector.py | 5 +++++ tests/nm_utils/utils_skip.py | 16 +++++++++++++++- 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index 4ee213dde29cd..4665b3c5178e8 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -8,7 +8,11 @@ import yaml from tests.nm_utils.server import ServerContext -from tests.nm_utils.utils_skip import should_skip_lm_eval_test_group +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_LM_EVAL"): + pytest.mark.skip("TEST_LM_EVAL is set to 0, skipping group", + allow_module_level=True) if TYPE_CHECKING: import lm_eval as lm_eval_t @@ -51,8 +55,6 @@ class EvalTaskDefinition(EvalTaskDefinitionOpts): DEFAULT_RTOL = 0.05 -@pytest.mark.skipif(should_skip_lm_eval_test_group(), - reason="Current job configured to skip this test group") @pytest.mark.parametrize("eval_data", TEST_DATA) def test_lm_eval_correctness( eval_data: EvalTaskDefinition, diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index a624c4ca9ee62..d60db173483d6 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -3,6 +3,11 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping group", allow_module_level=True) + from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index f439afa9b7d2b..17637b29c5d45 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -4,6 +4,11 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping group", allow_module_level=True) + from vllm.attention.selector import which_attn_to_use diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index a0f7ce33954fd..458316f560e4d 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -34,4 +34,18 @@ def should_skip_lm_eval_test_group(): def should_skip_tensorizer_test_group(): TEST_TENSORIZER = os.getenv("TEST_TENSORIZER", "0") - return TEST_TENSORIZER != "1" \ No newline at end of file + return TEST_TENSORIZER != "1" + + +MAP = { + "TEST_KERNELS": should_skip_kernel_test_group, + "TEST_LORA": should_skip_lora_test_group, + "TEST_SPEC_DECODE": should_skip_spec_decode_test_group, + "TEST_ALL_MODELS": should_skip_models_test_group, + "TEST_LM_EVAL": should_skip_lm_eval_test_group, + "TEST_TENSORIZER": should_skip_tensorizer_test_group +} + + +def should_skip_test_group(group_name: str) -> bool: + return MAP[group_name]() From e6ac0513f212898244f490c7296278fb685f0be7 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 15:00:27 +0000 Subject: [PATCH 088/154] format --- tests/kernels/test_activation.py | 8 ++++---- tests/kernels/test_attention.py | 5 +++++ tests/kernels/test_attention_selector.py | 6 +++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index d60db173483d6..6286de83835e3 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -4,15 +4,15 @@ import torch from tests.nm_utils.utils_skip import should_skip_test_group - -if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping group", allow_module_level=True) - from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index fa5c951a7fa7a..46e96d115e657 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -6,11 +6,16 @@ from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops from vllm.utils import get_max_shared_memory_bytes, is_hip from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 17637b29c5d45..4e8e5fee36155 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,11 +5,11 @@ import torch from tests.nm_utils.utils_skip import should_skip_test_group +from vllm.attention.selector import which_attn_to_use if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping group", allow_module_level=True) - -from vllm.attention.selector import which_attn_to_use + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) @pytest.mark.parametrize( From 5f83af897ce88a9763da8ba86e9d5a3ae7a59a99 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 15:01:42 +0000 Subject: [PATCH 089/154] format --- tests/kernels/test_blocksparse_attention.py | 5 +++++ tests/kernels/test_cache.py | 5 +++++ tests/kernels/test_cutlass.py | 5 +++++ tests/kernels/test_flash_attn.py | 6 ++++++ tests/kernels/test_int8_quant.py | 5 +++++ tests/kernels/test_layernorm.py | 5 +++++ tests/kernels/test_marlin_gemm.py | 5 +++++ tests/kernels/test_moe.py | 5 +++++ tests/kernels/test_pos_encoding.py | 5 +++++ tests/kernels/test_prefix_prefill.py | 5 +++++ tests/kernels/test_rand.py | 5 +++++ tests/kernels/test_sampler.py | 5 +++++ 12 files changed, 61 insertions(+) diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 12109f8767782..ffe7ac9862693 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -4,6 +4,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) @@ -11,6 +12,10 @@ from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f26eb896105f6..f9f59e99bb6ca 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -4,8 +4,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 2cf0e86e5ca44..ca788d6316fad 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -7,8 +7,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 22772d4ea4422..b0af04fb22b6a 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -4,6 +4,12 @@ import torch from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + NUM_HEADS = [(16, 16), (32, 8), (64, 8)] HEAD_SIZES = [128, 256] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index b9aa00ce13f56..a0fd931e372ac 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -1,8 +1,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm._C import ops +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192] # Arbitrary values for testing NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 210d59e4f32fa..439e276ce2799 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -1,8 +1,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.layernorm import RMSNorm +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 1f8d94bad26d9..34e4e7b5db469 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, @@ -20,6 +21,10 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 2356b9ec18b0d..e7f28f58e1c75 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,10 +7,15 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.models.mixtral import MixtralMoE +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + def torch_moe(a, w1, w2, score, topk): B, D = a.shape diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index fbabc02bf9a9d..fb55a87a7decb 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -4,10 +4,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rotary_embedding import get_rope from .allclose_default import get_default_atol, get_default_rtol +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 2bf0ecc1fcb69..142041b9c00d4 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -7,9 +7,14 @@ from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.attention.backends.xformers import _make_alibi_bias from vllm.attention.ops.prefix_prefill import context_attention_fwd +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] HEAD_SIZES = [128, 96, 24] diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index a4242d22eb489..edbf841db37f3 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -3,9 +3,14 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.ops.rand import seeded_uniform from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index e28f809309ec5..619b8f37aa56d 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -5,12 +5,17 @@ import triton import triton.language as tl +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.ops.sample import ( MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits, sample) from vllm.model_executor.sampling_metadata import SamplingTensors from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_KERNELS"): + pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100 From 61e8d8a7860104e8622cdd9bc3370c396710994b Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 15:15:18 +0000 Subject: [PATCH 090/154] skip kernels env variable --- tests/accuracy/test_lm_eval_correctness.py | 2 +- tests/kernels/test_activation.py | 5 +++-- tests/kernels/test_attention.py | 4 ++-- tests/kernels/test_attention_selector.py | 4 ++-- tests/kernels/test_blocksparse_attention.py | 4 ++-- tests/kernels/test_cache.py | 4 ++-- tests/kernels/test_cutlass.py | 4 ++-- tests/kernels/test_flash_attn.py | 4 ++-- tests/kernels/test_int8_quant.py | 4 ++-- tests/kernels/test_layernorm.py | 4 ++-- tests/kernels/test_marlin_gemm.py | 4 ++-- tests/kernels/test_moe.py | 4 ++-- tests/kernels/test_pos_encoding.py | 4 ++-- tests/kernels/test_prefix_prefill.py | 4 ++-- tests/kernels/test_rand.py | 5 ++--- tests/kernels/test_sampler.py | 4 ++-- 16 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index 4665b3c5178e8..ea86e0209c136 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -11,7 +11,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_LM_EVAL"): - pytest.mark.skip("TEST_LM_EVAL is set to 0, skipping group", + pytest.skip("TEST_LM_EVAL is set to 0, skipping group", allow_module_level=True) if TYPE_CHECKING: diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 6286de83835e3..0dfa26a58596c 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -10,8 +10,9 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) + DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 46e96d115e657..cacd06af072fe 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -13,8 +13,8 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 4e8e5fee36155..03a71949559d1 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -8,8 +8,8 @@ from vllm.attention.selector import which_attn_to_use if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) @pytest.mark.parametrize( diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index ffe7ac9862693..8f2151a958404 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -13,8 +13,8 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f9f59e99bb6ca..c0413b0d56ac1 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -8,8 +8,8 @@ from vllm import _custom_ops as ops if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index ca788d6316fad..a10722fbacff2 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -11,8 +11,8 @@ from vllm import _custom_ops as ops if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index b0af04fb22b6a..e218dd3cbd345 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -7,8 +7,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) NUM_HEADS = [(16, 16), (32, 8), (64, 8)] HEAD_SIZES = [128, 256] diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index a0fd931e372ac..87fff0753e9a3 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -5,8 +5,8 @@ from vllm._C import ops if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192] # Arbitrary values for testing diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 439e276ce2799..98e8f65f0334f 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -5,8 +5,8 @@ from vllm.model_executor.layers.layernorm import RMSNorm if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 34e4e7b5db469..0fa2bf12ea8db 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -22,8 +22,8 @@ gptq_pack, quantize_weights, sort_weights) if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index e7f28f58e1c75..4daa0d073368b 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -13,8 +13,8 @@ from vllm.model_executor.models.mixtral import MixtralMoE if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) def torch_moe(a, w1, w2, score, topk): diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index fb55a87a7decb..4b841970eb13e 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -10,8 +10,8 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 142041b9c00d4..02f5afcf6af2c 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -12,8 +12,8 @@ from vllm.attention.ops.prefix_prefill import context_attention_fwd if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index edbf841db37f3..f2344901bea7d 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -8,9 +8,8 @@ from vllm.model_executor.utils import set_random_seed if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) - + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index 619b8f37aa56d..be08dc2f4aff4 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -13,8 +13,8 @@ from vllm.model_executor.utils import set_random_seed if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.mark.skip("TEST_KERNELS=0, skipping kernel group", - allow_module_level=True) + pytest.skip("TEST_KERNELS=0, skipping kernel group", + allow_module_level=True) SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100 From fa58955cd453561c3cec1985d384f0bcd2f36f08 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 15:22:03 +0000 Subject: [PATCH 091/154] skipping lora env variable --- tests/accuracy/test_lm_eval_correctness.py | 2 +- tests/kernels/test_activation.py | 1 - tests/kernels/test_rand.py | 1 + tests/lora/test_baichuan.py | 6 +++++- tests/lora/test_chatglm3.py | 6 ++++++ tests/lora/test_gemma.py | 4 ++++ tests/lora/test_layer_variation.py | 4 ++++ tests/lora/test_layers.py | 4 ++++ tests/lora/test_llama.py | 4 ++++ tests/lora/test_long_context.py | 4 ++++ tests/lora/test_lora.py | 4 ++++ tests/lora/test_lora_checkpoints.py | 4 ++++ tests/lora/test_lora_manager.py | 4 ++++ tests/lora/test_mixtral.py | 4 ++++ tests/lora/test_phi.py | 6 ++++++ tests/lora/test_punica.py | 4 ++++ tests/lora/test_quant_model.py | 4 ++++ tests/lora/test_tokenizer_group.py | 4 ++++ tests/lora/test_utils.py | 5 +++++ tests/lora/test_worker.py | 6 ++++++ 20 files changed, 78 insertions(+), 3 deletions(-) diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index ea86e0209c136..0b31a48e7e78e 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -12,7 +12,7 @@ if should_skip_test_group(group_name="TEST_LM_EVAL"): pytest.skip("TEST_LM_EVAL is set to 0, skipping group", - allow_module_level=True) + allow_module_level=True) if TYPE_CHECKING: import lm_eval as lm_eval_t diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 0dfa26a58596c..6ce27d6097d48 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -13,7 +13,6 @@ pytest.skip("TEST_KERNELS=0, skipping kernel group", allow_module_level=True) - DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index f2344901bea7d..9137f07e5158e 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -11,6 +11,7 @@ pytest.skip("TEST_KERNELS=0, skipping kernel group", allow_module_level=True) + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @pytest.mark.parametrize("use_3d", [True, False]) diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 5ab863eea94b3..4b2b50819970d 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,10 +1,14 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 @@ -105,4 +109,4 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files): del llm_tp4 cleanup() - assert output_tp1 == output_tp4 \ No newline at end of file + assert output_tp1 == output_tp4 diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index bd8cc98ef8ca0..d7406fa8f7307 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -1,6 +1,12 @@ +import pytest + import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + MODEL_PATH = "THUDM/chatglm3-6b" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index bf96de026ae09..212bfb4548279 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,8 +1,12 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + MODEL_PATH = "google/gemma-7b" diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index ace10e389ae6a..deb39aa6fd6c1 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -7,10 +7,14 @@ from transformers import AutoModelForCausalLM import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" PROMPTS = [ "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 3d868b0b1d5fc..2d922e82e6731 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -7,6 +7,7 @@ import torch import torch.nn.functional as F +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import LoRAConfig from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, @@ -37,6 +38,9 @@ from .utils import DummyLoRAManager +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + TOLERANCES = { torch.float16: (5e-3, 5e-3), torch.float32: (5e-3, 5e-3), diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index f5a571e81acba..667f5fdcfdec6 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -2,10 +2,14 @@ import ray import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + MODEL_PATH = "meta-llama/Llama-2-7b-hf" diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index cc1d4d620ff8a..4c6b06ec0b25e 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -5,6 +5,7 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora from vllm.lora.request import LoRARequest @@ -13,6 +14,9 @@ from .data.long_context_test_data import prompts_and_responses +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + context_len_to_scaling_factor = { "16k": 4, "32k": 8, diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 3415d36b7e341..eae74f6e5e37e 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -1,10 +1,14 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice from .utils import DummyLoRAManager +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] QKV_TENSOR_SIZES = [ (8192, 1024, 1024), diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index d4d1665b624ea..35acf0b1a4cab 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,8 +1,12 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.models import LoRAModel from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"] diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c08eee9910149..d58d60145d527 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -6,6 +6,7 @@ from safetensors.torch import load_file from torch import nn +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -18,6 +19,9 @@ WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 53d49a8dbc813..80500f32f41b5 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -2,8 +2,12 @@ import torch import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index a2b42ce4cb96f..258ff193f0db8 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,6 +1,12 @@ +import pytest + import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index f021c003b1322..30bf7dcb80c82 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -4,6 +4,10 @@ import torch import vllm.lora.punica as punica +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) def assert_close(a, b): diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index feda06b256e04..a1381e9513ef1 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -6,10 +6,14 @@ import pytest import vllm +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from .conftest import cleanup +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + @dataclass class ModelWithQuantization: diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 2dcad23c2b547..396bc95c56db1 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -1,12 +1,16 @@ import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import get_lora_tokenizer from vllm.transformers_utils.tokenizer_group import get_tokenizer_group from ..conftest import get_tokenizer_pool_config +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + @pytest.mark.asyncio @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 892f6081e2aaa..c05317b7c5925 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -1,10 +1,15 @@ from collections import OrderedDict +import pytest from torch import nn +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule from vllm.utils import LRUCache +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + def test_parse_fine_tuned_lora_name(): fixture = { diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 732e91a52c0a9..ea82af40cf9c3 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -3,12 +3,18 @@ import tempfile from unittest.mock import patch +import pytest + +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker +if should_skip_test_group(group_name="TEST_LORA"): + pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): From 22566106611aba668e97c19ddc848f9ad26ba7a8 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 15:27:21 +0000 Subject: [PATCH 092/154] fix issue with internal method --- neuralmagic/benchmarks/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py index fc78bcc10a357..a607b0ad0d9bc 100644 --- a/neuralmagic/benchmarks/common.py +++ b/neuralmagic/benchmarks/common.py @@ -28,7 +28,10 @@ def download_model(model: str) -> None: def max_model_length_from_model_id(model: str, trust_remote_code: bool = False) -> int: config = get_config(model, trust_remote_code=trust_remote_code) - return _get_and_verify_max_len(config, max_model_len=None) + return _get_and_verify_max_len(config, + max_model_len=None, + disable_sliding_window=False, + sliding_window_len=None) def script_args_to_cla(config: NamedTuple) -> Iterable[dict]: From 01973f5cd160bbf1a70737fb1c1b46bfd7fe53ed Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 15:32:39 +0000 Subject: [PATCH 093/154] formatting --- neuralmagic/benchmarks/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py index a607b0ad0d9bc..3a1234f1e41d4 100644 --- a/neuralmagic/benchmarks/common.py +++ b/neuralmagic/benchmarks/common.py @@ -30,7 +30,7 @@ def max_model_length_from_model_id(model: str, config = get_config(model, trust_remote_code=trust_remote_code) return _get_and_verify_max_len(config, max_model_len=None, - disable_sliding_window=False, + disable_sliding_window=False, sliding_window_len=None) From ac25d3a6f1c922ddfbd759ab8b6349a1e781f28c Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 15:35:02 +0000 Subject: [PATCH 094/154] spec decode env variable --- tests/spec_decode/e2e/test_compatibility.py | 5 +++++ tests/spec_decode/e2e/test_integration.py | 6 ++++++ tests/spec_decode/e2e/test_integration_dist.py | 5 +++++ tests/spec_decode/e2e/test_logprobs.py | 5 +++++ tests/spec_decode/e2e/test_multistep_correctness.py | 5 +++++ tests/spec_decode/e2e/test_ngram_correctness.py | 6 ++++++ tests/spec_decode/test_batch_expansion.py | 5 +++++ tests/spec_decode/test_dynamic_spec_decode.py | 5 +++++ tests/spec_decode/test_metrics.py | 5 +++++ tests/spec_decode/test_multi_step_worker.py | 5 +++++ tests/spec_decode/test_ngram_worker.py | 6 ++++++ tests/spec_decode/test_spec_decode_worker.py | 5 +++++ tests/spec_decode/test_utils.py | 5 +++++ 13 files changed, 68 insertions(+) diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 81f91c5e10b0d..ceb11ff77792a 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,9 +1,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import get_output_from_llm_generator +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index 4a2b62151f8cd..a58873c9379bf 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -4,8 +4,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + from .conftest import run_greedy_equality_correctness_test +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index d444ef24cbfda..4d0f3204dca49 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -5,10 +5,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.utils import is_hip from .conftest import run_greedy_equality_correctness_test +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index c266b4c7ecebd..ee37622ca3648 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -3,10 +3,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import get_logprobs_from_llm_generator +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.skip("Out of CPU Memory in NM Automation") @pytest.mark.parametrize( diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 94d71fb012727..60eeb18806513 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -33,11 +33,16 @@ import pytest from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import (get_output_from_llm_generator, run_greedy_equality_correctness_test) +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index d475d37af6425..4f57f00dfdc3a 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -26,8 +26,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + from .conftest import run_greedy_equality_correctness_test +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 43cfd78ddb0cc..e6cf14c5a6784 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,10 +1,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from .utils import create_seq_group_metadata_from_prompts, mock_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize('num_target_seq_ids', [100]) @pytest.mark.skip_global_cleanup diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index 48fa862b2e41a..d9d4dcf1624ab 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -3,6 +3,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.metrics import AsyncMetricsCollector @@ -12,6 +13,10 @@ from .utils import create_batch, mock_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize('queue_size', [4]) @pytest.mark.parametrize('batch_size', [1]) diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 312878804b86e..6c4890fc7232a 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -4,8 +4,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.spec_decode.metrics import AsyncMetricsCollector +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + def test_initial_call_returns_none(): """Expect first call to get metrics to return None. diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index cb2de97a4af94..145d9cf3320d0 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -4,6 +4,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.utils import set_random_seed from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.multi_step_worker import MultiStepWorker @@ -14,6 +15,10 @@ create_seq_group_metadata_from_prompts, create_worker, patch_execute_model_with_seeds, zero_kv_cache) +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize('num_steps', list(range(1, 17))) def test_assert_enough_kv_space(num_steps: int): diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index 88b40d1eb4674..0578bf1611422 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,11 +1,17 @@ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer from .utils import create_seq_group_metadata_from_prompts, create_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + def test_ngram_algo_correctness_for_single_no_match(): """Verify our ngram algo find the right candidate in the prompt diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index ef9d32f73d668..dc4b2509bbe5c 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed from vllm.sequence import ExecuteModelRequest, SamplerOutput @@ -17,6 +18,10 @@ from .utils import create_batch, create_sampler_output_list, mock_worker +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 6b6f35a1a1d05..82e1ee6908894 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -2,9 +2,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.sequence import SequenceGroupMetadata from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len +if should_skip_test_group(group_name="TEST_SPEC_DECODE"): + pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + allow_module_level=True) + def test_get_all_seq_ids(): """Verify get_all_seq_ids extracts all seq ids. From 4fbff3524f8a4f631c2bd6330005d857e29ef49f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 17:24:39 +0000 Subject: [PATCH 095/154] stash model changes --- requirements-dev.txt | 2 + .../test_basic_server_correctness.py | 188 ------------------ tests/conftest.py | 4 +- tests/models/README.md | 2 + .../nm_models_core/test_llm_logprobs.py | 53 +++++ .../test_magic_wand.py} | 47 ++++- tests/models/test_aqlm.py | 5 + tests/models/test_big_models.py | 6 + tests/models/test_compressed_memory.py | 62 ------ tests/models/test_embedding.py | 6 + tests/models/test_fp8.py | 5 + tests/models/test_gptq_marlin.py | 5 + tests/models/test_gptq_marlin_24.py | 5 + tests/models/test_llava.py | 5 + tests/models/test_marlin.py | 5 + tests/models/test_mistral.py | 6 + tests/models/test_models.py | 6 + tests/models/test_models_logprobs.py | 5 + tests/models/test_oot_registration.py | 6 + tests/models/test_registry.py | 5 + tests/nm_utils/server.py | 20 +- 21 files changed, 185 insertions(+), 263 deletions(-) delete mode 100644 tests/basic_correctness/test_basic_server_correctness.py create mode 100644 tests/models/README.md create mode 100644 tests/models/nm_models_core/test_llm_logprobs.py rename tests/models/{test_compressed.py => nm_models_core/test_magic_wand.py} (54%) delete mode 100644 tests/models/test_compressed_memory.py diff --git a/requirements-dev.txt b/requirements-dev.txt index 4329a4fd0fbe3..92ca9ee6aeb3c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -31,6 +31,8 @@ peft requests==2.31 ray sentence-transformers # required for embedding +optimum # required for hf gptq baselines +auto-gptq # required for hf gptq baselines # Benchmarking aiohttp diff --git a/tests/basic_correctness/test_basic_server_correctness.py b/tests/basic_correctness/test_basic_server_correctness.py deleted file mode 100644 index c33d0aa46c8f2..0000000000000 --- a/tests/basic_correctness/test_basic_server_correctness.py +++ /dev/null @@ -1,188 +0,0 @@ -import asyncio -from os import getenv -from typing import Dict, List, Type - -import openai -import pytest -import torch -from datasets import load_dataset -from openai import AsyncOpenAI -from transformers import AutoTokenizer - -from tests.conftest import HfRunnerNM -from tests.models.compare_utils import check_logprobs_close -from tests.nm_utils.logging import make_logger -from tests.nm_utils.server import ServerContext -from vllm.model_executor.layers.quantization import get_quantization_config - - -@pytest.fixture(scope="session") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client - - -@pytest.fixture -def hf_runner_nm() -> Type[HfRunnerNM]: - return HfRunnerNM - - -async def my_chat( - client, - model: str, - messages: List[Dict], - max_tokens: int, - temperature: float, - num_logprobs: int, -): - """ submit a single prompt chat and collect results. """ - return await client.chat.completions.create(model=model, - messages=messages, - max_tokens=max_tokens, - temperature=temperature, - logprobs=True, - top_logprobs=num_logprobs) - - -@pytest.mark.parametrize("model, max_model_len, sparsity, gptq_config", [ - ("mistralai/Mistral-7B-Instruct-v0.2", 4096, None, None), - ("neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50", 4096, "sparse_w16a16", - None), - ("NousResearch/Llama-2-7b-chat-hf", 4096, None, None), - ("neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat", 4096, - "sparse_w16a16", None), - ("microsoft/phi-2", 2048, None, None), - ("google/gemma-1.1-2b-it", 2056, None, None), - ("HuggingFaceH4/zephyr-7b-gemma-v0.1", 4096, None, None), -]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("tensor_parallel_size", [None]) -# note: repeating the test for 2 values of tensor_parallel_size -# increases the overall execution time by unnecessarily -# collecting the HuggingFace runner data twice. -# Consider refactoring to eliminate that repeat. -def test_models_on_server( - hf_runner_nm: HfRunnerNM, - client: AsyncOpenAI, - model: str, - max_model_len: int, - sparsity: str, - gptq_config: str, - tensor_parallel_size: int, - max_tokens: int, - num_logprobs: int, -) -> None: - """ - This test compares the output of the vllm OpenAI server against that of - a HuggingFace transformer. We expect them to be fairly close. "Close" - is measured by checking that the top 3 logprobs for each token includes - the token of the other inference tool. The first time that there is no - exact match, as long as there is a match to one of the top `num_logprobs` - logprobs, the test will not proceed further, but will pass. - - Parameters to the test identify a model to test, and key arguments - required for that model (see the `max_model_len`, `sparsity` and - `gptq_config` params below). The additional parametrizations expand test - coverage across the functional space of the server. - - :param hf_runner_nm: fixture for the HfRunnerNM - :param client: fixture with an openai.AsyncOpenAI client - :param model: The Hugginface id for a model to test with - :param max_model_len: passed to the vllm Server's --max-model-len option - :param sparsity: passed to the vllm Server's --sparsity option - :param gptq_config: quantization method id for this model. default None - means quantization isn't involved. - :param tensor_parallel_size: passed to the vllm Server's - --tensor_parallel_size option - :param max_tokens: the total number of tokens to consider for closeness - :param num_logprobs: the total number of logprobs included when - calculating closeness - """ - logger = make_logger("vllm_test") - # check that the requested gpu count is available in the test env - gpu_count = torch.cuda.device_count() - if tensor_parallel_size and gpu_count < tensor_parallel_size: - pytest.skip(f"gpu count {gpu_count} is insufficient for " - f"tensor_parallel_size = {tensor_parallel_size}") - - # skip this model if the current device does not have the required - # gpu capability. - device_capability = torch.cuda.get_device_capability() - capability = device_capability[0] * 10 + device_capability[1] - if gptq_config and ( - capability < - get_quantization_config(gptq_config).get_min_capability()): - pytest.skip("insufficient system GPU device capability " - f"({capability}) for this model") - - hf_token = getenv("HF_TOKEN", None) - logger.info("loading chat prompts for testing.") - ds = load_dataset("nm-testing/qa-chat-prompts", split="train_sft") - ds = ds.select(range(20)) - - num_chat_turns = 3 - messages_list = [row["messages"][:num_chat_turns] for row in ds] - tokenizer = AutoTokenizer.from_pretrained(model) - chat_prompts = [ - tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - for messages in messages_list - ] - - logger.info("generating chat responses from HuggingFace runner.") - hf_model = hf_runner_nm(model, access_token=hf_token) - hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens( - chat_prompts, max_tokens, num_logprobs, ignore_special_tokens=True) - - del hf_model - - logger.info("generating chat responses from vllm server.") - api_server_args = { - "--model": model, - "--max-model-len": max_model_len, - "--disable-log-requests": None, - } - if sparsity: - api_server_args["--sparsity"] = sparsity - if tensor_parallel_size: - api_server_args["--tensor-parallel-size"] = tensor_parallel_size - - # some devices will require a different `dtype` - if device_capability[0] < 8: - api_server_args["--dtype"] = "half" - - asyncio_event_loop = asyncio.get_event_loop() - temperature = 0.0 - with ServerContext(api_server_args, logger=logger) as _: - # submit an asynchronous request to the server for each prompt - chats = [ - my_chat(client, model, messages, max_tokens, temperature, - num_logprobs) - for messages in [query for query in messages_list] - ] - # await for all the requests to return, and gather their results - # in one place - results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats)) - - logger.info("preparing results from vllm server requests to include " - "tokens and logprobs.") - vllm_outputs = list() - for task_result in results: - for req_output in task_result.choices: - output_str = req_output.message.content - output_tokens = req_output.logprobs.model_extra["tokens"] - output_logprobs = req_output.logprobs.model_extra["top_logprobs"] - vllm_outputs.append((output_tokens, output_str, output_logprobs)) - - logger.info("comparing HuggingFace and vllm Server chat responses") - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf_model", - name_1="vllm_model", - ) diff --git a/tests/conftest.py b/tests/conftest.py index 0b44b1761c9ab..a9c30c6caff01 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -147,7 +147,7 @@ def __init__( self, model_name: str, dtype: str = "half", - access_token: Optional[str] = None, + **kwargs, ) -> None: assert dtype in _STR_DTYPE_TO_TORCH_DTYPE torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -166,7 +166,7 @@ def __init__( model_name, torch_dtype=torch_dtype, trust_remote_code=True, - token=access_token, + **kwargs, ).cuda() self.tokenizer = AutoTokenizer.from_pretrained( diff --git a/tests/models/README.md b/tests/models/README.md new file mode 100644 index 0000000000000..fae6c44501149 --- /dev/null +++ b/tests/models/README.md @@ -0,0 +1,2 @@ +* All the tests in `models/nm_models_core` run by default +* The tests in `models` require settng the `TEST_ALL_MODELS=1` enviornment variable \ No newline at end of file diff --git a/tests/models/nm_models_core/test_llm_logprobs.py b/tests/models/nm_models_core/test_llm_logprobs.py new file mode 100644 index 0000000000000..1852e9e18c725 --- /dev/null +++ b/tests/models/nm_models_core/test_llm_logprobs.py @@ -0,0 +1,53 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling. + +Because of numerical precision and the fact that we are generating +over so many samples, we look + +Run `pytest tests/models/test_models_logprobs.py`. +""" +import pytest + +from tests.models.utils import check_logprobs_close + +MODEL_MAX_LEN = 1024 + +MODELS = [ + # Llama (8B param variant) + "meta-llama/Meta-Llama-3-8B-Instruct", + "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", + # Qwen2 (7B param variant) + "Qwen/Qwen2-7B-Instruct", +] + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models( + vllm_runner_nm, + hf_runner_nm, + example_prompts, + model: str, + max_tokens: int, + num_logprobs: int, +) -> None: + hf_model = hf_runner_nm(model, device_map="auto") + hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts, + max_tokens, num_logprobs) + + del hf_model + + vllm_model = vllm_runner_nm(model, + max_model_len=MODEL_MAX_LEN) + vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, + max_tokens, + num_logprobs) + + del vllm_model + + # loop through the prompts + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf_model", + name_1="vllm_model", + ) diff --git a/tests/models/test_compressed.py b/tests/models/nm_models_core/test_magic_wand.py similarity index 54% rename from tests/models/test_compressed.py rename to tests/models/nm_models_core/test_magic_wand.py index cac6addafe5eb..109f20d46d19e 100644 --- a/tests/models/test_compressed.py +++ b/tests/models/nm_models_core/test_magic_wand.py @@ -22,7 +22,7 @@ @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models( +def test_correctness( vllm_runner, example_prompts, model_format_pairs, @@ -55,3 +55,48 @@ def test_models( name_0="dense", name_1="sparse", ) + + +MODEL_FORMAT_EXTRABLOCKS = [ + ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16", 1500), + ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4", + "semi_structured_sparse_w16a16", 1500), +] + + +@pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_memory_consumption( + vllm_runner, + model_format_extrablocks, + dtype: str, +) -> None: + model_name, sparsity, num_extra_blocks = model_format_extrablocks + dense_model = vllm_runner(model_name=model_name, + enforce_eager=True, + sparsity=None, + dtype=dtype, + max_model_len=1024) + dense_gpu_alloc = ( + dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator) + dense_num_kv_blocks = dense_gpu_alloc.num_blocks + + del dense_model + + sparse_model = vllm_runner( + model_name=model_name, + enforce_eager=True, + sparsity=sparsity, + dtype=dtype, + max_model_len=1024, + ) + sparse_gpu_alloc = ( + sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator) + sparse_num_kv_blocks = sparse_gpu_alloc.num_blocks + + del sparse_model + + assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, ( + f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} " + f"not bigger than dense model KV cache size {dense_num_kv_blocks} + " + f"expected num_extra_blocks {num_extra_blocks}") \ No newline at end of file diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index a7abc011f57d7..df004ed21a31c 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -6,8 +6,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] aqlm_not_supported = (capability < diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 8116b796287a5..7476ea83d9dee 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -9,6 +9,12 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + MODELS = [ "meta-llama/Llama-2-7b-hf", "mistralai/Mistral-7B-v0.1", diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py deleted file mode 100644 index 5d6392f4a9e45..0000000000000 --- a/tests/models/test_compressed_memory.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Checks the memory usage of the sparse model is < memory usage of the -dense model by checking that the number of KV cache blocks is -bigger for the sparse model rather than the dense model. vLLM pre-allocates -the memory for the KV-cache after checking availability once the model -is loaded. This implies that using a compressed model should give more space -for the KV cache and thus more allocated blocks. - -Run `pytest tests/models/test_sparse_memory.py --forked`. -""" - -import gc - -import pytest -import torch - -MODEL_FORMAT_EXTRABLOCKS = [ - ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16", 1500), - ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4", - "semi_structured_sparse_w16a16", 1500), -] - - -@pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS) -@pytest.mark.parametrize("dtype", ["half"]) -def test_models( - vllm_runner, - model_format_extrablocks, - dtype: str, -) -> None: - model_name, sparsity, num_extra_blocks = model_format_extrablocks - dense_model = vllm_runner(model_name=model_name, - enforce_eager=True, - sparsity=None, - dtype=dtype, - max_model_len=1024) - dense_gpu_alloc = ( - dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator) - dense_num_kv_blocks = dense_gpu_alloc.num_blocks - - del dense_model - torch.cuda.empty_cache() - gc.collect() - - sparse_model = vllm_runner( - model_name=model_name, - enforce_eager=True, - sparsity=sparsity, - dtype=dtype, - max_model_len=1024, - ) - sparse_gpu_alloc = ( - sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator) - sparse_num_kv_blocks = sparse_gpu_alloc.num_blocks - - del sparse_model - torch.cuda.empty_cache() - gc.collect() - - assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, ( - f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} " - f"not bigger than dense model KV cache size {dense_num_kv_blocks} + " - f"expected num_extra_blocks {num_extra_blocks}") diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index 59bf054913f7c..498a8085d871a 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -6,6 +6,12 @@ import torch import torch.nn.functional as F +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + MODELS = [ "intfloat/e5-mistral-7b-instruct", ] diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 0a5819ea3f054..da459f9a0812c 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -8,9 +8,14 @@ import torch from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, SamplingParams from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 561d4a1756587..28ef3949015aa 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -14,9 +14,14 @@ import torch from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 3e6ffb7f90fcc..5fbb28afe00ac 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -12,8 +12,13 @@ import torch from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] marlin_not_supported = (capability < diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index f86cd3fa88f5d..413859d0eb6de 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -7,8 +7,13 @@ import torch from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import VisionLanguageConfig +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + model_and_vl_config = [ ("llava-hf/llava-1.5-7b-hf", VisionLanguageConfig( diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index d3770fa69f6f1..a2b062d2cd53b 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -21,8 +21,13 @@ import torch from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] marlin_not_supported = (capability < diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 290d68501bc5c..593f0d1b92a31 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -4,8 +4,14 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + from .utils import check_logprobs_close +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3", diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 934749625d08c..0d69039e714a3 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -9,6 +9,12 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", "gpt2", diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 04c172e0a7942..4810e181093a4 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -5,6 +5,11 @@ import pytest from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) MODEL_MAX_LEN = 1024 diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 50ab06631500b..6ae72e89b60bc 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,9 +1,15 @@ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, ModelRegistry, SamplingParams from vllm.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + class MyOPTForCausalLM(OPTForCausalLM): diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 547ab10051f1b..2a0ded20c7d30 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,7 +1,12 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.models import _MODELS, ModelRegistry +if should_skip_test_group(group_name="TEST_ALL_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + allow_module_level=True) + @pytest.mark.parametrize("model_cls", _MODELS) def test_registry_imports(model_cls): diff --git a/tests/nm_utils/server.py b/tests/nm_utils/server.py index 1cff5a42176dc..89b9d8b8f5dd3 100644 --- a/tests/nm_utils/server.py +++ b/tests/nm_utils/server.py @@ -10,7 +10,7 @@ import requests import torch -from tests.nm_utils.logging import log_banner +# from tests.nm_utils.logging import log_banner MAX_SERVER_START_WAIT = 15 * 60 # time (seconds) to wait for server to start @@ -31,13 +31,13 @@ def __init__(self, *args, ] - if logger: - log_banner( - logger, - "server startup command", - shlex.join(self.startup_command), - logging.DEBUG, - ) + # if logger: + # log_banner( + # logger, + # "server startup command", + # shlex.join(self.startup_command), + # logging.DEBUG, + # ) self.proc = subprocess.Popen( [ @@ -95,8 +95,8 @@ def __init__(self, args: Dict[str, str], *, def __enter__(self): """Executes the server process and waits for it to become ready.""" ray.init(ignore_reinit_error=True) - log_banner(self._logger, "server startup command args", - shlex.join(self._args)) + # log_banner(self._logger, "server startup command args", + # shlex.join(self._args)) try: self.server_runner = ServerRunner.remote(self._args, From 977edffd193b99eb07674cdd137d98013f38fb5a Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 18:14:44 +0000 Subject: [PATCH 096/154] fixed basic server correctness --- tests/conftest.py | 12 +- tests/models/compare_utils.py | 1 + .../nm_models_core/test_server_logprobs.py | 191 ++++++++++++++++++ 3 files changed, 198 insertions(+), 6 deletions(-) create mode 100644 tests/models/nm_models_core/test_server_logprobs.py diff --git a/tests/conftest.py b/tests/conftest.py index a9c30c6caff01..aba119273629a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -472,21 +472,21 @@ def _decode_token_by_position_index( def generate_greedy_logprobs_nm_use_tokens( self, - prompts: List[str], + input_ids_lst: List[torch.Tensor], max_tokens: int, topk_logprobs_count: int, - ignore_special_tokens: bool = False ) -> List[Tuple[List[int], str, List[Dict]]]: all_logprobs = [] all_output_tokens = [] all_output_strs = [] - for prompt in prompts: - input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + for input_ids in input_ids_lst: output = self.model.generate( - input_ids.cuda(), - use_cache=True, + input_ids, do_sample=False, + temperature=None, # Explicitly set to avoid warning + top_p=None, # Explicitly set to avoid warning + top_k=None, # Explicitly set to avoid warning max_new_tokens=max_tokens, output_hidden_states=True, return_dict_in_generate=True, diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py index 051cbf1547b21..337428c6c6535 100644 --- a/tests/models/compare_utils.py +++ b/tests/models/compare_utils.py @@ -34,4 +34,5 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1): # Break out since sequences will now diverge. # as long as we got this far with the output tokens being the # same, or close, the responses are close enough + print(f"INFO: BREAK IN CHECK LOGPROBS CLOSE AT IDX: {idx}\n\n") break diff --git a/tests/models/nm_models_core/test_server_logprobs.py b/tests/models/nm_models_core/test_server_logprobs.py new file mode 100644 index 0000000000000..1923469ba6b40 --- /dev/null +++ b/tests/models/nm_models_core/test_server_logprobs.py @@ -0,0 +1,191 @@ +import asyncio +from typing import Dict, List, Type + +import time +import gc +import os +import openai +import pytest +import torch +from datasets import load_dataset +from openai import AsyncOpenAI +from transformers import AutoTokenizer + +from tests.conftest import HfRunnerNM +from tests.models.compare_utils import check_logprobs_close +from tests.nm_utils.logging import make_logger +from tests.nm_utils.server import ServerContext + +# Silence warning. +os.environ["TOKENIZERS_PARALLELISM"] = "True" + +NUM_SAMPLES_TO_RUN = 20 +NUM_CHAT_TURNS = 3 # << Should be an odd number. +REQUEST_RATE = 2.5 +GPU_COUNT = torch.cuda.device_count() +device_capability = torch.cuda.get_device_capability() +DEVICE_CAPABILITY = device_capability[0] * 10 + device_capability[1] + +MODELS = [ + # Llama (8B param variant) + "meta-llama/Meta-Llama-3-8B-Instruct", +] + +@pytest.fixture(scope="session") +def client(): + client = openai.AsyncOpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", + ) + yield client + + +@pytest.fixture +def hf_runner_nm() -> Type[HfRunnerNM]: + return HfRunnerNM + + +async def my_chat( + client, + model: str, + messages: List[Dict], + max_tokens: int, + num_logprobs: int, +): + """ submit a single prompt chat and collect results. """ + return await client.chat.completions.create(model=model, + messages=messages, + max_tokens=max_tokens, + temperature=0, + logprobs=True, + top_logprobs=num_logprobs) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [3]) +@pytest.mark.parametrize("tensor_parallel_size", [1]) +# note: repeating the test for 2 values of tensor_parallel_size +# increases the overall execution time by unnecessarily +# collecting the HuggingFace runner data twice. +# Consider refactoring to eliminate that repeat. +def test_models_on_server( + hf_runner_nm: HfRunnerNM, + client: AsyncOpenAI, + model: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, +) -> None: + """ + This test compares the output of the vllm OpenAI server against that of + a HuggingFace transformer. We expect them to be fairly close. "Close" + is measured by checking that the top N logprobs for each token includes + the token of the other inference tool. The first time that there is no + exact match, as long as there is a match to one of the top `num_logprobs` + logprobs, the test will not proceed further, but will pass. + + :param hf_runner_nm: fixture for the HfRunnerNM + :param client: fixture with an openai.AsyncOpenAI client + :param model: The Hugginface id for a model to test with + :param max_tokens: the maximum number of tokens to generate + :param num_logprobs: the total number of logprobs checked for "close enough" + :param tensor_parallel_size: passed to the vllm Server launch + """ + logger = make_logger("vllm_test") + + # Check that we have enough GPUs to run the test. + if tensor_parallel_size > 1 and GPU_COUNT < tensor_parallel_size: + pytest.skip(f"gpu count {GPU_COUNT} is insufficient for " + f"tensor_parallel_size = {tensor_parallel_size}") + + # Load dataset. + logger.info("Loading dataset and converting to chat format.") + ds = load_dataset("nm-testing/qa-chat-prompts", + split="train_sft").select(range(NUM_SAMPLES_TO_RUN)) + messages_list = [row["messages"][:NUM_CHAT_TURNS] for row in ds] + tokenizer = AutoTokenizer.from_pretrained(model) + + # Note: its very important to tokenize here due to silliness + # around how the tokenizer works. + # + # The following examples are not equivalent: + # + # ----- + # prompt = tokenizer.apply_chat_template(message) + # ----- + # prompt = tokenizer.apply_chat_template( + # message, tokenize=False) << adds bos + # input_ids = tokenizer(prompt).input_ids << also adds bos + # ----- + input_ids_lst = [ + tokenizer.apply_chat_template( + messages, return_tensors="pt", + add_generation_prompt=True).to("cuda") + for messages in messages_list + ] + + logger.info("Generating chat responses from HF transformers.") + hf_model = hf_runner_nm(model) + hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens( + input_ids_lst, max_tokens, num_logprobs) + # Make sure all the memory is cleaned up. + del hf_model + torch.cuda.empty_cache() + gc.collect() + time.sleep(1.0) + + logger.info("Generating chat responses from vLLM server.") + api_server_args = { + "--model": model, + "--max-model-len": 4096, + "--tensor-parallel-size": tensor_parallel_size, + } + + # bfloat16 requires at least Ampere. Set to float16 otherwise. + if DEVICE_CAPABILITY < 80: + api_server_args["--dtype"] = "half" + + # TODO: Update this to work like the benchmark script. + asyncio_event_loop = asyncio.get_event_loop() + with ServerContext(api_server_args, logger=logger) as _: + chats = [] + for messages in messages_list: + chats.append(my_chat(client, model, messages, max_tokens, num_logprobs)) + # Gather results. + results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats)) + + logger.info("Processing raw data from vLLM server.") + vllm_outputs = [] + + # See https://platform.openai.com/docs/api-reference/chat/create + for result in results: + req_output = result.choices[0] + output_str = req_output.message.content + + # Unpack from req_output.logprobs.content + # logprobs.content < list of list of token data + # logprobs.content[i].token < sampled token + # logprobs.content[i].top_logprobs < top logprobs + # logprobs.content[i].top_logprobs[j].token + # logprobs.content[i].top_logprobs[j].logprob + + output_tokens = [] + output_logprobs = [] + for token_data in req_output.logprobs.content: + # Actual sampled token. + output_tokens.append(token_data.token) + # Convert TopLogProb --> List[Dict[token, logprob]] + top_logprobs = {} + for top_logprob in token_data.top_logprobs: + top_logprobs[top_logprob.token] = top_logprob.logprob + output_logprobs.append(top_logprobs) + vllm_outputs.append((output_tokens, output_str, output_logprobs)) + + logger.info("Comparing results.") + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf_model", + name_1="vllm_model", + ) From 0266f28424e31a1d31ba5b2f0556432d7b007884 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 18:16:01 +0000 Subject: [PATCH 097/154] format --- tests/conftest.py | 6 ++-- .../nm_models_core/test_llm_logprobs.py | 4 +-- .../nm_models_core/test_server_logprobs.py | 28 ++++++++++--------- tests/nm_utils/server.py | 13 --------- 4 files changed, 20 insertions(+), 31 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index aba119273629a..5bba01dbed4fa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -484,9 +484,9 @@ def generate_greedy_logprobs_nm_use_tokens( output = self.model.generate( input_ids, do_sample=False, - temperature=None, # Explicitly set to avoid warning - top_p=None, # Explicitly set to avoid warning - top_k=None, # Explicitly set to avoid warning + temperature=None, # Explicitly set to avoid warning + top_p=None, # Explicitly set to avoid warning + top_k=None, # Explicitly set to avoid warning max_new_tokens=max_tokens, output_hidden_states=True, return_dict_in_generate=True, diff --git a/tests/models/nm_models_core/test_llm_logprobs.py b/tests/models/nm_models_core/test_llm_logprobs.py index 1852e9e18c725..bbf649961315e 100644 --- a/tests/models/nm_models_core/test_llm_logprobs.py +++ b/tests/models/nm_models_core/test_llm_logprobs.py @@ -19,6 +19,7 @@ "Qwen/Qwen2-7B-Instruct", ] + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -36,8 +37,7 @@ def test_models( del hf_model - vllm_model = vllm_runner_nm(model, - max_model_len=MODEL_MAX_LEN) + vllm_model = vllm_runner_nm(model, max_model_len=MODEL_MAX_LEN) vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, num_logprobs) diff --git a/tests/models/nm_models_core/test_server_logprobs.py b/tests/models/nm_models_core/test_server_logprobs.py index 1923469ba6b40..5c529f9d62f30 100644 --- a/tests/models/nm_models_core/test_server_logprobs.py +++ b/tests/models/nm_models_core/test_server_logprobs.py @@ -1,9 +1,9 @@ import asyncio -from typing import Dict, List, Type - -import time import gc import os +import time +from typing import Dict, List, Type + import openai import pytest import torch @@ -20,7 +20,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "True" NUM_SAMPLES_TO_RUN = 20 -NUM_CHAT_TURNS = 3 # << Should be an odd number. +NUM_CHAT_TURNS = 3 # << Should be an odd number. REQUEST_RATE = 2.5 GPU_COUNT = torch.cuda.device_count() device_capability = torch.cuda.get_device_capability() @@ -31,6 +31,7 @@ "meta-llama/Meta-Llama-3-8B-Instruct", ] + @pytest.fixture(scope="session") def client(): client = openai.AsyncOpenAI( @@ -93,15 +94,15 @@ def test_models_on_server( :param tensor_parallel_size: passed to the vllm Server launch """ logger = make_logger("vllm_test") - + # Check that we have enough GPUs to run the test. - if tensor_parallel_size > 1 and GPU_COUNT < tensor_parallel_size: + if tensor_parallel_size > 1 and tensor_parallel_size > GPU_COUNT: pytest.skip(f"gpu count {GPU_COUNT} is insufficient for " f"tensor_parallel_size = {tensor_parallel_size}") # Load dataset. logger.info("Loading dataset and converting to chat format.") - ds = load_dataset("nm-testing/qa-chat-prompts", + ds = load_dataset("nm-testing/qa-chat-prompts", split="train_sft").select(range(NUM_SAMPLES_TO_RUN)) messages_list = [row["messages"][:NUM_CHAT_TURNS] for row in ds] tokenizer = AutoTokenizer.from_pretrained(model) @@ -114,17 +115,17 @@ def test_models_on_server( # ----- # prompt = tokenizer.apply_chat_template(message) # ----- - # prompt = tokenizer.apply_chat_template( + # prompt = tokenizer.apply_chat_template( # message, tokenize=False) << adds bos # input_ids = tokenizer(prompt).input_ids << also adds bos # ----- input_ids_lst = [ - tokenizer.apply_chat_template( - messages, return_tensors="pt", - add_generation_prompt=True).to("cuda") + tokenizer.apply_chat_template(messages, + return_tensors="pt", + add_generation_prompt=True).to("cuda") for messages in messages_list ] - + logger.info("Generating chat responses from HF transformers.") hf_model = hf_runner_nm(model) hf_outputs = hf_model.generate_greedy_logprobs_nm_use_tokens( @@ -151,7 +152,8 @@ def test_models_on_server( with ServerContext(api_server_args, logger=logger) as _: chats = [] for messages in messages_list: - chats.append(my_chat(client, model, messages, max_tokens, num_logprobs)) + chats.append( + my_chat(client, model, messages, max_tokens, num_logprobs)) # Gather results. results = asyncio_event_loop.run_until_complete(asyncio.gather(*chats)) diff --git a/tests/nm_utils/server.py b/tests/nm_utils/server.py index 89b9d8b8f5dd3..989e9c053740a 100644 --- a/tests/nm_utils/server.py +++ b/tests/nm_utils/server.py @@ -1,6 +1,5 @@ import logging import os -import shlex import subprocess import sys import time @@ -10,8 +9,6 @@ import requests import torch -# from tests.nm_utils.logging import log_banner - MAX_SERVER_START_WAIT = 15 * 60 # time (seconds) to wait for server to start @@ -31,14 +28,6 @@ def __init__(self, *args, ] - # if logger: - # log_banner( - # logger, - # "server startup command", - # shlex.join(self.startup_command), - # logging.DEBUG, - # ) - self.proc = subprocess.Popen( [ sys.executable, "-m", "vllm.entrypoints.openai.api_server", @@ -95,8 +84,6 @@ def __init__(self, args: Dict[str, str], *, def __enter__(self): """Executes the server process and waits for it to become ready.""" ray.init(ignore_reinit_error=True) - # log_banner(self._logger, "server startup command args", - # shlex.join(self._args)) try: self.server_runner = ServerRunner.remote(self._args, From 51dff179f200158cea12ae46d271e4e9e146ec39 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 19:51:22 +0000 Subject: [PATCH 098/154] tensorizer, cleanup comment --- tests/models/nm_models_core/test_server_logprobs.py | 4 ---- tests/models/test_aqlm.py | 2 +- tests/models/test_big_models.py | 2 +- tests/models/test_embedding.py | 2 +- tests/models/test_fp8.py | 2 +- tests/models/test_gptq_marlin.py | 2 +- tests/models/test_gptq_marlin_24.py | 2 +- tests/models/test_llava.py | 2 +- tests/models/test_marlin.py | 2 +- tests/models/test_mistral.py | 2 +- tests/models/test_models.py | 2 +- tests/models/test_models_logprobs.py | 2 +- tests/models/test_oot_registration.py | 2 +- tests/models/test_registry.py | 2 +- tests/tensorizer_loader/test_tensorizer.py | 5 +++++ 15 files changed, 18 insertions(+), 17 deletions(-) diff --git a/tests/models/nm_models_core/test_server_logprobs.py b/tests/models/nm_models_core/test_server_logprobs.py index 5c529f9d62f30..afb375351574e 100644 --- a/tests/models/nm_models_core/test_server_logprobs.py +++ b/tests/models/nm_models_core/test_server_logprobs.py @@ -66,10 +66,6 @@ async def my_chat( @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [3]) @pytest.mark.parametrize("tensor_parallel_size", [1]) -# note: repeating the test for 2 values of tensor_parallel_size -# increases the overall execution time by unnecessarily -# collecting the HuggingFace runner data twice. -# Consider refactoring to eliminate that repeat. def test_models_on_server( hf_runner_nm: HfRunnerNM, client: AsyncOpenAI, diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index df004ed21a31c..82844eb048818 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 7476ea83d9dee..1c9a5b20631cb 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -12,7 +12,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index 498a8085d871a..cb3416239ad8b 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -9,7 +9,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index da459f9a0812c..a677d0ba6c208 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 28ef3949015aa..4f3b97fbef463 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 5fbb28afe00ac..f762b87d93a72 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 413859d0eb6de..eca8a85509a51 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -11,7 +11,7 @@ from vllm.config import VisionLanguageConfig if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) model_and_vl_config = [ diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index a2b062d2cd53b..2c0faedf5530e 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 593f0d1b92a31..521bc058836d2 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -9,7 +9,7 @@ from .utils import check_logprobs_close if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 0d69039e714a3..23df98bf6beb3 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -12,7 +12,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 4810e181093a4..d9ec2a2d0d2f6 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -8,7 +8,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) MODEL_MAX_LEN = 1024 diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 6ae72e89b60bc..4a40eeb21edd6 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -7,7 +7,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 2a0ded20c7d30..48471092876a2 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -4,7 +4,7 @@ from vllm.model_executor.models import _MODELS, ModelRegistry if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping kernel group", + pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", allow_module_level=True) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b63fcf23af09a..a0a183eb8dbe1 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -21,6 +21,11 @@ # yapf conflicts with isort for this docstring +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_TENSORIZER"): + pytest.skip("TEST_TENSORIZER=0, skipping tensorizer group", + allow_module_level=True) prompts = [ "Hello, my name is", From 775f6d403f0ee7ab46adfb79ea41c5bdf62d09a6 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 19:51:48 +0000 Subject: [PATCH 099/154] cleanup README --- tests/models/README.md | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 tests/models/README.md diff --git a/tests/models/README.md b/tests/models/README.md deleted file mode 100644 index fae6c44501149..0000000000000 --- a/tests/models/README.md +++ /dev/null @@ -1,2 +0,0 @@ -* All the tests in `models/nm_models_core` run by default -* The tests in `models` require settng the `TEST_ALL_MODELS=1` enviornment variable \ No newline at end of file From 88e3a556d02544cea70e1d1510325baf1d3e92e7 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 19:54:30 +0000 Subject: [PATCH 100/154] newline nits --- tests/models/nm_models_core/test_magic_wand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/nm_models_core/test_magic_wand.py b/tests/models/nm_models_core/test_magic_wand.py index 109f20d46d19e..758a7d9873be9 100644 --- a/tests/models/nm_models_core/test_magic_wand.py +++ b/tests/models/nm_models_core/test_magic_wand.py @@ -99,4 +99,4 @@ def test_memory_consumption( assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, ( f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} " f"not bigger than dense model KV cache size {dense_num_kv_blocks} + " - f"expected num_extra_blocks {num_extra_blocks}") \ No newline at end of file + f"expected num_extra_blocks {num_extra_blocks}") From a1a659dde6799a90df7af75706926b65b5487c63 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:04:58 +0000 Subject: [PATCH 101/154] disabled more kernel tests that use triton --- tests/kernels/test_moe.py | 10 ++++++++++ tests/kernels/test_prefix_prefill.py | 4 ++++ tests/kernels/test_rand.py | 4 ++++ tests/kernels/test_sampler.py | 5 +++++ 4 files changed, 23 insertions(+) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 2356b9ec18b0d..d08410a8bdf65 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -29,6 +29,11 @@ def torch_moe(a, w1, w2, score, topk): topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) +# UPSTREAM SYNC: breaks NM automation. +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize("m", [512, 222, 33, 1]) @pytest.mark.parametrize("n", [2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 511, 1024]) @@ -53,6 +58,11 @@ def test_fused_moe( assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0) +# UPSTREAM SYNC: breaks NM automation. +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @torch.inference_mode() diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 2bf0ecc1fcb69..3534468355d9d 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -220,6 +220,10 @@ def test_contexted_kv_attention( assert torch.allclose(output_ref, output, atol=1e-6, rtol=0) +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) @pytest.mark.parametrize("head_size", HEAD_SIZES) diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index a4242d22eb489..1e38253937ed5 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -7,6 +7,10 @@ from vllm.model_executor.utils import set_random_seed +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @pytest.mark.parametrize("use_3d", [True, False]) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index e28f809309ec5..9ba7b7e56017f 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -42,6 +42,11 @@ def test_uniform_to_exponential(): assert torch.all(torch.isfinite(torch.full_like(output, 1.0) / output)) +# UPSTREAM SYNC: breaks NM automation. +@pytest.mark.skip("C compiler not installed in NM automation. " + "This codepath follows a triton pathway, which " + "JITs using clang or gcc. Since neither are installed " + "in our test instances, we need to skip this for now.") @pytest.mark.parametrize("random_sampling", [True, False, "mixed"]) @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("modify_greedy_probs", [True, False]) From c50784ca5a832dcb02f1e4a8cd7b01180bbcb980 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:09:00 +0000 Subject: [PATCH 102/154] updated cutlass skipping. We need cuda 12.4 in automation --- tests/kernels/test_cutlass.py | 36 +++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 2cf0e86e5ca44..04897029b93f8 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -87,8 +87,13 @@ def cutlass_int8_gemm_helper(m: int, @pytest.mark.parametrize("k", [128, 496, 1024]) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) -@pytest.mark.skipif(capability < 89, - reason="FP8 is not supported on this GPU type.") +# UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4 +# to use the cutlass fp8 kernels + we do not have this in our +# automation system yet. +@pytest.mark.skipif(capability < 90, + reason="FP8 cutlass is not supported on this GPU " + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, per_out_ch: bool): cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch) @@ -116,8 +121,13 @@ def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16]) -@pytest.mark.skipif(capability < 89, - reason="FP8 is not supported on this GPU type.") +# UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4 +# to use the cutlass fp8 kernels + we do not have this in our +# automation system yet. +@pytest.mark.skipif(capability < 90, + reason="FP8 cutlass is not supported on this GPU " + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, out_dtype: Type[torch.dtype]): cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, @@ -127,8 +137,13 @@ def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.skipif(capability < 89, - reason="FP8 is not supported on this GPU type.") +# UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4 +# to use the cutlass fp8 kernels + we do not have this in our +# automation system yet. +@pytest.mark.skipif(capability < 90, + reason="FP8 cutlass is not supported on this GPU " + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool, device: str): cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, @@ -151,8 +166,13 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool, # kernel must handle any M thrown at it. @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) -@pytest.mark.skipif(capability < 89, - reason="FP8 is not supported on this GPU type.") +# UPSTREAM SYNC: This is currently 90, because we need CUDA 12.4 +# to use the cutlass fp8 kernels + we do not have this in our +# automation system yet. +@pytest.mark.skipif(capability < 90, + reason="FP8 cutlass is not supported on this GPU " + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool): for nk in range(32, 128, 32): for m in range(1, 128): From 99fa9f857308e520d168a22c08e923df23898c0d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:10:26 +0000 Subject: [PATCH 103/154] trigger kernel tests in automation --- neuralmagic/tests/skip-for-remote-push-tmp.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/neuralmagic/tests/skip-for-remote-push-tmp.txt b/neuralmagic/tests/skip-for-remote-push-tmp.txt index dd39bb8e64d19..860b4d025d196 100644 --- a/neuralmagic/tests/skip-for-remote-push-tmp.txt +++ b/neuralmagic/tests/skip-for-remote-push-tmp.txt @@ -1,16 +1,6 @@ tests/test_sharded_state_loader.py tests/test_sequence.py tests/metrics/test_metrics.py -tests/kernels/test_prefix_prefill.py -tests/kernels/test_pos_encoding.py -tests/kernels/test_activation.py -tests/kernels/test_moe.py -tests/kernels/test_layernorm.py -tests/kernels/test_attention.py -tests/kernels/test_rand.py -tests/kernels/test_cache.py -tests/kernels/test_sampler.py -tests/kernels/test_cutlass.py tests/core/test_block_manager.py tests/core/test_chunked_prefill_scheduler.py tests/core/test_scheduler.py From cdc9f49d1d97535e24f66c3b97afec23450f00b1 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:18:13 +0000 Subject: [PATCH 104/154] clean up magic_wand test so that we only load the model once --- .../models/nm_models_core/test_magic_wand.py | 68 ++++++------------- 1 file changed, 19 insertions(+), 49 deletions(-) diff --git a/tests/models/nm_models_core/test_magic_wand.py b/tests/models/nm_models_core/test_magic_wand.py index 758a7d9873be9..d7aef3f8e5bc4 100644 --- a/tests/models/nm_models_core/test_magic_wand.py +++ b/tests/models/nm_models_core/test_magic_wand.py @@ -11,52 +11,6 @@ from tests.models.utils import check_logprobs_close MAX_MODEL_LEN = 1024 -MODEL_FORMAT_PAIRS = [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4", - "semi_structured_sparse_w16a16"), - ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16"), -] - - -@pytest.mark.parametrize("model_format_pairs", MODEL_FORMAT_PAIRS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_correctness( - vllm_runner, - example_prompts, - model_format_pairs, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - model_name, sparsity = model_format_pairs - - sparse_model = vllm_runner(model_name=model_name, - sparsity=sparsity, - dtype=dtype, - max_model_len=MAX_MODEL_LEN) - sparse_outputs = sparse_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - del sparse_model - - dense_model = vllm_runner(model_name=model_name, - sparsity=None, - dtype=dtype, - max_model_len=MAX_MODEL_LEN) - dense_outputs = dense_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - del dense_model - - # loop through the prompts - check_logprobs_close( - outputs_0_lst=dense_outputs, - outputs_1_lst=sparse_outputs, - name_0="dense", - name_1="sparse", - ) - - MODEL_FORMAT_EXTRABLOCKS = [ ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16", 1500), ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4", @@ -66,10 +20,15 @@ def test_correctness( @pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS) @pytest.mark.parametrize("dtype", ["half"]) -def test_memory_consumption( +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_magic_wand( vllm_runner, + example_prompts, model_format_extrablocks, dtype: str, + max_tokens: int, + num_logprobs: int, ) -> None: model_name, sparsity, num_extra_blocks = model_format_extrablocks dense_model = vllm_runner(model_name=model_name, @@ -80,7 +39,8 @@ def test_memory_consumption( dense_gpu_alloc = ( dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator) dense_num_kv_blocks = dense_gpu_alloc.num_blocks - + dense_outputs = dense_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) del dense_model sparse_model = vllm_runner( @@ -93,10 +53,20 @@ def test_memory_consumption( sparse_gpu_alloc = ( sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator) sparse_num_kv_blocks = sparse_gpu_alloc.num_blocks - + sparse_outputs = sparse_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) del sparse_model + # Confirm the memory is saved. assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, ( f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} " f"not bigger than dense model KV cache size {dense_num_kv_blocks} + " f"expected num_extra_blocks {num_extra_blocks}") + + # Confirm the generations are similar. + check_logprobs_close( + outputs_0_lst=dense_outputs, + outputs_1_lst=sparse_outputs, + name_0="dense", + name_1="sparse", + ) From b08194a7bc2be1552bab9d432087cbc159ed0103 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:23:10 +0000 Subject: [PATCH 105/154] format --- tests/nm_utils/utils_skip.py | 14 +++++++++++++- tests/samplers/test_beam_search.py | 6 ++++++ tests/samplers/test_ignore_eos.py | 5 +++++ tests/samplers/test_logits_processor.py | 5 +++++ tests/samplers/test_logprobs.py | 5 +++++ tests/samplers/test_ranks.py | 5 +++++ tests/samplers/test_rejection_sampler.py | 5 +++++ tests/samplers/test_sampler.py | 5 +++++ tests/samplers/test_seeded_generate.py | 5 +++++ tests/tensorizer_loader/test_tensorizer.py | 2 +- 10 files changed, 55 insertions(+), 2 deletions(-) diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index 458316f560e4d..89bff73365fb1 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -37,13 +37,25 @@ def should_skip_tensorizer_test_group(): return TEST_TENSORIZER != "1" +def should_skip_sampler_test_group(): + TEST_SAMPLER = os.getenv("TEST_SAMPLER", "0") + return TEST_SAMPLER != "1" + + +def should_skip_entrypoints_group(): + TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "0") + return TEST_ENTRYPOINTS != "1" + + MAP = { "TEST_KERNELS": should_skip_kernel_test_group, "TEST_LORA": should_skip_lora_test_group, "TEST_SPEC_DECODE": should_skip_spec_decode_test_group, "TEST_ALL_MODELS": should_skip_models_test_group, "TEST_LM_EVAL": should_skip_lm_eval_test_group, - "TEST_TENSORIZER": should_skip_tensorizer_test_group + "TEST_TENSORIZER": should_skip_tensorizer_test_group, + "TEST_SAMPLER": should_skip_sampler_test_group, + "TEST_ENTRYPOINTS": should_skip_entrypoints_group, } diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 2682f284505bd..2b61e6f80a7d1 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -7,6 +7,12 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. # 2. Increase beam_width to 8. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 864657a3c2b28..6c8ff3540f783 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -5,8 +5,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 0ccbabfff6403..659b36fc52b15 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -1,8 +1,13 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 40d054cd472b8..c42511e57e9d0 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,10 +1,15 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from ..conftest import VllmRunner +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 5e93238d709ec..cc6edb6274c9d 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,7 +1,12 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 273df509568d6..3446bc1109b99 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -5,9 +5,14 @@ import torch import torch.nn.functional as F +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index f4a5eb621b573..0e9bfbaeed85b 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -8,12 +8,17 @@ import torch from transformers import GenerationConfig, GenerationMixin +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import Counter, is_pin_memory_available +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + class MockLogitsSampler(Sampler): diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index fef5ff3fb9e8e..1961c6e03c28b 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -8,9 +8,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.model_executor.utils import set_random_seed +if should_skip_test_group(group_name="TEST_SAMPLER"): + pytest.skip("TEST_SAMPLER=0, skipping sampler group", + allow_module_level=True) + MODEL = "facebook/opt-125m" RANDOM_SEEDS = list(range(5)) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index a0a183eb8dbe1..b9f6404f2b387 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -9,6 +9,7 @@ import ray import torch +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import ServerRunner from vllm import SamplingParams # yapf: disable @@ -21,7 +22,6 @@ # yapf conflicts with isort for this docstring -from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_TENSORIZER"): pytest.skip("TEST_TENSORIZER=0, skipping tensorizer group", From ccda2e72319c3e9a93085dfbb7049dc0c0bc9832 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:45:45 +0000 Subject: [PATCH 106/154] format --- tests/accuracy/test_lm_eval_correctness.py | 4 +- tests/async_engine/api_server_async_engine.py | 6 + tests/async_engine/test_api_server.py | 6 + tests/async_engine/test_async_llm_engine.py | 5 + tests/async_engine/test_chat_template.py | 5 + tests/async_engine/test_openapi_server_ray.py | 5 + tests/async_engine/test_request_tracker.py | 5 + .../test_basic_correctness.py | 6 + .../basic_correctness/test_chunked_prefill.py | 7 ++ tests/basic_correctness/test_preemption.py | 6 + tests/entrypoints/openai/test_serving_chat.py | 5 + tests/entrypoints/test_guided_processors.py | 5 + tests/entrypoints/test_llm_encode.py | 5 + tests/entrypoints/test_llm_generate.py | 5 + tests/entrypoints/test_openai_run_batch.py | 7 ++ tests/entrypoints/test_openai_server.py | 5 + .../test_server_oot_registration.py | 5 + .../test_llm_logprobs.py | 0 .../test_magic_wand.py | 0 .../test_server_logprobs.py | 0 tests/nm_utils/utils_skip.py | 119 ++++++++++++++---- 21 files changed, 185 insertions(+), 26 deletions(-) rename tests/{models/nm_models_core => models_core}/test_llm_logprobs.py (100%) rename tests/{models/nm_models_core => models_core}/test_magic_wand.py (100%) rename tests/{models/nm_models_core => models_core}/test_server_logprobs.py (100%) diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index 0b31a48e7e78e..4539011916051 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -10,8 +10,8 @@ from tests.nm_utils.server import ServerContext from tests.nm_utils.utils_skip import should_skip_test_group -if should_skip_test_group(group_name="TEST_LM_EVAL"): - pytest.skip("TEST_LM_EVAL is set to 0, skipping group", +if should_skip_test_group(group_name="TEST_ACCURACY"): + pytest.skip("TEST_ACCURACY=0, skipping accuracy test group", allow_module_level=True) if TYPE_CHECKING: diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 1be76fdc8d868..ff97b971da3d4 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -2,13 +2,19 @@ import argparse from typing import Any, Dict +import pytest import uvicorn from fastapi.responses import JSONResponse, Response import vllm.entrypoints.api_server +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + allow_module_level=True) + app = vllm.entrypoints.api_server.app diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 7f57d5cf9b182..3a194f5f218ba 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -7,6 +7,12 @@ import pytest import requests +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + allow_module_level=True) + def _query_server(prompt: str, max_tokens: int = 5) -> dict: response = requests.post("http://localhost:8000/generate", diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 10a46422887e3..eeda3041ffb2c 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -3,8 +3,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.async_llm_engine import AsyncLLMEngine +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + allow_module_level=True) + @dataclass class RequestOutput: diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 55b730812ea94..7b8db5a43e6d0 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -4,10 +4,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.transformers_utils.tokenizer import get_tokenizer +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + allow_module_level=True) + chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( __file__))).parent.parent / "examples/template_chatml.jinja" assert chatml_jinja_path.exists() diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index c4434301201ce..15bcb85900166 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,8 +4,13 @@ # and debugging. import ray +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import ServerRunner +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + allow_module_level=True) + # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 7b1f4a9e1eb2f..9d8aa37d83652 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.async_llm_engine import RequestTracker from vllm.outputs import RequestOutput +if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): + pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + allow_module_level=True) + @pytest.mark.asyncio async def test_request_tracker(): diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 7d8117447ca0a..74c03455a9ec1 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -7,8 +7,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM +if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): + pytest.skip( + "TEST_BASIC_CORRECTNESS=0, skipping basic correctness test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 8d7e88d151369..afa9a6a89dcce 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -8,6 +8,13 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): + pytest.skip( + "TEST_BASIC_CORRECTNESS=0, skipping basic correctness test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 29a4c39cd25a1..5ef2d5ddd9b37 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -8,10 +8,16 @@ import pytest from prometheus_client import REGISTRY +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ENABLE_ARTIFICIAL_PREEMPT) +if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): + pytest.skip( + "TEST_BASIC_CORRECTNESS=0, skipping basic correctness test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", ] diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index c45f02fe564a3..1033f02bea771 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -3,8 +3,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + allow_module_level=True) + MODEL_NAME = "openai-community/gpt2" CHAT_TEMPLATE = "Dummy chat template for testing {}" diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index 5d4163e96fd87..e4e596898c940 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -4,12 +4,17 @@ import torch from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.protocol import CompletionRequest from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.model_executor.guided_decoding.outlines_logits_processors import ( JSONLogitsProcessor, RegexLogitsProcessor) +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + allow_module_level=True) + TEST_SCHEMA = { "type": "object", "properties": { diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/test_llm_encode.py index 7c3fbe43a8384..c142b242bba1a 100644 --- a/tests/entrypoints/test_llm_encode.py +++ b/tests/entrypoints/test_llm_encode.py @@ -3,8 +3,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, EmbeddingRequestOutput, PoolingParams +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + allow_module_level=True) + from ..conftest import cleanup MODEL_NAME = "intfloat/e5-mistral-7b-instruct" diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py index a00fff91a310e..1cea00a96e950 100644 --- a/tests/entrypoints/test_llm_generate.py +++ b/tests/entrypoints/test_llm_generate.py @@ -3,10 +3,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, RequestOutput, SamplingParams from ..conftest import cleanup +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + allow_module_level=True) + MODEL_NAME = "facebook/opt-125m" PROMPTS = [ diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/test_openai_run_batch.py index 5de28513ca391..2068d5e878623 100644 --- a/tests/entrypoints/test_openai_run_batch.py +++ b/tests/entrypoints/test_openai_run_batch.py @@ -2,8 +2,15 @@ import sys import tempfile +import pytest + +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.openai.protocol import BatchRequestOutput +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + allow_module_level=True) + # ruff: noqa: E501 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index c4c1f8fe3afec..e554bd10ef3a2 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -13,9 +13,14 @@ from huggingface_hub import snapshot_download from openai import BadRequestError +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import ServerRunner from vllm.transformers_utils.tokenizer import get_tokenizer +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + allow_module_level=True) + # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py index 3e55d7f4297fb..1d8e69b4b3aec 100644 --- a/tests/entrypoints/test_server_oot_registration.py +++ b/tests/entrypoints/test_server_oot_registration.py @@ -5,11 +5,16 @@ import torch from openai import OpenAI, OpenAIError +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import ModelRegistry from vllm.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.utils import get_open_port +if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): + pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + allow_module_level=True) + pytestmark = pytest.mark.openai diff --git a/tests/models/nm_models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py similarity index 100% rename from tests/models/nm_models_core/test_llm_logprobs.py rename to tests/models_core/test_llm_logprobs.py diff --git a/tests/models/nm_models_core/test_magic_wand.py b/tests/models_core/test_magic_wand.py similarity index 100% rename from tests/models/nm_models_core/test_magic_wand.py rename to tests/models_core/test_magic_wand.py diff --git a/tests/models/nm_models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py similarity index 100% rename from tests/models/nm_models_core/test_server_logprobs.py rename to tests/models_core/test_server_logprobs.py diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index 89bff73365fb1..658783a5aa519 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -7,7 +7,42 @@ import os -def should_skip_kernel_test_group(): +def should_skip_accuracy_test_group(): + TEST_ACCURACY = os.getenv("TEST_ACCURACY", "0") + return TEST_ACCURACY != "1" + + +def should_skip_async_engine_test_group(): + TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "0") + return TEST_ASYNC_ENGINE != "1" + + +def should_skip_basic_correctness_test_group(): + TEST_BASIC_CORRECTNESS = os.getenv("TEST_BASIC_CORRECTNESS", "0") + return TEST_BASIC_CORRECTNESS != "1" + + +def should_skip_core_test_group(): + TEST_CORE = os.getenv("TEST_CORE", "0") + return TEST_CORE != "1" + + +def should_skip_distributed_test_group(): + TEST_DISTRIBUTED = os.getenv("TEST_DISTRIBUTED", "0") + return TEST_DISTRIBUTED != "1" + + +def should_skip_engine_test_group(): + TEST_ENGINE = os.getenv("TEST_ENGINE", "0") + return TEST_ENGINE != "1" + + +def should_skip_entrypoints_test_group(): + TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "0") + return TEST_ENTRYPOINTS != "1" + + +def should_skip_kernels_test_groups(): TEST_KERNELS = os.getenv("TEST_KERNELS", "0") return TEST_KERNELS != "1" @@ -17,45 +52,81 @@ def should_skip_lora_test_group(): return TEST_LORA != "1" -def should_skip_spec_decode_test_group(): - TEST_SPEC_DECODE = os.getenv("TEST_SPEC_DECODE", "0") - return TEST_SPEC_DECODE != "1" +def should_skip_metrics_test_group(): + TEST_METRICS = os.getenv("TEST_METRICS", "0") + return TEST_METRICS != "1" + + +def should_skip_model_executor_test_group(): + TEST_MODEL_EXECUTOR = os.getenv("TEST_MODEL_EXECUTOR", "0") + return TEST_MODEL_EXECUTOR != "1" def should_skip_models_test_group(): - TEST_ALL_MODELS = os.getenv("TEST_ALL_MODELS", "0") - return TEST_ALL_MODELS != "1" + TEST_MODELS = os.getenv("TEST_MODELS", "0") + return TEST_MODELS != "1" -def should_skip_lm_eval_test_group(): - TEST_LM_EVAL = os.getenv("TEST_LM_EVAL", "0") - return TEST_LM_EVAL != "1" +def should_skip_models_core_test_group(): + TEST_MODELS_CORE = os.getenv("TEST_MODELS_CORE", "0") + return TEST_MODELS_CORE != "1" -def should_skip_tensorizer_test_group(): - TEST_TENSORIZER = os.getenv("TEST_TENSORIZER", "0") - return TEST_TENSORIZER != "1" +def should_skip_prefix_caching_test_group(): + TEST_PREFIX_CACHING = os.getenv("TEST_PREFIX_CACHING", "0") + return TEST_PREFIX_CACHING != "1" -def should_skip_sampler_test_group(): - TEST_SAMPLER = os.getenv("TEST_SAMPLER", "0") - return TEST_SAMPLER != "1" +def should_skip_quantization_test_group(): + TEST_QUANTIZATION = os.getenv("TEST_QUANTIZATION", "0") + return TEST_QUANTIZATION != "1" -def should_skip_entrypoints_group(): - TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "0") - return TEST_ENTRYPOINTS != "1" +def should_skip_samplers_test_group(): + TEST_SAMPLERS = os.getenv("TEST_SAMPLERS", "0") + return TEST_SAMPLERS != "1" + + +def should_skip_spec_decode_test_group(): + TEST_SPEC_DECODE = os.getenv("TEST_SPEC_DECODE", "0") + return TEST_SPEC_DECODE != "1" + + +def should_skip_tensorizer_loader_test_group(): + TEST_TENSORIZER_LOADER = os.getenv("TEST_TENSORIZER_LOADER", "0") + return TEST_TENSORIZER_LOADER != "1" + + +def should_skip_tokenization_test_group(): + TEST_TOKENIZATION = os.getenv("TEST_TOKENIZATION", "0") + return TEST_TOKENIZATION != "1" + + +def should_skip_worker_test_group(): + TEST_WORKER = os.getenv("TEST_WORKER", "0") + return TEST_WORKER != "1" MAP = { - "TEST_KERNELS": should_skip_kernel_test_group, + "TEST_ACCURACY": should_skip_accuracy_test_group, + "TEST_ASYNC_ENGINE": should_skip_accuracy_test_group, + "TEST_BASIC_CORRECTNESS": should_skip_basic_correctness_test_group, + "TEST_CORE": should_skip_core_test_group, + "TEST_DISTRIBUTED": should_skip_distributed_test_group, + "TEST_ENGINE": should_skip_engine_test_group, + "TEST_ENTRYPOINTS": should_skip_entrypoints_test_group, + "TEST_KERNELS": should_skip_kernels_test_groups, "TEST_LORA": should_skip_lora_test_group, + "TEST_METRICS": should_skip_metrics_test_group, + "TEST_MODELS": should_skip_models_test_group, + "TEST_MODELS_CORE": should_skip_models_core_test_group, + "TEST_PREFIX_CACHING": should_skip_prefix_caching_test_group, + "TEST_QUANTIZATION": should_skip_quantization_test_group, + "TEST_SAMPLERS": should_skip_samplers_test_group, "TEST_SPEC_DECODE": should_skip_spec_decode_test_group, - "TEST_ALL_MODELS": should_skip_models_test_group, - "TEST_LM_EVAL": should_skip_lm_eval_test_group, - "TEST_TENSORIZER": should_skip_tensorizer_test_group, - "TEST_SAMPLER": should_skip_sampler_test_group, - "TEST_ENTRYPOINTS": should_skip_entrypoints_group, + "TEST_TENSORIZER_LOADER": should_skip_tensorizer_loader_test_group, + "TEST_TOKENIZATION": should_skip_tokenization_test_group, + "TEST_WORKER": should_skip_worker_test_group, } From 51a76857152b558e6d0b105a2363851332667a47 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:57:56 +0000 Subject: [PATCH 107/154] core, correctness --- tests/async_engine/api_server_async_engine.py | 6 ------ tests/async_engine/test_api_server.py | 2 +- tests/async_engine/test_async_llm_engine.py | 2 +- tests/async_engine/test_chat_template.py | 2 +- tests/async_engine/test_openapi_server_ray.py | 2 +- tests/async_engine/test_request_tracker.py | 2 +- tests/core/block/e2e/test_correctness.py | 5 +++++ tests/core/block/e2e/test_correctness_sliding_window.py | 5 +++++ tests/core/block/test_block_manager_v2.py | 5 +++++ tests/core/block/test_block_table.py | 5 +++++ tests/core/block/test_common.py | 5 +++++ tests/core/block/test_cpu_gpu_block_allocator.py | 5 +++++ tests/core/block/test_naive_block.py | 5 +++++ tests/core/block/test_prefix_caching_block.py | 5 +++++ tests/core/test_block_manager.py | 5 +++++ tests/core/test_chunked_prefill_scheduler.py | 5 +++++ tests/core/test_scheduler.py | 5 +++++ tests/nm_utils/utils_skip.py | 3 ++- 18 files changed, 62 insertions(+), 12 deletions(-) diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index ff97b971da3d4..1be76fdc8d868 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -2,19 +2,13 @@ import argparse from typing import Any, Dict -import pytest import uvicorn from fastapi.responses import JSONResponse, Response import vllm.entrypoints.api_server -from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", - allow_module_level=True) - app = vllm.entrypoints.api_server.app diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 3a194f5f218ba..e08956836fbbc 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -10,7 +10,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", allow_module_level=True) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index eeda3041ffb2c..efa0e62a0e218 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -7,7 +7,7 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", allow_module_level=True) diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 7b8db5a43e6d0..0eaba87444bfb 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", allow_module_level=True) chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 15bcb85900166..3de088f798da3 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -8,7 +8,7 @@ from tests.utils import ServerRunner if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", allow_module_level=True) # any model with a chat template should work here diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 9d8aa37d83652..010962f4c293e 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -5,7 +5,7 @@ from vllm.outputs import RequestOutput if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", allow_module_level=True) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 3713ef2fed4d1..469f8ec90bcd1 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -2,10 +2,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from .conftest import get_token_ids_from_llm_generator +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index d77d6a1dbb741..0f17d8a9e7b3d 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -3,10 +3,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM, SamplingParams from .conftest import get_text_from_llm_generator +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + # relatively small model with 4k sliding window MODEL = "bigcode/starcoder2-3b" BLOCK_SIZE = 16 diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index f98fc0e217278..11a74d902676a 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -1,5 +1,6 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.block_manager_v2 import BlockSpaceManagerV2 @@ -9,6 +10,10 @@ from ..utils import create_seq_group, create_seq_group_encoder_decoder +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80]) diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index 6fb95cfdfab81..a68fafabda16f 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,9 +1,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.utils import Device, cdiv, chunk_list +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("sequence_len", [1, 16, 129]) diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py index cfdd3582ed2ef..71dfffbe2e350 100644 --- a/tests/core/block/test_common.py +++ b/tests/core/block/test_common.py @@ -2,8 +2,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.common import RefCounter +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("seed", list(range(20))) @pytest.mark.parametrize("num_incrs", [1, 100]) diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index 44a5be6c181a0..000d4fa4eab33 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.utils import Device, chunk_list +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + @pytest.mark.parametrize("num_cpu_blocks", [0, 512]) @pytest.mark.parametrize("num_gpu_blocks", [1024]) diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index edcdc0c7d4f98..821c8f67c1eff 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -2,9 +2,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.interfaces import Block, BlockAllocator from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + class TestNaiveBlockAllocator: diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index bcf08cda09f46..5b8425fe32ab4 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -5,10 +5,15 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block.interfaces import Block, BlockAllocator from vllm.core.block.prefix_caching_block import (PrefixCachingBlock, PrefixCachingBlockAllocator) +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + class TestPrefixCachingBlock: diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index cd306b9e4d3cc..ee96b88286f2c 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -4,6 +4,7 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams from vllm.block import PhysicalTokenBlock from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, @@ -16,6 +17,10 @@ from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + def test_block_allocator_allocate(): block_size = 4 diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 3649e6b003a5d..e01efaa1f376d 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -3,6 +3,7 @@ import pytest # noqa +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import CacheConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler @@ -10,6 +11,10 @@ from .utils import create_dummy_prompt +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + def get_sequence_groups(scheduler_output): return [s.seq_group for s in scheduler_output.scheduled_seq_groups] diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 07fc8731e1847..aa47bc89b7b90 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -5,6 +5,7 @@ import pytest # noqa +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus from vllm.core.policy import PolicyFactory @@ -14,6 +15,10 @@ from .utils import create_dummy_prompt +if should_skip_test_group(group_name="TEST_CORE"): + pytest.skip("TEST_CORE=0, skipping core test group", + allow_module_level=True) + def get_sequence_groups(scheduler_output): return [s.seq_group for s in scheduler_output.scheduled_seq_groups] diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index 658783a5aa519..3953d35839765 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -14,6 +14,7 @@ def should_skip_accuracy_test_group(): def should_skip_async_engine_test_group(): TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "0") + print(TEST_ASYNC_ENGINE) return TEST_ASYNC_ENGINE != "1" @@ -109,7 +110,7 @@ def should_skip_worker_test_group(): MAP = { "TEST_ACCURACY": should_skip_accuracy_test_group, - "TEST_ASYNC_ENGINE": should_skip_accuracy_test_group, + "TEST_ASYNC_ENGINE": should_skip_async_engine_test_group, "TEST_BASIC_CORRECTNESS": should_skip_basic_correctness_test_group, "TEST_CORE": should_skip_core_test_group, "TEST_DISTRIBUTED": should_skip_distributed_test_group, From c42b18f2e8fc1208840f8118990b52f367bc8df0 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 20:59:11 +0000 Subject: [PATCH 108/154] distributed --- tests/distributed/test_basic_distributed_correctness.py | 6 ++++++ tests/distributed/test_chunked_prefill_distributed.py | 6 ++++++ tests/distributed/test_comm_ops.py | 5 +++++ tests/distributed/test_custom_all_reduce.py | 5 +++++ tests/distributed/test_pynccl.py | 5 +++++ 5 files changed, 27 insertions(+) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 5178bc5dae566..7ff61e4520583 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -21,6 +21,12 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + allow_module_level=True) + MODELS = [ "meta-llama/Llama-2-7b-hf", ] diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index a15d0f8766556..bf4921c1b6a14 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -20,6 +20,12 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + allow_module_level=True) + MODELS = [ "meta-llama/Llama-2-7b-hf", ] diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 2b597bb63c031..b285c0853cf14 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -8,12 +8,17 @@ import ray import torch +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import (init_test_distributed_environment, multi_process_tensor_parallel) from vllm.distributed import (broadcast_tensor_dict, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + allow_module_level=True) + @ray.remote(num_gpus=1, max_calls=1) def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index e4bfda8425344..bf40654efe8e9 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -6,6 +6,7 @@ import torch import torch.distributed as dist +from tests.nm_utils.utils_skip import should_skip_test_group from tests.utils import (init_test_distributed_environment, multi_process_tensor_parallel) from vllm.distributed.communication_op import ( # noqa @@ -13,6 +14,10 @@ from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, get_tp_ca_communicator) +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + allow_module_level=True) + random.seed(42) test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)] for i, v in enumerate(test_sizes): diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 0218295a3e3f9..41f20fcb98c65 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -5,6 +5,7 @@ import torch import torch.distributed +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.distributed.communication_op import ( # noqa graph_capture, tensor_model_parallel_all_reduce) from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator @@ -13,6 +14,10 @@ init_distributed_environment) from vllm.utils import update_environment_variables +if should_skip_test_group(group_name="TEST_DISTRIBUTED"): + pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + allow_module_level=True) + def distributed_run(fn, world_size): number_of_processes = world_size From 765aff07938a39483f5f6fac4825f3f3da182ad5 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:01:04 +0000 Subject: [PATCH 109/154] format --- tests/engine/output_processor/test_multi_step.py | 5 +++++ tests/engine/output_processor/test_stop_checker.py | 5 +++++ tests/engine/test_computed_prefix_blocks.py | 5 +++++ tests/engine/test_detokenization.py | 5 +++++ tests/engine/test_multiproc_workers.py | 5 +++++ tests/engine/test_skip_tokenizer_init.py | 5 +++++ tests/engine/test_stop_reason.py | 5 +++++ tests/engine/test_stop_strings.py | 5 +++++ 8 files changed, 40 insertions(+) diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py index 4f32a622546f0..2b419842f40d2 100644 --- a/tests/engine/output_processor/test_multi_step.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -4,6 +4,7 @@ import pytest from transformers import PreTrainedTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor from vllm.engine.output_processor.stop_checker import StopChecker @@ -15,6 +16,10 @@ from ...core.utils import create_seq_group +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [1, 12]) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index f795403e3d8ad..7837fd2cec84b 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -3,10 +3,15 @@ import pytest from transformers import PreTrainedTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.output_processor.stop_checker import StopChecker from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob, Sequence, SequenceStatus +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + def sequence_with_eos(text: str, eos_token: str, eos_token_id: int) -> Sequence: diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index ed35212cc3f11..bbcc07bfb54f5 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -1,9 +1,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py index f77f6d0725b6b..8b55d711b0975 100644 --- a/tests/engine/test_detokenization.py +++ b/tests/engine/test_detokenization.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) def test_computed_prefix_blocks(model: str): diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index 610ad9732fb91..a103a5fee4477 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -6,9 +6,14 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + class DummyWorker: """Dummy version of vllm.worker.worker.Worker""" diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index 338b208723ba9..418c9e9566a06 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -1,8 +1,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) def test_skip_tokenizer_initialization(model: str): diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index 7b886507c04f2..2420111f6329e 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -9,8 +9,13 @@ import pytest import transformers +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + MODEL = "facebook/opt-350m" STOP_STR = "." SEED = 42 diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 6b747beb4b543..c6456a4c5090c 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -2,8 +2,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import CompletionOutput, LLMEngine, SamplingParams +if should_skip_test_group(group_name="TEST_ENGINE"): + pytest.skip("TEST_ENGINE=0, skipping distributed test group", + allow_module_level=True) + MODEL = "meta-llama/llama-2-7b-hf" MAX_TOKENS = 200 From e18bd8a72336da0e91d4b4bd31d0f580c915ec65 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:14:23 +0000 Subject: [PATCH 110/154] format --- tests/metrics/test_metrics.py | 5 +++++ tests/model_executor/weight_utils.py | 5 +++++ tests/models/test_aqlm.py | 4 ++-- tests/models/test_big_models.py | 4 ++-- tests/models/test_embedding.py | 4 ++-- tests/models/test_fp8.py | 4 ++-- tests/models/test_gptq_marlin.py | 4 ++-- tests/models/test_gptq_marlin_24.py | 4 ++-- tests/models/test_llava.py | 4 ++-- tests/models/test_marlin.py | 4 ++-- tests/models/test_mistral.py | 4 ++-- tests/models/test_models.py | 4 ++-- tests/models/test_models_logprobs.py | 4 ++-- tests/models/test_oot_registration.py | 4 ++-- tests/models/test_registry.py | 4 ++-- tests/models_core/test_llm_logprobs.py | 8 ++++++-- tests/models_core/test_magic_wand.py | 5 +++++ tests/models_core/test_server_logprobs.py | 5 +++++ tests/prefix_caching/test_disable_sliding_window.py | 5 +++++ tests/prefix_caching/test_prefix_caching.py | 5 +++++ tests/quantization/test_compressed_tensors.py | 6 ++++++ tests/quantization/test_configs.py | 5 +++++ tests/quantization/test_fp8.py | 5 +++++ 23 files changed, 78 insertions(+), 28 deletions(-) diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e0aa14f165c2d..2b04c94690040 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -3,11 +3,16 @@ import pytest from prometheus_client import REGISTRY +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import EngineArgs, LLMEngine from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams +if should_skip_test_group(group_name="TEST_METRICS"): + pytest.skip("TEST_METRICS=0, skipping metrics test group", + allow_module_level=True) + MODELS = [ "facebook/opt-125m", ] diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index c8b9bed691bba..1958186b2e111 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -5,9 +5,14 @@ import pytest from huggingface_hub.utils import LocalEntryNotFoundError +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.model_loader.weight_utils import ( download_weights_from_hf, enable_hf_transfer) +if should_skip_test_group(group_name="TEST_MODEL_EXECUTOR"): + pytest.skip("TEST_MODEL_EXECUTOR=0, skipping model executor test group", + allow_module_level=True) + def test_hf_transfer_auto_activation(): if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ: diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 82844eb048818..b322fbf15b561 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -9,8 +9,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 1c9a5b20631cb..31782f4ff432d 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -11,8 +11,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index cb3416239ad8b..8db5881ea6887 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -8,8 +8,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index a677d0ba6c208..f83d9cfb7adc1 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -12,8 +12,8 @@ from vllm import LLM, SamplingParams from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 4f3b97fbef463..870be32f5442b 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -18,8 +18,8 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index f762b87d93a72..b9c6650492861 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -15,8 +15,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index eca8a85509a51..1669bed43dd24 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -10,8 +10,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import VisionLanguageConfig -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) model_and_vl_config = [ diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 2c0faedf5530e..6475cdd97cea0 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -24,8 +24,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 521bc058836d2..57cd9720519ce 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -8,8 +8,8 @@ from .utils import check_logprobs_close -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 23df98bf6beb3..330ab094b8406 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -11,8 +11,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index d9ec2a2d0d2f6..0e3e5eb6dbdee 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -7,8 +7,8 @@ from tests.models.utils import check_logprobs_close from tests.nm_utils.utils_skip import should_skip_test_group -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) MODEL_MAX_LEN = 1024 diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 4a40eeb21edd6..0c81b244e334c 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -6,8 +6,8 @@ from vllm.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 48471092876a2..f563e264f9280 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -3,8 +3,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.models import _MODELS, ModelRegistry -if should_skip_test_group(group_name="TEST_ALL_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping non core model group", +if should_skip_test_group(group_name="TEST_MODELS"): + pytest.skip("TEST_ALL_MODELS=0, skipping model test group", allow_module_level=True) diff --git a/tests/models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py index bbf649961315e..1cc349084ea05 100644 --- a/tests/models_core/test_llm_logprobs.py +++ b/tests/models_core/test_llm_logprobs.py @@ -8,13 +8,17 @@ import pytest from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS_CORE"): + pytest.skip("TEST_MODELS_CORE=0, skipping core model test group", + allow_module_level=True) MODEL_MAX_LEN = 1024 MODELS = [ # Llama (8B param variant) "meta-llama/Meta-Llama-3-8B-Instruct", - "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", # Qwen2 (7B param variant) "Qwen/Qwen2-7B-Instruct", ] @@ -31,7 +35,7 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: - hf_model = hf_runner_nm(model, device_map="auto") + hf_model = hf_runner_nm(model) hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts, max_tokens, num_logprobs) diff --git a/tests/models_core/test_magic_wand.py b/tests/models_core/test_magic_wand.py index d7aef3f8e5bc4..116fc2f815813 100644 --- a/tests/models_core/test_magic_wand.py +++ b/tests/models_core/test_magic_wand.py @@ -9,6 +9,11 @@ import pytest from tests.models.utils import check_logprobs_close +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS_CORE"): + pytest.skip("TEST_MODELS_CORE=0, skipping core model test group", + allow_module_level=True) MAX_MODEL_LEN = 1024 MODEL_FORMAT_EXTRABLOCKS = [ diff --git a/tests/models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py index afb375351574e..385629f2afb84 100644 --- a/tests/models_core/test_server_logprobs.py +++ b/tests/models_core/test_server_logprobs.py @@ -15,6 +15,11 @@ from tests.models.compare_utils import check_logprobs_close from tests.nm_utils.logging import make_logger from tests.nm_utils.server import ServerContext +from tests.nm_utils.utils_skip import should_skip_test_group + +if should_skip_test_group(group_name="TEST_MODELS_CORE"): + pytest.skip("TEST_MODELS_CORE=0, skipping core model test group", + allow_module_level=True) # Silence warning. os.environ["TOKENIZERS_PARALLELISM"] = "True" diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index eeac6ab43c05f..181a504f0f1a4 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -5,8 +5,13 @@ import pytest from tests.conftest import cleanup +from tests.nm_utils.utils_skip import should_skip_test_group from vllm import LLM +if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): + pytest.skip("TEST_PREFIX_CACHING=0, skipping prefix caching test group", + allow_module_level=True) + MODEL_LEN_LEN = [ # Example models with sliding window. ("bigcode/starcoder2-3b", 4096, 16384), diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 305596e16ef1c..a290a75d96aa4 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -4,9 +4,14 @@ """ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.core.block_manager_v1 import CachedBlockAllocator from vllm.utils import Device +if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): + pytest.skip("TEST_PREFIX_CACHING=0, skipping prefix caching test group", + allow_module_level=True) + @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("num_blocks", [16]) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index b83286992da3d..73ac6ca947d49 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -3,11 +3,17 @@ Run `pytest tests/quantization/test_compressed_tensors.py`. """ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor) +if should_skip_test_group(group_name="TEST_QUANTIZATION"): + pytest.skip("TEST_QUANTIZATION=0, skipping quantization test group", + allow_module_level=True) + def test_compressed_tensors_w8a8_static_setup(vllm_runner): model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed" diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 6820b2728e3c9..da02d3d631b46 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -7,8 +7,13 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import ModelConfig +if should_skip_test_group(group_name="TEST_QUANTIZATION"): + pytest.skip("TEST_QUANTIZATION=0, skipping quantization test group", + allow_module_level=True) + @dataclass class ModelPair: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 607544a1c8394..8c10768e42142 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -5,9 +5,14 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod +if should_skip_test_group(group_name="TEST_QUANTIZATION"): + pytest.skip("TEST_QUANTIZATION=0, skipping quantization test group", + allow_module_level=True) + capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] From 495488b0af5dfb8ce8662c39327f51853aea345a Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:18:38 +0000 Subject: [PATCH 111/154] added tokenization group --- tests/samplers/test_beam_search.py | 4 ++-- tests/samplers/test_ignore_eos.py | 4 ++-- tests/samplers/test_logits_processor.py | 4 ++-- tests/samplers/test_logprobs.py | 4 ++-- tests/samplers/test_ranks.py | 4 ++-- tests/samplers/test_rejection_sampler.py | 4 ++-- tests/samplers/test_sampler.py | 4 ++-- tests/samplers/test_seeded_generate.py | 4 ++-- tests/tensorizer_loader/test_tensorizer.py | 2 +- tests/tokenization/test_cached_tokenizer.py | 6 ++++++ tests/tokenization/test_detokenize.py | 5 +++++ tests/tokenization/test_tokenizer.py | 5 +++++ tests/tokenization/test_tokenizer_group.py | 5 +++++ 13 files changed, 38 insertions(+), 17 deletions(-) diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 2b61e6f80a7d1..257a2668c63b1 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -9,8 +9,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) # FIXME(zhuohan): The test can not pass if we: diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 6c8ff3540f783..74f8e62ea7369 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -8,8 +8,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 659b36fc52b15..a30567553c1e7 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -4,8 +4,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index c42511e57e9d0..128326123f855 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -6,8 +6,8 @@ from ..conftest import VllmRunner -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index cc6edb6274c9d..8fc88af774b80 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -3,8 +3,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group from vllm import SamplingParams -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 3446bc1109b99..1fec93400269e 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -9,8 +9,8 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) CUDA_DEVICES = [ diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 0e9bfbaeed85b..5e65bfdfa7755 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -15,8 +15,8 @@ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import Counter, is_pin_memory_available -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 1961c6e03c28b..d2aa9f7848c19 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -12,8 +12,8 @@ from vllm import SamplingParams from vllm.model_executor.utils import set_random_seed -if should_skip_test_group(group_name="TEST_SAMPLER"): - pytest.skip("TEST_SAMPLER=0, skipping sampler group", +if should_skip_test_group(group_name="TEST_SAMPLERS"): + pytest.skip("TEST_SAMPLERS=0, skipping sampler group", allow_module_level=True) MODEL = "facebook/opt-125m" diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b9f6404f2b387..8c87c624fdd6e 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -23,7 +23,7 @@ # yapf conflicts with isort for this docstring -if should_skip_test_group(group_name="TEST_TENSORIZER"): +if should_skip_test_group(group_name="TEST_TENSORIZER_LOADER"): pytest.skip("TEST_TENSORIZER=0, skipping tensorizer group", allow_module_level=True) diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index 4c8238fd8d113..549a8ea8ebe36 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -1,9 +1,15 @@ from copy import deepcopy +import pytest from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.transformers_utils.tokenizer import get_cached_tokenizer +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + allow_module_level=True) + def test_cached_tokenizer(): reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 8d019fe5f38ca..67d8abb31513b 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -3,11 +3,16 @@ import pytest from transformers import AutoTokenizer +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) from vllm.transformers_utils.tokenizer_group import get_tokenizer_group +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + allow_module_level=True) + TRUTH = [ "Hello here, this is a simple test", "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py index 8db7204f15d4e..3273e2fa55e53 100644 --- a/tests/tokenization/test_tokenizer.py +++ b/tests/tokenization/test_tokenizer.py @@ -1,8 +1,13 @@ import pytest from transformers import PreTrainedTokenizerBase +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.transformers_utils.tokenizer import get_tokenizer +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + allow_module_level=True) + TOKENIZER_NAMES = [ "facebook/opt-125m", "gpt2", diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 31571dbfff6f6..801fb86644030 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -5,6 +5,7 @@ import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.transformers_utils.tokenizer_group import get_tokenizer_group from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( RayTokenizerGroupPool) @@ -13,6 +14,10 @@ from ..conftest import get_tokenizer_pool_config +if should_skip_test_group(group_name="TEST_TOKENIZATION"): + pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + allow_module_level=True) + @pytest.mark.asyncio @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) From d64bda5111fd76866449acb1060e9dbc331e6cc2 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:19:53 +0000 Subject: [PATCH 112/154] worker --- tests/worker/test_model_runner.py | 5 +++++ tests/worker/test_swap.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 92de545acd53d..bec7b9f91c74c 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,6 +1,7 @@ import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.distributed.parallel_state import init_distributed_environment from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -8,6 +9,10 @@ from vllm.utils import get_open_port from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size +if should_skip_test_group(group_name="TEST_WORKER"): + pytest.skip("TEST_WORKER=0, skipping worker test group", + allow_module_level=True) + def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: engine_args = EngineArgs(model, *args, **kwargs) diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index d941ffdb5588a..a822372913e0a 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,10 +1,16 @@ +import pytest import torch +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.engine.arg_utils import EngineArgs from vllm.sequence import ExecuteModelRequest from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.worker import Worker +if should_skip_test_group(group_name="TEST_WORKER"): + pytest.skip("TEST_WORKER=0, skipping worker test group", + allow_module_level=True) + def test_swap() -> None: # Configure the engine. From c6c6994f37b1b344a651adef0a2b1461b0fb0b9d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:24:59 +0000 Subject: [PATCH 113/154] added models core --- tests/models_core/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/models_core/__init__.py diff --git a/tests/models_core/__init__.py b/tests/models_core/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From c9a2d026d402b8744f19d9b197a3c5ea90c6dce6 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:46:08 +0000 Subject: [PATCH 114/154] added remote push --- .github/workflows/nm-build-test.yml | 8 ++++---- .github/workflows/nm-test.yml | 15 ++++++++++----- .../tests/test_skip_env_vars/remote-push.txt | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 neuralmagic/tests/test_skip_env_vars/remote-push.txt diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml index 74dc1db48b5f2..11c621d044dca 100644 --- a/.github/workflows/nm-build-test.yml +++ b/.github/workflows/nm-build-test.yml @@ -45,8 +45,8 @@ on: description: "git commit hash or branch name" type: string required: true - test_skip_list: - description: 'file containing tests to skip' + test_skip_env_vars: + description: 'file with list of env vars controlling which tests to run' type: string required: true # benchmark related parameters @@ -91,7 +91,7 @@ jobs: gitref: ${{ github.ref }} python: ${{ inputs.python }} whl: ${{ needs.BUILD.outputs.whl }} - test_skip_list: ${{ inputs.test_skip_list }} + test_skip_env_vars: ${{ inputs.test_skip_env_vars }} secrets: inherit # TODO: re-enable @@ -105,7 +105,7 @@ jobs: # gitref: ${{ github.ref }} # python: ${{ inputs.python }} # whl: ${{ needs.BUILD.outputs.whl }} - # test_skip_list: ${{ inputs.test_skip_list }} + # test_skip_env_vars: ${{ inputs.test_skip_env_vars }} # secrets: inherit UPLOAD: diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml index 3e3f3adef3ef3..87860bcc356bb 100644 --- a/.github/workflows/nm-test.yml +++ b/.github/workflows/nm-test.yml @@ -23,8 +23,8 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true - test_skip_list: - description: 'file containing tests to skip' + test_skip_env_vars: + description: 'file containing tests env vars for test skipping' type: string required: true @@ -51,8 +51,8 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true - test_skip_list: - description: 'file containing tests to skip' + test_skip_env_vars: + description: 'file containing tests env vars for test skipping' type: string required: true @@ -131,12 +131,17 @@ jobs: - name: run buildkite script run: | cd tests && sudo bash ../.buildkite/download-images.sh + + - name: setenv test skip + id: setenv_test_skip + uses: ./.github/actions/nm-set-env-test-skip + with: + test_skip_env_vars: ${{ inputs.test_skip_env_vars }} - name: run tests id: test uses: ./.github/actions/nm-test-whl/ with: - test_skip_list: ${{ inputs.test_skip_list }} test_directory: tests test_results: test-results diff --git a/neuralmagic/tests/test_skip_env_vars/remote-push.txt b/neuralmagic/tests/test_skip_env_vars/remote-push.txt new file mode 100644 index 0000000000000..0b2d25d0910ee --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/remote-push.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=0 +TEST_ASYNC_ENGINE=0 +TEST_BASIC_CORRECTNESS=0 +TEST_CORE=0 +TEST_DISTRIBUTED=0 +TEST_ENGINE=0 +TEST_ENTRYPOINTS=0 +TEST_KERNELS=0 +TEST_LORA=0 +TEST_METRICS=0 +TEST_MODELS=0 +TEST_MODELS_CORE=1 +TEST_PREFIX_CACHING=0 +TEST_QUANTIZATION=0 +TEST_SAMPLERS=0 +TEST_SPEC_DECODE=0 +TEST_TENSORIZER_LOADER=0 +TEST_TOKENIZATION=0 +TEST_WORKER=0 From 9b452a752fc5cf54e1d2675087caa5d00c3fe74f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:46:29 +0000 Subject: [PATCH 115/154] added action --- .github/actions/nm-set-env-test-skip/action.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/actions/nm-set-env-test-skip/action.yml diff --git a/.github/actions/nm-set-env-test-skip/action.yml b/.github/actions/nm-set-env-test-skip/action.yml new file mode 100644 index 0000000000000..2f62d8135154a --- /dev/null +++ b/.github/actions/nm-set-env-test-skip/action.yml @@ -0,0 +1,17 @@ +name: set test skip env vars +description: 'sets env variables for test skipping. See tests/utils_skip.py' +inputs: + test_skip_env_vars: + description: 'List of env vars controlling which tests to run.' + required: true + +runs: + using: composite + steps: + - run: | + cat "${ENV_VAR_FILE}" + cat "${ENV_VAR_FILE}" >> $GITHUB_ENV + env: + ENV_VAR_FILE: ${{ inputs.test_skip_env_vars }} + shell: bash + \ No newline at end of file From bbe29060968cad6d0c3638c9474234004c3f800f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:47:34 +0000 Subject: [PATCH 116/154] updated remote push workflow --- .github/workflows/nm-remote-push.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index 93950e498fd23..7142128539dd9 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -21,7 +21,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -37,7 +37,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -53,7 +53,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -69,7 +69,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt From b2bb2bcbfe124332ed876d5e5c10a18187a63ccc Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:51:00 +0000 Subject: [PATCH 117/154] make sure action was saved --- .github/actions/nm-set-env-test-skip/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/nm-set-env-test-skip/action.yml b/.github/actions/nm-set-env-test-skip/action.yml index 2f62d8135154a..7a1675e6bc0c6 100644 --- a/.github/actions/nm-set-env-test-skip/action.yml +++ b/.github/actions/nm-set-env-test-skip/action.yml @@ -2,7 +2,7 @@ name: set test skip env vars description: 'sets env variables for test skipping. See tests/utils_skip.py' inputs: test_skip_env_vars: - description: 'List of env vars controlling which tests to run.' + description: 'file with list of env vars controlling which tests to run.' required: true runs: From e629449737ac84b6554c3aee401f1cf17316582a Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:53:14 +0000 Subject: [PATCH 118/154] added action to build to just the action works --- .github/workflows/nm-build.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml index 077d0a147f24c..5c88f6675bb85 100644 --- a/.github/workflows/nm-build.yml +++ b/.github/workflows/nm-build.yml @@ -95,6 +95,13 @@ jobs: hf_token: ${{ secrets.NM_HF_TOKEN }} Gi_per_thread: ${{ inputs.Gi_per_thread }} nvcc_threads: ${{ inputs.nvcc_threads }} + + # TEST: to remove + - name: setenv test skip + id: setenv_test_skip + uses: ./.github/actions/nm-set-env-test-skip + with: + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt - name: set python id: set_python From 668e17236a7899961102ef87771bbf23483b0073 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 21:56:56 +0000 Subject: [PATCH 119/154] updated to tab these in --- .../tests/test_skip_env_vars/remote-push.txt | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/neuralmagic/tests/test_skip_env_vars/remote-push.txt b/neuralmagic/tests/test_skip_env_vars/remote-push.txt index 0b2d25d0910ee..551a1bcc4ed5d 100644 --- a/neuralmagic/tests/test_skip_env_vars/remote-push.txt +++ b/neuralmagic/tests/test_skip_env_vars/remote-push.txt @@ -1,19 +1,19 @@ -TEST_ACCURACY=0 -TEST_ASYNC_ENGINE=0 -TEST_BASIC_CORRECTNESS=0 -TEST_CORE=0 -TEST_DISTRIBUTED=0 -TEST_ENGINE=0 -TEST_ENTRYPOINTS=0 -TEST_KERNELS=0 -TEST_LORA=0 -TEST_METRICS=0 -TEST_MODELS=0 -TEST_MODELS_CORE=1 -TEST_PREFIX_CACHING=0 -TEST_QUANTIZATION=0 -TEST_SAMPLERS=0 -TEST_SPEC_DECODE=0 -TEST_TENSORIZER_LOADER=0 -TEST_TOKENIZATION=0 -TEST_WORKER=0 + TEST_ACCURACY=0 + TEST_ASYNC_ENGINE=0 + TEST_BASIC_CORRECTNESS=0 + TEST_CORE=0 + TEST_DISTRIBUTED=0 + TEST_ENGINE=0 + TEST_ENTRYPOINTS=0 + TEST_KERNELS=0 + TEST_LORA=0 + TEST_METRICS=0 + TEST_MODELS=0 + TEST_MODELS_CORE=1 + TEST_PREFIX_CACHING=0 + TEST_QUANTIZATION=0 + TEST_SAMPLERS=0 + TEST_SPEC_DECODE=0 + TEST_TENSORIZER_LOADER=0 + TEST_TOKENIZATION=0 + TEST_WORKER=0 From 95d6fd7f32c8a5bb1f186caac36bab1584f23e66 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:00:31 +0000 Subject: [PATCH 120/154] undo indent --- .../tests/test_skip_env_vars/remote-push.txt | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/neuralmagic/tests/test_skip_env_vars/remote-push.txt b/neuralmagic/tests/test_skip_env_vars/remote-push.txt index 551a1bcc4ed5d..0b2d25d0910ee 100644 --- a/neuralmagic/tests/test_skip_env_vars/remote-push.txt +++ b/neuralmagic/tests/test_skip_env_vars/remote-push.txt @@ -1,19 +1,19 @@ - TEST_ACCURACY=0 - TEST_ASYNC_ENGINE=0 - TEST_BASIC_CORRECTNESS=0 - TEST_CORE=0 - TEST_DISTRIBUTED=0 - TEST_ENGINE=0 - TEST_ENTRYPOINTS=0 - TEST_KERNELS=0 - TEST_LORA=0 - TEST_METRICS=0 - TEST_MODELS=0 - TEST_MODELS_CORE=1 - TEST_PREFIX_CACHING=0 - TEST_QUANTIZATION=0 - TEST_SAMPLERS=0 - TEST_SPEC_DECODE=0 - TEST_TENSORIZER_LOADER=0 - TEST_TOKENIZATION=0 - TEST_WORKER=0 +TEST_ACCURACY=0 +TEST_ASYNC_ENGINE=0 +TEST_BASIC_CORRECTNESS=0 +TEST_CORE=0 +TEST_DISTRIBUTED=0 +TEST_ENGINE=0 +TEST_ENTRYPOINTS=0 +TEST_KERNELS=0 +TEST_LORA=0 +TEST_METRICS=0 +TEST_MODELS=0 +TEST_MODELS_CORE=1 +TEST_PREFIX_CACHING=0 +TEST_QUANTIZATION=0 +TEST_SAMPLERS=0 +TEST_SPEC_DECODE=0 +TEST_TENSORIZER_LOADER=0 +TEST_TOKENIZATION=0 +TEST_WORKER=0 From a64fdaa93e675120143a6fba838cd0c15222640b Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:01:03 +0000 Subject: [PATCH 121/154] cleanup action --- .github/actions/nm-set-env-test-skip/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/actions/nm-set-env-test-skip/action.yml b/.github/actions/nm-set-env-test-skip/action.yml index 7a1675e6bc0c6..c36f857c097f4 100644 --- a/.github/actions/nm-set-env-test-skip/action.yml +++ b/.github/actions/nm-set-env-test-skip/action.yml @@ -9,7 +9,6 @@ runs: using: composite steps: - run: | - cat "${ENV_VAR_FILE}" cat "${ENV_VAR_FILE}" >> $GITHUB_ENV env: ENV_VAR_FILE: ${{ inputs.test_skip_env_vars }} From 9e6a4e9585b51f9aefa60ff70987acd71e9f27f9 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:04:51 +0000 Subject: [PATCH 122/154] removed example --- .github/workflows/nm-build.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml index 5c88f6675bb85..077d0a147f24c 100644 --- a/.github/workflows/nm-build.yml +++ b/.github/workflows/nm-build.yml @@ -95,13 +95,6 @@ jobs: hf_token: ${{ secrets.NM_HF_TOKEN }} Gi_per_thread: ${{ inputs.Gi_per_thread }} nvcc_threads: ${{ inputs.nvcc_threads }} - - # TEST: to remove - - name: setenv test skip - id: setenv_test_skip - uses: ./.github/actions/nm-set-env-test-skip - with: - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt - name: set python id: set_python From 352493e0215683f229ef991b3f7b5e9d9790e3ac Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:08:16 +0000 Subject: [PATCH 123/154] added env var configs for all groups --- .../tests/test_skip_env_vars/nightly.txt | 19 +++++++++++++++++++ .../tests/test_skip_env_vars/release.txt | 19 +++++++++++++++++++ .../tests/test_skip_env_vars/weekly.txt | 19 +++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 neuralmagic/tests/test_skip_env_vars/nightly.txt create mode 100644 neuralmagic/tests/test_skip_env_vars/release.txt create mode 100644 neuralmagic/tests/test_skip_env_vars/weekly.txt diff --git a/neuralmagic/tests/test_skip_env_vars/nightly.txt b/neuralmagic/tests/test_skip_env_vars/nightly.txt new file mode 100644 index 0000000000000..121e15653211e --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/nightly.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=0 +TEST_ASYNC_ENGINE=1 +TEST_BASIC_CORRECTNESS=1 +TEST_CORE=1 +TEST_DISTRIBUTED=0 +TEST_ENGINE=1 +TEST_ENTRYPOINTS=1 +TEST_KERNELS=0 +TEST_LORA=0 +TEST_METRICS=1 +TEST_MODELS=0 +TEST_MODELS_CORE=1 +TEST_PREFIX_CACHING=1 +TEST_QUANTIZATION=1 +TEST_SAMPLERS=1 +TEST_SPEC_DECODE=0 +TEST_TENSORIZER_LOADER=0 +TEST_TOKENIZATION=1 +TEST_WORKER=1 diff --git a/neuralmagic/tests/test_skip_env_vars/release.txt b/neuralmagic/tests/test_skip_env_vars/release.txt new file mode 100644 index 0000000000000..89d5e9eb5b2e4 --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/release.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=0 +TEST_ASYNC_ENGINE=1 +TEST_BASIC_CORRECTNESS=1 +TEST_CORE=1 +TEST_DISTRIBUTED=0 +TEST_ENGINE=1 +TEST_ENTRYPOINTS=1 +TEST_KERNELS=0 +TEST_LORA=1 +TEST_METRICS=1 +TEST_MODELS=1 +TEST_MODELS_CORE=1 +TEST_PREFIX_CACHING=1 +TEST_QUANTIZATION=1 +TEST_SAMPLERS=1 +TEST_SPEC_DECODE=0 +TEST_TENSORIZER_LOADER=1 +TEST_TOKENIZATION=1 +TEST_WORKER=1 diff --git a/neuralmagic/tests/test_skip_env_vars/weekly.txt b/neuralmagic/tests/test_skip_env_vars/weekly.txt new file mode 100644 index 0000000000000..89d5e9eb5b2e4 --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/weekly.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=0 +TEST_ASYNC_ENGINE=1 +TEST_BASIC_CORRECTNESS=1 +TEST_CORE=1 +TEST_DISTRIBUTED=0 +TEST_ENGINE=1 +TEST_ENTRYPOINTS=1 +TEST_KERNELS=0 +TEST_LORA=1 +TEST_METRICS=1 +TEST_MODELS=1 +TEST_MODELS_CORE=1 +TEST_PREFIX_CACHING=1 +TEST_QUANTIZATION=1 +TEST_SAMPLERS=1 +TEST_SPEC_DECODE=0 +TEST_TENSORIZER_LOADER=1 +TEST_TOKENIZATION=1 +TEST_WORKER=1 From 8897dd14491ee92d5413bf2512a47b22dc5ee9d1 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:10:02 +0000 Subject: [PATCH 124/154] updated other workflows --- .github/workflows/nm-nightly.yml | 10 ++++----- .github/workflows/nm-release.yml | 8 +++---- .github/workflows/nm-weekly.yml | 2 +- .../tests/test_skip_env_vars/remote-push.txt | 22 +++++++++---------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index d6c84077869c0..67c99e3a86ed8 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -1,4 +1,4 @@ -name: nm Nightly +name: nm nightly run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }} on: schedule: @@ -27,7 +27,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -45,7 +45,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -63,7 +63,7 @@ jobs: test_label_solo: aws-avx2-32G-a10g-24G test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -81,7 +81,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-remote-push-tmp.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml index b52b9046d0daf..9db1a402678a1 100644 --- a/.github/workflows/nm-release.yml +++ b/.github/workflows/nm-release.yml @@ -23,7 +23,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -41,7 +41,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -59,7 +59,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -77,7 +77,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_list: neuralmagic/tests/skip-for-release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml index c385e0c6d8510..d56e9687cb461 100644 --- a/.github/workflows/nm-weekly.yml +++ b/.github/workflows/nm-weekly.yml @@ -27,7 +27,7 @@ jobs: test_label_solo: aws-avx2-32G-a10g-24G test_label_multi: aws-avx2-192G-4-a10g-96G test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-weekly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/weekly.txt benchmark_label: aws-avx2-32G-a10g-24G benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt diff --git a/neuralmagic/tests/test_skip_env_vars/remote-push.txt b/neuralmagic/tests/test_skip_env_vars/remote-push.txt index 0b2d25d0910ee..121e15653211e 100644 --- a/neuralmagic/tests/test_skip_env_vars/remote-push.txt +++ b/neuralmagic/tests/test_skip_env_vars/remote-push.txt @@ -1,19 +1,19 @@ TEST_ACCURACY=0 -TEST_ASYNC_ENGINE=0 -TEST_BASIC_CORRECTNESS=0 -TEST_CORE=0 +TEST_ASYNC_ENGINE=1 +TEST_BASIC_CORRECTNESS=1 +TEST_CORE=1 TEST_DISTRIBUTED=0 -TEST_ENGINE=0 -TEST_ENTRYPOINTS=0 +TEST_ENGINE=1 +TEST_ENTRYPOINTS=1 TEST_KERNELS=0 TEST_LORA=0 -TEST_METRICS=0 +TEST_METRICS=1 TEST_MODELS=0 TEST_MODELS_CORE=1 -TEST_PREFIX_CACHING=0 -TEST_QUANTIZATION=0 -TEST_SAMPLERS=0 +TEST_PREFIX_CACHING=1 +TEST_QUANTIZATION=1 +TEST_SAMPLERS=1 TEST_SPEC_DECODE=0 TEST_TENSORIZER_LOADER=0 -TEST_TOKENIZATION=0 -TEST_WORKER=0 +TEST_TOKENIZATION=1 +TEST_WORKER=1 From e1a1a590dc040fb9e113fde3893b67f6d64a5bc9 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:16:00 +0000 Subject: [PATCH 125/154] switched for whitelist to blacklist --- tests/nm_utils/utils_skip.py | 45 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index 3953d35839765..d6a5bfb618830 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -8,59 +8,58 @@ def should_skip_accuracy_test_group(): - TEST_ACCURACY = os.getenv("TEST_ACCURACY", "0") - return TEST_ACCURACY != "1" + TEST_ACCURACY = os.getenv("TEST_ACCURACY", "1") + return TEST_ACCURACY == "0" def should_skip_async_engine_test_group(): - TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "0") - print(TEST_ASYNC_ENGINE) - return TEST_ASYNC_ENGINE != "1" + TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "1") + return TEST_ASYNC_ENGINE == "0" def should_skip_basic_correctness_test_group(): - TEST_BASIC_CORRECTNESS = os.getenv("TEST_BASIC_CORRECTNESS", "0") - return TEST_BASIC_CORRECTNESS != "1" + TEST_BASIC_CORRECTNESS = os.getenv("TEST_BASIC_CORRECTNESS", "1") + return TEST_BASIC_CORRECTNESS == "0" def should_skip_core_test_group(): - TEST_CORE = os.getenv("TEST_CORE", "0") - return TEST_CORE != "1" + TEST_CORE = os.getenv("TEST_CORE", "1") + return TEST_CORE == "0" def should_skip_distributed_test_group(): - TEST_DISTRIBUTED = os.getenv("TEST_DISTRIBUTED", "0") - return TEST_DISTRIBUTED != "1" + TEST_DISTRIBUTED = os.getenv("TEST_DISTRIBUTED", "1") + return TEST_DISTRIBUTED == "0" def should_skip_engine_test_group(): - TEST_ENGINE = os.getenv("TEST_ENGINE", "0") - return TEST_ENGINE != "1" + TEST_ENGINE = os.getenv("TEST_ENGINE", "1") + return TEST_ENGINE == "0" def should_skip_entrypoints_test_group(): - TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "0") - return TEST_ENTRYPOINTS != "1" + TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "1") + return TEST_ENTRYPOINTS == "0" def should_skip_kernels_test_groups(): - TEST_KERNELS = os.getenv("TEST_KERNELS", "0") - return TEST_KERNELS != "1" + TEST_KERNELS = os.getenv("TEST_KERNELS", "1") + return TEST_KERNELS == "0" def should_skip_lora_test_group(): - TEST_LORA = os.getenv("TEST_LORA", "0") - return TEST_LORA != "1" + TEST_LORA = os.getenv("TEST_LORA", "1") + return TEST_LORA == "0" def should_skip_metrics_test_group(): - TEST_METRICS = os.getenv("TEST_METRICS", "0") - return TEST_METRICS != "1" + TEST_METRICS = os.getenv("TEST_METRICS", "1") + return TEST_METRICS == "0" def should_skip_model_executor_test_group(): - TEST_MODEL_EXECUTOR = os.getenv("TEST_MODEL_EXECUTOR", "0") - return TEST_MODEL_EXECUTOR != "1" + TEST_MODEL_EXECUTOR = os.getenv("TEST_MODEL_EXECUTOR", "1") + return TEST_MODEL_EXECUTOR == "0" def should_skip_models_test_group(): From 2ec6643843222ecfb8d1a1a5eab8c44130b00870 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:26:31 +0000 Subject: [PATCH 126/154] cleanup spurious setup.py change --- setup.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/setup.py b/setup.py index 7814af2b9e1b7..dfc88e784b4ea 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,3 @@ -# flake8: noqa -# UPSTREAM SYNC: noqa is required for passing ruff. -# This file has been modified by Neural Magic - import datetime import importlib.util import io @@ -98,7 +94,6 @@ def compute_num_jobs(self): # when it is set, we reduce `num_jobs` to avoid # overloading the system. nvcc_threads = envs.NVCC_THREADS - print(f"NVCC THREADS {nvcc_threads}") if nvcc_threads is not None: nvcc_threads = int(nvcc_threads) logger.info( From 0bb099c5140bc63134b1b299020a36461a3a428d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:28:10 +0000 Subject: [PATCH 127/154] readded the missing images --- docs/source/assets/kernel/v_vec.png | Bin 0 -> 42452 bytes docs/source/assets/kernel/value.png | Bin 0 -> 171134 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/source/assets/kernel/v_vec.png create mode 100644 docs/source/assets/kernel/value.png diff --git a/docs/source/assets/kernel/v_vec.png b/docs/source/assets/kernel/v_vec.png new file mode 100644 index 0000000000000000000000000000000000000000..75d344ab933f2db650f6cb361f306790612bbf37 GIT binary patch literal 42452 zcmeFY1yo#HwSEmLlr02{zWBLzUH9vbQepwR-DfA#_35gOgU_Mf70{_`>D01#;h zVEpqjCaBLp1a;*<&i?05P%hfvu9%DdFGpht<%0gDk52fMk(R{**sYL%pG)gD^0$z}Psrc&Gy!hyipo5C|Ougo*jb-=GDceg`l}FiGzT zDq@l8z69TQBNqxu%)@3;s_CY9Iu2(Qe&rsDgG)(8O+(AZ&cXSBOGH#mTtZSx`Jsxc zn!3g#y=VFchDOFFmR8oUZEWrAJ>GbFc|&}B!`_BRL`Fr&Bqe`HNlp9sDLucSu;_Df zNoiSaU427iQ*+CguRXne{R4wT-zFxfeooKK&cRmK);BhPZEf%DB92c^&(1H9zc2sL z3uVuLQVaF|Pn!KjFA|hq=olCv4DcU%p`m;Kp*RT!<~>0yQbk?xOETQ!@iEad@%<0W-6F~@AKt+L zd=MJSnLs3fJaG9rhBFua|F(a%0ny#q|Fy|q{DJ6h{C|T@j+r8uNDuvkyvybnU`1c% zq^KW}K33ffUd&me^K%>3;+`o1@;j6eYH@-55QlzZyDxof^Ye=fAl!%Vg+$mj+Zc)l z=>2;|1N;;)$#wsIqsXWTvbzJU7Vm&p`&ZsccYw`_+)+mT{%E6{g%E_Lf0=Z&56=P{ z(gDv&*Oj4T9-utpx*msK6^EVW5|!IRxsi1Uv}iIn;opwux=0K%z8XxSVCT6r< zHOHBd(Xwmr@zb}6%FQ2lfTdHdQ0Ip(7DhbfgcPj+G{z~k^Kgnj2&iA|RsudnyI{|f z#?BqpJ*Z(w7kthf!Psl)@bB;WKhooW!*@(~6aTdX-nO--7I|&C#=wW?0=5sDn^e1J zr~MH@Lq!iDLVLhX%I0w@KwyBsBz!dPTE#7ZDXf94cnDCIq$2HZ6Lf0pC)_9Yotfw) zY@=K^!YB&JH|S8C^o0Aq)Bw&tOYlpfo01FZgC;S%5dj~3K%kd#aN`9!ZOuRRxWWscTn$m^Ps-4Z;gYT-w(dw7_vEjAx{SV?2G{JDqn( zv#s$+IM#2gNT&@MgoyaAlGAtbOFNEYAEgMz*Mq~X0Rwd;a7wr?5BEs|=TN)egF$q1gw4QHr zHUx(n0kXDeonU{P66EODN{{mQGhQBa-tX~!g1Ks2K*3tBZae9Sqgb&{QW?&TH9A#%Thd+5{^!T zFfHLs+;YU^KnFR3GK?~;z#K25)%47VLicO9`lf163a;x9Fvb{|JGTcBv=&#d;SeN3 zNm_V^Lv;eIgDr!;=8sg7iVeYSz__?u~(Yi(`y2u(hM61=J zKES39mE7i*@9w0QW87gRpQtaNKacTqP=A&3ko0|38OXgvajO}sgg8&XSY9nMt9wTTXJfpk1v0JH)z3}JWzkzobzW9 ze7WmJ1}V2|SGi>!cJQQ?&B#hf*b4tZI%0%~W@RQ3;?Q#k;INOS=^_j|-P5XY9Q+{P zKFBZr;#dlTVc-Hsx1`9?-c&isJK!yo!Bm<-L#BGokJ_6&P(BSv?ZHMV5K5#xOYATa zD0r>j3fGV8_>^yX^$1B?W#}3s*uO%A8NKR`y|UHd|9vIE|+g$H4Ke!w@! zNA)Jp1gv2E3S7?EQ{)>tK*DNr2Sj!-&q86RC*;!9Op7(~%0lRVJ~0b!vzI!U>VeeP zww<-K8?YaVi?v5atZkoO(`-4b4Hll$1v4O?5A<-izrI)(jlI2R;|ISW8LrG1MU8kd5|r&8jVs+Z@lIQ~)7Ii`%w{2BVg>ZYs8+q5y9S+-SZEuD+>ZKU_NT2r zICDsu^^|Pt$LH46+_sCZi*=Nh~^|F*4WGUbpsw030p!BZ(>`V%;>+UfsyH}eZEMW#(g@(%T2a0NvX&Ui8qcFx#8hl0)mLNEb9~RFobh2{Pi$MWmMqMu3OG84mVm|yx(tjUcQe0cv=0FBB2FiS|zQUD3VgE zRxEo4zb!$^jyjNw`oXn|?@w~H4EDIf??qXpkJaO%6TV%^b#Lx7kn<~m?T=63EUQD! z!U0Jyif@ghH|DM{0vq<9>44KZyAvXk9&d@o=S4`6X;FoQ97h8Oqx2V5$7WROIge=R zrxRsk`fIRh6l3qlasroV14Z__3XIYmT{)3N^68E4`FFsmXtri&DevWU@-;Q`^)ZfC zd<*r8rUX4c(FcwA1;nWD87?PC%#7)A&ZFC&beMR}@mo2?=f$Z7Xl~m8^(j;|Y{k1L ziXU^f+8cNNt+q%UW8mA3gjeck(C7KVEU2N6ei^QGA1EsmAV=l6Z_o?#V*t+oL)zWc z|DfZCJ2U%aFR~h>3gG23*G6os?+V9r3!5MaBUfjiu0L;!8Ih#3r!l2Vcur&j%;o1b zHAn1XA6CxFJS}#NWx6SM(C=rcL|ALAdKy#u zGRUTMuCK)IxMKQ*Z+k;I<>fBdccJ_=cl44Wj$q(OfiS;?Kzv|{U|LkLz%ISAy>D)0 zcD$|?NN(uZdw~w&h}qi)@?$trEy6E~*HZZTU=1@fr8~J*CZTre8p@Vo1EL`Xkm8<> zdr7wS(%UYy%Zx9go17NkOENZ#pmnM|of4&EU?0&`d-~}`(_8Bek8m9=NqN2J3D-QR z6><5LuKo__*ao%cr^nZFzL7Mb0Z-9-0_3_X|LHC0?&m-5!ih~z(!S@TPbS6{m^f{Z z5PG+i=AGvhlZ53P^EV~ppUzz~$i5%{Ud*(VJW4;ch-ysF8vSZI*Jeggn4U=z$%~#E zFD4X>Z+-h>o)CxO1eODZ$PXCE3AZDHLJl3@+3O78Bh0aWKd~>P@e0c(XY33qgUy89 zKG@49Y92ddFx75uY5zdutvWtH&w|0~9G&EizfrudK))P%?XiADwxIYKDKtH{Bjre0 z{oTzgzQsBYBaCYy^Kd?VC)6!{+h5PdX^klM3tx13{;0KGu?D1C9mMizWba!x(V||0 z{qP+SZoE!JXn*e8*|XQmjU}e59DrZRk;1brFj~)oPJ|IKL8$Fv78H9;wI*wD@d#lU zak$br^JeykvlV#o6Se>M@hBSJU_Zv(l`$;v+x^NmSwynwSp8d5Ke{X_={j`$`-2tl zjzhYx_T7$l%=gS+u4rTJnP`=kAfbItf@b!YsZ%a2a?f-*9zGBD!$-`z`*2MDldD0q za) zf1_ z{C3(BE4=euyH}mJIMwn~L4zHo5JeL^p)$IFaG1FdyjZJdLdI#`Ok6Wc*$HFXf!-mL zv$xG7*(@-_5}~yGb%wI(0`>+r$i@2^_KP>n`A8VIux9Ih^RR0+6Wp+Gk<6lFdiqM8 zx9mH4TF##7eVLo?%_<4Yg#cU7HWz1pFz$bHlQW&NSTnrlqmk-P5YqOB~% z;=bVLNRb>IKo@Q%$(qxhJ)?mtA|gZ2pj1eMe%RVBwdZ#nmX`Q8dK;g{h*hy!AB-03 zNE|58yYvj(^qj*EiX&XE*|Qm^o?w|iVAW63pt4*1^oaY2i%?s#8_dAHO^YzY>kgEh z@_%W)w#;Y-BP;6PaU%FA@!FHi__%<b~&9a>6k>MF@ z$@VO_q(3M6;(L-Ee)+U^M$*L#?xn-amnk$>m>7{c+VF-<)yrE4PRE03{za=Qi%q3w z)+Xv*?df(8^~pft=k^8(RrV&Hl^+N{TQ$ZGo^WM9J2Dq16rv`X!rfp;O0N;PYX<2d zo-~r$n*>Mie{QmSrp#kZBdMUT-o@;Ewu=e)W+#4zg!P(JH&5KMD%zXx9|)QV2j9P^ zj_>ZeTK6sU4fFx>)2cjvKxwAq6ADD-E=?bs- zK)QZrYM^L)L&xz?%EgtXi5E8BAAgiKpLTc4eJ*&{J(J|Gp3y&ZqQ=(d<=|vPJc1nB z{BC90Y6;F7kQiWQ434VFjl-f)0DRH3{)^(QyXk*?(fJER`2SZ3ndoljAG1jdKfOKd zJbJafQmrS|!nrM$pyt?>$PgwO(78yTzGs_D^x+hhzMIP-Dl82U(@mmvPw;pMKA9;W zD{2lV5}iNuBj!4~7FJ2P!P>k7x;*cIwY{B7PEahI<1Ja&CHl7D^Nb>6b43>Ro|E-h zbl+!7;_x!aWKSI=&@{}kPDt=Q-?5m0^Ct$kJu@eAv=i`OuC?7x^u&xxrDi>S%Q50f`aen5q0=nVB{Do+!C{-PZUsl{V*Mi6KTkhfBa=4|ShU|EDWCv4j%S zWn+gEt^pnwE*@$8s0tjjuUbOFQzQo#>f;LZvQ;7LoT95i^3f*inxw)3(fheib;81h z*{4@lwp=8j)YESg-sO_{LK-mbt?%3 zieWIQvEo&Bfy!>zo(tp*Q1SY}B-CBQv)Qd6#;;uHL}Ft5ygKuwngq9=Ni4*Z_I z9rl`FIN6(MeJ?K9M4!{Ie7)BIC+hz0g6>dk8u`?(_9eu>F^ai|*yH`cXIXBTZ7zC; zL+4Fqg^NpZ+wEM6%x`ne4y8;N)^+KAxAe=WF~EsMBoiSs(g$;2V5wnio>q*l0H^5% zOC9}|=)6Pvt34Cm^VpGBRE-n!X&|cLuMKj{gdJK%dW>AYRluF-DJA@F`b3| zo~egee*F>|aeaS@@tvNJi`Qu5?@1rS_l*O`iys-nT)vrNB={ce;(el}vwQvAn&hL+ zXN6(FJ)3S^#^SS?hIo{}@=XmM`_#LAH&aK)u+}$L$^ELbyPsQE?*Mv>-KS=@*_^jO z8eZf%-FD+&sRLx~gB%<=_P^u^A?v1y{*00(hu88#I6sLWwmy5H*e@xoD9L&TwaUfU zCN|omjn8zd=T*Dn+o(-++bHTM_cV)F*qUijL@htPwAK}gkVu2u*-ts{kI&yU&NhRb zXNSOY zkEa*~_E9v>@D!pz6%K$TK69%<1-i{BHswj?kq61R;+riCaXF`U7U(Hm+F%TjHuO{h zEcY5s8uLE4S4Ecuy*3kk+Qj&1v>ucsCJAh~9q9Xue@2EqX-MKH-d&z26F$@k97&z0 z0-WY)m`{>(e78~Nq+|f~4?(k(T*!RVy|X9qn5w@DlCV|;x=MHGhP2X1RPvMAnX~;Q z#f1NAxQQQ66>6HZY&k*e$6{r1mfV>UokET3rDtlTQvq+vZQgn7N^GUiMum@MR^{dc znAnEfGDZ06Fj2ph^~vZO6G@Q6e7&^+Q;mvK!}kY79YPvi7?#D72WFMzXkYg(!Ep@? z4;Qx0*>Ar$a^;%dM9{?w?0UC=bF_YTR?O~E!5*8~UM8G-uFk!Kva$DLFJ3j>3w$}y z=-?a!{Av>|LcV((YJ06x5#*@8kMDABd(g-yNgey%-Djxc4g3y>r!u?pLl$`DoQHAD zc64Lk3^_)%EgY@}n$3>-9@*v+TJ*vUqdgyn_)As>S=L*}R-PLj`ggg7L}Sm#`+5%B z+D)4@G{!a>UPi)>LWLT(l4>^Ny_!B0+s-Er_#5<*qxC^Nlo9ct{7;aPy%wHH2jkGy7GM(~|nk5=DAFr68Xrft~-X^C5k7OgpP_kPJ& z=%AnOtbqyj8(i^t%e|23>V4}pAS?CjOAIH7>T%69uZT*`R0C7va=5FD%!hY8y&qg2 zXpPqs3og^hI)Hm%L~$bXCvH$=@|DT_OI`46UQ-XL!2DS9fQ;Xa8*1LTy*K7>S8DMt z4%C}7CpM4%QLc}m--s*~<-U-0SS7N?Uz6%B-$2AK4#OAj$gz@P8g)Dz6qD-TjxzoBHH zb@*IQ^1QPDOkpEM?a7P8o9iuDn`e*>WrLho#v09wkDv%*%Y-Ktxv_UZqJ=iki%eO^9n|>F%+-5I4&P-gzgR+h|>k&uNMIjn``0cr2ZFD*R|@tO>+ZsSymjy?QhIUkz@|5 z+9bxvP+sFh+VM|sUlZpwJuq(_k-9YO7+1jbzgi0lPVHcuITGFR^bUuz2~qc=g>E%^ z$IgtQV9jKINu<6L{QUUvnC_xYT(<&Z`xO({Bb%nqyi~^ApHw;1lOp-5n6Axey854jX)w-q`&(h0AnC_H zdCuM``dH1Ud9akOuD_A6^1P+g>!v`}w&2Bm-j@UKuJwwR2Jlt*@Mqr&Jl*zBb@`OQ zrYjsWL?e@aQTCq>U;aBLLD#g5-s&>ZcXHaF*J4T9x0P0o!XEKp^M=m9RemW#%1{W^ z7q$g4+Jtx^w<7wGKEQVT?`kOee)T?1uj8<1A0XgA+SaZ;*PX<>(pl_h zR=%@jy;mqE;Q2TrJWE*NN2C|5ApMAYy%ieEM7dsXLB=2^@uv7?Oy1L)^i?&IcAH>#Q2ho#sK#tK?zw5sj&zgjTtDFGkM^zpoL*fT+;7yG!ae9Z zA0gbzAgu!~`+GB?%1&s%?f}X+ypMa$!EF<#AA6JQR97Ek(pkFxaPK7y&4R~SSwu<+ zq)8qd#b&~-UUh!v9^-o@JU;|ToRFl+eWI?=#@TjSg%TQV^Kxd+^|AgtVlzF@ zatR~;lJOB?A)Qv_Ng>_?Qiq-8N{J>*pW2dG4PKwHFG)Og>)H}M5*_NPGS^u5gAxj8 z*QpfuyeS3;UZjkjWLn}%8ycfr-asp>YWuQbcriP}lc9H7$j1;nl3jd=&UBe8hzpp` z&&yuJGx{+j#G{UlnHf&Q0ZQJ5l$QMS&=!>Jngk}=u;-O&rA{O8+mZzIX;$0=w9$k4 zb)Vmctw9Nq*^;bd{P}GpmJsC=K3`|OP~ouHc!%77<@)^3B&~nt!VuviZycQcSzJnZ zNZO~wt1F&TV0Jn6mN@4Mf|wcIG_2Q&qG`cVZ~U0(=Vvp_+?8?h%$BT3>0g>A5wY4F z1+@>zg8Q_wA?5vElQ%#H#S~){i^YQxiT8fc)w!|dH@ya zOV%J6>FX^c7?YO(BBF<_YT`I|jHTFN&lc{23glcrtrFBZm;M4XZY`(_X2ayi1L5cV> zyxG>EFeg#!roLYQn(YX6-0T^K$IuAo5&pFVf`_0~tX3pW)}`WgeXN~VH)~uG;Bq2X z?zWShEu{k{sPKP|XiRaF87b@)fL8GC{FmkaN(ABN!7}BF64rm zrKakacEl2I8$3SQ#xuwPt7-MOvfW7+buWvG{l-@4?0UU8pKcruk2sr~9qaIj6pqtAFZgWO=3Cq@|7CUatGD?_gCmQagRh zq2m#eNa?Z6Lfi&APB+bb;tUOSE`bxRrhyaJw3##KYKQHIQF=AbTFYO!?vT-z%j|}e zi=HFA?La;_G|IC*5GI{5fAu|gQavx5xeRyT^4kGxw%~j<#VKLzmohtp(kdAje->S7 zC2VDON$&R&*O-u$tsW@T4wY16)=HM@1Ot;v;zlCJ0&7SSb@F$v;wE70vlK}>uosB8 zv|!;cF|oAyu3pLLSIEKt16Q<`%96U>3@%=V@mclpbQr)Z})f0^tU?;%bM ze!g)5!_4fcJmwz3-rAZksYb_qEMn78A{0U!Xz)#o;7Ho)F)`~MvG(Cv@$zDOM;WKV znq7D{>>5PtFLuiUm)Yb%<$bT~?7`fvkcbCzii6$KUrtV|X1Z&Bn-h=cb4WY#Mg%7E z0&hP2T``8j;9@F_}e$}J#*u6CHO~=D_YFK1|#87e+LZu zK2X|Zbaj-`PEMeX+D@O9p-!|sZ0O=?7(AD-XD#Pl2P;xuY22AP8_$O=ifwIvDjR zYeJ6tZYRkL1tGxL6P8y4#nG(DiHJU11LCO|55a|KmgdX)a z#exc}#UB`g-bTgIpJ<}wf|Dnc02v*yZmGXIa&U<+qleO)v#G-;nzwSTDN%7Rc)zlX z;juXV7PaWER8H?Q@wFU0uEJbc{qS1jjafm+BOnnJ?(@Wg-~9W<7poVE9NkR7;{v0m zcV~UCryAH6nLd42BRfgbmfqO)7!i<4Hs?TuK5*&aUVzyfqEAp>NvVm1#Hi84X&nl= zHP=WSr33qY0|!*lhn{|(sW3{q_2`Ghh;5Qvk2?Ups%e#|Pt&DWs=k8^@hYNS#Vg(^ zRGeM-q%cN69BmSKBl)Xs@s@sRz?p9Yv6Ov}<=*i8iyJxn?G921i*{sxfoUBAv#Oje zW^khLzUIcaii{=+w8$UsiZWy1&)2%ha;lJCUxPH##oZ>n7TwyXY#eBrgc_C?&q#K) zpZtD&2lS{}ba}eIj>cZ9;h$S~D$d02xL}MG4;0OCD!23MWa9TEqEfF>BluBH_AT(C zwOMr?W3W>{&I&z~?`8T{zd31L*nzlIdy-mgs7*pnE|_DR6ZEx%;QAq=>D{%aSXSi% z&v~_RUFTsUw(D;0ro!)dEj2AE1E$!STe3SKZaR=Mhu`vrf%x<5B3+rCj_{6Exv%ZT zt4qbGQPzsJ3Uf9K<9^&DscWv@VUkQiIWH;D9PBP zaqZHW6-bfdvh^hkzaI-l5IcT|y^_>R7S<15J<7&| zUm{Jv!U97DOkl*UEP06?cG%~9TAZNbzs8MaNz1d{QYNEr3zvzpY=%K-a|zO6-D&j`-hvk9M*#X2#>;8*j!^0pk~v zB$@^hxbS%rq1#cV)yeiwn1&v!_btY&553ah*n5ni>(O3Y#$K`=AL0G(nCxTLbMb_U zdT=jRpXqHPbqD?Spq2-Y%hk?YXC;8*()?S#E0efC+w_^H+8Rf6#Tob9p#kLwrDXNP zl6R8SI=$f_<`;$&q8qokNE?o~ZTjVq)W@r?qAcvQZ~eS**iIy+AMeTb}n?ayx~xAYYc+A0u;m(^Z?Si(yNqB1AH&Bp*(!()=jBSW{)T zI9fz^8Ceo~g?5CB!wZ{Yu2YU-+k#Q}*K;tO5h+)P-qTKuYpaQdVP%WP&hhY(63Z(^ zIXk^ge7>K@5fT@d~P;A1Y4?sz*b(?>M|*Va>4Q$Y@2nHLo+aly?&Qb9k(4 zV~^(c2`RhZnif<`i|2vCG|Y@@{+_>DvRE{c-`z@0%hl_5{Q8m6T`6NB#= zyzRAoU(+OH$u;-X)Dr(fCceYX#w#kp_^SQ-oy185fP)NxaaDtowvg&cW zxOz>su_Fg$tJQUnP%nYkeSk-uLk6y@bS9N8O`Fw(LQ(3f3ixB!j#Z?;sGr>zPpw*Q ztO8ieZY@kM(vSI3sac=)TJpL!qnYWbA34+OhKL!t=k|do^dTqoIp)RL!M+FoOMHo- zf0UfuY$t=6*BQ5t7#9w*y-$p_x*1e{+;J#;p>Z_bn!Y*TA4sQ7#vhixcgtQSjxEz7 zFJY-=P;YSINPioO&&_#MAPqPa60ec zAS(r%7;5Uqe?YI9HK(FOzTxT)Hi0k0q;DU-lfS}IL50Atp=R)~a-_D}8%Y97N5ykN zRK)V^?;;j*U4553V7-6+wnhFn)D~!LuWNrg?J*#s7i#&Ui^>0WrQxteoyB$Vnakzx zggZb1T4ek0GTkcf{#(#^o1Z0SMD@wT#^c{5aG~;z&_e5!enNN1E~Zz#4p?xNAA9>2 zZ-Pp7Jf*F=Z*hZph@}Z3iKZygMNbG+W!E)$PvuWkZH%uQyBN?Ae3ATGpQqKOrG~NI zd>QjQ`Y6cm^{ay(B#{>E7Vftb_X0SrXls#TW*RLzwWbeAd?1( zde?&GDxs|HNZyqL(8~&cpqkegU^v7PEBe6+%lcfKx*Fgx&Xg1^#{V~14H zyAKQ7+{}l{R|aVO4H^574Dmnl9n(Kbi|sH*EPP+(k2evIc2>r7sE-wiJgi!oK8lwQ zSAQibbA>n)wcJBR(^Uhc%LAWymw$xnuc=%0gZdX%BXy^&@KJO;azjC}g0gKXnKa)v zA$#{e_FgkN(uu*h!4;OQ!17WzObX83?JUQ9?UJMONo1Kws%t_%iM=k-NG-|GgK9q0 za|`+ixQvK_L-8ROYZNujjN1}p!nMy5j>N#fK2Ku4p6i0D!!Hu9 zby7)8k>x!m<@R(@i3hwNx6KtcJfw^>po3J3B}iKg3moN*PEt;Wlh)llC4XT=$EF0 zg&$@U>c6tEk}zX}=soobH5fx1Rj`T5RUrQ?FkeQc9JDVcKB4R*C_~7|Kc$*!f z;Mq*Y{#gLk{<*dgUNr5rdgDOO#|W=}H`MN*g6%5qXNV6l^UpBbF;BnhB86-i_7dd+ z+Mp$~jQ1clR01&?bIZ3QcdxYxKJuQge(<`JT^ICd_=_m-g#`zp`VK5u_BaN?A1TLy zEU0qOCVF?8XZ@nOg_N3+@&wH+*Tpe2)GvtZU|+_D_q;4!Z(FLpamz}a%rrN|Yb$Aw z#nr7FIL{@76{L!$bA;D|O7-HKStw5UO4k$OQw?i-u)=?RWp$(DEljdT>tZcP9K19v zK!S}Czu}vQMe7+j=C-35F7z6-qxS)2=6Bo?nC{{)8V@-^pW#6v_41$|50|{(`O?LP z)+rY&B?DmK-^CX{z`R&piH@jBlC1eUo?SMv(TQ@ zhyY~NMzT(3(v|dZj^?-N4{&@WvxfphJ4Ia8?oqzcO#_ef8Bykgm|;o{1MRGkp2p8Ddu80z~ z$x*qTHRUd}B-1!IR58u03e+rWJd{{*9%#3Q1ob_2m=Q84J_8qETWhNv-5hm}0_^a+e4_Q6q%K|(bpY^GEKXNNDX+o|$SkLlW# zw{Tz8sU+zT20B%Ie=og~Jo>~2pc63wN1jd3R2DWnNnL0Ob_}PDM`bmG-Ghe{=bVlYF~`TnA#8_x(1hix_w#;#9_Tn-{|B zK$P}WI9p;1UOo+LOOxV1AFOVTYims2Pr6?@`aLJj`Jgpkg03|NMN)2I0QDs~%p|;q ziIo;(#18#h>Z(z7Re)i8{f>9Xk^<>Cmp2TB1Ej8Q%dZW z%$5=Bq>t;Uue!6;M<9C&47?Nxc4b4}1#w13ix2l>*CzBIHviW6{U{Aay&l`-W-JAM z$2tRL9IkiL)54)i9&QhL6}u8rPkeAfzk&Y-D_ClmJ7&7gS}ZPbnnm#^G&KCagF2~t z28#?Cvm^sHbD)uEQ-uS6p}iIy8e^hSC*n!YjM%p$0<+6)>VaCnK}CQ!xuGbk zl0B5`!zNOx{FL^fvfDfwMY1TtL^In z8b@~_6~JNqd3?y)km%k(ofF-&$xv0R6PFi81rp3o7}EX&k{|}O63TZ zDcach^QVw+ucuYx-!Pa31#AWsb~DW(GF_Hfv z4aa8@i5~ba$6AlOz8sNCpnl|VfB%?2$#JEd`a?M07eRU#~Ro0zw78QwJ&2qAg zHn;YVEB^jkH-#fHTk;?s*vU(kT+>*6F?2t|fB~C@c`9WD;@8U$EgZSDzRlRApzer% z#h-cHYF3LSxns_<#zgOL)r6qfj1SL~Np%|;0u9+7v?z8-|7;}oqd(0hDzt@y;LGuW z9P?(6IpSobJ=q$6sPgdkX#66Td@*se079;{Cu{x8(HZZrJf|!7vi7>-*)DZECKXFr zct+W8_^Tg#=a1OnTQw3D>`xul$jFDqUZ15_~^2SG=o=&4YJtlE~|vEZKU!v zw_~(1B4RN9_XAAQCLafn1~TXUEc33nL5m&CJ7Q9!VQ1V*-gQzI3d(PnjICmbFc?dsR`;&hnw=U^qMpM#L8=7Zf=R8 zhRpA$-jqLn$h3=*2SkRYVUArdwViG?Do9Il z4_uqAbux)1yevhssAkHHaxIuH^;~V>#TDjv(zW#C3_|55Wqc6dj&JcC;;;wE2J_V% zBz3q*#TkCgQj?>d5l{>`bnvAqw^KUg+c&M`y76Y>@c1VQCf|SGLm$c=jM$0^d|>zG zgXYQ&%Yhh?6MbPD0qZ*!vKDni3}V5=cuvp)Clo*x?K=1z7+D({5PS*enxbu^at4i% zm^HBnp$Cgy3`8cUr83qczGLI@Vdcabicl$|18$PBMN+3y~b7-Ud|bl~JoPs9p`6>~rIvpn%{1A8>4GFZSsQ6D-VK5$hkU-tCvDV|;jC5W zr1RLc60}(R9m$c`j|q6mQNhI{{N*d{rOsoI*oOCNpHseW?Q#`;9-p-~9Cn(V;3=41!ZkplZW^1El{m`rvKmttlkp4p2(g zf=}{zQ#n^_Jdl2(Gw8+|+R? zdn*8JRVn5Y7Hii=YF3A7H}Q?n_t%srzH7l15EtA?Z~41YFsjD+|M}LsXH;Ti=BAJB z-1F8x(TeM#w*O#XE0%8O5#IE0Xi}C)w(DzA}2G52* z*dA?cwru-Y;S0z7>Ig{xcp{55xBsFx;09}b^qI-$Tr4KT}O5eh}5Sj#3x4z29X^KzqNDoUvjscgEJee>PMbKHM6qZBrH{J#id{=U+x%qQ!hUba%hPF7p@`W z(4OaMyMZ^!EIy!O?l}j*MT_6}f~tcFoM|KA4FuvULfoz z&*>S8UGx%7J-_V*(%jC?yfJe*U^F~`Sf{bMzxeBSX&qe?LG#|&uVAUNzo5?l*bWLP zFERxo#T4oAnzs9>ux;rX_&8C?Qqrk%ba;{+ZTFdB7=&!y4$%~~3P+Bc&kq|~3XT4H zi(ORP^Zg0B0?p`C?(`{nJ5DE;*NshM*$<=VKpGyKEikulKAQDcR%77FDZ_g!E)-3u z1(L91+Qsr$9cSFUmr|y)!PiagDooE))Pw9l54|0OZ^M2rC1XXM;Av!&9zXX{i4lD2 z<>L}EKtBeacxD)_f=Fjsp+fXtJ{%A6JbxA}r#~Xx`2!}9rRsIF8nbbl1LJMb)SB0{ zAnI^tGa5+q_$B_s?dxx4`6n^J zx-MmtjDaPd-*0vAe_0R^>v5(qeJx}J7fd3$@Gz%C=$Rtj*68ren7Yp0p6@1Ug^JfV z3BJ@)9ie!f%KFk(VQ;{d2--XLF>s8*zrFQh!)ZQtOI9%cY~8nW01E6Fs!nq>cfwpp z9cXJo`y)d+C4;A>XLkT5vRoHwQH+#-)DnZ&8v&y$U=c8gBO?s?Ab2%`qfeXZy?)$ zl;43Lb=HdG+2St~+kc2ny4k<{fu%-wv`TTUVULg(eT0d4bWPufV7RZ=R47TNIhnj$&$OW-U z|Lc~-R|*O|8=OND^f`Kcnilf6V`{yiuU)2iLHqfLo7~1D`5(*2PPYsl?Rzco)@$9{ zF>v4aqRQadB&XPH=cCpB^;q9u!Lp1ozukjkm+sH-9~QwkM-o5J&Py6a6x<^njb8zE zu`onPIm^rxAvp%VKoexxe0++OT?iX&pQuXW(Rr{7Ri>~t3XFOT?y8BP?NKP}{(Lih zpQypUNe8BK^E=D(`HfM`?_~wVRzDNPkoV^uN-;d%w@36B&I~WVxouV%ZAN2kWn&MT z7yoSF$1{esD%5^KEM?*CQoe8fpyre~_)UD>!JTjng_n1l0DGrdmQQ9J({v{|Of$TB z%I^nEJUfuMBr(z7kVn4*HZqKYW*#Tp(Ei+;`db<~hu@kR(Lm!2nW-HajH}P*JV<)b)gmGqF651lv-i^@=JNb~$qW4m#a#lxKWJsh zQPd*Zqb(q-AfJ|90YMAv0eqdRL0#PdazOr(YAEIVCxb;Z1;j#f5y5i&@Y7^u*h+uV zY8Hug_VKqJXPa>1Yd94cR}CW;wI4S=qEU|7Ulr*Ch2Y!L^1dsYz`l<9aSyYqpDz9| z$D8|jOY&Dodr2y_F6@~K;i3_MbD0jTUXww0OwxiK&m`W7HZ-IU@NG!`APHf{Rr?_! zE1cfC65!C|dmfs$MA1aqsHCPy9i;>Uo%ojzNRFtTRDJyzl~|%a#T6fnj|==RUXP5g z$EgH|v-l94W$lxG`z%Dx6{}LZ7H+Yfj zfV)#u1M$Y%1S=rkeUwy8@DDiV5^|~%FGEOTV+o2{B$)(ctX_3|hd(jNlp~qQoA$=* zZHRirbcBT~;fUG^z!0!kXq#_CTKm9ryB9wpuapz0!t0|lSup2<|7^^BeZ@Mt5KygJ zSgN1!pe>|p@le?8z$3jWn)gkd<)iW9xFt>F&3LirPI{y+(zS}+jGuA|SP)iskb*9`zDVy|E0IH<$!bz(w;oQzMVeSsl|0!eb zQChiJZ=(5g{>itBTi^aI53Z`=mLX>USx}DsrEmYY8O^##?V7k(p%_O0jl8#vinGhs zMGGy0LvRls++CAkAxLnC;O3^r1nd4L8zeG-|2)Qm*1oiQKG_>$ZOnl#4@`V2qgpj z^uB>A;v{$W4X2cl4=o=ygje9{-=w;!J|B!LrRFfB4Ui=>TBiU^9JZwt;bN1CHYviq zv9)I{RNErwxpStL71zW0pduhh2`DjnwTi!i4&10hr<9xP)qU0NOdB$l`TcHQEM^0> zB&@x`Q>{NhXnyF+5OMWHeHdBYHUInQx>7iE*=uy}gu_C20WwGK8}Hi3=(<7^u?(+F zQ$o%b74~<*G zdzlU7OcOV9Fb4z2HX0a;meQ)K6HmjPG<+ejcn-4uI1q$&vxk%kIV-Pd>YlRhK8AP` zj9-$*UEh7+pvlpcVKNar(G2Yk$!{>bokFmW7r7^5xRms;ndj)CZb^GV{Xv({@*0AF z4cSj|K5j4~&z%k!p^#nl`fPx^C?hHG8WY4(ts}REEjUKUPcGO=F&ZCqk0dCrVKkjO z{Lws?$F(*|RreyL)OvA^2HE9WF{Gm0)2K(T

8P6Jn*YK`jS zOCU51P#Hy?agTuXXD>vVd%SF$q-#M9wP~bVbSc8JAAZJ3qx`bjc8Ku9oN6OJ@{+Dy zZ``hLNSe1(a7%1r|h!SQ1|z$uFf-!zqfvq5-Fh!a8)2+x5q zPjUzp$+UmD6+OsrDac($?#o)vL~hGbNn^gL_y$K*>&RwOcr@L&dZ#4>7Z=t(s z1c|v4byqM5<8{4N1ld-~D; zQvKVhZnkL>0>I}D`R}34}EayHi~f?DF4}5BqFs3soU#%k`^1Y!k^1% zKlO|ms<#*DTiz27ajk(1+Z1?s&6dan(UFH#3>vbQ2NMO1jbvn}Zxtf9D9J;W`V^S; zrg`se9*Oi0L&a;J?UcO4rtn#6kDQAdC_{HSge_xdt%vLqu+Lw6qPWsKY%USVQ# zNr=SU9F4dLNvFY*w^V9;JyfR3Y>nKwW$cLK_oPIMS6l4KOgZ}XG%MdrP;w4IU3zpd zbDTwZk)w;X^kfZqr#IGE1fQ@w<*3k2cK6CB*4m+CA6u+8#xSipb;RP4%Ut)zJqe!l zn~+4p7$Zjd9N(r>zBgzF_T}3RLnI#qXiIf|*;VYefqv~uhXe1RMc-q4vq5)Y{AGm_ zETb^`lr<-;Glf8z+kueM7SlNo)Qnnc(&j8GSUwp6f;oh);@m{8Z`kD?dU)BUiila6 zh7DUqifMiLitW2K>g^P9mHKBWz&{n+|6)J*r%9w$rx&dJs?)Z-x;?C7siRlhoNzi) zi#>szTeEe{A7N~PMTc!n(A83F!htFJh6Lx4AjJ4%Jl&k<|oI59IvyWL_LJ>OA%*haC6na@+_ivXX5dZ%$ zLjI0j{iq!~&ia*>xN(lsj`EHI!b<#MlnA~szC&>ROcDXC0kGOc7&iG}Y{?GvHK~1@ z#2BqT&&wwhsYtuBHs56>HQyohy5cM^y<7T_M7AZMfw%=NK1!T;K-(aPzpy6ZOT#GZ zQ2F)-EMT8o;RQM{ZLABa8)16U=Qs@4zoo?0Q*}Fh_#AXXYXUCh9kua8hqi~ashsLf zZ8j(KoXzA7zHn9qi{;H`tVZS=9>~0Iu9stpVDXdN;G^e@5e80n?RZe7EP5#)w?)bR zop<;86<>Ee2it|+Pl(6fnaoyQu%q61cZ+#kVSLu`w_Mu2HHu#1t{1#m_7}<*4|t$- zDft6ruT&s$-Da6KN|k($7erdlg_4a~pluxD@$N7n-Ru*Bcr=r2LN??JSLR2u*wL!X zc1+7m+8|*&_Zha$@cM6q>BxQDoktN_G83+4@B>zNyiGbwp)53tU+oKF*=XS&Hq0L* zOwC0>MoK&KbR#vabyn{&xkHUG#YuKuW8rSZC0q4nWT&X=8YEr~1f0NyVsd#$r0GH} zLLTItQ@nPw@RzF!lgsGk&=MR4FX;n>xLM>W(?@eSFgK++yj5`YUm)1kv+b zbx}&G$r`j5h5NGyHugl7(L!cTS$$O(74SDQZ`w%;k_x&VTyp}FBv;NIJ!voT;3fxM z?9l8|)m}$!d_joq%QOo&IsZCM54SAahoxwfKKnqt;g74AzuE{;uUM>hO63w>ZKq@U z%9RPa>ND9Q zy6VWrEdRUq@}ssLYk!rdR+rp)hHXOzmr0fjAwWXJqZ{XDI#nz(-;&_9(nQ(h>@w#E z64xO&@o12qe6zM5ttt)YJ_5WylQx@Oj$QTJPfqFFAg7x--h2pQmk`~}3{M#!sdqj) zVhQGHTJRu4Q7OsZM-kplZIdkh$_UtoO3@_z+aPC&&dY4fuwjKv`wie0&mzig)c0kT z65{Qdsb;@^9+{uI)P*mX0>k~YbyedR*q$QCXP@n!U3B!mA8%pU7{lNx%bxEww?L)k zjL~9qO20E5N1oDKV#$<$8+e91*)1q4aHN#1m7_l;l+8eF3>w?gYyATh=tEOt9i}tQ zGjHB%F3rs$94-N$*&0BZWNqU|Ing3xT_?0tqTdxms}^Sp1_eM)Lup{y7_#{hW-Dl= z0ZvI&QBU|Je=-V2u_E7EgEcJ*V)1QcM;Z$=3mhXosyZE zf*S~tbQ8RTWVR&8-=y%3*Dd6~5f~*75O}aw;^gye4jr~%$>f|9yeRk9gNB8+$g{W{ z)!`V3FsYMB$SEtz5{cDjFS31Jl(~9VuT<&z3X94yQ;TS+-&PsvrZC9omWz3-d=gWf z$>y&nK}&}cnEZ`m)2vsO5hJ_G#6hG65V&w$X`)L_M%F25um0N_a6k<4ZlbYmP=SALaTMhhakJbCtTING?uj&YMLn>Y8}xW!Tu=PQ?c z=us5KO*UrmKQJ1}8vU0wvA>?%XBnM+q>$JUspPDWng9NX)kQ%9o&KSeqa|g2&ZueG zi4*{k!0*P9w^kN=cIdw1hbe0z+^QuE;W0N^s3C5#NDx2RBv1Q**B^CN>`YXqs4|a# zCMR{WyaknJI(2|F>T0tOD5z@)NLGI6OO(^1-1e=bfMbm<5fuM4S<-(wc=fDrmL&XP ztUCR#YvpChhoj=7;I~0aqR?yrq;0k?4n9~+J>nBG#W7&!4wn!P7I+cMP&^2rFtyi% zDO@AI9yA1Ijb`l9kMTP$L@!nM3GIm`jU8p;Lf67gBNV*)nEO8B5GOKN+BbBbMoj9{ zDCjfaW7v&r-O{fOB5a>U95Y|^mNspZT6< zj3juar&gCkH!=0VNE+9NL?%_2nW>`wGIK36etvk>A7V zdi6!-+g!;g&Du>KE?1h6Ui`ce2gWPKqRO_fuZIv-jA#KZVEyyxYcu}7!K7HM8=}lx z=jzo=eTcabmfHi#pZc8GCQfgUgOymIlCeTk?=F@l#`YBdJpQgZALfnI=bqwAv+oUj zvpkY^;oVRBewoZ!K9u|-5&LSF@Bg6_K0oF2hcoF{x3(zEM?n}My_nra^Ck46J>srP z2FehczjPW6g1~{YJzOtRr3p@Fmz=8wx#9k$pIJs|73hyQHGqj# zP{ua2@BY?ANqvKOzk#k)FIu|CFG|teMpZ?@&m%BE;RAS*cNg-`G}PD=_(F*Xj{0?6 zt2>cJ4)D3BoDjq_oa>(8p}qwLyrsTec&)g^Z#u~D2WjhErgtlj+VG**;VoImM~oF> zW6eL$;e z6=+rG4Ss^WDc>OMU2SAdHpf^>p_3l3Nkxo;l;dQ#F9yH>k16ov*w{H?IHJ3V1@56l zGoJd@S*1F;oB(Bp^I|`+HC3!;yw;xj(A2<$H?Y2kHtu$;E7TTSU^A;wXboi@X^zw3 zADvX&wx9?NVDJ5hW|Q{!W~-VS%Kq`g%xEh?8tNLcj=u;(`>?S*zrpY1N#rl*?frD^ z3zPh@;-ya3(}bu^W+MtYzCS>y=x4qqnT;F%ST{+=`d*{Lop2^Zi2^F~qa5Dr8p#<2 z^9Ka)trkt09qdrLq=Qkj(_uHMpbn@DB-596?WXd(hldJJscqn^r8iDLSOSx&oO=?S zofeyZ{+$%yFVe?2OW|F+Mt{#nw%e_`JLG^Z`yo3UX&W`G>W;#%ii z->B>kD_6MAbL#m1(e#JHbXRd@=BfQ`bVJNVYj+Zi=;T`Q#flGsxSoLw-hqRIo#HB7 zUa58rjhKOo@&_|nR9wCf26Ro@ z7^f5F!6z0g5)z#oeLLIZ6!&T?RK#dxBfIVFj{A@4N&9jYWG!toK4@F4{(Y%~iKw-;Nx?9UQHl|1d5M?%O5^}}pTy(zGA$`)@6 z*X_exzTKqs@TC@x`I7M|&{W>el;h;P(VkBStmc`wC1+#GD8o*Jc}fR)9%VFYgEncx zqVq^U-%-BSZ89&*KjQm#FV_@t0xx_1!tp46+LQw2q9RMgA9vwFvo^E88+!CZKA(Hg z`W4&NIhkj4BM~!M@* zq7MX~!oIoh^bpFr5+P|VT_KpK!Nt@q7+-TuzXJ1`?*Z#sJtJPlRx50I>cQm6N`=)3 zt93eb<`6v^X^~v7fa`>oyVuv%YZ+DYW9x?)a^^p4q#PzoIF8{SbuwVCI3jG4+Ku02 zsMoF))$AUp<>IkEy{cwZ|6&im&eTHW*Mmt=Y8OU5yreIC{Fs_iZTJk`KqAiJ9IW1d z9Z1x#2UDvMqlQo_wfl0NYt=?u|vGaL)aTAWXxQ)~6pJ#O~ zZvW*Q4R6=zfT?jVQWCkOgR(Up^zk&T<)x>cTaNZk*sCw{{8zo{RgpP3w6~3p{m#x1 zA|k4E3o1%o-DOD+L=TP8AH7UrGO9^{1coHJBw0jixSMy70AZb5CmB?{k6(Vs(dn9>xx@+5@B#v=GAhuuZUuT) zol(MeUjPWT5K*5**7>P^)g{EZ@k0U^z)H`x74cnfRXJvHebia>+#c=GL`;WQ+J@uR z@{xza`3CD%k2*con4BLNsIMRM{oK;6pA|7Q!3-{1IIQQJB^ZBA_L3ukjzZg1a`)qQ z=^kQc%VvX3)GhG{kxRrBeCfiy)ici>et9w>@CW7Ra2)~FJLU-{R>fDlF zf=pdsBn9qEl3`oxoU;dCF@Jzwzl#yhYu3i!!62s7^iEK=0Pl!k`$zH~)WG%W`@DgQ zCJonVdi8!+l{hB*G!|4|h>iuo&_%?Ap>DmO?>-BaRqN_});7iD%Sn78DOL!^-xSC8 z0T-v>Usua)T}>Ar+L88ItlpMEj+&K z$go(T7f8nWVlwY$s2$BwkV2!&%Ql*A$VJ&%G)Q|B7_5RTsJ=b?q@xtJXxK|VYOy0Z z00cAMrTT}xn2X%slw(Ex7Ipg_J&#Ll7ld?>0K}!$tX=~WAQgukYOEQp)E~o3ah{Cw zoq&nQ>}w-|%9JqRpO47L;-~8scu=~GnA~+r;=@TP2Qm*E;b>mG&(#oP=XU1Uq;RSr zfm>+~X)y0b*-N(!HgfJ>FEtX4@Wtj`&~MA8Ik3)v$~R*-v0>(ku(mTwY}B4$~#h_n_4c=$r@<^mq9r&~lF zeZ+HkUb!~7Hl$kS;wsk3_hD~2;+n<>i5+uqgjSn*Y#8F)u+HQKb=uR_=zi?x{oWk0 zL)ui&uo43DmWqviz%O3T&xonAVwohPex^piPzfd$>uS2YO;muUgs`_EN2@%It_l}@ zxA>TY-qM)a&>zN7F_M!!Ovi zqLyq~DXU$A)8_!zsHa8R;D`I=XH3w4zbJGKR{g_Q1(YK;J^>h07_y+nXhj59YG5X$xKM=X<`=pShVR< zGx1UbnFWIQTY$wE;<>JL2dCw~F5>^70CJJpI1Lk#*+hBpHv7IdMDwG@&dxOy??PBFcT^9h`kc@ynE^<+2MJXAYD^PPGh|c{Ng&-XL>DIvVd5_0GN2U>7EpgoNb21 zb)KfKKJo>1_$%t|?G0(rAZ?acy#$3IVJAB}Z83UEYSf~wvuAOkoTEaK?H;L9x*u?N z+jQ*Fp+oqIH-lNX^_)O~yk3&bOoKa!_;s<0tV%8j=eGu3xAueRI17-K^l%WjJet!- zY@o3xBWyWI;Bl}>D{7tV>G+_HQwLZ<4;O#TlM&VsqEfr8F=1nR!!CNg?eO3RfN7Eg zv@NME*^B2Fe4`0oAdE&#loTzZu*UZZP36K)gXrRoWA&yMz65|8V`(#kPu1pD(V;Pg zuGfozzKN3)J=!2G%IG{1F|8?-sl^Wj6;WZS+IL6&DT%1t-$@B20d1H0VOd^z4INe5 zkaRR8sHZAjj*Jvh87^2p}~DS+)|CKW(PH!^k>{{UH+MI5s3vF6YGWO*Nz`623<%5+QQ z0zpLM^nnEns2YuXC;_0bFMoh;N~UIfI5$?D)v~mw0H$cPd+8W%Gt#Id8oX2HOcIcO zop)qiz*hp_h5+AA1$Dv1yIavV5y!2sNy=^l<4Ndh8kDNjO}|H=3Ji{|BG)+hQO;Xa z;3|PvtVtr5O6{$N*Ovtbh4p{N4UKZv6Cq4C+t}E63qy`sX=FcyzoNCwQCD*WAq0T4 zi}w)*PW><~J?;#$a>q)nSEW3yrKauCX%%-f2<>+jH{v)k2~+f_aB?&Uw(W)6!ZP(`yaCF zd%=l)a=ah?kS*!Q?gfr?))w54&-|t{4b~LeG&qkVu7KqjV)1)!nd<+^?m-?iw>T}* z=+mdEie!^05DgWTT^L#UFF!Ihq1evH=PI)&m$z;T8p3H4)a@yb*@%gP(JsVx!9Nlt zI*tvh0Tqvt!j3gmiFjRogF}zHcx;VX@Az#;m+SY>+jA<6BN*4CU;K}D4j?iRXdg-L ze`?zM|Fbyoe^0Rb?;zq*aE`jf_WRX^|(nQ)?3tKeoL1Xq-khUsY zoP7mrdE<}W4Nrva5(`LHZW8A-hOn`&n8rw<$9&}a5g!MfZDd}n%xZ+?F5dXd)J(8M z*6=y8$m;EUdOsZp8axG-yG-Zb4!r(%RA z0UV{5cbbB^H=Cj+?;{oUNwsRYP9863(t1&kRo1u@1JjMX5|G?deG{x^v9+Yn-qyjs zf7mfT<9HucO(7>bqW4=fRE$vxOAb{mt!VAGGse~~)v?*BJ4xhurl^equks!H)!c%; znkTY<^_NeL8y>c2cc-H8mt**)QXuD53YZEcRNGjeU!{_h(RNLj2X#T9EBR2yN$paJ zR3`thBA+j|bon!*$h{4JGC~VN-1>T8I^`>r^4SPb-<(_i`1g==9sSjy4UuMncW39B z?lIkDfVx;YTf6ynq?xBN?rX{v?`J`;12EUiW;{E@vhOr@@ZfspA*;fV>O48Q)=)xs zB_SI=S>)w2QwAb|_3P5r7qC|lFWL7OE7E^}M$uhNY^Oq&V#tcu(_u&Yfb`tw)FNNP zdz)YNOd`SJ3&U^ppv`rjz8;x@UV9Nk(CdiA9}pDu0Xd0Ocb zxI7D}s)l9TL{==cZY~&TeT?hhS?w~4TSsVMb~A9qDiTYIwc3(BxeUz3pw#&El|;No z-g404Y;X7cS>IF)mLGG?w*WzZ?3)}BQCh}3`&|D$WWlOBtFGgIOf{$?v4cSil0`z zSD3Fc+hRu%StORZC{_VcetB-kvT7N#kq!nU#yo{K13Ijfdb7x3#tx~IhZo3bI}1JL zDVg_R0rC;ajYnS}Njtiq^Q-fp*35_Y(J_}1k~FqsyzXWn0U6mv$fR(qk!82c%r|=J z*kjy9F5QR5+m1N-EPT*48{bG(5$^J%*^O|5W2x9(2F z`Fm}hqhJ?ZK(`j4!Cu0UYEqa@yp3sm)7Y~)gdH)}brS@Sj`{3k9K5&jc{3_l zTm8-7^l;$K2-c#(L~s31*_GXkd8ckyLHrzmD_R(qKv4&1j+a#f;uI`s$8u54@DK_NghXU>alNhicGob{`%jP=Pyh)?jScRM6tGD&| zZ)z`b44O-vDBMMPTLftHvYkZcn_+FRzpz4$lNxOyKkuO+(p;t`>>n7dO6| zrX~c36T5F*Bj;qkonybf4OnFV?k6fGY)O^_6JL`Z4*mLZ9`ps`pP`4MM=wR}NRJHX z+vo3^vP3$sg3ULVLYLZy4=OJ)rpV@Oq(ClhFJL4P&let*?d|80=vqrPJMLR4XyatC zBTd`d_r7^-<#qL($kP7QmJxawjkoVC;`+%jA2fgy_JAb?yMiC=*WB9~mbU=)mEBGU zWfX<Ro?=*Y8BM1yi^a% zY+RhNU!1Xi;~MpdYru(-BObf}r4|E8IyS>6s$|!ywyLu)Y)ktj$N;DJwHw9dIH0%- zG`;{-OYDHQp#G-T3+F9aHIMO6BXguJt7+nk2fxX$e`iek_;_%!g+~bQC;BSuQk3D# z*DNw=$omImJEP8%`_w!Ty=M7Bwumm5)+);rOBS$09)^abgV;Fxc10B-`162mx;JkT zT`;dz)+Ls#CImhV2}@_Px%BObN(6XToL~rZ%n5Lgn8A%HcjB2@q z?lgc9G)*NPWKzLUZ@d`uZP1Qj%=o=g$nJXRJ3*Onzie{81#n(u7+EhG zP0vmqPLr=SHh${m|LJs%C9)N((>wlPLDA^LqA54?JzQngY$vyyki0&V%NzB#vixg9 zn3pACxcQ<+v5x^K!&fuwsXi{*MQud{rvy=y%d+Oj_qIHRGYYlC8#so8q15LY^XtGu zjnUNe7XNk%3c!Fu=v^z@E6?6W8S2&`)mwbVm_~Ow(Eb6L6sXPlMoy_=Y_8=uUKeH8 zW{ms?h=jnMLA(NCU@09SbGs}hp)XI}%}kAQ zpW_Q$Ncq_m)yv5AHa%zCS)k0T*xv#(*Niu0^pwffa?J$K?mLI$Pf74H38-GmSqy#B zZ3XRJqU>zB28^}9v^tIU=WGLoRtgoy(EzPefH+0d{|4~eKiNs6;nBaRdjE-PyhbZW zQ{TWU*8^xxPP^7|qV)C`^K+Fn$;kIpQ)<|xgXFy_Uvl!+>i5nD5sR0mqnu^~`2Jd{ z_`iM>Z<*@NB4{CAzfT5QPf%wJq4Cig4&pdP&r&mfEY2Y03k zogOtmI-PUK1CDBn%>>BeFr7v@Fx>CjstPRmaII5&yL5G>uJc?1$0WciN=AY_aQRXY z6Tk6HI6{95G%u4;kqy*ae?7uAK$d>)$3bdFX!Qro;{4%yf;qG{!RI-Y!`Kk2Gc0f_ zs8x!Ft0dVNgg~1Kf^V6Vxv5=W@itV#I8c@69WLMhDD6{sHuEKJ^^1yRHkSOie_9Xu zKXsE4qUz?@3t-24VP7g1jHWgkl))prfWBGcL4ZK{=non|B+RYR)zru#Cy?H$X8X-A ziQB4}Y*^~tza7Q-mrc3R4RQSKFbH;9grG8h&pCNJOt3qV>XG(N_9$*DxS`-)eM$}J z@%JaD8J4okmzfe1DOym3xJJZ>MxvwHRweEB>M5SmwBuLmK`6P#Cwr*e9fQ{l_-Epw zJY#VDovo@WZ80;o>e`$+?EaF{2SB%J=Umqz$BUrvBaD zSQB(Yqhx<3I9Otr#0N%UWXA|(V|l^+V=ji6Nt-diK#hKunvzLx05zzPrO;|)U;uP| ztV4<6BUuC~?9?DRDN$Zt_FTGRo4KWTB_g|eZzyK_qehMzmnk8TBUHEL}xOytgCwp_g+<% zq{o`C-_NDq-1lYTH^TNlmX}>J(ncyaG~KI~ni((CAUILvQT_}N&Fb^o&DmJPTs}-y zPdp^Fsx=PcH6Id2LJUFNa7m156*w}G=Z|P(gz&$gEN`*2WwUkD{a9yd*NNLP>E*~B zQM z3<2dC{yAi@<#4U>^zJ-9@ic3d&F|y|mH4_n=45jhyFk)`1pieB2V!%f z02M41Dh{t@<%fSW;*xzh{tB9x*pKk8q>H+}HB>_Pq;3X8u~15q#|*33vVGtegi@m`t|;zwYc*vbTDrRDyYvniR^yZ9Q{kN9U+) zxs2{(whw-R9&c@so;BSwWZBOwo@9=rlH$so`!<0$2coedJVTVo5bH%|H17_JCTzyn zj8P#`f(Wdmai+ceUZVn zP9%n%-uld&{10@=KY9t$o_mh#P{$3ucxN)sXsl&2ZxR8z@>LW*a9t{@86;q?50wgB zHq%haqv6;Kn5PF~;93`kc=y%hAq88+n(B-_`PWRzJsFZLg{@=;x-{ZnqDtS7gjN$b zj>QyIwHxJaAa~7~h2wBT&*wpM6{)rC1dypM@{3e(&^KRkDb>2ogC~`trzOm<+4HW3h)=rgw;=iqOe|&>PgYCq+C?uC zGv=-jP#W$Rfn^3Y#RiaM{#a#fcbi)E_LWU;Xx*ko4_qVeo7De;(Kz;coCG+M{KNat zzl|+JM`;na!wR>6aSiO+M5@)A3d|Gmv7l@T?xn7kv(2(PJUu^J$mjMqCq>bq(z+FM zbZ4)1iC%I_Kyv>K(&Wy8)JwzgpYjV1VvhL|`Qx35Q3LM3LKrF-ZFDv2+ncBCS!T!1H|E70lq?slL4ZJ z8nZkjsDriS1{MNw_1fcRDr*Ef`);_gJ^D;{kBQNXHdNSlJw?z5Xz#ZUe_NhMT&-%Rfckz0n2Ac5s zt@2aPE(*UEG$Tikj(NNl`2&P<`b!+6#jG4q!3}wAY?9a4nP`cJ#D}Ujn9-h%A}gZM zfIK>Y=)0h&8RZLN@_Rrs5bA_5=JZJ2J?@#>2JZaM{LIjQwW4sCO{b*k|IpaF?ddX3| zg$9QBNYOg_U&`6U|CspCPKT)18g=!vr&iv$O6b8h{nTO76d{U|lsf%Ns1O}yL7tnM zG`hN^;6ar7DCu9VXl%>4!nn-YKyIHi4k($3UVLI<&Ha%md_0S%mFA(BO{(Y(({?vN zfx4$T*1HRde&&9;TwY0KCRXAUf1{}iq{w&mAwIn2qbWk8X?7k==*%Oy)o=$1+_Cma ztNVWHDqk){L$jkY9zz?YR~t z%W@g9v1;Or&6S2(UTIv(y%=I`k;66?jtc;+2@mdT;eL~JB3;j-Ssve*>yzmSE+QAd zNEJfj;KNv%WHH30dvUl_tfFAiU?8&AU=wA>sMQgzn98CSXwF7VVi9&hj-PSiD>gfr z2NJ9gp=JR~xWlE6C3cFTr%I>VVJUhzYkw{KK$|#7o3x#%FNy~j%^oHqDwai&$d_h$ zrChgvyofR^yL?upX(^H^abg_=q3LuI8X^cQWX?hVURN1}vM;&Ri9Zd9db`{ehO@Xv z>%N;PSJ6n*@T3Wed#TRJcHWYVi0<0KJFY=cxN(G`t2*W8{cY2ibvpqg4{12EwjP}< z%9jzr{a=UZn8lMdG& zBVl2_w@h~|!V-kAyXs)eW>q1aZHKb=1v%W);RR**p@PSSGC1PF_{PR_D^1wunE12d zeAs)^j8Yc6AWhLPIA?nZiK*wLq(Iuj*0thVbex|!q+Ci{j8KUXc;6@BP)s{F2WSxl z+<+x-dRD2!v|;7h6WV^cIRg}tZ6(b|Y?C_pVuspsnV3N5f>f{9R^Q#z$Dk^;-{ z$7HUeHj96IU;7jO_HS4|rZX0H-s;OnXn(@%rNv&i-IIY2G+f7h0yGX~u5L*b8F~F> z^2L26yw9k#R_e+7@QpQ@(@oqRopgS4r$zZyXKKFMx<*T0G^a4v?O@BQ0qqnMgkppu z&d}IDim_D!nZAu7u>$$X!D}b#MT|+?4VSO7>)x8Jz(sFLR0{(*s)d(7%z?qOW#!|$ zY~1pX*h>&ZTtPBNNUu|qO!prk5~*U~T)N8E)D$`@<>Y|s$xPoJNBg49Oq2-QS$M8u zLV|*vy-VNZ?KGRc2v$W~l9k7c_+ta4uQ+MNulfVWF}Xm8Hic6n_?x1_-E)iDFxKJv z>!|N3Ww==kR>aSUO*+Xa)0T}}nIb^Y473gQhzC(_(Xhzf7{PL}2_e>Rv7Dp)&zm%e zjYP2~TSuiVqT`8jj3hoxKVO3u0%Pz9^r3WEkA`#AGUsPA^~}#m`4GQ+Ny9({!Ys^p z=lOFBg8$$acn^_o#1dKYfT{dA&!d(Fr9K~k|XfFGh%q`!` zoG!myM1BFUWdA@3$_uBw;vDr%#-M4HIVsgh5&+ z!VJ{p(_xPw4srBrMEKWwL}5z%gKp>#EgBjoYz^d|nVZ0yCJr*h6t)7|@2E!|{{I1B zX@B;FzW~gp#RGFYT#SF^ck$CM{}4C#O?RxwfAn=V^||EE&AVZ^EdUMr;`7b zf5SJlKHu02_7U`J$_y{0tNmH5e0cUel!yks$K~QT)qndza@_$HQm`xB>etD(6E0F? zZlOX*yS4LSdA4jfByVkhulw5lh%P+Z_)Lz<_MIIuSj5Pq%!?5r6%Y`6;NK7zf5JK9 zt+`hm5MhqxUDy@7#&>;S&PDw%(?9yeR?}F0y-=L7k^iK5qc)*e5GG^vx#0cU(i=UsqqX^7{*!Ng9(KXc7q3cPbaV)PW4a@ERF2pfQ z0k3VxvVK;wzi6$y+(*O`m>x%D1Ra*4{(ckOv5E&%820db4ALLM^|Txw&0~}6|JmJG z{sXA;HcYrSw^Hr%V>8cx`G}0GC{hbT8g=QQX#9SPc_ZHK_x$`1&;~`du2RjfbN6b# z{DnWjWdT5U@j{G3H5U#2ZY&}2s!2b>MZG`91 zYr#G{Mv^I)03?7Uc`GYQ^yJnE^qt-JLXO{P#wplO-dvpLbRuHuc5xHuWJv9u^rM9E zznbo=&N~=HnqLV=5Tt>;Xkd|{y3;fRFb{rIQ2r1RRTuKrcB2&Mo=hEX?qakAr?biv z8x4Ilxil#X^5hj=CG4{H1iAjt*-YgwnINDWa|Q6|69v>TRD_v!^2Tq_j%1xNu2k3_ zE!}t1%X9|Vyu-v$XLvPdPcNpEFIP;hRY9P!+bbxHowVCosO(e2Q>7x_bN+pArL z-D>OEOy~1IEzcwFgo}oJIPk}8A^dh&Tz24PE$K<@{3Fy^(or2)pBJAFenTLKAi8h6_zQO)I)#xVFX}_Id%%c@(?Re|IppKzudNg5GyFX;kt) z3M`ZJv{6j7D40E_mFt$xG&s@s zu^{gZfSDvuY|JA--QyyFZyAhSJR&dH86~`CrcS16gp<1s7sc~ak2ZrIb$Yp%Hyy{M z#v7E@1SyZjjb-#BRgEfHA6xzh-UtoaACVVoh^@`Kq#KHpY@wf1mDMDphdQP3Ts{`q zgFbpv!FP1nwWOzezH5HFC)kH6v|9^2s@eU70<90RNZ#wdVz#3FiSx6%ZU;>D;Y&=d zp~j8x`ih6D&}dm9?X`uyDa(3B01_}5xC4D4lS(Yx*HQZPi}c;>@$Bxx7fxwf^x^?Q z6|5uWeF~s?IlK9tU$AXmrQD08czJBSfV_vE%m{I&$6rE{hWQxFz*5_SU4uwFY4;#b;Vr{%CqybbQ zQ19oSefRzcL-}38O@$Exo5$uD%@ZyuZ}UA6TA5wAj@P2sTw|JL4Bw8|j$f3rQdts= zBUa(?P?IHy)D18v$aM~$>eMKM835-&0EB<%xb*i4o>{RJ?=&+D<2HgRxhXbt=EQ)I z+LO|+I(xC**vN!EFns7-chrH#hLt|(UMPcc9H4uh8mSF)Wf#-(8!yw}QM|j2a@HlS z8Wj6d#*PC4whxvw@r@VNy6x2tXZ74{D>o~^($u*92f%2cdb}nLgAC#~xnfdGn%Zi8#~IK7a$$1B`7A*N|LE zcPe3b8xTq5E?#T}|LkzmZJG_shlDBlvTFzE>B_3rn8rO*jsF=3;RWDwMiBpQ60DH2 zyqa-VjoFum!)_YpNR`Nb#@?tuBY^|KhuN`D-=^Z-5V-X`JGaMDd#Y^h`+(V2J3k4|SL=TeY^)~ieeZ*hglmFj9p!9aK=JB~wr2OP2wyp$942K-*GulhqOWvIS z36A~iE-L?(zyI5Twtpi>`d^;0{+pk{`ftI%o@Ygf5jVpaLducToScw$QfqtA5I*B_ z#Wr-fEpvh2k$DGik;LQ{w0ER2r`a6sHYK>pcBFsRIi=HohCt83&!OkqFSrZzIQrHw z1_!bBN4>FMQ*3Tb-QDR(c29jxzo0F>^U8~gCh+FEy!>#*{@AS6-Bde1?gytPTqxIf zK}2`aBKThXbN}vEMu^QOMeD|j!4qsd2#aeXKmC`tCRH*`fWD;R7mDTvqH@(P3LG(@ zf4a=3OYx!8e4mAa_~VPyRBGDeJdPI*({wXM3i@mL!wdouj+EoAinhF}z-{tHf2ElW8_%<4M~P=bsgEo{I~j zH^*@jVxPXX!0)XepM(`#X|7_-up>GdytU=yjiK$HQY4BEmhEF^G z2{xW!zkMZq*6j+@ulE@<=gD9DD&43Pc;tO?vYq6qmB-(6B)oEbXL-Q%nLJmGB-`SZ zSyF57EnjK;;?|mJAC7hKkBm-{0%`C)kD*64n&SK`2XTlTh^ZV6w2*=OyZli#0~$ItkBXo3dwmPMp5!N9oOi%MoN6TO6* zyrC9wF|sf3@;;GlGvMyZm%;xuXJ>D}vLN>5_VcANdI!pR*RZQ3XMKIPZz6DKl=i8w zdzBwu`L*=OF1f9V`6*{-Bv0bmwSKWs@8tLMPi;EZ{&By}k7i&99{{e?JNANU`7wT` zAI(REZf4Kpce>S+sXOOjtg>#d(FV(OEd!&&lIPDJ_j>idgxCM@e7^dJk{{O2ijO&- z`z=wjw8L;%0=Gtf{48L3KpnLI-g%^J(Wz zXZ}6+?eE9@ec-NaVf$Cv^^%?L!Bx>`*S;z1vYEF&E-0iL&t+CnsW8uTIUeE3UkA(ZE zx@_eN+q*}Oy;4dPdR%rjJn6Q>gXH;nTfYDY{66nLvYx-j;rgD`^y17VYAYALyq(f@ z_j#Ab(#?E9#!U~HWVg-9 zvsB=_xxQ>sgPre(^R0DCKdO$MyOqx#9d|ad=zD``6v9WZqyY`YxX5O-<T-w=YV*A-Dtx9w2z|V1q*f!CeL!EVvCi3@%Am4*>j?2e}1B6{e}IvvVW`nKV-tf z#>9E{0s|fQ`QOI=Bk@1pFVWC2o})ec=f$(*+ zOz(?jylkg()U!@kf*{U~)`c$f)670{^pCXJg&s1Ol>5tHbaSJT{9drOZMXL55AyA9 zj}e9Z(J2sy8Y1kJArp1N2b_Ji*0a<_X|KbyW!90E*3#uhe{Vp;{;O$p)v`GL*DHP7 zl2(WgpLPaMHI9!xgSJEf$vV7-94hg;W3hJM3dzHB)_bsZNYDlJl zc?woq{le9HlSXybBb%|y(hbHvmJ=?Pu{tB*_?sUZ#c3-O=X5`4NQ(g}O4ohLE^@aA zD_-09{_1jRUa;MPYcblkE9;rF(~MMny8#BPQvNNcD^X($QbGN|Lo432n-4WJhpR18 zu-#aXKxxM#=K{o(2nG=W_r2PVY2FMr4?7f_lGq@V@n)NSNP&{^0b^4nRSzAYWb~`( z0*}u7jI9gQZ{mQDsPScmy}cXr;7>`InvK%k{{BG^5bcf=qnVxXZBPLzBt1`u)q1tp zW}K?4SQbmfUc{E7j)B)xb0?E~aaEw~f8>uUM|32+Qe$(^+Ap7wF7#p_`e&dU%`EZO zjVK62_-En}X!HNWP&(~>iW{utmq{^T5o>>Z%y$S;%^x$0Wc z8Bt7c(7`~PC%^X0y>zh^f-m~h9+_G(KFET_yVoJaV@+U~DHBqyF+`58zjEjk7QSGj z_29I}VZ0sIAXc1G;@NHsec7S8bM5~OfvmVC=Um_+<kQ)c}?50e0S7ytts5-pll>tOHEa$laxZfGv5GVV4#_&6mO9f&C|tM&dj8K#pp zNz~_D_JiaM^GkP{`}3olfF|TSlKIwPnr}fx<#}7>$=WXS!%(bRf|vRsnxx|f{FH*4 zK>km?ypY87=04Zl_jGd@trCOV3$@C zc*5r|+38w7umz9qeRexc?wW{(7fw(u>*++ov(r|z13$S;`V4(X(!8(MSS0(HVLsYQ`;YFOv1_b=!wRB7ZU0x5;hzZEG|0iEW!4DG%7 zPS^4qVg-uUk2;KJe2sE33NiVPa>aZ>fwNBySCQ z2~|nRymKa~-6pUpn)P6nap%sMbT{sGHN z2mLC*H-BrsWqRLc`Qsh(`x#`TC(zMtcbRp;Z@&I&;SZYN#Xc9EqYfvDcCq=go^wJ4 zv81Z?OfeovNE0{}qd6H~kb(4jRS0l-*)w9AVQjWOLn7TL2%FL~?ytYT$}Kf?f2^~p z4r-YaS=nAch$^}>sqUITVJ)ofTJF-eslm+__+Be^maG$DKwvXFY zh@F@dkKivL6qg3d&?(8Y?p;MgKsOsI7mcc+?s0Q|6e_=14)M}OEEjsulTx8De(n#9{!H<%^qFPjTp)vv zY&yth)vM+W)AU&C=%n*mid0i5^=}^Z3rYsTJyP;pL?khdjwWZQX)lCGeo9u3p_R%z54XI|9*tgZY2w>}bLHYTds|h5n%%2*^2P?Ig{FNr zoPf$e1bsqk?Jn+g{*?Oh{)jtzR7M62q%on)Ov4vx6Cs8O_baI(7L~(A8(;ZwV_0&m zhL{z;JP~oGJ{Okv&=aUMZrr^V(}M0^bIuQ_K4K79ZDy>eizYb14pKOaB4*Vwk$i4< z!Cim-=Ez3wi^;Y_5sts*8ph}jG- z*APB_sHTyE@;t$j9D3!C=ATGL)ZA&30lLp3}*&aOPouIK`vvjs>KYhQM`q>DVrBNszJBv}BZ+uq9Db32w>16q?T{Z&X$LQcd#JvA64?^ubTOx@7b1_Z4d4%AH`3JQ-m@!zLuA2gio961HC+kg&_g)C2zISBqmRmYhZrZZe;e1uHk1X~%vr@3BJ zO3nP~*GyJ=g8^(oR@MLa+*pc6FS`XH>AlL*JOJ2)1+l#`+wf@P9(~F73JPlJl7tVZ zl=ZQlq0Ydo=|?l%61&`5Z&iNv_PNky*+}$U^5}z~6dueQ!77U|dHid#jrNPti)Ob4 z(fc7PXt(Z>H{n6pkdh#LF5sXV5H%cKHX%@4)+~=PEc5A$`yaFGaZOE-TH9 zml4Zh9G1fZq}KE7?hoZ2TIdYLz}7-ZqNt$cxgSP>HcqjWXW8rh1K94ebt)b@fW{ri z1CEzS>@5F1d$nWuo>*m=zM8Du;JcA%{B=p~xi+y)5>-ErptTqLS!|!F!Y|@Xy%2L! zQ9Hu6!;bfIrmd^S_H$HfaV4!qcL4hW^=1Ir0Cqaj)@n>o zGm0OaUC+xB;J2WBUY+z|K0r}~FJ8Qu8CbZ#V&v!LjALqgBRuWN>?zqiqqVTnubKq; zfmniFo6l3s99`K@s4}} zPqh-fGn$KCFA%HXsAhqWza@7qq_JXFe@S>5b#qW;HD;`3Sw#Xa9?oIp2>>_SggpM! zZ?{#RSg6O$xMymW*op-T8;_1ylbq$W0KfOAII6YYj=a+)zV$24N_u&gH__17h_xAwQPn?beSH4%xW&4#M$gJyJh3flJCDek6((( z%AJ}x1ec3FcWA8Q0qC8&4YuB#MgO)+65HPU^?jqs!R%lNaQBe!-P0Gcl8q_mI5l$A zCQ|NRaHMa@{turm~A;m2E^zrn!hWuJ+t6$2;FLPNu zQZl}E;`kc)k|Ar^Wj%AO3zxAMWlHVoSx%F>>tCXNCUy z8r*+cw9}*kIBqP2^;di=C_jzGSI9#;71>&_sLpi3Vm*?qGoU0X$yXU}5~8-R1#?fm zS#P%JH`wp`2Sq_?Hz@Ig+Y)Jz(TfzJ5=BAhrosJ3-j4n=5vSxwH__-!-=s@e7v?ob;tj^>h#=*%; zZG3C!vh!QM1JTsk+3RdE;4+{Xam$gcEU(2c3Nz6%DE|qv0veRhI+g zJOo~JOhU^1tW-{oJ)3>V4VN!F61C(?R4&J`EHxZh9ViUQpy11&H0xTSfS)7w9gPfKj>Yv@S2-y^^2lN+= zL0xD+qTWs4n#h)^GH4AN2)Xuvx#`eUc>&bYtJG#zFdmz}ebvT+YFuoTONX6Hgr4SOl2!gcu6GYruE0~q#R zbqS2?ZRAC+;_GzqDVG0EW6?H{3Rj_9tH|+pgB4s}@CNQ8CC~yjsNB4}Am10gi%&Z` zlLfZ$*HQS?5JBZ*vIb8W`s3%q$Za5R>&qxrg-Zry!(Im#wjuWpGLqp3f5v^2FJH_} zTo^kCwEIa`F_~vW2HAXT=-vcxDa>kSwZ+EYJ7yek=y&wXHo(OrQpFA{&o+DT^tG$} zWUQ5c{rVm*ZoaRME5WgCGvfC&Q0YcHj;h`+H}+P;>CQH|N`O{PRTzQ&rK;a1Ukj@n zD>ar0e~I(i*27O0SB4r1+q)+Mp=h$)9t&}sf6(Z7L}8ebR|L$f2TvPw39s4MjSVD}%EpLQwKhjnB9wH7EtYCCG}uwG#U;nIFY>1JRyyNS z^Fx%1HGC?U&tsM~Pf8RUuw@m7i7)ACDk_8y@M{i2hY)E_QIw#wFatk%60C>XV8W@L z-sPNC*VDy5fY{Y+#fhQsllRQe%%$X8T+Vuo5tr^~^=?9dNj>&0>4nqHbGeAB-)FAT zrdxZco81(34>a{RblC+Ql7UBm&_p_n4h3>~4fVd-I~zDSK>0sG2yR>}EQAg*GA7ui zmWCDd^TDWrdvUVp@NYwk!cx5k0LOQi6Oi9nWuM>uK_fqG&iAr^2(2e)ao6cP*)6z} zyGB@m-x|=4ICG%}9i2f%`Av6t+Z-u2#UN8?hBY}Xu`5k9G+w7zsMjS`g%qkVLrT-U zaR*n>gFH#z6_NIu5cS-LEF*uH-af0GO9a+ed`*qmhMjr*LF2b&3bK0Xw<1VsUt1tD zX_+uj`5JAGywg|CoUS)8#6)|J^9EAkyv+QJnVx##(cCvCDX9d@b|fEeb}}>!#wlbU zI@`V~cT}txZpBjr+9VsZ91HFrPiek^(@`7dYrFkH%UzMJK8&q79psgh*FTbQ^q$XR zg)8QHPf$8|)Xt5v)!!$jiiW880G7@1NW|F0y9_Ph+G9u0Y&2=DNG`i@FDGPVHu3)%H|1 zvZBGBG_a;`Tp8IjffN_**HP5ZI{xpu>u6|r%7owQRXv?ScLbeML zT5+KW*i+2G-ZDhZB2o+9dBdNYy#3_4-9)Jb7qvC!`Q|{9sGpHpxuUG( zN=XjO>-r!+plFsStGrQ>fx5Bk@?Tl9Kx1TmC>A%R3YfBX-ROxdV5ut0_La$SvYx8L zQm!hWW|}qdx&;LdOs7CuP(sR-Oo}O4NjSMn#V2S9#7pHP{0pkZf-2`lQAs=cSK;qs z#!sMv`2Hdkqva?|7$ABk?8_)XgvCvnW=Qx$Vjy-`ZK*`@SQ7@rWB@?i?;Kj1axQZ! z&%GaYXqH9AeXBf4bw+zmavsp+mZjQ@Mc8aUZq$6iXDh0b!Oa_|u3|xfx7y*|{{c!v z!(}sZ=WOWA6ghmX&aIIXfrHY8U)t1?HN1!sEeHcTeJW|gFBo&@&4kcl$O$ydg7vbjL zJBUV$Yrze<=}akN0UM!)jL!2oJ=_4rS_C<#-1cZ4Dce~^*w4epN_X3!&{TIlJ$@E?B^9MkEf#2|cob9EwUiP#cMeZY zt})^Q5rRfW7MNdKW|(Nrv<~fu)DKqv9u5+Y=SZoG_VI0(`h*?zarQ#OP&?$i1G}oc z(S(Cz?NF<7p6*(ipfH&?P_}lGtl89xj*x7TgWcjmfJh~0!$vZvd4SO9Il42_czdO^ zlDF8sA!J%?zvk7E0hX6>Gu(c5+r^Ee=wn*KSFTjPkG~T+7!njb@%bYASR|T-iVvSX z)YGw_^Kf(x=-w9R3iJg9r;EwjP}t);sLz(py?)0~TI*@91t+3t|8lOyU4+%zQ&r;v z3AjEQE3u%-+$jGdq&BSXcC^B=6_be=S`%_O6q^Z~(TxIU)|03LVvAXd9Mo3kUdyKt zKa{RgcEQq{_(WI&ZSg9bngd~?42>p57nowZ?K>JJ%^XA7dam|L7^9UlG$|Ptk^AkK zQ21z{bTg;7ODMrSQ%tbySD*X9PlV4Mf5nn@>OhY>+g+VmmTi4vg67&a6&ucTa%-lc z(UwVx;t8>{h~`MhfW0B+@f3S`K2?+@N~4LlCdq>GS+?T>Tpx_{{7Btz_LW#wWm34q zGv>-l1kPh@5IuSi{_j50BS~|KE$?Y%P66d#jbadnv~Je=rmKx{#;dQV_dP~%QABx` z2w_%zc5^p(CYS3hl~0sjGfmQJ;7I@WGBs9A98u(&jKx;d|Mto2f*L7`c8AVx(@42EO4&ZO#HEQvx$mvXcP z#k0dLVT*`j!H=||nHxwDYyy?B1ag4co8}?}a>_o}r6iut)k+`CBAE9zA%D=+-OA{M zgQP`n*eP=UpxM<$hiO}$o_3M5s}h11Z-U}0K|8=ycUboq%k4^j5ye^bdT}|q5hWU3 zwbXr_Z-%de+-Lp6FR0fRezKviGbxbIMO&wQLZ0B<@&09q)@4r6qLM2Vw?)vb56&eg z!SvD$$o0&&4#jI|8Du#SMVEZ5Gmzn;W$~i_+b4XIjsBqKuOq&l;;v(YcA3r97a^?< zqx>H#jTRddx~eaO#4O#3buoCIu>8`vh}<&4B-bt~LaUjnyoJ-pA#(o8R zT5>DI$*X0S3b8PZdUwJ3UN#)H-mG}d)hV6mnHcuM_!#wHR*T-rni`RQBvmpmGC!mB zWjd(XJ>Dt>R$k|n&x)m)3~>f!Pywb2bx$tIt6*y?f*$Lw_Ax0Dz(w^)a6y&UO@!tA zAb<4d-+uQ4ONhboGZ*nePBTC9WpXOXj zU8(T4=zBVpu!a|@|h zTNt1j+RtSb8TmWcGtsVW1{kikbkHd|Re4QO%V^Hx{k+U}b)O|w9P$V1Hxye?te0Y( zamfjW@s8;5VX^;+nO1zB!DoXGaEkWj^C(_Nw?G9AoBMp~GRb`7HZh*dX0UPguL(@4b7Prf@NL9Q%(q4Y zZg#4h5;3EhKAFC!{}>&#Z?C78`ZV+b8_2pU;;;IBBC#0O)+$$;!iw%@lkS;B!qm;u z6@)wDzM2Hc7LTn7KXukh-_|}V5lw7YD)w{08nlG+FXH+6t*rC~>1heqzrDlhSo;~_ z;Fa3bXyY8(>n6h%f~jaXg!gQhp(npxD^ z%DDEF!NW3HceqG8G-^(ue4HLSnnO2Fs~*o^krx>(w@$~J%TS3shcV=8VZ`E4&i0{ay`k`A6N4$~Bkjo85N*mgzKUP`)F<&GMq z1-E;HEVgxwjAz1K#OfPqOn3%;_wf2_YU8hPbC(NDa#46wMKOHg!zGZlxTc1BuvpxI zKa86#nTnS!qr9XmLP4GalJ*BpDl3WYp}Bo6tfF0L6Dd>_RdEo&&F+GaZoX6tIS?DfpIKu&m^~#+R>Iv-8W0u2emwM}w2(-)c!23E`v&foX~YqWa{N`+ zRCNokm0eS2n&NqZ4ev3lGw0i(J=@ADEBbINe>3rmV?cgF}5cASG z!#~waB`S%7oEaN`cG0H%5}&M5Y@jUyf9^Vi!(0=f^mp$IQIXtnZv_&)B8Po$ zJ1N^#fDqGgR?A)7|LnCDiu_N1dyV}Mnj*Xs@dqs`GUw+G`U%vFNz`i0_W;MTJd*SK zT9i$vjqHHEffzeCdx6jIpW!1_dhxfMpQ2b#hmWwyXIl-+5M&+jIT7l_@2%vY6pUap z>Icmo5vAwa_Ln01inCS65k|+a7VG_nj#$!bEW+y-nQB76sly?Jmu4TVJQ&={tus+BV$))n0 zE(x!?0VqkJW!X&VFh}!z_3rBiFly6|aoT5af94OvGjZwzI5iAq& zA<@2{P!y|94&3D!HpUrlTM1w(#U8yvhme7#5S_xeW-CG~{{2nMhv>c}K@LkslN}57 zh0iwW3@1dr7V$?C>q2S)ZVQ?4xBW%t)k$P5^h=aTx`!th71rPQCwRv7qRxXUrujJs z*%p6AM~F;N`>WCCDv6!=o4rr=)l7B9D6Cp{q+=_!I*idPsk%M0j28Hwn$UwsDM-@2 z)%B0U<;v8)s%+(V&a8x?mycmJD=UnjwlA}0rGeg+6*TlC_$d!5dt#`LT#uL(S4s(L zYP#@5K~?B5i_Ht20(-Z&G@Tsa*w_VFshFNN~-X$`~rU&oIk*qpthez! zz(>&&g*QdWLxV`P^`Knl9`eVf)|Cdi=<%#mLkApm>!ggYx{h0M13hPQ>!;0|ar-Ns zYs)4tueHb)jaDrWo~%K+lF5-Vu-OE-5frhEN?>*E<;ZH6wIoqzxiMgY#JS*Ib5;9b z7xOaLsY2VdxOfRdScWRnRLf=(=2Lln&r8f8H znb7n*B@3<5xw&$+gn-8y3YfM{Id@l(xNORIG8cpAoIoYr9k!l>Et-@`UqjuT zDU?w!>7e2J25+x!`*g<>gWXhnU64?$7Ry9-kan52d3b$K7y$`CVbV}#^&$peXm@&- zH21Dr=9!iMs_QGR)4u|Y-BB(sy74_er&+n{aT(~T3lGln)$&o5cvk9=bSBrN%4_*^ zk60lzhU2iRNQWJRzGwA{_TfZT!n~;84KuE6$6C+Y%+h2DoRl5)ej8|~t-8t*?8opj zjmFsvVyh+_5#3?TP=+qUeB(`Dm#aaE>g(QRx(#j>=@gX{-mBP`o0)Q3@l=IUBp!u0 z3@V6BOTj}cVDMB`Fs8T%XRgAnERK3c512%91mopK<@xersQvM9wHHQf-@EDrm*Y}? z7ymEPdk7w_IGQ#k6Ski~A)|@e(nQLH12Oj@sF4d)n50=^1 zw+q{DFAGL_v*cwvPW&y(EQ{#3Kb6C)_}T63MinK@W3*C47#V)VsIx$Z%!P0Z+y^%Q zb{w*9ep#>-5yT<&)@@1M_z+cDqsapWBOl7oljMM`np51&j$56E_IkpN${z>#trv+3 zZGl;4$2#mG-56{^IjZyR{0)LZ!%})~o9YaQ(~h+@T!NN>`wDjl`4m9H?_-9_?M|IC zoq>j7Wht-I4hy&OVWA%Jw^sNeky^nPlQe>xeJA4+RM#Fi&852?ex5w;He3^~k4F`N zpIPD`Iq_p2`98rwgxY$CTJo)G{qpi9bBUJX2A0ulGVF5w^gRq7+_%R^5<&`A{Y%e` z4|ffkn#1($29ZC$oiijG$T`gy61`es->P!PT^Qt>uuA0*e&r-LFOSZvn)G+)WhJ(I zm+_py!)M4`1%L3L6-IXA<5OK_S_B;`VXAUKGu&=#s;*4H?z>58v@Yh-vZx-nfcXdfNN=att zPB)0$^k^5YN}7LEXM!k_hLWqZrF1F0Pc}BLz0=m&pRRmeKEiA4SdVWT9;F9VIq?}2 zty8K#s!vRu$pTYbX>GmA8$UJ#DAe58gcz+MRe-p`F+rGvgVZv`4KsB~b;`TU}03-7^xaoHy4X zlEXyz*W@f6&vtka+H})R*uno|&G>&l`ycTG<94K;>ggm5u0K4Tb8^Iep!t0&!3CA3 zsT*$NixKLeo5R)jvw{(0rmpiFIeYaG58XqO>9%UJ>cbp1-oHfjoT6d5#pu?{3oMS*4+*M7nRLxU!Uso$te1Z8lr@ zR_{E6@@pkv3zcZUP9u^-&iO`>^+nvVE zXMMhOKR6;cmxV!0m)Si_q^qop(u?)i4o|JzR6g}P*NwX*i#he}`R z7fRx0ea?OJS3L}pu_q=1d;W#h0y&@eXtW!zipm{X&dsz+JgjH1LFM{(GnWzad{}JNc2Yyr>{(d zNXX6$HrK2j;XX9rrTS=b#4vIrc07gqa;J1TlV3Q*oIo(8>9$f`KS%t|&yN$M518n) z;-;o6!|XIlmUuw3zRv=q(Aa0ueo^o_M$<35#O!Wmei1KYjNE%!z}Pw4hOSh&C8bah zp}WQG7Y<%%O|D*UYvv>_%aHeyRCBNUYD1fT`HcDfUqSzC>BA#-ud5G7t=K9Y1jf7) zp*=|0Ho&&$k39F90d0sc7QdS2OpMD>GFZPdGNdj&AKH?uc+&WTHcF1Oas5N>UxTH~ zaP}EG*Nc+kqt$9a*yO&Qt;&WQcnkS?{rcS8IiGmB>v-C1tl=bOov zj1t3b_RG8qw`ajjTE4nfT)y z$AdaHzUvv%1Frp<6j6z@I-u`I9ezRWSSLlY9X+8Li7W`y&6EFR+ z=*foB;B4)jH@PQ{b+yi@1>ZecvjgG=(PW)vi5z57q-~O?E3yIw)K2AH^dl>KH5d<2 zEr6bwU8O}BYtFUttBwOD3zuLGnLeR~7h(r(p;cpuAK(HecuD|Ai=NgMW`s|hzvx%z6 z;s7A+ba?x@?)hSbT6KjHi??sF2o&`NhbweSCK|$PmT$O@=!$lmqv)Bu>Hy|%9V$rDT1sUd*jgSZr>94eKTRCqTBOWJ6c0q**cB!1X#b`=XpjU{H37bfzj%4}b`cV12E6h zEK4-TDoYgDrK$Rkf~T8cV+$*8fcllw%Ngpx{xrcQ5uYPa^WTUkjYAGdYs2Y$H8(A? z+nNgQQg6I0yQtsaVfVIAWMn!&bfP+OzrppZIIsy9@(}60`d;grC!+FNZQ?Gpw5>re ztx6+ot~C)+xi@a8_DDPrd@=#-1xAHBIMnmX?1I}PBt4pOLv4L5=&+@1P5TpKH)dCa zn-<+Y$OeNG5z_YQ;nA|%^u`SqJkX{^u ztpz5-*F+Kiatw>ppptc(QY|;#HU3ukD9gBw#_4>8pPAj=_T~N`G&Y@vR+;zxM?3Nn z`Sy=W+0xu6X3gN6IlG&FCzdyBCM`FInu&^J!eNO9KyrSaj5a*RvX;&>_8sKOMwQbs zKJS2UX6~KSwPj@8@snAe<6z=JQ{)lwp}t%sO1kzk-G{iwzQulz5jm@?rin3JFfoCY zjZW8=hg|E(hvkY$s&YQS3wJx=zCG!;<;Le#W$^vNrE$l_9=k<%)s|}!hjNDuQJtXq zjGK?vt83N=TNmva9&(#j zy{EqB!NueD7SYg8N^PM@X~D08nlJ1i!s{%X6Cu{7FKOK5DMs)x?S+X;7#3|%IB)P zFfJ$CT!GBAT2sAr)^Am`5`s2TKNfI5g=i_xj8`}-%dS|HDs?-negcoSd#I8|p2}_6 ze~_llyQwVJwLD(ks1yX^BoLG8P9-#`vhaF*gSSPpe9n%7P`TP)rZ70)7xnbcIn06+t<=5Bs?4 zLQf?f|J%G^>o3#Y)`O>?OO$dqY_{?ZY?qUI@Ah$f3`ZTU1(zF7y=&a-L-I?wNpH6b zQ^TSm_KkMgnQZXfZ}SsWjatkN#=6x-y0Hlcli1C|#ydF(u27;txg{Koa9>`7z=Z z0r7{MrN{}t?Zyu`Q;F*f9Vj<{AID2k{EpgJv(1ia!`y9AL%qH)K zC1}05S5l*=$p7Zs2Ppxsv_-mvPOkbL_?wonutwUvUpmQAt22wt%#T_Lb!LEK*$%+> zIn`n?*PD$^Zd8;au`9{jpbcU3i(F!nA&w17+~|!HLVtg*MXITRu#~E`utTodZ{?8R z;!~BEj@Y5?lP&EImfCX}nW-S&@bcR13~%bURod(>F{H!wi!CmvE1-JCCa-d*iiZk% zR(LXg;1umUj~8Sf{SKh0b`Mg5YxTKH`-<{wQE%PGvCggOazqg-w;ua!Q&|usyMpJGHJ{T6x?~+CyDo+(5t)q-WHK8ao;}G-S z0*H7nlje}h?_YZi(CA3BJ#S>|=A=-)DAJ~=0aJX=%fpbUvzuAjjUKl9U`B|FCuVT=^Ngve&*hpRBg<713sO|Qq`WACrj zkPWxK<+Xw`D4@vEi+ox+r5G35_C>#5`Y$8eia*qVvaJ zGE$`(#c8bG5)HE13s%d5+$BeY3mFJHj(w!tPy$RbX+-P;Ni3=(b(0k* z9Kr3!JUE$(qmX{JiPONA0}y7pxbb*R$;HQ_%~&;$kUBw*A#HZ?M^js=@KU4#qZBW3 z&>iNcN@=i|=mhV{;!;8v6dAO`I>=O&Reb#SbbPUcKnIow~KoHE&uK zLInJy2s@)%*S_RN@J-^hLZcLi<@xZo^Nsq~W?r+Ppot$#f*BW0V>>lGPPDyQcAej( z1_*UZ>^^yC&$RhVlw~>IL}4T>yxKuscwhZNv;LZ#FXvBLf$vivNqC9l)Y87 zA8~YXGnhW;{h}N(*rH=)W*5d|PUK>#-K$V6DN!l{R@wXph@0?GW6hU(D-ply(TbmS%R~|`XxayWN zR_^-AR^0`NZbn#3UVA$=9pe(T_?!Az4CY`8_SLxTRi%B!2MkFUq2*0I7M(5BtFd*L?+)kH&Np!Wzkh0Ac6*kdZ+0UBlRJ|iI zR32~^*z%9<=YKgJ{d$FEb?YkyEfK{SElN_(53z|AK{XA60V(Yj*~heH5B*Is&wCHwr=Ak(D^H zM?&IF&5HcxjDirgH%TXggU*UA!wQ#lG-c{%P^Abpa4!9S?ZQ=uFa-%c>c~i2Z}%TG zO!D0o{ipo}OR@fulivc$*vz}c4qCz=!Jn*a0)FxAwG3#Q)H9$s${|+O=nR%cXgH1h*O2HtlPXiOE@#lKeee z;5AvuLA1p!n#ys*&M=+2on5v(aMyohIEi@j^7f<0wb(ob3g9QL;ijm-LrTfdNX?F- zn4g{;_6!~p849Ns{(A++!t;tks37Tn$4A;F<7?gW`ymdrKpHODQv=NL`%e63n%HrCAFO}^Ks z_#oV!=YA!)(mE#?3owm%C#C-+9%hnI#J(odyq;2t50HUep21o-GkLL^TI;Ge_6;+) zXXQ;gr068`fTNHOVHGHoi>VVOc&5qW6I(CyNMW>|`y%-Qw1D!5e}2bFUPC~tm-zgJ z6sy^%8zcIqfS33=nNV`7<6xwDwteXG%{+w&V6=bTvh0K zhz|Ck#fiS5S|4~57$LchS^6}a;c+xu&lv<)zeUN%-6>Pz)3 zXP=-Z8|X6*>hyQ=w(ywveL6BPRQ!yalpoCkb((el${^4E$0~QOKqKz0_IjnQ7tdtD z96251(hBAzHQ=vaa%96ELFNuPsbi;fpb$U42MQjEI zQrDSZ<<_=4x%{nICkF@A=UZ=zJHJS7w`4kh#7?Ufg0?<9=y*8xCB?1w#u1ufu0{S zk|aumR3CAWkqT=hPO*-_vj@Cwy=|5|3u>Q&Vz5ZKE;??Or$0oPJg1Co%XfBJ&ny8W zKxTf*v}Qwpeg8feOmofU!2(FX;_Q60;XL zd_8A9W3ukty*BZcOU|+9W)n!hoMoh&>qHZP1+0X}X)vQonqST=nd{K#vG7c;Juq@t}cV+yGM#v^*meh%GqGvzD^(y_Io=>j)5 zyuBh3Bj&}* zbOeX&;PmF=RxghdL&>gr+@YvzN*`Vlc6FPcn@h~1#y+5Tl>dDs>n?#~nVkgVlL_5* z=A^pZi_j{ztVWB6l3NB+G7OS~FaIdNPZKuY#uEtn4ZGZ!ML(hP#GzUGa9I~ny?Dn7 z52PO*3gCB4PpUYo(>I(qA{yVT(;_R~;vgm!=HusJkHF9m1!ZF!o6%CqPXlH8^HwZ>+$A=vbp_*FEy`HR&H7_U#KSA_SY zy$j4|DWyJoPbVp!iL*+e&(+SB^*;EiOi3>hp$_6%QE6QmPSSi(d=CtgW)t{qlil?| z>3xhOwe<`B1Sf5a8x~cI7^wYvsGS#hKDZw7S2=JSv6ip3#UVSrB!D#+)nV$O;RnD| z*HNot&eun{_AzR24;Q%8y|mt41ljzLwIs5!r-L%)t?S{m%3|v&4B1+%dkWL{qxIj| zZ%_6IDgcAe5a_j%eUev%_8Dq635$pb5AAP*_!iBB-q3L5%HoC1Qu~<5n1?Az`#C7l zj5zL9lB^RRQxMr{lu@o_m0D+398w}a*&-e`-T~4h9MQGOW|=&qB$$ zfGI)h=F5-i#X=L0@*)h1FphcY)8cMhxe}~`m;J-!25_Cjaa@Y?tTJ3@Z%$mn+?}jV zlA#?I=~QNeIjp@l9;dWS<{ubQLoLpk!-_=mA8Nb%mkfEoUjfr2;y|huZnXyXl%1)y z$oNE?%GNOdp4r}Qf|GEg_ioyONRw@PBER)MGjZ9s6;-_ci-?z>1P0MSwFq{t4*@CDpzG`2df75Ygt2VZ&q<^;o zWA_TOe;oV4JD%qrL{i7dlUF$toPju!Z~ZH&4X+^*aePVa!3|qzwp7*Qc=fvOplp;) zjyCODn}Y@p=<~#Gwz7x`rKDEYMriF$wSV)+kA=fp3Wd@BU@<3st?dZQ)dR@czLO9~ zBs^b1bjY@iQK1!UvNs3%8d=(uXoAm>;I;t~o*!is@CJ0xo*KYc6WLZP-gr7X!INo& zLV?JtJUM54y=v=~+QXgTSzbA|^aK&ZVF9-xKPU;a{g3*`#2@gA5AulGTR8&WHLW2! zL|~AFRF9G1aS>jgUlXYZ*4BnqPDuW6dzzgUI^0Sfl>)E|bc6ni_IhgW9?`5lpRvM{ z|FN}|DLrQB2&ABRNcGTW%(i7Qf=q}%A!>Ap0^~*BsIJ1X?njj?)Wm)Y2QSMv&*fvn9OowEb0Ayp0En>980X5Y}zb8hvgchD2l6Czo zho;Oqr*J8t-)m*ABz^2L5$e@X{9uhVTncor7JGkn1=8mm6Ro}=0)uMejOBw_ffYEh zU&DL619|Kw#UlwLB-yi2=a+ZhdlZtR*pSvBV3a4<9lPOtz!nV8|6O$`;B`+njivd5 zF>p?f3#fsV_J(vxlOI0R(NlocbAm8En#C}c2tKU$#HC+=CV1}ON=W9Z?jIGbqZ*x! zGW30Uj9TO-4)vPKq65ieMolb3O%a(7~-JG27q9Dgs0tF^JaDwKGMgSdzrTb*FJ2b-tH2r{Gvi7mFxi zZ>SAU*OFq;-13ho=;!boJjp@PjPXlr>iIsy8SZH z;HLM$a3}CPnDwBe7=R49ySqDT>tY zVqWf7^lPQGL%$Bkkp-XP?QRy7aKhuf)6zH+mEZQ-MVt9 z^F~RJw}!kt1!fNDR5&xtgv2KX>UeXL;2%bi6*99a;>yRHi9(#SO8DWWjA75ZT~>I^ z-Z#wjK&BSN+Hc$n>MU7k^j&lfCBAEl-Imd#+5q&A0pFRWlX9ivybP6NXvcx>um37Psvf8&!>MJNEtZHH-Kqz=w#&jvGx`^TOvHP~7TKTe39-ZIN z)@8_4$syI%Vj1_AQA9rb3s>Z8?ah(HR%Y|ng4K=X$$;l=#vL|2{ z^>9#egD_89@ek02@Hk;ZfGv&si8sL2;&f6eHP&9Ek)IXK9|HN!U`!)BXQeq~-07EP zLKsE&4!Nx6_eAX?y)>Ps<`}F`HWb;7__+0PTBo{!lVoqP9rj5{cWGd)QBpoe(&g)Z zt6|azWl`c=nkPKp)Qm769dh%h+jroR+tc7YOci6X?WKHCjrm@^7*81ad%d!_Nf`_H%uD@H9&?mV`+BfF)*A{ob z5%}HX*H-Au(@^j7fG4230Lp7onB{A@z7ya4N;#R+eTT1VUBY?ZV$c4;UGRE^=nKTO zEeQT)3ZHrNsv)TV500GUC`_y4BxRAPGvF#85N57D4qIkv<(EMB*GObHwzu2ah8cOi z$aYe_G44prH0TBD(cMw1+Wvj*%JC0wxBthJL92z`9X8tSO+Z3VJGL1(jpA^(I&W0b z5GpBXrOWm`Ow!IP^X)(;n;m^Cd=O&3#7{AH@R(CL5i>(29b++1ooPAIsMib^;#}Z9 zwwzn7xL{-H*NJATD!ISwBZ~hk7NMA8AezMwhnBG5TU%4U_);2F`l9(x0h7Rq_vyD-+jGS#OxVrMUZV99M+gSjxS=*g6VR^AgOy zJDfIM#nOE;Ts_k+N2MmGSHq-;ypz=?rb;<7SP6^cZ5CU?B^7tSS~fKGADfbw``+4hN)|h-1PAczT0Y#xSG3<8L43{J>%* z$1)5V5P`|2`B`C`-f5p$)E>HzH~NYEO;;fz?z;^tIG<{y1a!B9oBi-zFk`f~-F#}g zi#xb7s3my$KBDBmqwW9V5edd)*V+%#TPthi=b0A*7n);#5VvvX<}Lp|G6E1K1%R%qt$;Wa^n#2u3gkj=Vmw=MAolc zuWNE|lp2y?5nI0(#GJw*SFYq8hI2Hf{ql94tI3<*%ZekmAiAWZhQ*vpv?(SVrG;z) z|78J~BO9CAozUD}DQb=b=bTqDL$dXvd)aqw!+)bgE=5*@#5rjX;WoU55`8aE79Ys< z4ki3C!NFnI5^M0Ip?cRwqciLV0JU6uUxF^P;GVCQqqPw0GU4j(sueCIzq~0mmrU$1 zmnszP6&|Qcsr4LVM$wwt&x|_R96xv4S}-|rnaV-119g9$BepJI_L{?I3y$+5JSL{` zg3bJK%5IBl`G`_?qW`zUcM<_`&Iy2yCl46j%m=rLB{ zPWLC61};48PESrCfjA6E&0caIoIl9%xsDQ| z^b^empS^McCabQzYjI)g-D_JAP?6pdlouiWn84R&Tr)NUmCfRFqKs9vY>2UN%DgXHlx}xu0n!nUiv6 zq8PK?+>p^h1{DFn7qTihZNq+l2KOH`n=-Reh29}$qcZMlngPWQ-G4~$qWry*;4fj& z$hX@Lbi@bkP|(PjD4ovsVDTP)K(I-AfY;H{Q98w`4$Ne;N3iPa?WxE*5N0NL^6jLyjCH-gS~!W2F~E5t~^XMtG}C1)uQIe-dyQ43c5lkrjR6_(s@Sey&DHzk#Y@k zjiv_OdF>K7yDIaDhT1eNOg2&KsZD?Vl9rh!8f_caq{(;6mWKmeMx;dKhy1^$Ycz~X zGn+zOmPIDL(_10UoyQ3>uQ-o3+Dw#DP#qpJcAoaV97;W`9ZfaRF`msH7N4-o(|kh2 zf3Ks6>gJ1=v3^{=dT;!nnhjXKtv8;(TCa8r7}=0>;^5c_BlYIks5|D@U3>?RwGFa;_w87)+=X`W zGkd^4Hh@Qu}n(Gcf_53?;tW}hr^RK9m-{i`D^UVo}bX*3_<^Wv@$PK zU}bK^Eykl=fs{M{&4`Hl8@}|EHfc+VShVpp))BW+kkJ1W{P|`sRJXh-8W|urg z7Q9={u-N51I&L|G<-)0#iSSbj&+E|^7EI&JjWFY$08{=`z7*F(3^4!jGY-mC)+{w0 zTHwR2;<9+Psjs+4g}#^_2U;|d?bhr&*$B^dO7kgGK7W5DkwEiue$^q?E&(haBNBAt zqs{sJn%y&8`!d|V*01iQ(q&FN_v`delQ{c1J>R5~DsUUQ)IMeZ3i*p_QVXtz?x95bI~`8SnTQP8Ufc7cXuoXk(F(MCI4=N0&OV*o^!NlKluhFw z^j0-CxrvM#X6&uW8O!J34`0c`hm5GqBqnS%akw*pHfcQgGS(mZJ~|gHzWuVbVsR6v z-R84Ged30hHkwo(^>Ra2A9hra9!gp7+*Aq;jpy2pR4N%*V1471Z?JbnPSLLeW6L@c z240EpG#2xBT6-O#6&(1^^LKjAKqZI1zr z9il%fkA5lGVy6S*7m;uaHtI7(jLb3% zag9+%+NQsN_jS!#EpP;$F40-|nf~k&m-;n>)&)uf{!hy9q2W~?$n5wZwkCeV$zqEKR+Tu5U(qD(j5IgER`S(!|s%^B( zPP*5%)ucsRcAh&U_i24Pt+LwefwqsxB#Ae_+BAPP4Zu)$Mq`d|G}RvP769Z^!&(yTHveTYq<(JH?Zl8H8BQ1tFoF{bjtdTLKTs&}l{ngB8R{>aN zpa=$W_v_ow3(U)Zg5-wys38ERve$E(tvLEflu&KapNXgS$SijQ;4rm5I(Lmq^KHt< ztv-oidn`p3vv5eo3^g)U1R=uGTI?F^Yx@yVB?wH?`Pw?g@{9ft+;K>s%f+UM&XQCm zEOm_mE0^&zZWn6(j^Q%tq&#!7eZo4`ypmil41{qns(jiTN^?aq#f*hEs^dHC2SccP)oSsR6+uyXhF~#|<1p zl1`JP^L+zf<3p4TW{uDBCAguwT`ZEC3@x7;V%3P0yjG<%ar*e5(1i~9_GbPy3VQr^ z>wX{0vh(i!aq1!K_wAOew2KvDeHTz82<3P(lJ;N@9P8hkA+2`CWXh6sc9l+u+Lck5 z`r-kdQt&>Zu8QiJhQ7M$uzRRJtNn}xJ#1%w0k$)Les2EY|Jj+Z^?!EeS5p4po%sO) z)>BW@|2t&n^%jaha?&*=~IcoO}V}O{oE*Hhk)@(3GPI_u~q{4*q%la z7$QQlfFBs_=jts%HsnTjU@RJEB_LFx81v@sx8t=SW+H>gzF>mS6zfp2u%x@kqm2dJ z=lQuS=p9ArmR6Bz{c^FCY4D{G)5I4em*gX8jsyTiZ<^kzbKaHhR#sMnddE>YfXbjE zi#npC4f+Yc-PYD+ty=opZ49=n9EohsZRmA2maI+ADC=3wonwG)U)&00R2aB4`294% z#Fg*9`O~CWQ~T2+7;j2^e=x0iG-1-X(s(E5?Q~%}2g_0ex2En3Txy?qovHno>i~9N zf(739M^w!Z6Ao|Y0LgY0CrQ%Ozr+X##&hV(cWmdyw^3}=H75CX^6kdeO~}6aeLHA$ zx3{wFnjZFCuZJ(DuypOKz?cZU{iKD@6pR}F+$VS6FVjLG6Q&7cV%qM*v_6oW`GopP zVd}W`8ODO)+1Qzv5~pJsf?OuXnXiIz3&K-y_&VO3e7-OxU51(t)HlwLX@H)m{K}wq zQHh!F5uW`1Lwm!%Fh^W4F#AJO{i_9zlN7?ZwD>~4lvUM$P&WMPyU<;>C0e!2&_=dV zzzr~%u5ohFdA2weV?UEpdssz;N}0#TrUHh6FHTsM+O7*;46tciOJ+Zgzv*xBJFY1( zDh7DN&e0Fa#e%WHp7!vXWPNhdQtwJh_g8ziY`$))KXHZD-x86q??&LqoFv@=9B z6|L$Y`j0K1yX^4fZdYrR)Xhn7w=~Yge^WA~m|Q=i79A(otPyV*>>pN9z$^?7^DH5e zHP-L)d2&98v=0ht%=Q=ZKe^lCN%>`}DW`4k9r7Eq&}POgMsJ|Ejo=#37+M2Uq)`G= zA_Ysdj;Pk;;`~{*b$NZ8BMh?9Qh|@&n^l||uU`eYW*riiiBac+?Y1+0t*ptb*aJoD zB|sskFmg^2dzYEVVVmYv4fx{J{XDT+_H0FSGSrjL1_({Z^H%yyqWFwzE(q0jO>sp$ z69V~kW9Kt>sJ!=|>UfvTNhQa)zduAK83}?bv^6&Rl(lmy#tt8+js2ohlAue>)OOO0 z8Ee4|9tODy7+9aw2q1K$xo&+6T24`ulC$FUfA(MfW$hZGyt=)8)O)|Z6y%CQX84(z z?jpFqGw-@EZgSVnSKmGH^fyh+@K`Oep7Wt!Y7rwFgEr_2CI(sK@Z0rCCF3N`)2fJv zIs(5leiPodIzu-v)e)1HX9M*;=BBg6hH=H>RRs@qyxRd|*vHdX{; zeBg?|m(L(iwkgheK6;Kl@Qy7RDtIGr>OJ>I*K4w8&8LU?`SMl9V{XxqT!&A`}G4a16HJZmxe?M1K_$C;aK$o8>4sgtZS|F^=BAz#x3sSA%ec;epiN$}R`s9G^;QNY0+s*)06X z9(XfBMVR3bG50`+y7&vh*yko8hVmbNz*Xp1gD(W6qaqnpw9W3mC3z*}7^kv8xj56D_K#ujIc%c~7%=O$hr% zUd$w=LA0cw#p6wzGJg$(0O8rfd|z#Fns$yh*3lmWbrSX@GbTlGA9kzi=e;E=1HYCw z*9&~n{Ys){sg(tu9UG4`(yY_x4{25%PQ|9gjh`twq`nEpS0`%2*1<5!D@j<*#${_R zq%js?3I;k8ygmxiJ9Yk~H)6-h_oqtFr}BHupXjgh!rr^_pL_22mlq8&+A~zB_aL&A z6+Ka?G|44a>8AD_53zF3jhs6qFn<=}Z@ zDs?~XtGJifLXst}v>!KvuEMzCd)p;ibDMRS1IPzD754d3Truu@szEv#^l|%P7SH`x zXvm8I;PBFkoetEeiJVP#zu9Lz3k-wQx(H%7>-7g7wh3gVh;!5g9amYG`qGh9y=1|_ zlj`mK!fmq#*x-Si?UK<-pvI~zp(Fpyp}A8W>W6}HagimfIfhnAmS+`7LO@>?Ps8>l zAuOFdhVH{TAXo-p^FG7x?_6LuO3eAFZ#y^R!r!`}TxhCsDbFyiv$;W6E6r%|QMza4 zCX+hni!{b=fWp%&Smcn+Y(MIq1V-WV>Ye9ZY@8WvmmBcSWCfDzsVw7c_H7qFuYsUt z2V7VTzTa)KA&JuR%ztYuWx|}ve%B!}`xsK#`b2TIA*NLe&q)*3@Yo^cPJAIIB2og~ z!PE8`3y(rRqj^FJl(8EoI?hpetylL<6S71ZXF zKO=aIp595QlbN`kgO=EER`&JP_3Ku0Z*J*|5@`+%D|3ZOizgdob1)5dVCA>sAMM_( zSG05^oQN@hZI? z+M7+9K%Mm>!&6PyV|(S#Bm)k%P*47j4@d0g(V7~n0XX3w2N!g^M(O<~a4bOV9Y&co ztIF!*GW0Q>dEr~p0!$kmZ8jDP(}D3MYgN*>x4DnRk52#NV6(w^hw8ub62?TGZTqm* z%7Jk(3wEF7!9lTIPWrjbHXpHjGC0+d%)YCXmMh45+$D)i<_sdwoJCyPh?i|LW7ZOca}NDwjk62B&j41D7l~;}*8|ZriN%5oeydV5Fx9 znsr5?N8DqzU5kIxl{(wn`e_$ho42N-_DSjTveGqPmiv*AzATtrVmCNtf`{LAn$Fr~ ze$;4WnZcG3LCR32TD(C6fqqrW8Yp1CETBMpJdr%=`1ess%XbsSBZ8|&w$?a3+2&|R z)>_^fw!I}_T=v!II{j-;5r$SSZ7sb|^NNM>1vk@gc<}b2zj3m+-H=-$%?R`&ASz&= z;7qRyn9*oe>$Kn_*Tq5h!;S!kN62)FT{T7nRDd;=RYv>g*G_Mi^Rs{8?^AM83iq~` z2*%<1^&0?k2KvO^9^#-(73136OO2X(`<6nfd<2KW!T2WYQYgyDSJ)iztK`Q(b8+5k z!M-1T_2`KQ(>-*^mU@ZR=B4_=N;QhGQS#~r>r=S@eWZ71EH(BZ^1JEg_>N9+i1@?3VWDub$0DuT2!mu1MOTJS}Nf|SOfQ58Okbu+EE(Dv%2={E>X zqQ(Nv3N9?uPlRXzjhbG!UBgHHHE2ui+}!)231#L#1FO1I7rYJ2p9MfGs*Zw+yR4Vf(8&F@MZnFJOf-y%|=uqVi zc{;~%D-ZpGzNd=!lQ$hIq?fJ}4rn~mUL#t%*N0T!WK*60g#3f$Eob+Pw|*qbAzU;~ z^;3x%R9cJ4IWbHTQnOSR;y`I>=?b%W*vY1dKXvghAj;0;OslO1hMyKbbmkx2MrrbW zJJ*f;1zQVM-ZLDAj~Oa)x@TeD?21Vv`Fh--ffrnamYEr1Gfy2Ln);l!vbB)+ChH4H zA20Mx13%7b0)ICN(H4Y;JHAV+H*iMSkxjpu*L{AJUC6R52}-?;9$*;1-6gV0eMu^m zXktI_ZB74BF=Zs*7S-!(J_&Dd`b)!e_u6OgT+WV;(9|N!>x?M5YL36Vj6$*VLL&d{ z&T}7nYk`wgwIe~QTwCl`x@)SU)z~yVf~ubZ^@=|4TH$}eHT_pJCpOZVDZ^aYqsTc> zx4)MmlD)&jWu1O@S#ajtT=kryOP)5O3GceE-=6C7M83+HroTbgq`Fv7&Aw=*Y8)A7 zzih=~mC5qZ(qB%f*vJFjg2m#+<~atiTfRn+r8T@UYGlArH3Gv7@bzC!{kjf!=_O{! zRS)~~0)nM}eukxt|5ofo0IVW|cp^X{ZJ4;>(j`jUovEB@B}ZAqZ0!bRImH(PXC1(b zMfS9zYT#F!t-)Xk?>!pTt*~*uKv9%4OZrq3b#9bu0k0brJNwR}W7b42%R=+@$E#4~ z@;!;?4v=U3)oKu87Cfj0tnbN;sn7$VjV$a*9lH?q0Wl#YO`2@iII^Lqm}QBaI~k%Z z>Y2Lb*!iigHEPoEdhPVLvs7m%nZ>W+OCfvXY(USaf))m) z<{*bzc^5<5_3&TxYBlxCL9d&!Ud{q1x!OU3R2@n0RpDr=qK9hI+#5ycj(VzvBSnmW zZILO;g#b$Y86YzgmAE1Eh^}Y4w$7$VHNK0Dm{=cl$`~+=)5fa1pAde^{miIOtX*@T ztel_i4MYLz&M3B!kM<)>uu%$2;E2r8%(6$8g`^8)q^-%t+$iYGXQ zM2LY4Z}*y{KRHkgT+lKs$>Cd(s%Dk&)kCc*(H}t>1UN;A4BMk71b2lChLy9{Yiq#* zBY2Z55JlvgQy=fqejYE7t*f8t(EBtQkUj_j62Hm*sc2#$VT%rvn>~42b0BJ}eDhE4 zluc@Q3x@uX2zNjj=wtky$z@$+Wa#3lNx3xV*T5lNf_Qrh;owiI-^PV{q?4&m)3pkJ za$GnvTRNr>1hh-1@DP8(A)HDGv+K=Xu8z}*26?HLTa}vF1Acn$(5gN8w9}fGM~yI6 zadMg-L>@1RH2rYS2@=Y~;YZ{NqtBt*bi$ zq3wZ)^hyxA3q%}yxCUKjC(X4wi53-g0!T|ZUJpK_WLTArwN4jLE93Kot^@iW1^hEn zQBq;|SvlP%O|>f@m7{546{j@{lJB!AdFAZq7#&)amyOIYx~Kb+R-dxbiLH{N$IrJ4 zXLt%EXXa~+_c6$X2}hRGB!w!SCB;TL)v7pq^gF9rj>A^vZ5H-7gR$E77>bL>GqdNk zvCXrX=7okPSfepCFQwX5Si)NkTGy;4)CPtO@d8-!_kh~9PUB3!2Gh}6K!!J+Sv-87 zFRnOf=&ZWeI%*x$Ri>WPeEav&YhR87xqLdpTv9FJ!05py_iW1rFPuhU+K#&utu(D<2df-g>K;(!JSHJ-jWHg@VK6`*=~A{+?JM1d2bm7#3gu&*eUJWLYz|Uu~n7DB(-KeROEVY}#H%5V7it-(py;aQT zv`}l%BdF9Q7lXys<7C)mu1>-ySM4fP5fy2#%T&obtBDG#)Pp-4CC;L) zw8x}bl0>7Ag)qrnU4sTx@M)*A@r$$TnDU-6sA6m=s5fVbf zX^2w2&!oa)X$a*Dt;|WRJO*vws(i`8haimm`Cj8zr#+j5AZb?ziCvUHU+2G%GJLZJ zOOp(3heM;?Bg^WS)xp66NhY8FzRC0dI8&$gu5pyM;A6P&yQ=zfC!if!<(n;C@F-;s z%bFx^>hm~7uJNz9G}a^>l!Dqk-iP#_pHJ(k2<1K+X}AA$bXCCxDV+@pRufTeIkSp1@r?tZTqj$uO`Yh*z+6 zso{hvG-E4BaZiYh1nLjDRp!*TCdTOB#Z1{9?i z;y)xl>D9bDGf^s}KYMkE++(S95BsB|y2Ty`wg2#VtAfg0ZC_-JS*WvYCg)b|Qz?ZM zq(qZrT45|})n?H$wcmqJ^)0|L3ir*wkMOtnBsPC`NO;D7zQ*cn$eC^HHJF}P!&(fX zL~u7QJ%p!PExJ0FDLRa4CNneA|6K*uyKKb_{8Eo>ym<0SiXViw?j>56_G zX1ZOkroW(m$j(`&`Ap4;>9fY=I9$Xm3IpoX#!>=fpttKE9-mHtW;$me)s~=M;@q0m z#vhoN*k(WGdUhF>Y)msm%g9>$FsxdTZ~#5^leG%C791CF|udL~`&LecWGevj4; z0`pm}Pxj@m7UHSLoYu^30nYOnqV(X35=lclEyFnO7FJmRk1-zO(y}f9Qr-WhySj<> z2g1muZ|8bHa*PIpCR#HNtvuqu!($7Ly`qDHZQz(yvT|-t0sM>8@+K7u4qN_gcDy@2 ztS@XpX;rlSL+Z423NvieH{n_u`a`xuQOU${M8zJFQ{A95tWc$Eh;06u%9lDA?Y

0j1XC(j{+wo9R-}AtY^T@FfnCAT=jGD2D#m=yfF%~EgD13*DItl zY%W=IgY|nfOZ(b;qlMA!E*;JvXS2eG7rWLjfYl40MA^J)J&9H)QZ)3@_usppN{Vz{ z2AO{?Xv55!e>}2w)R{G^aHzfD7y-V6pjQXg628P*>oTd_^XQ-0)(APIeX`C^pJ+D3 zXZ(}a6?sK2iH#mJOLip$5KPdK{RD+7(0%s|YrH?VQlYL>irTu4$I?9H$o!LRtxp?C zt3yh*;`WfgCIj_{3m$>bH?Ftj@-CFAv&Zx1+Diu~>YY;j?SCXi5I%6%BV4DBHJD+Go=F}&k+Z3pq7t8Yk6$OQUeh54oWwXS zCO64GF_?X?cKj;ke>}WDZzXG z0^l1?OfN}4WPOMC_4YrvGVi1YkT`p|C%9Q{EG@5<96LHu*v z-SLk=Y@-s5yUZH5@8k)FNrmsKZ5Y(=VE~%kyPKp{{R5~Lii-mJ)aLR7Zo;S+D`Iv` zo+bYG)aZZbh(w9lZrh<#UhUpp5ZA67JCElwzL01`Hme6l@X~nH*4|xKtN8uTN%C<$ zFQYfk8Lu==Ns~c;&@#!rKWk~#`|r)x!dKMD8x{f zw5yt*=t*_^y6@Kb?AzhSb|ar-Dsz8zxM-ZNM}B*asbJ!K#6tQ}Zf;mwkoD>EMsprH|qU#IUQi^xbMR@au``o_PF)c*HWMJ1^@hjWl)xs-R;!PJ~U-- zE;?%8)UPph$;7MwU@S`fW~+6L#_?_uRFttt%Gm&nYt!-^RF4m8Mr z$}KfwLU+=NgXW-rV|CT46Jsoqx$o<0>`GN{98O%}aCpO@*z7CJK9mEi&*wGlP@)5u4x&GfSTK1rB2_m$O5+kT}Wj@|AddAE?N!yCTH8yZ0AuAZhP_U42e7>9br(sG z7Eo`36bz3zj2HfPOtT^umuQ`^O+Xd@KI->$w>I^bjez)dCGphTsh2E>d*qS?5~Fq9 zBfb`gGVn~CD*OnjR5)IK-{C##t%r7rHswQFf`b{XrfX>rXrfmq(g04kEuW?xMYW&5 zc2{8z+iW}6pNB(SF2rf--Zps^Q4T17AAq;5w8lX%r2@TtAssOz zQ)cc{(9MC4143xsJQ94##Vb+(_Stqyn1EP zPAiNu6vm9pD`a-}w@ zE887;zAnk9YyrwbH%UnV%4C^h)9rIc+j63N-oDP!e{I#$x~?{({n+J0fJaYz?hXDObOd&O;p_%8r-%to#ou z`Kye;h>CX|s|8b|+I(Sfy~c-5Tw z-xNVBY$7yzUUDo3>W|il_y?VYRf}SaazYYGpTHt2?i53vv0E>*_eja8wl$FOwj|p+ zmhE!g{i(BsC)}yDw$_U(szNUHq=Bl>rP($<;-&2>&xfxoN3ekIyOjFOiXeH)Ndp*yi$0mSq#>(zp>S)5@rGW zA17y$uj)5(u@F!rt+68OB*1t|_^3ZcD4ZUW5WTH{ZmhBzqc? z(^B?Y=_+RfRkCDT5|ETQ&3YpWlPwcgPfq#v-|+u`>G$_nGOm4CX^L+m8Q1>LJF-ht zCl>_VL=jCvP~%)YS9g`%7T)K}K)o@oL^DG+7Zk&moK=zS`=Dg(Uq=S5G+LBeB;Rjxp0$*(plALto#8ZLMw!hgze+`KwvZ{Z427K4{zf^S;I^?y2Wwl}qJ12xgm z=qoE%weXqb8v9~)j1o;colv!2NZ-xN-taCu)7ly=p{2EuePqHUeZney`Wc)qFl;7i;bJovk`AebJ8Q(nVq zp0qekckMnUIl0^9?CPvJE#AmP-HMry&W7sxlB)Se9>h8f;)Uf1nK__*z&ha|nrCE5 zumvVy78Arb!#+i+Nl&h*0^%D}Mdp4Zivc~;V(psRs@|M6t*oBjYZ)GPbHv=vHkOH1 zS8^TeyRbiK&8I%o6Rh-IFsY#t0Z;BawTcars#SU;QZwn}Zr6*P|AJ7ViHAb#ll|_B zyEFx(OJY?`M}zw`-6fnkZmc;V5FrPzCB~&nw?c)H`E^e&n7Bf~bXYxx{!wpxpaN%$ zuyOe3`>9sVK)ysTrw2S4e|4Cb|BUKwMPUrU`D_XWZ5)Im6Ika{4a z=*Q}gQPo0|5~R|zzt^>=NrU++jjl0n#(cVh0g1hLV`c^w4uV;=`sO)&Z2s)B z1VmCty|ycTjQTHmel;r&mu9~e;V(Qr%pX3vP(E!OZ4axx2xK>Z*T$-9a1fyF#$Nnx z+t$fw8XU-@tX47x2G}_VTP7rAe6M2zd)|oKg%HEQMe@e;9wl@m?j`4W_6a@;97`*6 zxV7M#kDA~`9~W0QTSXLAzah*f?7R3~{ie#JWk)#bdHAP9aOj4}k>_vi4MnjV9KX8io3chy;7kg8F+*#LRz`!Hr_-21d5SGV=NlN#Q%K~Po7g_PYe^8nFeFT+ zl*kDgrrt7I^-=yE>EcZ~;mnYwLPsg70r);Bg0Jo=Lb*~iP6I4!_y4G6J~VgHk=SNd zx;#F4l9Ry_B00axII}`U{(n*TmQigsU%Y52HQE9N3KTC|q(Ff}kOD34?q1v}UL?@B zw79#c#VxoLZ}AY^f(42f*C0uN^St-2`|bSC`Jemct~(!+weqZIW@l#4-ZNW%+w$jo z84`g46S4PX6%j;2=&F5Q?s+ z6X;yU(qF?_5IE=aK#9|XKv5JvmJJl zv}TJNs*ur%MQju1-jr7k9z=4mMLNfs`p|Xy)kgq-JFIB(RgDZxr+jbH;$J6s!9y@_JCM+wRz zljlQ=<0_T8#m%iMxl;aKMrtdF^b>FBr$@M%*w4nG=q5udQy&GE1!ke*;h+EaG>TGD zH&|LK3E3Jo<8$y4xp~P{M9Tzhv*%kc;W7nJQ+HY{-GcYg3cc5z^$!9;_{N>QORoY` ze1mmQ4vsfnTesT5((B5!6o8>YJgj1a)wuz(+V?&Q?i5&1=ZJ2R*doPMyI;yi^^Dv9 zL&C?uxXG6)Y<3HJ`|beKkuc+Zkl3kno_Y~pR&7`BLrz`csH-&Rq`rJQG3FgzR>Wk) zX%gSe@iF0ckZ1fjFKH}HnZ5n8819s2~z7)|RFYIC#@5|@!W$S_BVQZ`srix(4 ztmzZWEa6e0xI^DNe+ys;T08omBa*GrYuG=G#4$Hq(Z{G>f6161jriBaG=)|ND%3#H zk>Q=ViswyxPNChpy1PIT8{>YP$eA@ko=5v`;X1Da*PDfuG_wAHK+O6Y=+GbQ)VUok zuKGzs-_LD#r|0if6b2FcZE^0Y{uV#q;H#Sf5-2lzV60eL(Gd?F0_qR08l|Uc=ZNt~P42GZkA8y4Zg7HnG?3Zrj5kKfidg_pL--tm3|rfP{4 z+cDX2j1WtyDU(!449nbZ_loK-5g7D=dL*bkf4=;*Q%L0&w6`=?a!sg>8=-OIe!b@J zyd!qa6grDkb`|1&{%MoBxCO=)C~{6F*ALsB(PI+!)Z*$Qdqp98@WeR~KgHaknx+;s zjX<#4ow&yDmV$G+8tEj6lk|{h&_Y?I+BI@$LwgbGL|;9&2%SG<7ITauA%A=esxKM6 zEAcw5N#Z2=El>_gX>@~<16QQ zLHB;`O@R?2s?#bo3sp}cn&5*CGiP`?cuYC$Nj_l^$JB5=~y+%#Qn;9TJ-pytdiC_AUN)4oEu z3W$~;bNeB*k$MYaew!Rqet4L9L$@n9hmRZbuL&A03$%7W8a{%A4(9%8aXE{f^33-Y zgHY_Z5-xTW&z5ZWk02iTR5a!_5PWe$x++>bpSSqz!-{M@S1_ElaAjz$H0uG6v|?vn zu<+<51K+{e>GAoZ(-U|%JnQaQ#PcGDfW4}@+HCF zk^z5?AbjnLjLviGCF}JO-3zN(s|VjtQV|1p!>G;YE_+ksLiSiIo)K3U+GtN9UOU~% z8Duox>R0!XQ*k;C^!KA@GtY3&n-w`u7B34>(U?}=TOJX>?E!!KhGtFR7DNbgjY6;= zQY`k5sA9Yqe6QIk)YDWK7jZ0zF zxU~7D+n*WkCP<1l#k&QuA#8%G?A$D78LA{rT}IR-{0R(ps%9m*a58!on(J2%cQ+un zAmvLgyH(1y0L0hp9{`w3@l{#loaj{_Vs6S8v+}rC)#Y<|H#dZp(?=Bv3ca$t; zs2JB&`DsIW(7{&FQCN9%C(|d6xCU_~)chwIJegg*BV|V6xG?86%FV9v3@W|2O39DYvU9^fYydVF6cY^V$&4zGcRd(vFg zQmC(((Tvi{mIvR0M%3oa>-A{$(YK(#;IS+ZL~uf`lHByg#c(hYI&<%z8(VRMPP=Id ze&I{oONnk9^^aykBXS2@#?EY*;RPQWMt|`dx}d8$gPd}%mTF$Oj?a3cOKx9ItHT;t z+6X|kziLi(Fw+?wRbNarnRr9(jSXzcz-2NUz#tX_|#%^O4K*XL#h60<*F z_2QVQKirRStdr4I)UyBaMuzqt4G&qGn#R?Lrogh}x65L95S0>@6+G>@Kb8p7OCiStR6L#&Vq%QigxFxBP(`Og~g>;3&|Mp7*Xv-F>n9@YTC1LiNE1 zgNL!Z^F|K_mt!!y*O6tX3dmP`6{CLe$Br*kqf`r~bQefyl9Zes3nBVE8N7T(hR=tR zhmXoOOv?-DzV3rJ2wPUM=GsTBlYObJd58P$p3_tI{BKXBVZ3MTemSGzA%KmTw?!U$ z-T-NKjxaFb%p{!hI4fy?cMHNFx32zJPs~elACRB{Ug_^r}4O`8~H)+p2PnrvYvL(2~r`#nMOf%3u`WTkEb{&}*x^cE=lr zxSD({uWZ{1UiJ2AwtvhQH*^apN80_o>#0Y{3rreNJXL-9u-SW;l=%JZZbc%h&hZ() z+u_(ji-r38S*B^)xK0zao0Ra{b(VbGM#Fh8+~w}?1!9IRAB1i>&!4mNSoLc3?gD%f z9L{eoVWVewpc;*nPG4@1uMU3lbUFTs+|=UoX;h>BJ~h0r^V>BYhRjvf?F3w3VS`y7 z4kq@_-2Z$F5-``lkO*EA=*Qlf|QeHb_F6JBE8UX%#i4C#?&c}7PAYzEf=!9gK zB(@XBcytRoyai?4f`Cjj->Li2!L~Q<&@DUjThP%m74*^`-^PvbI9HdcIreNBu@Op- zm~P+6YF-06;9Cc9d2$mgPsF5juW{t=Tr2T6cq!v-06w7#aDP>0R3|_JeXD12bM?1hLsEn$W76l!N!W^es#zHOKv*)v1s#7EwIND1zLAZLqgdd zSO4vN0}%pjKUP zLq`gJsyyqz3L=;TS`HU2;Y-9brHlEDcJ;6A8?T*`{wH*_{eR9?P}3D{+)rBcIo7?q zRW!Cc5!v>tG178B>|V|!r&LW%zbhaNQ`-ZA#X?7inu#+6Q_b0UAVk*Jp##9Nt8XaE; zJG!3<3E|~2%ynklUCkkHK{#-q-3>KpcA@)?T^c)#rheh-YjyXWulsqQ-VE=Lr(+KV zm31?Jm@pc#s|AF7uH5MInOaxR?vNRUlu?ddDbH$sMIOYh=F>)X>aH>rXH{f5T|H=F zZL69KW@HSoT*>d#rgmqE0u)s=Q!KQ#`j6^;Va?as7QQhGdD|{%rz4h zY7MNOwWLf(o@mx-8d3CJu}4_xZWbDuvd3e?)SV-nratJ2JSL%sTfRCZdVQ>?EFh2P zY9ug9oJ0hj4S*VY_J^<;D~*0Pa^xWVp6r&~;T#_GHD>87WY0yDna(nMI@+;ulk$0k@_a39iFUKmZIEa2UN1;2SiY3~a#XeNZ zo8$q(uC57(r6j?Y@8AlGf!@X&BI-qfEzdb)$^J!2ynngReQ4AUKli;GjF-wzFi`X? zvh!FOaAKkFkT2X@R8ohy=}LFkltCCztlds32fa-K)x%Z(6pn!8dob8R!hpOqgRgc?}z8^F1T@A`>q;aGEdu2+S+EiO0FT^c|k`O zG8Cj5!wYV5cRAN}V>)MMB|dHK$mw$%2gY@ac-TZ_KfvxSRh?KBG!8+ea6>2w8RP?>eQv;RM&6uz*#*bdlMx+c5bMnVq z05MqP6C1YyT_wvme5YB!Bw`1wVChYhh`6|6T#J6E@&5RseNE?w_k)TGf%CAF=&Fg1 zJHP^Jjo?!ro#Xan4j)uAoQDCz3Qt>i&Ymqks^>?5sDzJ4vMgXxm+~VJ7iZA0!XOmmJ}4z&SBb=<<9ZdJPzdZW;01exb7X&&S<9_I z@oe!q-fnkWmAqu1zE?V|oa7lyzla^b!Aet-M08I{s-bgSIS0%uzi$shH3mC*kQAC`|S0D5tX+AR`HQ1y@5vG>HOEU@x=um8mp^mJu$fY_`uU-C>9!_<_xTRP)90{)KvWBK6( zDqDV&?PN4%nxbBXvk5+5LWcgFwDHuxWZlFW2#^Y2mG2@bNf|#hx#fYQF~O zY42){^*GzXqYm;B@w^8$&1=*4zr6HImvb^Ys4pYrBFC5glS1qTrHB(Co4 z&6{vdQ|qJ~D|~E>^LBzOi7D<%kx2CUPk^w;?s}!V8v!>*;!Y@!khdII`5_G zwRa@(?_I`BjYz3G|GRLx`tFV1hp%pmz2^OQ3T>nl0k*5}%$Oy6_&awRCTUj8yy$cm zG@^^0fLITAC`qr2YjS*S3Hm2Y*zQMbF@(%6t0B;?A7u8jjI2pA}EzE3~Lm392-YO3rwNyD3VbW~CgtLg;mr%J(eUz!F3 z;J{lXPrNf5j;N#Z3tFV0<_o9v2 zQbnw^19D^2?uJ%CsOqL*al-|geJRY_&b-CQdVrPBw`f*a@W=pab9&tfx){xx$pG7J z`g(!iNQi1UOFO7zupd_Cex4kHj0tK#Sym4|ktAWVC?|XVgsWSe429h`%~qQ}9>(1% z)KklJajiP`D;9;Kbx!5#R~Mn(Q%CWe`H)0|^dU^=WqKl}=~QAlMStq`CIS$Xx>KdyLH_M|Ibw!s-X<}iw2>ueBbzWv{}x0_J(_-lEQH}%#QQ?93l6f%zlU+j>z)q} z3twcImZEuMwF8BAtQfbi9y^}pu1`8wwwn%`|4v1j{T8P^RSas;XnDDAVTDu_R%X~y zYl@AkXvY3SmZ6@+X<%4fH(%tL{pYUmMi9VpjlLCv4~lT&!33hJyUL;ySiqhGP~XuJ zzQf6SI=CP?0Sh*aX@o{EMMV|R=egE;D8*x>iU|{2j#$lrfJ2J8BH=;6Kj!Dm- zFmyv`mRdIs;L`OtH(;~v?Giec?Ka(l`iL@j+@>!05e5Wy4ks;nM07^_^BnT8C3_$# zk8VLb5)uA3ExA9!8-de_QZf4RwRRa!JgDYWJD@?7AP$&%V8b;WPI1(!?(P`71^>_! zMUX4W?3k`B%Z~jCOTrkUI#VipYY4Zut6BeOeF1mfvEo=KVlVVWymtMOP$~TXF z#w#Wh939+{kq_hoIoF;D3~`sEX^i@MXX4O%m5mbe1c5%C_*X+)OBzpsW8J@7ZsIQh zxG{c1-}w#l^m1lV#mf%&TV&xAg!t)E;$d0QHB+9R)d7F-r=dE>?s3-RGr(4Pi3kW! zIF28Hv`z*|d+Bzi5%vYW?u>hsz2UH!)3uFrw7)UOQujpX!HlMF7U!!;PxRcfV>nE4 zyV?nNua_SSU#yAXF>M0UqHew3_pbV@Yp2qI z7VW=6FCX__BSJfsy-$|;Ge0Rxt>JwU_AC%P3{-ZO%gJA}x+I8#_3P7)vEG8BO?juF zrRwY1h(e1POi;;}Fn2YqO{-8O&aPP_F35L2moHg;=Xt(2`XadutTwnaEV8nTMrLIi zm1^?k=mKU9HJ=$ty$Us1B%O&%=a0rrMC%%HO+mprb>Np+{#EsDDEX7*ShBu5dOoph zJHw1+mfVYwzRrI7F+cx^rR&Xzn^!@-z6-x^zHMqTh_uj89%3sW(KqWQM7$+;r?N#a}kqa$8P^{k+L@5}qxa zllZx@klX=H?yzYZ42X4J(el45yCnf^z1kKbVk0lgu&Na()V!kwoUskP`r_HRiFwQ} zdi_NX%7_tAtf+2Pf}D!cc8RU9bP!Xa8Pd(!R9n&qltt65ooLc*T@GlXzjQD*dNpPu zY5FEV1XGzSn|7A&U1g*TKfENcJrC2<*JF$QGcC}QEhFfhY0IWlmSCOHzIH-|J3BE6)K7g# zh#qcK{|$POiF*c}S>4bW4l>+2b$gva=9%1iFwGWhPw)kBecWM_K9{S>+HbA8SO2Bs zWz*m6EuC$60CV`@m0(>adDj(J6*CjFEm=L6z0uOwDXFqbl!2Ead1<7jsA$HZ4T0@{ z>plU2@E@F^Zl+7Gn!casP!LHC`^>aRI-74#|4jFzc?SD3t@>khwtUPz<$q}-kG3xu< zH}>V|XL_0L%uV*m1SRA}o2HALV~6yO;jkA2J%QJ~C9tF4_H{TP{#lK1QRhGD93V%} z%fic(R>^tPxAmmjdPrA7$L8S{QZ%T%5pC0v?I`B3$2Yq`ajFnASD~BKKTI!0ai0XM zjhlMw92g>*t_vP~_M^@IKhpN__spWrFx_PN+8d;Xr08jmyenEy2-b^|$^v3FhTfdAnk6 zUy>rrb^fe5RiiOkOEQ+Lb!1c|Pp1n4ow<1B9U5FC9!83A=5|pcC4K~d|BsaYFCQ?W zjN{R&(v|0(PQAj7S?;dlvQ_o4hPviZWo^=8Cq+NMvYqOquMx6XkraAW?l7*L9e5d# zZl{vb=NX^;83Ym@f$(JmR|~4qi_g-;^bn}Q31z>iag9TL^S+-E>IwJg8b_nU!z;xB zEgc{7CbMQ$>}6g`NXT11EzRHjx#Eu!dwGHbsPNtZWa_G?QA?_rzZn(m_gHXA-B}<8 z-0}9aGSib2f>+jO(L_C0?FH@s3|SXwRXK3HyP1=Vq?0jW#1<=DUVr8`0D8D<9I-C0 z{dkU&Xw!CcA!pKe3j=uP8jL<9Y~MwuaP<`ZiZo3ou~dBVEqbzcXGNba&?|QN9vcZq zP0~8+3!sDlwh{5+(r-cK4A<{yKmW&fdgurmZb9$QD2|pS-($Q`0f1JFBm##V6ukrW;D24vJQ!ycT-mhEr!O(I|mPW!f>NL zqubEGfKF@us*16@fbK9XFci-l+wPjHx7|$TL!5SQL5_ikm#oDIT>tMI&$(kY8u$M^ zIscK!e;<@XjEAv1O&bg4{)v8J-CL>|1B=-523#B}q4LJgavV0ZtsLT3rP2sfe!zFld64J z<50ZaV{3$ZZ~vlFKhCAPK8)QJ0P`Tit=%esh@DKicEpkMAo-=h!<8C zbaq;(j`WdtPEew`UjpaI=6`W6p%g~uc&GFIp3Ptxd z%7nY;E*#lw!$iXdbC*q?7IH{keFkRgf8L~dnVS>(<0OuUq`h;Mt)g4P_*`h3G#A3O zgeNGWiLN0sk=`a1txyN04EH43xjsH1W38~Nr>Rt^U+uD(nY-kKaGrLuo%PC1f0cs# zxz(E~%$odPMGQhDi7785JzGf*aB^^oOV;l`?cs5cvjsV?|IY~le#P(J+l+#d$)k!o;8Z(L z{Sh%c1aoDtJ+vz;<91vOAibbU&v1B{4vu?hpsei7mDC_V+d*VvG0XGBqV6*YM6y-D zjvEauAGc~rsBG}o3R@JNAFooZe`)1(%yICSsxxak3b_b}_p+StB?Fd}DEo{lL%_e* zB9jb&nll1Dfb>(G>R+AaY6%5{%9Z|+f=aFKL#9;D@3^fKTE|11U0Uxqd+K>Ux4h?KsBmm;V;j`78?+ksC9w1%7@rUACVg95GN7*;Hz~%xDr**0+&L z?%Hc0n{rW+^^Gv``Q4Ng!DTL=rZCv}QqW7W+Km#X=Dcp}dSmXCYw(K+D=Qi^Bmh9p zb5%L_tz2DOX~noerqp&8{3-Th=+i-`71`I}eo$IyVt-6~M)}YFTwirvHSXzi7v}G! z0a{yuGrsftLQeS%yO<&G8y?G>9T=T~9n+zu%{(WZBXmqzM6&sToXmFiZx0A+y)`P( zS&Ivq`wK;4G~Nzh`->6$P@%KYi2dYEp;w$bY?h=Oms%|u8`^z-8HoE^NINj1!L(cC zJAmMdN$UWN`FHfiM$2;IUf(kSoPoP0<940&a>#8bJi0|jF-3{-R{Ng zMKD3l>+DQ#{+Hu^v{gQhfsbps5*}toMNYclt4!cml-9zt`bd>Y#Z=ZOTil7V9a;+i z2$o8`7kO-X&NJ~x$#pd@bC-@!#V%;J1ty9bF8#i%{*UY&AGmNABMFvx^kvi%;ZouvYO?(vuk6C~W#&5SOEXu~(DoQ?*hf?DfuRNcNrPdH75BAH%ByoydlxROsf31Hb z$;3>hjE#~y5%~?e+Xo2VFn>i9e-sy9O4AOp@d;G%Y zJbXv*tV?b|A{{)?V{GiMq=t;lX{7NZE3Nuk0}f+~JJaHIKbQ@MI`ehC+hD4dzZ%tw zAG3ult_q1vu-JOO%KNIyI+PyR#L zatB&UC;+EBv72GE6?v|(1NHIb`B0+h?u$$rkwN zBBtvH+tWTQcPwez7z^cOgqFz-FzQCce?(~s7#aGo23qT0xkVl6of&2=W6ve&xV}(OPWMWaJr9= zAK@F=W;u;)E*|fD;XZSAyo-^Ji^^(t!Gy*6VD`V}#)04L?kBI7f@#>UhVlqn{G+{n z-uIs#m#IjExg8_;QKxNzPf>Bn;61ReC$YF!dk~+!0_3U?UAbg)cf<0hH%K)^BfPE@ zwB7ZI)2SuIOko&+o!A!*-x9vg_(cE+Bz&^Nvacgu<5{^OCd}T>;28B9$EO^vCNQH? z0h{6>son%czD2L|;$d#m18w1vhvp_g5Z9gPU@9p6wc?u9SD}&UU4I>MGOKya(p4iP z6C;neF+%@M1WM$G!*Q*WvJR`wn(!ik+e~2>vxEXnGp?&^syqE@<@X~X zluh5uNgIlU8ig)J?#wbxx~KNPf74D&#xuYE+AgLFUe(Wm^iI!t>!RNchgVSkgcf-KQ0# zCJngi;*|vKYn?)Kc#*WFsO`86UAiDKE(>eZk;8dRea(+VP|B$YgWq!MJowOb1LVi zHRa`xT*tLv#(qS}g_`PwCy+jA*xGgUYuQk25GBo-7md^TxF1#YCO2yulcX?15xaDf zpkIPE9fa@gwkMhWObGupqW2R3Eul*u&S~_>%#g*mT6qR$!h3>Pr-@gecl>n1vlcsQ zUMSr+psfjM^1SRVw|$XH%K#>B3hKfj7a}R-x1l*5nrs# zd0+5v@PJ)U)2rn_=-j{C4*vUPzuzN`-$(gSgCVmSsWA;Z_YoNa3B%BV!M;P zG?`q~UsdEDee;Xt5Qf=$cQg{-cH6obG%Nzi@lMd#a{Y{^s)--3 zC>}(@N0q^DA#Ag7=3@z_epROIjYn%$H_}0@d$^5iOeJ$ck=`VAo#lkL?2NZ5UB}a; z==ME(_*JGgzu<;o2OA4nW^}nid6)uAzTBm~Wo4ecD658eq2wrWWNQ_dnV1B)bd~As ze*3J!)XZbn%7{8nfrCbmhMDtCFQpvnUbWcfsylForrYf}bGcJ=? zC^fAT52jyf)t@Nib3E?$PI|XPM_&Ez@{{(E8=l|mtY%GNq%W`UTyiEKub!xPzJbdl zm2W=eQ#DS*1!mL>hE>w)LJs`frL;h0kQr^R`kJJ)m)B!?Fq?)zByjB&qX+x-5AfPM zqLL~xdhOF`4fRkyy*kZFcXw*QG3cv2h*McBHT6|!H6EGhr5;pmwnyz~N(xuJHyGzpdjVysat*Mun&oSgc)@N4C0 zf7*_W5eVeY+2?;>{XcK=nR{{ErITXxJQcaA{icmGUd6_S1_ot`#*6b2ej@z4tEf$C z0oHNU)7o|4vjQ_7AH}L;j|e5_ED?eyfbb2m8*3iD_}5|*0OB(4d0@47_(DQNq`az$ zg`B9Nwuwcr3t|^CqnY~9QQ%%#K!C*PgX?+a6t8tvM!B( zRRW}_d-(CKi6LDb+X%6OKWq})) zTyiFt+N$f-(zNr>HPyfrkZcOc1Ia3>pGuO?8pT+-xy>OTb)$6^a`1A4wH98)SR|dY z*Kwm%N918SW=a(3P8K4%%0u@3I)N z|M>W&<0S0&I`1g)6YtJ&;ulBr^15fkltz+O?@n{8=Tb+4hxOH62+JGiO>Jl2j~DHd zc@D@inmbgQm%gr;{N$FR+pTk?OW_qkMDDOIRxIq`*4y9TS2Z**Ld}M#%S?*1oct8ytD_Rd%B9ifw}AP{L2H)TX8UTlI~=;$icfxYz}W zo+UJaxL>sxxU@;$hO4S*zOu_U$9z88L9ncwgMuha&M0}3vQQleS;+J6l$&E_%p5Q*$ZyC> zJy+gmy+f6R)Ph$%eyZ5V*Q2Q2p7oFQ;Op!@nPPk`EL1G1+Uba>)hbijxE$7=g!`CeNfNP7Lv@l%pGm3J7b92 z#*IBD#TAKA+=9xu>3P9H-@~e%B-qz|)X@c~EQ9TI(>a1cx2L)J?Pbm!5a~A>EM`O1Ugc#Gsc%(>2pPAlKnwL zjjgdMNw&Yr`)VWW8eTdml}4!PtQL%UFdqV5k+i&IX+FI{s}Gu3K@xg&QADF=3UA#$ z$~)~p=HE79EXwy}i5&Pn?h*x-Ws2WXQ2@KH6sgi5^3sH&+wgw8ys`_uBF=UxulIJF zc)L<8SlH|HcxA|==}`(15!;MC^7-$ew|0x%*|?f(ah0$pk?ZfQvmnqDycsc0c#)r7 zVbeV^lB*`i7`$d7U47DF6%3b<5>HFb{EK6@fhZxdfm={%Z{cxbSHyk05{Vby z=5*+E3sOI#q(76JcBeSBK8OvR37|gOA>|Xc|F$=iImw}DV`Z5j8X;dN)-asNq-}SX zcdzfA>MuY@QhWFrFJPoUcfL3fJD{)mq1^2u8281M+Sb!iZYK>n-@i-ZL$Q^PpqkN0 zAXPIT5sa*tv7Y1Pq5nWxnq4p`qX7>?%`N%=6ethWzIyd#4AWFtmf749Tl06`buM$O zu$h>W=Ga%|;hB z7c_>=A-RNVXP!W7CK5j?v}{AfY>~b}VRap?nRxY zATG_vG}ftT(U^CGjn-TPRMl3;TdL^zlpK5xC)`3eW%JcPsdWzeB8VC0(UTH)Lb7Lh zM7iY9Bmd0P++o~os+HEum@e1FK3UJJ@iDAt&Kwr$rqSm8cG4rtl_C8>knjtQFk#8t zKtJ}4&X=Ms$fjm3*Ilq$p-FO{0cDGZ^BX&bS?x$Gfg^)en;JvqrYnMH(KggPOY^PGxGV0rz^zMe~Y`@z+1&g(`aNP@Nyv0&G&b!(uUN5}xZv z&Rsa?K;pR5pI0BBhmEGifhFGk#J7-h3a_{Yk>PL~#Zsqla9a0Ri_v68c_*sFY^*1e z_vVJHxs*C$>=zN_rM4*+`zm(airM$5KdDvkjf!?i1A$M&lNby&gTYZpDDv%OfkTgT z$Y8~mIzo#iX_1DY)g(`0BkmEbOi!003Ora-Vo=_zYC|4*Swy(}&RbyYwNJXC5-O9P z=eJq5685o5D=2nET!6T;b(3&PBv9z*rAVHxcR*aE5HxCwKTWw8wpsrF96 z;0FoGmue51>`lR|UwTIxu%_mbm$K`H#1_Nj4av6a-uYTx^AMwvqqbcUN|KKOFXAlG zYeVqn9Md+Zxi{f+r+L)I3{N>F8fpZI^?e`U8KbTMplnFHcjMJmC0fVq=dnUNDN>V< zcM3V!V_2bql{AN!YuKFWC>?SuevlF@?$>PfOa=h!+PQ1SMQM5&*69`?n(GS&D1mZJ zn!bb1&#H313hPF1p>I@LNc7-l?~k4zPv%|Knm%|k-?de9r=2|sFQof6{a#4>xv55u z+3Qy3o@GNn@HS+6Z7u7f410ffWGJe=WjwU9TGJ0cAxJ~0QSsMauYk5XfU4eQc+wIr zy-8T!?6p$s@j;tpu;`KYZk)uDm=9&PB(-?HLBqa0HCKWlQb*I5Hvd}DHnff}cTTj) zUUO%=Yl<~;#6yv_*f~aNZ7K0l17l3eLzDR4Uv9?3IfNCMZkb*Bn8`R`5<^iHO^p-F z2IZcDRB0Lq`X8qS&Gq*Psgz#xr8x%MszJLAB^^uYWYjW4pDY3ASOjVlrB~{MrFphc zhUwIYX4xjyvghlX%F46?S1%S>9SSkeiqO}APuGQ%voSPO*Wcgl;Z!jFZoYDhC(AQ5 z$BTDEmlyPwxCMa{t0dHmq*sNzs;yFAx*rN5UZG-1t7aC+)g9`N$uAfr>zC3ANB)(D z|1a7*>IdZTiMIGNUO%qt=L#~1L?0*p=_(u5+E&_PyqC&rX1B5pX$@m|>KrIeS#r9p zo%IwlXp56#ZGmfm z>Ky;U>x%=CA088;zZ$j(;Bt~hE)M(gFu1Xr&B*j5_^-;!55o@;L>Wng83wZea)spQ z|9;Y>h}jwux{v#o!hTWlrAUsD5^O`Q0sNV?2o0?)<^~14vDKRXAE=wu;$jF^gikZXfA;fsiGxh#TOZ#Sz<PoU%)yrAqrc!FC!;=|DxPlG~&>DbEf)r+Z15#uQ$W zP=hPv^Njv{L-P9g=_{Jv_p<#`z4bf_GeWFU_M3ze+3R!6KF$-eEv)dm-M?CE^xq{X z|NexlC--D~641W>6dlEoYmZun4+vcZwr)aSXF_?7iGv`KNn;!6(IWR9{6josF&Ra4 zQbuyV)kM1gbRA_Pyv|kerE>Uy06}ctt&>;5SQrq|^}EpXGigW+4(2ne^LW zH-$+wO%o@Ac5}qT%1UI!GaK{yW-{fH=8b=ikb*+gU9CBe)adLOJ8t^Yh~0hu3TWO+ zZ3b^8L?Cf%a_Yf26*=WY`14u&84{A=sKllZ&xc#@t0J;7>m?12nF=f>KtRzq9uOhr zb3c|Mb=qGif8S_+;V-gZoKUDX)T$_Ff!e%nQOzqSd%*um=pj%wcSQUn{vp78cu(Jc z{XH~yN%_;sd&qy*`RzXwV;>E!5^K>JsS_7^B%iLjbNK6TK?~VrSBvRC)o+lM;vlEf zOE+g!Qf;8#Ua}9*?>6%Uq&NVSVT9&_)%s865F6{)KX@ZqwYvjkd3SP~#ofm^tPZUj ztlTF|@u`_qDAKul?GgzLjt@%<4X>6L3D1p;$%^=-MZRd#N>Qsh5}EnckiXgbm^F(> zi!(zt`g!vifD{|F2*NA~Im27HshZyB>%mDV_OnqE%QJO&_oF5k;yiFZwFm28rj003RZ-ht?XCoDTk0sfZ1c3KZg zL&qh#pn;5+;7p9~lmx>tE)KX5iyQ&jsl=;67sXZkc(b5WeGxckuLROMv_jd`W_P-D zZlKKhQN`2jMz~CGOHg0UleY3AO?!m8sH+}qXv5kV0Ax!clJd{OgVlC;;^Go?A3@h@ zmikr7XbVJOVH~k4G{PLwI%k9Mz0MnkK9sd~4Emr$PJhde9d zQF6BE72?-^HruCUxkaDR=MJ6AwgZW>D>Ya)oSM7pzp#+-w)z=xrVxHvm1QcuAxr6f z!aa8NE!NX|j_+DEdkZOohgSLBHM@q4sP66^=@sdFy~%GYUQiGaZH1hCTpJj%P zn%%*?l(C0Wgsm!4^r243?9d8(5AWDGg%(Rwt`vO9)GrCw^5 zh}#=V|6B6V;e@V$Ch9R2ket9?Ue1}@o4 z{vIs1+CTri13|OEAmM!?XCv&RI~te1^b^kBsHS@T`H0(+v-=UpxdP1$T1HcP@#%S; z9FZ4<;$mmh>}9J#Q~K$zKY}GQlo*fVx%yC?|8%@z!=b}C7s5&h&BhmU834HRHWp{! zo9sd$>wZx)>eEf$gqNMb9ZQoB}2(cNzh zSd%CZ#-R0Mj#ElS<$G*%wMh#86W&zwO0E3Du(n5xe#YIgW!7}kjv1ePUajSH{9Giv=dp^sof$(9% zvY^+@D|md7>gj2kiC%M5wJD0gBKsAMKuM2eM$%nn>(3)&)n9=7tgD#bggZ^ zvxH}x$%hfXeRJ)g18{_jFYHnHL=&yxCj4njkEu_kYB5G1_<7B(wg;e`bh{m4VQd=iFe)poOOd;@R z(p3G^u02JcOqRqKJChSU;(a{mcFs{ce#>!nV>(puZ6aK@L?$tTv^$;zOU);wcr-^; zq|ec&AozQq{;gERMDq8RId@K{_G^M0q~5kujRmfMp`q3wH%^zcy>qKke4PJOM{PRgbPX$#ZNDZBhs}t;JrS zRKJy+*WgVssAi^hs)~FM(kv%G)Qd8Xve|>OR`Z!cua~Ph1f?8$NiY))*u>HF4o>y z3IYt8_0a-4`R&W{2n<0W+@W}Fn|0oOCVi&qOkXo=RZ@0;+*cFoE2G6nrV}S8-gY3D zJSR0n&?6~I3lLqPAI6wuOp108H#md?qCGz`*)XDYX~I)X&5<@oMpl;fi=1BGMP~+s zO1Zd15i+jwO_COfIkRuE$)gZehj|%FvFwa{#Sg#Be|(XQ%P%DVM@#Lm8?G5e@aDU6 z?HctL8*SU-T~ux$){7_ki|n&2DGrn?DCYLtrvm`0k-S`+M`eHf|NpMTUfY`PqLa6f zx!d=lTe+-kzc>KX`5ZsAtlvlHeKU)bYQ`q2%ymw(EMR{vf-2K)r5G*PX`z~30Hqwr zS4`a;G={gCXGm;9AH(kz_*k&KpzzHdNlS@2JG80rbXwY+W)MTees0ayX&*}o>se)G z;HP(PM6NI4`o$Us5ok+@PB3T}vFsdzUV0BK$9M{_Nu|*=6E?6z`)1kAOLt0lDWADD z#rlj?rI(LE=?{x5ht#)=%9q82cr_YC<@B~R)*0e6Sae*2@a}%VSZ00p&I||W1)%%@ zOt$+Oe%hxDVmh4&PL8YKMZA4Ut7iH}sN&MV)UG9Upr^VzUZk{MO!X|SQD_u>#k{7I z1RQTEp{5}zZJL5&YqO6&7c9nA<7wyG!_1x*&45Fg!|sh=qsszDHpC(h5UP;ekSU!7fPN{lE7+x;?EocDa$>m4Z8+SW=Ihd*KMsM0M?iH_!g_OW@?)97_XBm2C zcq9R%oZ6|nqJf5#Jct!i0s6(T9}b<-;oQuG$gEn!JC?v?F2-CNI6;@}yRWX{gVw?e z^}SNN^v5M7Y$c3(i=b_eFX~qjXe+PdLmfpNUyQIzr3Q)Zqx2p47f~Gx=HFprUGSqG z(D>-;;#sPCvn@RhDzCU(*(zJTAv zEgtv&6T@v+Yy2?&=9z|NXk@ly&CGO2HT)WBq;BY3=pGVJq$4+++7@HRbpD)0e&IrX ztjIJikp0ueEERS8-QpI`{=!&9$(1bWGF^6w!~&Zq%zx?q2(=g940g2 z>0G)84KieO=mZb9DO)854rkI76LxgVec%qyXpISc9e`F8M5jr*Dj&;^5|RwTg(bc# zxfYw`C$C>i9-Gh7u?US;Za4}$iu(AZs9*KXt;8*$C$Tg)_@=wmt&#BDfvmbQTM^1& z*ut{whCy+ekO3Nj&T-+~P+$C{iO}B-zxx1NPw!?~5(kIY)6I#*TqfKjQz#m$SAr-BNlr!tm$RxaGT z&J)QAh{hAgM+P_Ftdi;WFqXntym=%??{}SOGhN-P%|3Zd=jdMF0N*hvnc(sm=l6)X=6x6*z{>`aH+?AWoB z3)|Kz9eWK3tB9D&>Y{^&JaMDbVHt;TO^4!6q0pVaCM{v?gjcD%y;zCXXdTr}VNs-J z=slhMQ#$77--CDpy2mRi#|qMV-|=ZtCTI@JllI>P$ZeAK8x^0(4cjjA)=~35(pNO# z@Y7LVd!&Bfm`o~PQ)e#SNkbPk#`@@`3v{E1TGaOJr2?*EJ&nw61Rj7;{->GIe}Aw_ zdEnazC>w?=LPY$Hr_tflX*J3|$(o>Qp^DUSYf2wZ>p8Ml!D1VPHrM9zu2P84 zg;omi7c7jq+K^Yae!vVK2fFEGoHKGwaVHn7Sn|O(uscPS$f~}dto!7bJ_q?qOU_oc zrSHVMOCa1UK0;ZU9{<*`1qtjGZ&< z;<@ihv+9jowS^w+9ef9UssdWzGT**EyG{=f&rBwF4&_t8V_PAm7DrG1u7ySswG8Oiit(S&7tjkmw~%1;#_<=tU2lN1FjwK+Cy|SVP;$q z(lk)cSXN8Sy_(DmmQ?pH?Hy43;$oTh$df+1R+OC-AEC8C#$k$QB~z z8=QqCvfxkFp#+=|ZW+J3_W88L!ADla(NnIUX+&)ky=y70h`47sbrD-|i$&7^p?qen z(}{XYv7=sdHqCI-v!4bdC*~%4deL-Aou;DcY^pT6JXyV@Z2wH zinFssFTLR7KM~1UHe!4hV$O?7jal+k>w^^r+ci9)CqOytE%=g9BoVX5-6ROlu*M)`U8`DAuF+^3ldoG3se5{_HExIdV zN3hy>*(hJ@nHcxBGp5TFFK>acY_s>Tro_MpL5kCRGnCDksKVX)j zz5%Dz?O}H@Hb*NP35WG2-zFZQEY&7m(odl_|?Y;LKOTu_>% zvX{q1X}w9OU>Bz&3RRND2l}r|89I_Tx8r; zoatrWCH1Ow&e4_XISrHM<5<8i^w{GQw8ce7(=_yRsBvn}P2*I`{ZP7i_uxT#ZL5|Q zv6<)b8Ib3s@lXP-@lL<7Znr#JhVq-X-j`w20eyj2ROCMXmgO;SrJWg7(khie_rS+) zlu$g6YEultJGO&d=48$bAf#fw#;e(4#8=#SraCSqO5U!Qy!yOnAOQDg1y;lh$-CW4s}*G4Pjxjf3hNe{#BPunX*Am?BD*?Y znLUd+Dt(mRpmw%RNh(Sry}G|mOILmtZWolVpUe%2vIWF9nhzih1X;tE=bzu+kx4wx zN$S)pVyel{9na@*8b;zcIV(=DdiNGP93mc!nee`#IKwvGbU4JH@d3kC|Ax)}2h0r~ zL}z0K3FZ;bVwHc~tbEC~6Gz6><#kQuu1Ixa<9ph;0_8DrZl@E(o>Po+#>Fh6;M)B{ z5lIh9C2T-4y>1IEoCiAd=AGQqFLty<$=O$a8Sv5>gHLCw%EdIHR+J`4g?T)3R3@Fp zMn0XZeS%vi#8{vT!35UtfM6L6?N7uf%oEymr}x+D0Z6@7*)55Tj6RG7$tMnlWQSnH zk7&Nk?qP96(>Hzzh+8knJNU;f#RG6jeO-@{TUp)Ij z_sePsk=8-lBiHb`^lWdPANUb948Tj*VNK`#E(};A3#;$nrNBl}-}GvjQ-TuqE#1W+ z#fAiwM-r(-d5;v#Np~?HkY#l^KkK&RSiKPA??{z?*$n_aJYn2W;N9S6QOPu`+B;{> z#}7}()e+iX4jAEmC(X zM*b>O^_Qi>GsT^RYv}>f#L946loW*bu%8!CN0WupV-kM*u%H ze|US-@Yu}Si;Uh|LJ+@F+mn>!au&kuwydI@W9E>sRX_G~z@?++o+LB%i>yuC7 z1Gi)?KgEWme%Wp&M&U$xHK2}XM5QpAm7`~-wj+Sl^-etRs zL`bbsTl)uzka!|%>ch_b4?gp@qif@Y^ai`@OKuZqP+FbZd~etVT8rK8d>$>XF`8zb zdXrYg{Y6_m^V(w<`?z9AjZwy81#OdaeGQ@lU!voO zS5%oq=Y1nb%JLKtn5Ax+(*=n#)hTJcXl0R|SoqCenDHG*53yBJjx)=T$=q`TD(F)Hg2%u@0I8|`9uEle-lLj>PHhl$ixl^&Ot9GjdB0Y< zV$nOu#OqiSlfApA&fq3n(#s7!QVq6n(=$7?DJ>oUL@&~Tl&y=OlmEz1Sv$;MW$jLW z%Gyo-x2zrOf63Z0|3lX9VQ2+`>W@>=_pvNxR1Idm*+xpbTj<311MejWMahjwC4cC>+RncY@>n{m`R&D=V-g5R|`de1_iW-s*7ugzk67B12l>c)+>NZ#o*`C4+d6SsdTgw z>?%Hc2UkK1kL-yOTz&b0bq1+5f;~#eYqRD6Z*y2}=wtACRVHqQ|Ld4(1~2AVeyQ}u z9vD@VKN#wNxNLI=Z1Z;tHE$1Hr{P`YgW%|6(koZoW*7gDj?#+XXAbkY9hGGe1HYFl zvFsLUYtZU;(by3ju6RnL8=nwa|toy{==gZ-Dy*&WXd43O0@)I!dv*!%<_d~`4OKr&0C$KZOynX0l zvS#~-{qVvi7A5^yW$9`w7cquk@Iu$VidKbJwv_vnc8*Uq zBNAy1P-ubCQ*U`+w`}axlDiE^gQjM*N$sUgZyrUIbI-hRLd=I$c~ZIGW=>%tSKEQi z9o8VPIxopawXlb#G@x|5a1U2M?*S+V0t}vh0&8=#=i%cz>r&?owU_Dr_c#}gSFX1M zQis+h?gE!TV0K5Ab=XhSF@C@pUjKmcuD!hfmGAzyR>Ty(0@xJFolYLgjsWgm9S#_r zrr)0b{W09vyB9xTzT7o>YeS;sPHKYSE`NW)U#`6m4>8I8daZZ&X&cFp-%HHHmQEqs z;CevK6m`&>WAL%u#ApzfDTHX}SLkknn)Y zF&-Y-PAB7sU}b=YpvrMrj&= z)DylfBw)DUO5c=S_%Ce`@_{*JSWe+%wuX+tGNahBbWC4mYgDw3Q2n5r! zYqC@#EB>xo7oQ=Ma(||#hhCv{vOwcM9^kk3_)iVD`Sm&+_l>qrD~W_5%I?cs+wTVU z!guo93lEHm+bMnVE4s5hUejl&_QzmmA3#@=8@fHp@#wcb|Le!F%5$|Y4#nH?UL~X0 zYTTQy%7J}Nx8BptQe&@FRn!lq3>y}S-JOZcpB1y0%k|X-g`4|jrAYBu4b`bRHWWnJ!~ zwE_O>Vvc565=n9lw>jQ%YvaL zFyDV_-ha}A*|C4^eCM;jB|*m#vPeL|6f|>r=ZKTKCE+CdLSb@NZk06EkuG|)Zlqv% zKV^Mll=12hegBKq#~MYBTLrZjOOUm%xzO<=dhYyc=w3~|Gg?2MUe#edBO{8IB#}kb zpnxk&a-UMg=A#Nxw!o;3LSk+((~AfI--&Pm-JZDiWZebk#k>Pp6@F|BPc_Sv$U!Hf zMe7=MM}m-d{T$QQO?wlHC56d7x9R#-#cT7bWnF?pq6}tIE<@EF!_Nnb?SlN;zuG0V zo2)J$fQvmnrzMIqai!|A{&1KlKub^{_;$opxWtW`^N;S|qOrDn|6b zpd>b;(bj-=u6A5eVt&Xa%)DKRrUWB;6s73n0^7$Hz<7VB0N|HCx7-W>{8GJ^q9IdP zOOy$jQ=Xe$=%gqJRH)8wYMOFXbrBNDyiXYjr;O{21z#9UQKEdwm0suW`vv@B%%AY9 z3}PDzpd8?&h117LwtLh*9cymM5Lurvx7@DGC>lUtNY4`sodPXIsg}L4fBup;ktb=8 z;=5*1_@@lMrGQHuSSCS$I;uAgZ(C`$>XNH^!t zJDkn=8)#FMcWBjnoVq{1_FtWW`|R@qi>dUQd)F$0Fg#UO3=loyY@z2k}l~h}W@2Uzg+-tNRlt=g5l^jR9LO&N9 zsx-H?mNv}A@;WvMLB|EOGK?MdK1Et%y@I?JuM0F=T($o~*-%d@`MwI62%fG(aC^L? z1u6w@1jW+iH=9|z-5zP%6Z5O2PdPwK%-?_$VX1VJs#?N@2F+3%q++`+0uMrl{8h@- z2n#h!%TE&(+fjCajoHuKTDWJ<;bx-Wr<*4r*IIyATraDZCqUSq*QM_k#ceLax;C}; zP`PF^l{+xu^;T@>4;cO|mPoSt-lJ7TfTsrLr`Qn;?NUzGjZ#1&Vfz?MWM%h8GJ5#^ z$3j}WJ?#}u<=CZ&_71$ zJkkNZMB9+tdUT4VTQD^wW!c#-J4+RF;)_ChnM6lDK*-*l&_Bg%0(If`oNZ^4-<}{k z*d{5X{$t<&S{?C`;GqOBD2m8$Wy zl3?j&gzEb+$qRk@SQgD(IXO+=2Vm;`Eklw0s&O>*63wM=vqg28Zbcso&igt`cX+VR z`c>`KnD$6LlyB|zQ_5uWAjTjUd{zZw3I@KQjjjjzn@>N#Vt}5A9i7`sy%`%lmy}k4 zUA<;?dj5s4U&qj1aewJnuFSohn)o|jQ_jb(_>Z_pmW;Xv=#r}iZ2F4A7k|Zd3gR7` zNy=Vx%xVjugJw7azMTxC6hzK$met8i%G{5b%b7cx7AjRfdsm`X_6{Fy;1WH|5iM6? zfTD+Q7U}K}CWHIxO{4>5SQ3~fT>JsEztskJ`RNar{HFSW_>0bWop|g0xl@S;jxj;) zmEWBFHx7}}PJ)?cz3G0yM6kgkwh+Cu)+R~NQowq{!D|ZEQi>KL^>`gC-CzcZ8%SVA z*2L_B0;2V0?^~b!8*TjN(&t-^goCE%;NF-bnw(0Am18PR>9i!aa(3w@L#B|d^xPH1 z1(5r3Fco2|U%aO={`x#B_)e2lA0MITmjv?lpR#`c*a~i*P|Z4nUM;<;?=5rEQqPfU zvNCDB-JR>ktb{KVEs&q*T=eRTginu7h;Jw8nHz%XdWH}z^e4%7NB$ilu*s1OoD$y*OC#b9L6MNWjL&=;? zD)Q=1eWkMmYbXdj8rhJAGktsk-Dx?ksRdNBLA|ACB^p;f^5zl2Lu$=iv-a)PNEh=4 zo<>f4o=#4bZ>kgh9zwYuMLxQZ{ege~ZGjaL2xq*-u_oR}18BQ1&U9hj+kSig_s7)X z?TlB9gS8ijuH=Bmfyp<JBsukCW?r;gG1ZZBSeLysNH9Rx`uDtf|g9 zBaqZf2EH;Jdh$FedG5x7lbe%de?5Kn@%);=n+Ik6{nhSe&|L;r2GA7 zc!X^C6Z#^H8q-;%T_g(BbI9}Y0@+A51vz3&e>rXdlmDyd1aYeOmMR{438*=S_8^Z! zw%lJ!AvXYY12P;fS-2^lE$z?sZ&2TS$Fdiu3Xbsz!(mY6yehTHZ1KNJJNqSWr~07i zc5H52nzgjelziqoo*Qll7mCx| z_DD#3wPF7>GO!)g{QY(i$NcF3Z3qAS-Gf!a%gf+F?%X0Pam{Qu@Y+tg7$G`q8H6|* z(p<#ZjwMs>+tj^V*u-B4=_(rgV4u@J6sq~6T3z(A0A40wOr%~pOghyw9jZ)e=z z%kOYVE< zEfz40F26hfy9)gCD~ca5@YhZc%^>phbxTb6zv-KS`6lpEfflq0IYk3)Vh)68@2yJ2 zEa>)J6&CRh*@eW3Y>pg#WJA&d!iioBo5F4YPbQ4c%J2H}mn(?1$^c&6dcPUe{aofS zXF=WmgZbf6kE6HJcG72c086Sh6G_a2ukBAtHo#CIzk6Nlo?D6tRjnDDsWM#iT zIT>d#+-7quI!Y)UoXoCJj0j4#y_@ihesE>+8uuFg2{8UY3xNL*jemx~%bwtoIP|{9 zfgjVSgInrSMoxt&0?~8Q`eq`^^!*j6A9?r~P9`fH^2uG<{ zx2!th&T8#UZg6*h?6@JA|%iDTlIVZHLA`MfA(4n5`2jp@$I&RAkrjwwW z60h^9?oZJPU)vY@C#TgIzieuHz#|=<3)I#9q|AmZ^r-8(AV^Exs;cMH8jo#nbw7PQ z9G*%@V;Pvv5olR%<5?kPHhEdBT|ey)@(y_Bpx;{zfpU?2{S_T9(&xoF&6HZ}c>FH9 zwwvSrIAt%Tmr=u(N{7G3QYdek!im73$%4nxp#*Urk`%8*vGAIyl8l{3@?Ew9?)!GS zw%=I?2Id3$`rt$NeD6(XW9z-g42<^oKV0yeDl*x3fcF0M@ZMcpLZD}k8U>is{!W1v zY++gr35aLxx@)@9lhHV>u);>B0JX_Y+8EPCrRagx$r9V@>q1Q2<9t4XiSwRiXL zN?X0lKg_2JXVj-Q8F>0NTZ(hegXf_|8g}56t|6>5$yI=4>O-P<aZ?&Nv#$9?kb3N#7+$>Bak<&^KG3$P`H6SpH`nF&_uXe$ED zUydNiQ86pDOQm4eb64oj02}GQQy#;9bIMMI^rH)XH*2`}^^gtUM&)e5JBT@8AP}EP zJ(MC0gYUNNHW)*!3A27GPDd?9;Z`0ukjS7wl~*d{4K(@NNMPaj&_Va)pgS$QbmntH zS3eb-y&1~{@HQ;2u3uGonD$^Jhtl|~uW~BdeVyVT-R>hZa|cj}krJgNmI7$XKUM4g z3@+q5c(YpvrL}+@pBg;Pkxg&7a2kd5%`n=}-B)@-gwt+nO5Fw|Ta63rlj6+zZim<6 z)H)>|0c`C>hJ&}C$n$|d0jc+g5)#Wm>V5C(#Vx|5 zx_oju`GwXCM$4MOs*Z+*{@pE>io*SB%ZtZ`)2V^EL?2ZWLR}ik-`mt+(qQsSrRe@N zIT~xaw6VYfupc~tDb;!58Si$Ax;~iV{={&qP_eORE4Z~KwfKV+lX|v#EIlxc2Z7Jj zuDJs}o)1~dU`<_tl}38sKP;@IjjzyI0BAh3>$mHk$#Xa9>G??&PJytDh3*>hZbDt_ zx{|7-`r2dXsmDjbnu@ej1I^`w!J59_M`GU^=o1TT57frt?^m!D0Xx6}g6G^a47D}t zYXoMR(Bu*v4d41|V^cp3r}#9Dh3qt?xTIVv23M%t1|)~@WA>L2@hO!n9#Pd^28mBm z-~M39|ImWq8^9%BPXFO_@EQ&rO?!6`23@0$f7w46!!(T(k9Lx#AUErSrCGw?+RDvg zW1$PAjPIswFt+`n^MA1l?`jTatQ@~fJhyggXlQFLI9f6uZ=j_0_x1SVM9pofVOS~K zCnqieD5%zj#Y#E^W+u5@WwW^Uw;I{NbqWHl0M5WBd;Y#rywm+&E5{hHGH_?>N+f=+Lt~mY~wOlSoi{4a9GtPMYl6^+uUFF zgOn_ugt&O4i@}lJ7@=V;s1{9;Mv*(|62hjfrzTiz~txFoKGJ2 zR;s+F)JqfMOrnz|kM8uKiH%~B*-SE~Vn$Y2xoWqwh?t?a_?y21a3vSk;?_k}@m3|9 z_1gePQ7U^nnf=ii1M`MPraN75i;qU!4Skbm;AVok(lbuMCGpZt>#Q1#&>Dcl3tNZGBY~oF!89R>Ds2@#?M(m(rt%8u#mY z08OJ7_k4>mRgTu|K4G<-S8PL_7Ywo*EwXF<&N!&yTIph&#NVW`*wraDpUnV^yO+6r zuHHBrm#3O7Q2*)@SCZ|dp1O@%n_GDE#Wd`qoEe^XhB;^*TVZ2jQ-nUoWgJ$gOUW$T z)EqretxeJFa`koFtIJRs8j^Fp>34`yc4io?fQJZ9&2a8n3LK4l;)sg4Ik^=VHp7AK zR`YQ%F%(3|u-cW1i~Ty~DEZsQ7QbDg0l%~+eAbonJdur)=TBwv;R!jF3qC!xRzOBj z#LEwMa^;5A?4EN-+~(XB>xCHT)DcG-45!>rP&W>#s|7cBeG%=_xdwcDJZDfTTZ4c$ ziEoHT1+J-5T2Dd{#+E2SUT-<5NPAS@B)!+as8ztx)ymp>&1}p5)#-6=jZL?t{UrXx zHr;y^k9n+-XzJ+7WGV(p56n$w8=|ZM`oV@BO`VG37S4Q|TjCNtY4Z(}*udP_yhCMw zXAclZ6R%8YL$@lZ-Uj?Ft5YAP9xSP2+b2vdzmZWWccOEh1;Cx>0MsG*S^NX@r$wAN z!&>P}=Chb%@y^=PbLr`TNtXGemNCTXVckpVuolppx%Y?3!cCE1t(I6$f%MpSr8pKs z(Jv*0qv1J(NWy#%LiZ)hTE3Y>o^S{(JVj8sf%|LQ0`oIANQ>bH>ke*c}q7HLfnmX{BHeb&g2)QB19Cr&9Lo(iZsC#EBgsFA}66BoEh zC~VmXISF35qxK)`^GppIgTcL}Lu98BJEAKEk(rj?g>}38L-dnL0`80)h&T3K4i~9Z zPVtAQkkN?-yt2JO8hU#-%G`Hy{bsxep^sMH6j&6&e@9T z;Q+(tLW~?nHhmsVEh549dlZS6+p{su)Qjv4K5Fofu-mg&7n7b(t1Xw3K94#zOeO{6HG*s)iAfB!)_NxP5Kx8wAhFGOrw?1{{1tyW ziZF0sf&-n#;pM&c_~ruk&ehG3rVI<=7q{kWw}c$EJ=C3vIBLfhtBx6V=UT4eo-Q=+zfc)=EYVl>8M{GlM8+1sH| zw~m`d7qZnnC6H*Jc*Ozb7tdw_F*3glIopcya^TH zH?!@n{|}hTh@}{WYJLM}>kZtJ2D@PRH6LMRj&NSy(ThOy)Gua@1)Hg?i?0s|zk$2a zl9Ly2N*LZ9?hL&SUJ0z_wZLt!P-5}OmwycC36FZmq^?)8Y3Vf7H)Oosn3a0Gz0X&} z0Ls))?`o^aoh8@vdeny+tD7XYeJJyigL9xtP5-$?Ov?hGzBgZokgSzBArSWzUB!VM zhvKw^w0mf@QEw9?iwvcF)kuebUP267Ul{raUo zm3gDwrh{t+=aQ3N4?0DH_L|*^L8}JrPeZV*)w}NHH!pEDMpzt9KRedYTKgfEW0G10 zeU`cHk)!OF`{Lg=jg_k*D9csKX7$kF`zC3WPAehX3~ERzl}4Wxoj6Du*%;qVf}T8c(cSwA#Osf{Ju@{oRmm`amc7GHRrF*PzPlZy7Q^5nD9n5zp3# zTKY~4O4zU~Voq2K_iq&(#H|n_Z;5!EI3nrKOLU-G^&R2rmCf^ewxU-xRr=99fw<1_ z_v7pEBqPxXbpqYqTjBTUCd7BqV4bU;NA9C|lh=`ptr12wy90DML8z#zb8ilgg{LV> zosbHT^5Psu96)7GKD?MbR@TmYRrUC_SxlvjUnmz2%Zw5F75h&!#Gj}4#=m`#cMcmG zbM)HV+Vgu7ZAgcaUJRXSXbBAS#(*10Hi!(oLVv}f^u?4oy_!vTW|?-(FhsOM{?Uq7Jcid^C$y7{bP-?&V&3l0aNpV zxidqu)u4{<{hc@vdnOk0`p^*doR4AEH)k|@#$}S|dK=-5lEE)jA-X+WGG03=d&ncC z*7HxWA!LX5a_FE9a<){X541N2lvXkeY#*?tYO4m{wsBvDFttb~2bjbPufsFgdBoJC z%GZNVEe_ChmZ!8W&uV)ZxV5V9do-tRUVubr9KM=R8_ttEkms3Tpd3&(a+La3o+t6A z6iQ<-B~rE75^S~;5HeMVW5r$Fa>fx0`1-iwFl*~reCKOH>wI>PM15U%>rF4dn$>qX zjHwnpQ$5q3+2}aN-B*rtiNnoU%Z=*ubDQ=z3IG{a-NQFlvUuS5>iWhLl3%TbVW+7I zLTJF6G}XXUR>rGp^87Tg*?5hm-AV749FxEz+QXW%J^71suf5r`{`^5l`oaT-!rl3J zhs+}t?N|pMGyP;&MATV$^HKE(87=4ATxoQbQ&N0Rm9vzXxXSu^cFG{wWM5_RgHn8f zgfS8c7SH;19tzdnslk2r{Wr=_z{4X>;Y{=R>z5p+$o6lX)TT0uPT$bleu*&8Q@zuBCbtmNXte*;+KQA^_)&bvNg>K5N+>(QcC*eLLNkbERZ+!f$A zcfiriyJ=rwZjYm#H|xH1E0V7UwJ3hxaVpsMaTYBz^e-o=c={xdc`PjwYzF#HwKE={ez*|g z(W4&TnglJ4hrX@Kev9*(Cgj^pX|!T~-WUIDoxl;)b_dznJrMQ1RN;fS)wJ_tPyF=t z?LzBU1w1vbX0yeYYFh3ZaSjcr{FBAA`(=(X$O~Vm^ziSn&4_8EHy?Sn{noC(JAVGC zlIwNU-RJ#1XxV-$r_THZUTO!?nYxh7WD{Ex=7Cf;mIgDn*@7=$7z@8=6t^Mu>=TKs zSN+1|;CvuK8+HyYIQ!V(26E6$iCuUj6St$VKaO>F>^O1G6zkN zDQU+G=K56fDVF%h43eN)==BZ!nHn>}+^K(Qs}^|@*&47{N8w9=bLxF4zU0B+PxVH zAA<(Z1wu^LFS^?#b{6ZCGeAyj zVf=df9F{jJU(r@=m1#`4;cd5q9@Y`NaXsnKVb=(>9wIuxFCi!D3x7_3#ZXlIAj*XRj=U%2@PE}12 zcQy~FLYCAoZCSTTa`kQ^CQ|JroX^NX&!D@voGvEe-mmSK+yESBbQ5JRFP{SI=|atm z&9u2)OTuC1v5RZOt8HeC#fw@(N z`IeZCWq|=GVZFeb_@*QDy;^r)(Z0p;%;K_fVkH&++nXEzk5=6jgBBo~^RBW=9WwzY zYwOBi^e2C3sDH=l`&jpI5n{I0!nofOoY@Q6ES-CjzL?>8+rqSTd9NKJ zo9t5yDWv>V2r-E|8NvGK)NH|>GDNopUMZpp*c9m^b)gjjqelD9|E}}DtN0U%E%2DG zcQz9;3vX%HtfOVw47YZEuugCY3vbF_ME?>PRMCp1x6(3hjKB%XW!-~I6Tl2>nB~)7 zi$nGA_RZ?Z8fg5uOu9cIpJkjClQ=!sD7mvw;bpa*I8Pr^!5I(^Iz~JhBoJfzKpxp) zG0rS&_{dQAcU|~x<>e0;Ao=L#^{n>;=tyU6BLWrA`^hv$S@E0Hx3Xg5T?`#M;mU6; zvfWOO@fzIm#|vMdL^yn5_-dG#yKqHwvU>m!p6Wlj=(GMAh5Q=@4rY{BUUq*YAj5@S z%d3N*O6<%(c^TD7(g_ER+(H(b6V?m2R`6WVR zo36Re)q4Q8O_%-wbAIF1h6NxXlP^ce00d;I({2y3)RNm0u|ai9*=;Gu$?1^=e!Bkr zN0duxk&r5f+Lm(uzNL*68sFw+6n=k7sb4XWMfq$2?8iZAtsuw87MTBE=xbM%G5VL# zm(XuRU$Osx3w`n53VY@;4huU1bi`JmMLjq=#y?*DWUEXHB0+`u_ znR2?GHn%+b2EJ|}_eEwQW11Q>G~g%^rFsk8g%E#L++h2-dx z!3Jdzul%&dnh}P(Qc_wFuX9+&Lf)6v-Y%L*j;ESlBkyMtfVk0fSERGZ3@-gz7f{sF za@FH3!fx-THpiy>U<+YU@Ts9PN4j(hBsxBS8U*%ZcOSoWSmL;W;$sbIP@sKg_50fJ z@wF4DVUJ?~!?{R6pe$%zrgofX2@dvLWN3J?WqO(|fH7x~v~`b7xim7aLIiJ68U!Xo zsGraOob!wZ9PO)7Ck}7q;&=DSwewruh}7KeStU2G%0N{x|->-cVLCG zL{M4Z@LI%PdmrMs9jxQ_zleLQsJNPSVV5XL2oT&MxI=J<1b26LcXtRH+_f7B4&Ask zG#VgyXuNR<9;6|-lhfbY7w21Rk2A(v`~2hIbv5f^&aPQKN6lGP@B2LF0q~nGT*5k zMf-TyowP+mC%F-|GL!f#&|%fy`_TCqz2Wq?FRz05#Wn&0bJ zwxhW1s8bw=z#7R}Bbpy6xT3GqOcw@FEaHySx=9zC6EGCauVm<_2@%AG!9#OwEF=f| zgV>1L(tNSR*mihba}sJMNN%WK$Z_=(DG}(1m@Ngv;%lWW4tcRNXl*|Oyn;Xa%%f!a zCl#v}T9Ce8#Dn#;ZkH9)u3Bs}gcZ!$a<~3PW z3eV}50aPSL7Ii7IuIUT*_QAybe6mumZtm)-G=KWL4rMIpl*gHd%(j=Z$fN#XFxHE1+w;B0G}Jk5=$lzISv#c~BpiGz@u!lgw`{sbd1QH$?^*T46S!t+!Lg;W5%U891D8fMaE2v9_6 zt`7@fd#Rskk|vEAY~h_CZ-uA*p=Me!k|W;H6bgGL=8~U0K?;@Du=e!cf#(W_0=B3| z9T)i+m@K&}ZKPBY=F}_7BG!5W(Q-u|G?QI;u7$FW={Y?fz?ZnYwKSw?aC|yXn9HHE z*K$_QOU^0Biw9~4P0LH-lVmxFwy|OpMP;cQi>JTxkQKow3vJk4g%okXuXwleWZ`d4Woy!1S__0|MMa7 zpH(J-Plu|o4k7~weK2}XthU5l^?2e{9K8mg9)YujCY*tr@+tKPtBpW$^JWw9A?U)z zus(@448NlqaBvn;!<{78FGY7WA=i+Io*H5BD=++p#P@7#U{$k?z5{xDM_|a?3~_dT z0%wK|Lo;g_^kH~c*;2acR1 zKv&PXJtKsr;qddIG&~#@MS5le68|cxMZ!V3IO)@ek58l(R-Aa6uG=y~PM-ro$fkEKS3c(j~W8yc(WKZMu%;F~s3a;5!<5*Yn$=yt#+1OD{E|s-}r^~Am9o3ETrVT0< zU|(f9<$9j@VS5CZ&W&~%T!L5nl%bYalDHOsRs)Bqceo~7Vb6G=;!-qzbz+sVZ8`yR ztULoc8PQymJH!v*Gnk?;r{R?Kxf7??>CSDI7FVc`PnH9O@#q%tXQKZ$#G+OEpeYuu zGNkbF_))NAML6zKh2CB{zv-|u9L3*Z$;d!AuvDE~s@T3H;`_6kLCGq<)K5}x0Z=>u z%>m&Rj=(qc^cn!4p+oH3)!BKzeJcuG?XL+ob%zFC`S|TJ;ngYF{M`4|AiV=w-gkdc zo`@G&(`5Etk9)U?JYYnYu|Fgp*{ zFXV3=U}8ta`C5A3YTo8nc`y|;YfmQ{WKnRY6u+7JNbm`O3liZ0cF+(+Rcd!xyC+n;MD!Y1?{A(rwJshoFAqvO87~E;J^y$q)D{~DAjk457^TtNCWL+2^^wa-8Re1 z+X3z816HisUcLfr$&bA~Cc3ecC3#C-Uaww-KkO0D6jn}O!-X7mZBT@Sv_SPzRk}xW z0i<|>IVxe9O$JVo9N*gq>stL#(~c@cRG$#bdWj(mDYu1|VY~OX=7e*YwVDKOB+T{Q zPB^mN($f|}X>&&7!*l!(dddOyyb`VGH&tteGG7O}%m}Exn;0_E_@gQZhew(mJHj3Z zzO;IW$!MS&Q2|Aw(vO{~_nx!CJShx3%GM?Hd(Fq{(&qqC(YF2MO^*EI%ibMQ%KY3f zLMWJOI4|JX_J(DSme(-OXFYan3|#{Ln3mM#=LIevC}R zmJhbd5AzRk(#k;x_($6Xz?@!m^Q#|s^6kjv-m zR^w2`8upiIu0hxDkj`_sYJzEmYm1?b}fuE zqUObQl?$OMq#9c!LoLbUsIHwlWBQF6fNn=7g@f2rzD=;kG{g=RJnbUcjA+GAS}{wD zm@20_d8D58j`w3>S%$FpOJQ^AGUF88D*V`uuKqmQ9$wY~_$BuK-CcVTLz{%di6wr? zT>9vxM!VVm>9oI_V1|3Uhu>ul$u8#W6A>ZaBl>OZP3ORFm!+x^BGV8+j z81gr=$lREO)$CNK-i-ueU{)_u?04&QgSYma<;9(0;g2z?& z8ZI_A^;WkljI$o8t1yi!Y2aiD--)<8OeNM$eUhfJ2j7!XRas=we3MiCmu~#eSOCxB zL+Hfz2mhd`qP|B#;qJMQ;pTgHutrBtd`t4R`DJd+esyP;k1(nYf<%jnm1(CptQPJ` zN(Ytj8>8uCoY&LYQ1?xvwLC2!k!dlR=>LRsC*FIm6NE(i?F2{9+-o)Z-l^+ZawD?L z5HZ!XwMN>N>KFdZR_o>&4M+Tf_PDPv%q@=wJiAk7i%mTB(chi37TWAwHT%{pzhk+i zqO3V3Zpc6OEL*SLeaG)xuEi7|FDkE#gxNJI3Z*l363(WHYlcUfbB4m34K0-O}P^ zaaqcYG$q4QPIV^9EICVA{E&${V>-hdfvudbC=D%~;YOtpy0r|P0$_Vn2dm`G0d#8A zS2pLl*I|rCrgr~1_4wD~2I0};(p@w&myZMFp7+`g<8&Pw&4pAK2xFNlo>_eD;V}}z zlecJIW;V5g;xkT`Qv;r{7sLZo-}RG^NEpiWsn$=O9}2LZX1^>4a-oC%puEo?IVk{_ z3UHa&Bgi%}Q_Zswo)$bA&IM$3my&qB@=VnwDNb*PIr(NT0TJ>ZW6UEtR9GGTmDHst zawp0ZMM1m(wK*^1&ZnPsYo~0Ra_>cy0ZNs9b+xnm8UE)1zS+bG@0X12xuM_UknCcCx)eN>T9D zP+2INbe7`pH0grg8j_Gl4RD3tFZrLogsunJ+&G@tyclj~{g##5c3G2mvThwv!|a`- zt$HV8#TTZc-<6SRe`;bsRT`DK(E7nC$?8E!yFBQQxF^sUrUnV9NAZpQogI^wV@pGw4x~4v}Pt6ZN*t8(mr45vsDnaU_CuP z-3X@D)1=^%6K_>nEW;onzhMUM+>G-F3tM+|%fo5;wdRt(9|{_)&!lfYs?~WJRwvwE z4MLj7i5Kx^4B!F@MhnI?0BT-Vmwm<*5R^__^_TwOX6KU~m2^n=TNhS6v}s5}NQ>7t zpSas0GtsJll9wWN>W+ zPk(J(BxAnqMRAS6GM_II19OO^;Eux>iL;izvRbuG_23E}gS}&fd3s;wUaK|CAJ2!8 zV4cWtccMoqu8~a+|^!xAWNIm zP$Q#XnjA!6;xEUyM)e~yUu^;IKx+L9@@iJemLLhO^qZ ziaf;2&fFEsr^T1wJJ0pq#!1G6eD@$2pY^w))>bK6mc-oANjm&lm=jL{JX#H!Ob{mO zYd?9(2WJd>0KKu0*LxcKhWZ2(or0dc{0|eqkEsZxq!$zjRT#3n)>h7+?4h8TCYCMY z+)8BMa(QYI^$qEJVSHIy5^w75T{&n)ojykoGHdSR0y>aT;!y37{fL*hbO-n-WeU>q zz5-dpFhl!Sh1-bt;6Se72cG(3rr#}Nh$0^NM2qB)VBGRU3{i9FDE8yMRZ zk?|}j9AER6ij$qdkE-Y{-p?L{5aCrD{ZpLo|{9g#upd-NK~q~4ajlN@CX1M*0m z1QiF$DFq7$OSVW~)SNXPn`_w#Eq=H}MD2(gI5-%LMa9`BZ87X)bB^!_$)>d%|D3(m zHpl*W<~(7TsMscJ#U0Lg%z9CM6%utkI3?ua2yL%TLg-B5Ik9fFE*?p{E3LfP>9o;i ztC}<5Y5rJN*FVs|KR6|C)h30cC#7*|6_5CI((Loel^axRyHO|3++)uHHAJFcqk~f4 z2kkMCxBYfZZk#X~5HZI7Oc$y=OHH`&&an_DIgU-fP|=zWm)ib)tma1#X)1!s$ zGWmm37B>wXRz#%02k2^Ypo0Vtxft;`h0-NG(s29c3h&j!VluQB?^a4YJ;X_Oc;!>A zdJ-*2M9YvI5roq$am_c7UPTsY8E+)bGc+4yu+RG?lWP{NWZp-DN zj6t)3p#3@zmTOoP<%l_ee<;{%WfTwm2Zh3U;M4&`B_L#SxI(>kms*;+5hw=N0(-Ff z)U#;+x^{3*(jxMf4;(0~THL|Q^D&Y=p!5DctNA6VUxO)Gq$Sk^H_L@sECJFIHM@9G z75vz7Sj}g5s!;Fn?n~|!eNC0)v3G=5M$JS&wJiTEG1x_JhAMvgjak@uHqG_-0WIBy zH=maUVNV~%7sHfME4#^Syx5>Al}o>b*mFwT^rPXE*j{DZ#vBS!4!5b3&~n;o*Fpk4 zz3V}jFAq;ya!ZU@D4x{UrU{X!H~T65p0hq(u&G@V4peXrSJ1TToMdq)ZT31GhBUI* z;CK%*^bKImGP&h|RM5_pNLqw61|Otwi_`t{pDL-n7bCeH%rbtz7Um&ng1QavX>Ii@@fxniN)x z!N;GEDeD|jfxnrz3XINtq7vLfj+Oe*s<>%Ms-2c?h68Hmr1Gol78h$oIeBHY6@~}k zU$@3EVmXzZu)rf?O6}T8%;myq-La`hns3t2h-y0|&PuOL{0L6i;lypbTgbdCQ@D4h zhT5LdEb5%r!)VZgPRlf8o9^P?^XQwKoM+QkQ6dRlT*amRwuKCz!^tw2T3v8tGK|Kk zuU{}biAAwJ8_q^c^<|sBBtAtCBrlr8{GHR1ECY~6B+R6^6>Hjc82lsjFcifvpkoO% z%eH1=4)8M3**`mTobr~!wFg&JWVtMrZG8cSA@d882Qb4(qmY&s2@51fxX>ha&hl{W z{oTRnmBo?$!<+ghKA!d%+@{`xSEHmzJ?eZL*a7LQ{3NxA(9wgJ=;MRJV4}ilz`R9J zHk8VbN-k>i>%Yth{>>_LlA&APe$F;#X+Y^iS;>L1bC=2FE$7p(6VpV{eqFDiqDcXE z_77Uc*x}!~!~MPwE&XiQ3Is-7l_~n$P+A~gDMaVz@Fd zfe(L8Z157cWomg_i0#>YJM8IZ>A4JKTK#nLSxQ>Ji{x15p?a8fEw)q>h7RRzUF8 z5_?ao&6hd_TG&X+Ewjom6V!x)W=~y8)v9}^z1&Y!t53tw0WC?E8*;r7iZNLfSYDU~ z8l($Dj)6$=5zbKpdP$pL;+5XFou zY+LdD9#u*&k*IqPQezp~o4Ynx{&N3~RG)zZIp_AG5~g|IMt8k^oglM5cl3Onlpg(y z=y}%QB>@+#R--nXYfh+l0hANz#%w*uJjUBR2s3RSD^Tt^A)xHD7wTA+%G?E0L2DHU z9ROa5DADpSgU4m<6r>x)_+I5kG0QkFg4&TIv10##`?JN~Q+j%M(~mU)?VGs#gN_?3 z?N0&tv7e`jcSw|BwJb$w^8DFww`R!9Fg-RikL*F7wTxi7Y!s4(mBs&R4|RjO-^nX{ ze(0Fg@FVy&>fUg>3)kqa{eTvEk&dKurILC12t$m1*^POTxMgLLma;l-E3aE}a=T9w zR3i8wx2tfaW1*P0rly$Bd@Al^=l+ORX1SCT5qX59y{4}ozs*~_{-iIC07=nYX{cP~ zhnOIo!KRm*@Wnv`JCIa}-ac7~{A}({*~Fp!IjfI$MCr6cRfcP8wJXvou z17AQ@GFuQtc=HeJZ4|_VUdV(`wCmSsP4aAJ=SAg4LC=9wpJ|CssW%TXxdJDV51B5c z!?fv#g5i$Qjs}x_)~oFUVEN%_Wam0C)C^%XE{G~4zgF&9lQ6v)=Hj68FxO=5<=t31 zLF=tlykofZxwJyn zf}i^Y))&k}ZXek2&Ij4qv!mR-<)kNBjO24M4g^@z?JW*_^SxEHSf<}i0m7j>0Q(dp@_>?6kCH_KSHQ_oa zN3Tuh;)+i3C#U5hE^QIMc8gV1Y`u<6alWMTN7!IfR%A|0@<)WpfPDWD)p~rq$IeI& zPl_H_8OVBA(Qs4X+bbNc6UL4G6kFP<%FXfkb;n<#@=EbwW4Bo%b@EToaDNGhI(Hgr zd+oj=&E~W=2LvXsHFf1YVdM)Du*JwO-84LxPM-zzM#1~zO8xJU_=j02moME!WF&PZkb(Vmm}jSB&hU%UDj%KP zQ#!UD$4pmGGm z6E5eQhp%LzZU?5`&hBudS=Fm%OgFHl+2)`@=+L`G1asDC36^uD3nU~bgq*6MQ2as3 zVt*g<^;QMx8~CP&u3u|Z3};Sn3uJM!FID~g1w4C@Jf?1e^~9)luq7rUGS>XFY&idX z#X!vaFv)>expL{PgJmc_-%uzAQ)Hnz;}k)&XojEqz;6dhJr50WtYb4!mFXPAvFxc@AJaUCRWKl$_rCE9p(k;JCo>H<9{xY>Ae*2u$aMsR^yvTxX3 z?IOtFlZRu5d$=BRGwydKZLEjSr(-##2q{`JCi6`q(T`5f$N`WLfnY56f0;yE)lvhV z1Ed*^=FwVK+==Fybj%v_e*_^RTc%a@r)q1$<7gxd>=C~k=y(++2#~K~)Y*Jtes^;HhA*Z_J62u?;l^#b(l%3=3AO2=GqoeP1~X>rC)W61IJt zoGuf#rCn5vY>lLBB90Oji#u{Btua1If8Ov(&z+l~yMal}4TTMD&A!?US#W;u&Vzub za-8HBL6GEB<@W{+nmD~msfu+8z67fXqQW!)jUp5lBr#|ED7MvQ86-kK*3k~@xS_SP zos7(vklACkt6bBauhWxxt2ZuG2sb5FXtG?_Afs6(5~uU9>_^7dZlSt02&Cn;;P=gv zxH$ZJ(5r_8$7bn=D6Ji(`(Gq3Rh0qR#um2rJR0Ijhdv53U+=)ac<<7wdoSP(BlKus zs<4kAF3|G1UxGUvo${)F+XvF{%nCh{dd99b*H8#rcV;3W_1 z1aI>W7i>|oyEkHrMClM+D-pN42U<*WCkFn6qTMc%o_*y@FY7K?5#&=r+>B|6Tp3sA zEb5Qs1ZTk^fvs@?SF|r4AlO<6&XApTBIYc&UB}&x#*eFiP~0VlC79%;29%|d9}Y6e zG}27VeNJsUwPL?l3;er7WZ?Ommu%BoV7LkiljW?OI1$e)$Mks^7Z9a({}0NSHT(>N zPtID$WE}en;TxcLcAiiAaqAytH#WiTckqmwnu!^gqO-IEl_;DX7Z5dnMO5a^LW_* zB--z6aPX~r#2vtf*PTyzk2zo!_8GoACM~0 z-#<$<`~2-|=ijv(&KLe(4Xos%e)i&v52vZ_$`iA$SZeAG3Q(g>=xhHD!M98e<48Qd z7b$2v6(k_gIkI(g{)E$cxo2#NUWs?t;4*yL2mN_38yh~C!aPs2sDh3}|F_bJ_e4FD z!0+TOFZ293v-WOmS8zUX$Xnki_OrQS0mQGfPs)Gr?Fgwzy0MEmjHSJ+2Yy?%Xudbh zC~G+E7kGPF&LgU4Gs`z1B|)FDsXS{~;z56ON!UO17m6E`qHd!Arpo>o9WHln-I)#- z;>@tvxQ=5L*H(U`^+U2MlHT7q*|E57t7%3a)LOA>wrV-d=6=r?!R8V9%KaVRUs=Qd zgYp+fsnUn1{D+MGDd}uvcmf{C_bTraLD!HMjMowGbYQy7hJ~NQhc8Kl?Jv~`=Xiyi zkYyx^HO!ShVh9iz`pX#eccyM=Fq?0U%AgObEe3DS{99gNenvva2d|*`T7D2mo%LE1 z+Z{TO!8gIoJU6l^hNak|(bGt7(yn<4YKy(P_Y*AT*6(@xgp%2VZaI>Zm0{4KL#H?~Y6NlJPX?|9tj)-_Tcnpp0pCS`MS)nNdtdD2EVqj)LoMzx;YGV^=! zG*p;Uax*e>_n*ptw5Rfv-&-^}W3NVXKi@eFm{g^7PQiCfMu~2f>AbzA<*(y#Kq7@9 z>aJ8QyuMU%o#nCR+F)n1LB@R+_IT;~^x3t@3BdD2qQ4dz_Rz$1=iEmPJr>eZ5D~}F ze@WD?V96>-R3$*7#BX#Kjjo_I0%8Wz8E%JIquu3izUT}6r)K}XGH{ogcwoIx2CmNT za>2hda|jqC1Ew|2=v8@ut<2S5wC^%f|7xgSWch>ghP32i6BjxlX8R`Rx5&oGACwLD z6ka|)N2f}arD-4MXm;3!PcOJ(|<@<%zW}}`NwqSA%_k51K z{tsw1e{aG+A3u@o60}zA&Ndms?#$d~q`K5ej7e>~&Kb_+z9RYh9dVk~NJwd5-l}1` zdU)5x=C+sQfioALgr&_)VP~xfGQXGQMo{D!FI|ZPQA8abN7ua9{ZjJ;ajF zkj|#VkgN~i70o$+Li-VU`rpfc!UlO8qPVy?XfAK~)7AhZ%_ARvHmc=Kif2BKF2Ez% z@U?wlFs$|aps*XIUM6S#!QDImTqXw1ALge0gB5Q>|Nf)@bo`Xm4{OQZ))*Bb@&i7# zk-=OoU=d_+km=2-WNx)lJ#jnmV*}`awp)|)UO@sNr&o+y&`&v+qK2DxAut*r;qx0h z&2pPGP)@$O-Gb)S2kHLEGwdVTcjuv?bV72X+ZmbX6EZ;FmTkqwDqKJ)QpcPfhuI#Vj~^k<~Lq=(u|3=FE3MKU@L9T68M67*hflseYOy!V$~S4!hIDyx7qQ{;J_0WI!NjtJVBqu5x^7HZY` zMD*XDmfOZzY!lwzGixWDrH>TUce$Eqm??&?%>2C-|NilFk@Ax~?3|@`K%1_2g_=Sy)C7r@dGB@E6T!dqkmd6)JkNQDm+(17j!^Enbwys0?1g)AALPb8YcUgZ-!Y>1UpnqTLLZ3w zI1zW-Z^!M7t=9XmSSJy+OYyN9HM{4exGs!(#d_*Ex6uz{NNmz_cmQpP20jKG3W^1I zUxhu+9w!d(uo5x11bP+>$!MLxmx|a$p&K;7v(;hfu=OyQ`shITRy93ai-zjhM0VoLLqzW9+R&zQ`AU9tU&I%&qKX^N9%^qkJ`Z(T5h8I*Ht6`tt~K2& z$eL|d)W9pM(7*kI+E!p^V%-8^poGKVwVHXWzz9U04F`r{@s5%H&N83+56T_z02va_ zBZm|y|0FREEu)~#_u)_L7o9G&d(2B%#l}Y@Ao&C8U%9}xuDq=!;GFs&v%@j;jRf}f zOGD(!F9wSwl?(%gaTj<+Rzs=9=pNOKx!akGdDz+aeG{|H>aRq;X1oaimsM3s%noxR z84?pT-+{(Md4a=i+vb)dg~ZJ94CZ&8C0B%n^>ygp0XU~#Cw+aL%*}opULVif?!=UT zNQ)cbmmUzOBaC#^FIo*WXeqbz9fy2WNb>3Yx|A|VsLU&m*c6aa6GVLja|O;sr-dEW zfj2+ug;pX@^D1|JXd0HpKO2F*5f>Y`Oxm$e+fZT!G(z<)xP;K7G$hA{B&Z{mGB{Fk z%h4cDK(o*rAEA*B%gI#WXW#r&-M8-mR4tNppaTRUx_HgibC4 znxrvTa|x< z`+Q_V2=%+k^UwP49m`Udi^F%x^ABw%UP4TkX#SeTFYh7~qLAQ(M|PBXKw?|Fjt>Ek zeGPbOSpOZIB$iy=xh!FdvI(z{E(Ko>1fJQU(vwr^ut58CWP7&5Z<&?{=FZ-c~GtTzs? z2d34K;+pU3LHJot?7uZqm=ZRDZyluC_tR;M}vsxv!UCYRRQN(!x@4Ow({Zd~FFAln`+;@I}?-pr)FRzPREUW+AdHZ>@ z@f7PP5)>2gFwT4C$o2=trY`5;j(o&*)5>9g%orlSVl4iWQ-LrwejcCCCRwPR#E~YG z#OCFI5-_`!9f1aW0@hkPy*N>;t;xK_ORR_Q>DuAP+(zs>@!*k7YwkFx*Q}SmTA1YB zli!9?aEiVvzVQ*k}~HJQExp<`lYeAGH1%aYvz%yETo_P zlA;{Ov~Mc8=~AnP_AUraaa#9;`0(>pV6yLY4_d=d!~1w)Ix_(W=e+**>+%p0WveLi zgU;;U0`7<qb`m_=ofDjYT3$9R(sr300@!LGn)_c)=AWRo9>v z$AiZ=-43k`Mn@6V{!&ZcV-8p>k6rmEZpzWgyrt@*TYEE|%8Rif%Jyl81A<)fMAJ4s z?Md-R=hc2}n+U&o?iC~7nGB}{j=-3xDig9#L$U(Ok=}X5UD@$zpS|1l2z}HiFtB@I z04v$O7gHmCOqP18x56~#9oG3U;<2RRrfHR+C9G|7z1OYd?t zq|FXQCNaxz0zox~o3t>rc-#fP!#5m{3-n)*go|AyLFNHnJ>7RHGSD{dnNcN}UqFVE(QdP}zQ@x{PebA)fXYHS%*CH*SOxHc zrSfyc=A6Y7P&P8Wd3`K4{A{o#F+Zclz3lPJFP|mR3`{iuz`)qSem1r?DocOa3q-`5 zI@SKBZLcq^U!FRn{UT(5=i@HG4?+;qIIno|kYvU0*3f)m zU28z=Tub?kh&EnM+7XSUafoas0fMc7j^sPj!4wS%E+cnQW1Tw{PQuR8+wh`3C<+kqrA&yUv( zQj1}^d>-fI-;~Yl$zx{j2;7ZQYB|NFgY*;YDin>XVtNLle&h@HJ?mfLeU_T8pYaT6 z8Pn;w(oA`B|Jy(R;?C8^-L8P@epYS2^XbU%L0uzj)`{dVhy47Dh+?)%Z8_qqk;EeN zaz`bD7Vakfc^NTI3o>ET1;!w^P%^P(=f}xa?wgfe<#TTDZz=Hlt2h0$8~*T@bi)?n z+RMe?sSK^z%=>knOM*$chVcYQiNr0pVK~D<0}N`Im&3v`e^9Oj=Ki4kvQPVi68DWa zXy%Ry3T?J-TE;z;H7{i&EU7oOvP4OLd-( zljt)Fg2k5hg+ulZ=5UQi17nse-f4JlaGv4*y#b`_TiJnrja4%(YrHl~78uvb7?aaQ z4nmp-44K)+z8+cU;mi##>rvX+|N)po;twR3k*vQwUKYA6+pvR$a@} zY~Qcbr%>%IIQteLXJ-^?xN*lDrY)Udf4%psVXdCSZz&l)RVE08@{zJ#85Dz@)te~;&$>!&6r zVOe@{{!JGdZbX6Xx$VY_zH8ltWvo9aeMl=;`C16*d|p0j;6dvSiTh6a`arzqa2wtC zz?*VY8-t{Rp)Du1tR3^!iF9XnVYT?ZUk~aWNmp98oq&u2yg8G-b#)vHb()60S@}5e zz!&X%8qPTV^Ov{2E347bQKxGvp^BkzIL#z^jSF#TD2zhU{s_T ziCNCIC_XsFI)Rzf^CgrcY{6W!mhzTav0)}RbFb2FwqM`9zWb>3*WKf}sKZyp=YAyq zTv8u4;TTk36}yNe{@NIqq61%8t8ecb=dxDj4ZpZ}+q)90khx&2u@_@z-8Bg-rg?#ZGp7E zc|qzKoPN!93g(w%!{>^(sXe8Jq7GdO@u5rL{X3p-xcaxutA6 z!DnZG2LhH3%GFpzCCmmiCHXdB5kLJu@c}`5XRFdB~9VlIU{ma zKVumr&k$^`CiBl|j4E+EGt}x29uic+YrlL5eP$25=nzkE>?;;cX(qVr7=*~2y^62x zhWuU*aymdfV=F+4TjgwS1s~rPKWcQbbe~TD#Ju9$HMtjf!g0~n0AF-Yc~W(O(OJA- z?WXCxUTR`;;15J6PBE}YoK^BUePzms!gBq9SRNKN>Wy$-;mq?&EOSc(E)4kv&5i4$ zl`_?z8^=xFhJ>h_JEA+Kcp~Pi83WY8B=#0$%oqcs6vmz6uQQ_(eZ1+P0us zLR7JL$tffbv;>+yiOcxmxa~gtLeh*2m4kdMQVo?=Th1~mo-k^_ZU3o5=ob=&!Ir1T zB+ExrD}f?IB;nSJqB&P={fmavrI#;``p;#3Zx8wAoZusxona<>c-K+!`uBa2qDBDC z6#8cFAwyKRvTKtVfvM%PZ@VzT?F5`aeKLff0|c_-G~ro56ghUM?$ z<-ry;>jeV_kS7jD955OBP2TmLxIuqGyx)zUz``2oG9?cm&-+t-b8~~_n_FpZeQl|c zBpRBnEpmt1yzb;+gaxQ!1h~&sc#70%!rY$;+aBrdh?+Z?TZ{#67vAh8^n7<{cMu_~ z@VYE>u=RQi_-;-*510T?yc%nbm6qxuA%o#@zA#e6q3s#lCt5~9RrybS-h+2(;=fko zD#~kbF%|8D5E9a68Ii=wTXS0I1#0ra0;mbayExS=g}an7A9|X?BPKl}TpJ{f#M9?{ z#^NK8@6&%BhLo?fvnyUz3^6baUF*u~UaEbVm}CLSNJ;j~@G1cVEiD7MCvAI0!X;%> z8A`2Ox0I&LRj_>|k`xT?NZJXrMfo|`GDQ_p7Xsp0Uc?YM9Pc>}lVy*rA<`6W z*F~_Q?AgLtCvcVswUAuHj}9=_N0i+ht2>~%*I_d^dQiSBO-)k^&k+{7;y}#kL9|Dh z@}B=N-=$%5#Rfiqmw95dNgk}N?Cmqi_ex1=OjbBEJ3DhrG&m(AGb6>lc=)w|0BY`q zjms8AyY7BfL%CpVMIcha(+CV5Cl<=|OUllj&ptB|unqRNaCIX-S{#?LCkv{~T_lsB zJ0Xnbr8{Rjf6C=dm6GBrJ?Igs_*qBe`LL^pf0PbHbH8P=Iy26{TIVPoP#SDq?pAI< zSzP61V5L1sx;N`qFP4h3k-vzGk)xOAcl89MkJc2Xiz<59 z2w)bt*rBsec+NJ%?q&qLeGcY0-_iorcrTt-HdNNon!~&-Xy8?b8`gz*$%$GLL@(l_ zG$j{cazLB+RMGe4fqEg^<=-fdTwvYT?oU1Dz!P|cuLMigG05w+t0V?=z*?YlBzEV` z`EzC(YO}9D*Nau1$Z#-mNEdjKW114TsuX8(B%?yS>4$;yts~ui%;xTxV7uMNFLx9N zAy&a{cAZ|3$MHMB9);UO_0@xW&n0rLhQg)p<~aRYjPI27k-hzAIpQt5@k!98bg>w! z1IYJyJ+{BBo`cA&TF+rQ?Q5aU!SwH1)zm0pcuv9`qQu;q%F=*ty)kp2t^tisBe|c$ z4fuTb2{!K_%I~P#_F>1d-G0&(=%sL~#jGX$c!g=q(TCOV>G3!{qrzoOnM8Sx36b;|&Aj3rxX%bO(1AJfpIQO!OcefVmSDOP%_#xFdO+Vtq2) zNT!d_!^ycyXNdUuYTHbKZ&2e!NQ>y+-BYN^T{z+oN-M>E(%8LA`pqr{+R{n-)yt44 zwc~L&uU`m%dla9vXK?IqTqebNKgLp&BQN)w&9@;-KTfFFeddUD=G3r z*>4XcVD~5yJn2A2U|?ZWHQ9mzSvvPlI>NWtcf0d37<+IqRHC}Awx|}$@FC#4-*AKB z8Kq4JZy@08N1@5zkiGCeG6W*pzLjR-d-cF|V^Sx8pD-&1?)JM60SE0h+3>W?7nnX& zBn<`e{puFFP^o4;>7wts5f2Dv4D$*7mtOqeaMC>u?=Trd{*DS^raMN zLo@JhRIkQR?f)Y`G5GzzwEW+!Qjd8j{rKT;V{!Oq)XCPkCJ*ijdYtqj5+UBS2ZPKl zx0Y2?8|O6AwADpHa#}0f;gR~te877}yno~;N+5sb@A--6CrS&KA(Vkn#zo@XOTQhA zvi*wh5=$?GThh?JUsq8>#=rWn@M3Pu>m0o~$&=$lz0SU10u#1AEF&G~$ZJ)(FbPLD zx3jBx^i!y{R`TY|uJny`Vp{R^njJFJG6-5fT(Nc%_RoJ;WZ3^!;&W)dyd_7Q{t%@l z96W*l4-~oZz~oI%x4PaXjfRt5Vtew`>}_fDlHRXb-i31sX2*y;yVv#$mg7u|u?9sj z_l5O^lo#Z8!;y-PmaTu85&x_YK0da85kaa!H0md}L! z*8|ZLRa?9G<&7%Au1Q;9R#rCCWXz`x;!hnLmQKHoKwZib0n>CMT*>v{kY>tTf19}* zvAv5&6_wkdzvPrD+t8pVwKTrk0J`Qb}LBRs&HVofej zWd-&Rli0XZVjnDS!D+jcl%fA6r5jvWRzphZu0TI4JWlTnkWxCpp%Ub+RhT+Y28Flr z<6N_RZ>dp}7HLcKVBN#@#hl(Kcjg0_#}J848$n(JMTg8H{#Z3w`?4UgwP_|3eg&hN z3q+y-C8sF@8e244UPPOB)rl}}zJ3n*XLzLTgE}l>2QWmUn67CY6e;i3ah|Fx^HR>#S*pB z1qr2~F<~W+HvF)lkN>Kw{{J6VBa@%Jw7gvgI@g0`XH`mQISEA@IJ9A3y%_~7&>Q6L z@|}-aQ4mg+(ClqiVbY6v0XFGkop2w{STe4S_v$qBGOHK^zCap8Y?xy>(QZ-MTMIy@f(s z+$j{NI0P$D++BiGTtWy0_qI?dUfiL$I|&ZOH8=zaQrz9OC+pj5jkEUJ-yP?UaqgXe zt?t33@_ZePox~87E#pk~LP+aWkK!G?J z9rd7w6){=?XZq3H=YNiq_Pa-;dO~r$QrYgOMk57VcWTOFbR|UkIZ5xb$%@c;{dV4H zDdyqz{Msbm zyH%BA*ciX_qv&S?R*o5M6$`B$aS$6+gNT2Kd-KuFzmM#Ho&QuJ)}S{Y$XF#a8i2+| zd6j>&l;2slqSqOp>!SaEK==GzM-Pr=WjTUP_m$O7_@}3^Cta__#roe*SJ{5y${I3k zbZ{QApT{4FA>IIB8kivk9{5kY0NMLRK?EdYq`1;*JF!Vz;U;ywJ3adgYa8%(&%MC3j8YeW`e=Q2lTp~J(^=R|=nVq?FsBdAW5^T4rcmjX z`MG3@e>o&8VLsjMcq3|!(-2x;b3f{EmRCJ{jWlftmy9siSA%GIIekCh5KyN%P%H13 zWt_w-up@e=@q|q&)eJfDRQ{jAdGPz(cN9sgVLW>dpKqFkmBF$l*@YdN<}IgzAcEe& zD{6c%7IykZSFi?G(Howl7_Uj4RmDWkA@x<>ThNu^rJh@r2C{Q8oDLb_yKP>C)grPqJ!y8}>#9TM(t+|1)&+B^SnI}UH{A^s% z1d?tjF7f}wRiT~O-TdTF{lX&j=MBFw2y>3seh-gcvFOt+_@OuCI8Xo{uH1>+ZOxtn z;ZO79F{lPbQey^y4A`RND=VA|EN}=T(9S7nBW8lG3%2I*i5G^S|6mxtE4U7laPIKn zzdrGziHWSSsNK(l5hT~g2eA;wbN|8M$Zk*5fcG4j(24GTPWg>~!gqrZ^{piPoa&=$ zGORPVJy8YV^+V4x+uJe6QZx$+MJ*eYtUQ!WgPdz}G!4_m<5p@*qZeP2z0lBQP)DG$$YJRvs)TW7jIHIYi~DRN{RyZ zy~LQ-JSh#2zDPw64=JFy{po3b6I0KE5I<4mt&2zPK5Nfd^;BcIq|%J@F!Lt!R9)v; z;LH#zd3B7k|6F|wR3hl{@tYSKG?>gu!3j{J9WBJ?L!f!)<-_Ul6*SyR=auMex#Ov@ zZ{M^KP5#fb9!_u_sff}O*1XZG9R*M}lP+AZPrWq0I&VRVpb=R9LtWm4KvR&pMexP9 zwM(pZBgkRl^4G34z9Kj;yqJ3Yx>$R6{wFXzbpUD8uy$AFh~rl}%1p2sGdzR>(7N`8 zLbrfusJx(WG~-gXvq4#%H*gr?)KkL~QgXJz?J~KRg%j~5*UF5Sf+VE|PlI+~dg4Vj zE0p6_L7MM){%a9Ji5ct;X}LsjJIdI<+98N=3M3)-0#m>&Ay! zXr$eQqfy%VM`c}>lQ^#zSrjH{JartW@|PHBj72kEuS>f6?eNjT_kuL-oxIt-MyVht zCL3PN&^)F?*J8y0FsuAFz@~Np1wLg?l{B}V%9CUx@26iIjTIfps%-~O6J(+RH0+DG z35ISk_+ylB;^fYbO3lbObGR%}tAk4IOd%SrzV^<%0vA6WD)@r9Z6K8kGS^HRcnWkN|%_YddbQh zKjyc1)dy@Fj<{z&s#V^^QJIjl0f!czmo;Vf_0`tSl|%q?m3ISb6#eyW!0}80i?h0e zZ)cTA9zPGskz-(7t(V-2!cE54%^%>Ih}iG7n`tdHp(mW7tqY5#@`?Ky+d%zM4- zs=ba}<*Xr&^=hD{^N^GI23iEZI>zcP>)%-_K6&BZ{od$9#r;Pah1HfC@JVNv-&d0N zy*}@oU}2r!qnsHL+Pbc8i;w8LU&P!(AY~4IA6x!zM#e{7J+_ssBGhylU?8pJy3yFL{O5(Sgni?jn$*xoCpkw4E5_f- zjQ9b_ILiBl#;M3co^o~A2Hk3|M)0eH7%y=Y`FV;jf*Ed z#C2x{kCBQFE93Y~_}9Hdos7d(R@NS6{^->*G5v>NgOL>R+A>WGzv*Z=>3mrf6FLB>(XE0n2Kf(!hgr^Q>8&n7~X6fSECfC zBK3Kr2*24Ii3qqRq@i0Yn{kP0-YJx=J7*oN(v`~N2nDhmWX3OXpS!JRtb|n3-*fA3 zQo?7MDxv0YGTaQO@JUx2q)t!0@NJahSglz!Rzw?vD&NPSb^B;^qjz8MR(uiH%i)2T z0T{O&e42FFX{P|cG#)Gf&;Ct3LpiQ^7a&wzDkxtqY7$hVm9@U6$K{^iX?v6s1U_uk z>|J_PfXmPq8M|P3B$c;AmXRM`VV=!Sc%rt&v!|;QSu?-K-P_Tc<|&QM@E)z`PR}_y z$Yn7n8e<)$TqMb+=(K{tOs0Bo+(PH-iS{MuVY0taX1pwsg9^A#{8{!)xdknesaKB1mk&FCimhKVeHyRJ3t2jVL$UHN1#0 zQhb(-i}Y)=6JUVW-#z)%Wq~Wk9VV+uGN>RVS4f#-Lmxh=i8jsLt0)fe|DAE$@2x1l zIIlr6E<`!-c&bz91DKIQrqof)xr03fp`b5a?bSeJ#$+f(zBVLpdu1N+#c9{tg-i38 zxNPsL(0%K3W9bvsjk21Sagtts%jUf%%JR}=xvFP8iU_S zJ)Lsgo-q$97aw}OUdy0s;Dz6=DfFL*wBzcO>+fhBg#gQE7v`i{%?&imU2@|bPq#Gn znyAMOiw@pV=BE}Q#sex39^|1Vxu}{hjN|;?&mH|vNVlgg1{HNx<(xp}Yn663w> z)^(a)EF$JkfL`WnwU&jf6#(Y-qDIZc9!_}?+3@~89x@Bv8`K*@I%S!QJB8e(1W5-#YuAJK(fAuwiF6(GFlPEq=)}- z%q@y!Q#X_&$@3^RQdV(ha&%m{4H-6Dk?GR6*ciw%*j2Hd5r&a^@s;a?#L;!-j?f0C zcVjj;254fz&%Z269}9n55qwYOy5P0eoHq;aX>pR0NTjtXCMDYqbRl(ZOPDUPdl@3z zuHw_Zt^R3~C61D!b|AK8qQXzZoc{HH1nB>dWCE|qZ68F`_A<5vDW50rlY~t6{OKHG zx5CU9nhP%dni*!`H!H=5AJFWSG2@@udb)wZ^CsZiHalHIGbX}_Bg&JZ3;4p2b%^xV zYX2t-e*au&>B0dZ3tedqO$)C-4==QE$bb>Z;S7H3OU=b`!GEd|IyvL)9eR-U1sWdPgk>~40N zqrLSLJL`-ge&^MhvMR_^&9I_kme@tIIqwZ5J$8sFEwPbn_z}xgqo3p_cwi`8K$LU97=GX{;$qcTV`3eYq`I zd5FQF+m6*KYfo+h}H?s-lH?Lu^xy>byNCSSTDqfVy=~)sY1!+@Z}eK_umy9 zX9ig?DQ1Qgs~Dq2Fndi~+h^I&{0h9&&V*;iUY%#b5K4q+_i}rN8%K{S&Re=#8)TL# zq`v#7Q09jV9PRV%^$9zrqFLP;tvu=3;Mqa>!QSu>pXh3lC zWOBNoG<8*yccm+K`UUUlYM5e_)oTU-XH3+G-+;@dGEWA2$9z^N(PAyi&G{F_yd^7e z4>o}n&M3niMB|V3P4|Fi5HgJeD%zR*j|L!xL?0-xL|G30_c=YXM5v5`4+ncA_ zioN~z>3D%kEtq!9(G>3;=Z#VrnxGS7hOSWko*CXn69G&O+OQ^!q<6?i zm&awK-^8&S>hleY55^XYI^rKAIE^3-UMN0On0<_MbODHzLGZvbd=tA}Y z_2+*u%4+wQ;U~C8f-bun^*Z5`Xtc8fEG)t^3rgq{F_ywwV8cs09b1^SRuhAEmKq`e zPFq>S`KsrllvVFoZ-d>~vE1JtW&Es@0PpB5WLGOW#TvzzInh{!S`gIi9J-hS%WtT_ zaKh76GPMt21rGS zb^-S0jsTdRTNIY{Y4aKoS_6Z3A_nyW^KggJ%##dPJbLV{A90I8;cPN)9+JEJmGZf2 z$heT9sJb;b-5g3}kYJ+?0!bkl$ufP0+AGs(HUgHhqn@qc=1CU>iJ?$`znAvpBke+z zLQ?2k5tio-s)zo+s>!}gULK`cnpV|eQbfw0s2(nWJ!CKvY|ge~j==8_zRKi^1< zb#YDd?Aw(nwQf5$25g#yT7mxiMReMT^Okk#z2Xf~uT!c0HPkG58ibicou8}2Z(1@L zN+xfp^!{2GH;z!rid;2$CSL7j{x{tIo;C;Zn~G0+t(4qExuUe9^gd+nj^FI!;wiw8 z5~&u3DtXqm#vEHs*=4wr@rIg(=Fb+#H_;oE$8XxDI9n}!+N`OPsPutnG7%vVX3ivY zpD&XszD32?GsZb`kI&drJ>%QbE7^K8-n7l)ne|N0zB^#HDYDSwREgo^o} z-04ywupW46vb6u^Iym?Kg7|lR$K^oxT5bovC*+nF^VO(P&Y5pFk+_5_~nw6?b zSoAjo`C;Li5>=kg)%P94FP7k{fqw2Luc1TI=DF0<=<7nmQapz`{uVl&vO@Y2eJpiH zWzw~qfMnXDKJ!GbKr|%je;UQJ9$8Uur(>@R>-RN?b-@-&(J;F(+X!~`-TYG9ceV{W z2T+fM_{d;?%fq#ln)-I#cajd09O4^K`=DlzbI?#otj3@k7IIOsiP{}4HqYZ>uroYj zJ#q8;Ho)5{F&MF8bEC&Rc&3Y+M7R7i;oha4_TEF>x7~LHPZIrjPc$_d@35NLPLnu?MfrL$??t^648mi{qqPn_SKtm(3sk`+L;RMx4w1ZG1VgJYZw-8+n=RaQ5g{m&8^gji^+-DK>{ z;~r>W8pIctosdlk)-_-UD4y>9WUbI8A?iJ_yO{ZvFLuA6UbnOq`sz?4_&`6*9XRl63-}qrNSB0p;}TFU)85Q@Sk$!}ZH%S>mykcs*;ym)V|{X{OTN zThyrhQc`>q9ih#QcaYA3M>kl;48aTxVesjv_fj2A7X^A`l ze_1hYd!%8)E<|S*ql>B?zYy}yVq>5;6g24R*mx$`eack1sY*6(kKI>srP34Mxk$F) zS}R&7@CW0^d!cu>!&ADZBE^;{&jVHdR9IuNREfVg{Cm|x)>NmHL5|`Q-wV3kZa?Q| zYt73k?LeO^+;Kjm`DMhUCQ|5~9OR&qntX>_A5?XX_-JTToKo>wGqK?KsU zmVZZ7l$pOu6ISg#Bp9VO4*8z0IJ1Ol?m*Uflux`#iM7GZz~+=D2`e>|Hf``2{Q6yX zdY^mDC`4z!w(u*jO(=H5$N`DgN?1-b9out%=O4Bq*kiAT&&@j7(<8CwtLq>}3`OVQ z&|Fu+7!ID4Z>&E06~u$KSHT<5NkQrPDU|S#A-oyXPCu8QbPegZfOv*ec{3=ZKC#Ah z41W=6&)ALoP2zOqt{&LuGvQrwW4v+(oSbHTSXSk;&K0 z*+~6%F+FPL42Q9D77#0$fqQ7)p+)tm6)S-oE*W4bkj{L}>g^U)bYw7126n$PE~C24 zcdG8{hEw+VQr(8@j11i|51DjV-UI7#dv-ueR73ijki0sOl(>|Ir#a5HQM&Oa&ARXp zM#wcU`P)>JF&(t-0{-KP+gNWQ51d7xwjA#;hf-nI-SZ%&skn>TExgg3J;8;Q{C#b# zT)n}es465K&}3wNK_NTgjgBn+15 z2eQC%Kx-r{m7_Axne;!bvIb^WWw2Pu_9%5cD;-TuoYuaE zpR&fynbU%jhLc@`3H=hD=creTkbU-VifQ|Uktl;}n}~Qa;4DDFTS%(2wkta|yZZ-& zEM8iPes?HJ4{9-&o3~?E-Ppe&8z=y+nxV1z9gH{rLIM8}gp zG7mNE>@!w2Mi=e^qNPzs;YneHn`T|<=YSN&wE_1&ZrTZupP`n+3bhPu{nY!)bg zhPq9mQd7Mc&qM0oN%!%Zk2WyYeCop=l})b`n80Saaq6f|QlSu0haY&mhnFkwGP`cp zEgm>>IiuyC+CCM+Q73Y`aQcjyhFTNGQ8Ef3;f4YeVYPO2`1iJA24;Q`%_4~+Ei#h* zQH!LPQ|(T=ocH0hNZEbhek+UANW6fa;_hABp4j%-gu4Q-zJWK`tmL%R)MRGWN#80X zG11qh|1Oypx+n9c87*J(KimH;ri~1V%1~K59nQb=MI+j@+@&{KzN{Iej77oG5!m6Ch}grTq`h z^lSfA9zLasvpe9=L`(5nv3bu%r?Kd+D&@>WKucwM=YF%>NhOpsRPMHE_`pY4q9Anm z=9%K<-DldL=qG$mZM+v*c11hAfR^A=ya(>$NM*aIY#%K6QEe}Jf=LkIbB1YA+jqxQ zO4sZ|cN#ki-jLu!U)A)*kWXp9{}m$Y3pF?El~I3t=bc{dS^C`f(OG&-`g|+6 zNWGnIJwLG}r>=4>5#4E^M@)5vluRU7sAk002ja`K2OJ;&*$;5o=SpgMn1sV~!LQOa zAg{rymhQcy;2m+epD)@<)I}A9v)^l%EnB8S$oV|S?qze2tGNi90nrPuV~v}K>L=&t zNMWH|-UP925+hc3zYOl;Tc$7rT%i z*!|J+zfx(Wht99!T27H7Rxo7f9m|1#?(rh|Mr z9{TM^rXDE%snTy~%$5L67k!l!7X+{G5eXWdnGnh7<=^BL`px2l>*i&1WS<^%v5L5h za|^Dc?P?D#X&*&QZn%ESPD4_{=9G3hr5t_-n4D@Z5Fb>GN-BSTZanPk?!D~t{>j*A zPLl@oE*{JH^yLQ2%%aLUTgg-_*3aTnlXSW5OG}ify@uiJ#_@tro<1cVUnv!dVq8@* z2`|UTs&wJw(mBJ{Wo1x<^DByNRHX8|YSzPg&pP zz3;&o6u{>1pFd>hV>A=9?us(_3K!Tm)Jv`*7^~+c`(9fXa%CzL=iI`jj0+G~yqrII z=dm@EHhud5b(>(iWVDTRo^VyPtckC!Zc#gLX}WY=_cKLa5ZaM&K(G0@=u`($?94N+ zHf+^y{YlxYdQT29f%TkuNGfZ=eEv}LbVgc2g)CZX1-pVOqYrIRj++E&;8*;-_8 zWe*^*6*4mw(F@G`1tGC%9N^MhdAP#FbyYOM*ZxBn-*5?SD}ngbG|Xb?v>N$i`;8FG z_sbZ)GwI4ARXAJ6CK-6ceex`2&-jUC*!%0y$|4mNZF4ZIR(Xx*>Ux;A*n;xn@3{QB z&i*eHw*pu1j=`%+qchi~6{}GZUkwB>I=c@t2J5us8l-^u=+5J*Xi+oBW*i2i6bBP`0BVQ8!s9qXhGx z)=6mq(l)skH=9_^v&D3#={32}Gf8`pvvT~<(=A>x?mENAu`lT4Pxw{=cT1*_elX~( zOFNR0Wxl)p4~FhEARw2%#z-bhJ3~(Anh+Zi$3wq2K>aDiVX{g5neDPzl=4vm;?wu6 zU1|dV<0mIT?e*siy%|TwgLVS@W~x$<=|InVB3DmgdnXQW!qti2oVh*WG9 zr&zb+gE-xS)j9{sR>)5<3gIOVbUVrWI?r8ETI}TICBNOA$!%zv{Z02*|KetacKvhh zqDCgeqFL6t$`R_gZhTISZly?LRgwa7X4{3rYWn$G_2II(g} zwFVKIkFtSe@l)#sL}D$}G&$4H<#E<;^|R*9`xs3Mu6I#ElVE3c1>aO6u<>LaW(c}?*)&Z7jBy!uSMcz?VuCO_dRod{tWF)#Z z6~OhlqZZ(oh+-=p_bx)_INrM}W-G;|mcL8!{qcq%HfJ1eWu&X{I zi#soZ4*JXj_~=M0OV@1-5W|Gw10S0qt2uCc|7op8Ui+u&thq@qFi)Avfh1PLHo*ec z^VJ`!+J?M61#G=<+a8^w*^PhS&@x7PsDy0KVyiv*2F(vH%^6xF+bAtdqIbg6ffGN} ztIA+k44X3<4m$E%7HL9PeAD5O1bMH4qu6s#eA-1y$7vPI{5Fb7!e z*&~VD5=8e``2WkH^qr;=d~LlMY==@~QazfEo@J|*t}R(uwK$in!3Ph?TUMmTJ=@tP z_$EPQg;N!fKaSf!)ZBa9cGvpq-vo?on_K7_kH1_!J7O2%I@pOoY}|(eT~5OrQ{{4b zeWV|W*b#X2b$ByTS6V+l`&FRWnXsI4ByD$Fq^aXG!WP|gxd{DVP*y@I;#=kBpD z{LPl=^V6+d>s*sTn3`}QUn@DHy3-)jmypd>?rQt2O8sGZF@43T?qeO9(gY>CSY3nb zYB@Up=TM5!J|d>8hN-z{P2M4TH!DVV_jL7V+J|P&H(QMNJai+2l|&qXcY zt+Mng!jFr_0eQSN&Ns;|brX#i^K#t_Dk_uIq{#gnR?oDr7!^@cwqed+2a?75**Kz0 zQUiRKMd|Jp3R@)IyEBaz@1=w^q@ABmm)f>ul$;?SNS{pV8*w#HgC;F+aQyjnTkKB6 zyC+ZhmrYFb)=7#9kLZ=4&2m%npLI79(t_58ga%qkPpBdyeuFg~$F7gZeVnrPlg9#3 z8e4jS8Fs^XUN|QXdZZ}(={Ii3(siw?_f=w&pX;tVdNAtZ8HKa)=~QvOXg%byp2%E%Y9dMq=Sf^3cCw`!`}boFLm8!beJc}<{=%c3xiypUBudy=a*0@$?^SQgnYZptvHP?7y@RP2Wr{PaBxCOq= z0F!fA3`NB%v-wj7@>ojpeL|Y$LTw5eSIL=Y70c8+E~xq7Jth}^F(PA$N0Xgod3-&c zR3e)cRLx!ze!Ut?uG$BzDPAMRd_^taCp3j{qjt^6K6dMq-PwAVjX?dEN!TL-U$e+L zd{s*Wtqv!rU6mePz?yBy=SuCM2uY0S$PQie8>xeFgnmtb#c z$vZ1(8~a}HuTaSGP^{+L&Ae{Y%=s{<;_m$iBL#LbBsk$g>RVU^5r*Ujm$*FIw#ci0 zKn-x;^Ydw?UBIDn0~uMWUW1;yRXFj3XkCw(&9^gXEy;dd-p$`o<<_}HH{e00k|td3 zw>A)3)odebhzSegfV1ul0{*>SP0xKJfw17@fk9#0Q41~IVzujHg@#9}A3gBvj|@^@y#hKDM0%Voj5tcQYPk(GBbxH5ukO~$=uOEuV=m~4dc)W$ zv!ibno5b)|mQx=8@@^rn^<#0|ND*b6L`1EAZ)VrNr*yfoJQhYV3dGzvcYA4*j{_R306(1znV z(d(>z6%aK+qoZuwc+Ap)%P1s){G?}R*ZN82&o!OrNXY9?Dc|ArV!+aV@!xHp%>!3t zn-nQaJykPKYxZfGt_veutcxAjY~R#j-MzgAXHKik0E*xVMP0F}t?ls@FzL9(2YT&( z%PS#?RYhQ#vR&d6cCh4)Lbp-V;mizBu9ZAslKdMtM=OWOno_PXYlCO8`g~cX)6E{s zQ!8`B?uBK_brfSj_4L${O(#3p6%Eu4lN`h9LoH~1a7WzBczYmU3J$2V4ppRcF>DbE z=cO*q0a=Jrq!wLuXspkr^Z*V`ysezt^z!VKm6&UAgC9k)fdGz2ygLyI6X`%d+A@{J zTaj%IOKU2s+x*()i*m7^DXX1R6m}433IG!D@A!7Fng?ImJxGyNj92!^^16m&^aHKBj&YV zukb-{OZkokBP9!f;vw+MMS6Fw>>fX2n1T4(w6^zpSL&a8hmN`m~izdTJ*|C@?Ca$x|t9zl<4re%wfBc{N z{rmSZC#F&%)rBouo^JKD8`hw0@Ob|$gN#=3Anh*Zt~4RS#I|NgrB|i1t{7IzTh?g9 zk2aN9FdUD=XFAM!K}5{?wf`3e2KUxhhVkkjjC|ykFq#?kfV2*+m#1-0?vj5fb**&E z=A9dKnt#f@*_vz2aN*;>524JU_3j36y74+?lOdrnipI5|5`k0m&a26Uo(QV zbM?c|of+ciD;FX*tZi+v7O_CSX5UL|V=Ac3;Vt7AtdS;t1AP*OK>4z8F?U_(Xt8z; zEMe}f#C%&xba|c-oyPgAJ^Hn0`vOo_G2PIn;Im`QP-#8Yz?Fb7{&b^aq234j>hsvk zsI}%cH3;RVcV+B7r~e1zb8Ww{9Ir3QUCUfxUH|?24-FfpRjB>jkgG2quI&!tGboo7eH z|Em06Mv&@qXfyqgLHl9{d@<@d?6@uWY*&yFmy1km%ne_^u0vw$RvIMjDu3%*CKQl9 zt|QksDD|ta!u+sQ3yNq64y2!0jL2u(^3~Z9k;^@+jgloay*_4Q(I1Y1aPY!T~JY0au$lqKsodx`DGt5LOO$O>!HAd-TGJU z!p{=zvlYvnAU;B~3YJm}) z-H{2)W3GX<=T*W3rk}3P+QMVVs4GMgCP!x#)Ine|Jlj?yv>7IIqnwq~Y5sQW%z~kjL$8(mAdU1dnstt- zh8P{}2acBN0PKn`+VQi;RZgyrek%4!*a`J9@zDE5&d2+$-k>)I;_x^c-@JBd>SDkP zZmDeNl<;$PBw>VIbg%$lQU2+p6`nm0K)wY} zoHY#70fo{4gaanMQ#MPxUUn5H074A_hCQ8~+>tC?d3W2> zQb`F~?Mp08uI`VH-qgV2^R_L?%csvm0(P&F*FE$XQV}!y7K(Km`MI{=XUh!k708?H z0>LlZy>~Pj=oqH^#04QocwAT{?*k{msGp;c^*F998q^kW_T4w945T@9| zoDjK$C;#Be*P%=8o*4H(X;v3;9JLbcb4LIoqH6^U;=1)!9V{!>w9u6X59cS^_! zX2|^qcMIDb&B5jfwu*8(YhIR7IN2j&*}kpy*ZDs!c~b}Hz?Qs?J#sl(bD5Mgyf00E z#y=;azv*nUfZ34V=0ntj3JuElo*|S-`=F-Yy!6FAX^$heOR8*AYMkwSIl@#R!wL0>Z3Dn%M3M}wBBkVS)w`bk&e)rxP zvSv@Cjen}f#?PnRw&p*Mr#O#;4S78yKc#A5mB1d5nlM8FQ=HaORa^=@LY*lwFX9yO z(2^!De=wL935~C84xinmoRr^_e@;Fvl494J0HIkH1{2Jum=uB(;hx!YxgVvpYqiO zphBJOf+@TnRIQ~>ENT_Oh;_(5#NOHo(XQL!Bt#`73+xmXRlYV045rO;zw&4OrY_4| z*P61(dA%~llNv$l7gmiaem$j^{Uv0;-9A8P(hy%Lyu--0;9H91T z?{>m9!NZY(PyN7Gr0(3T?0l`!0$!jj&4JYCvzGydl!2j>qrjed149Iq z0@(UH$M$Fvtk>QQt|tI+<0;K5>(esvJbPKrN4&jiubdS|0~8xQh8D&>50OuKAdwLf znVNomyfo^yois8*2`bq_aOV-v62(p63r8B%P?_uPk5uy1^{8tjYCS^+@rou4gDs*v z#Onn2qWTD!(1v_uo*3=(L?fOVu{HK1n}`h?u;L`?*IYVi{cs&qyc+T5F@AweSEY(5 zC;O+gAG(B`C9m63TYo@9o#idGsG+2<7yn?;QSsfSu)tk6!r2&Hp7mYZ?0!9t*>yHd z@zT{7m!b>i)tsqS=NIN}{?hpZ^KOC%i-tecy0Y1zQSqCn!!-BN%1MyW8cKW5wrn?V z!rD;aCsApAErWPxwVrlY7V3mP0DJu*p~YbK_FR~Ej^Hj`_VL}Zqy}pmZRYcXHNC)| zPk%-8zrFC@9DSZuY(eYUQp?yb`DL@$)v;E;OoBK;3SsLWXBLN{e!&?5Eu>8rN3ZsJ zI@zl!>(O~nz~lec$zK-_Bq>e2FVGvrpaHbC`^`ILuuq@Q90A(;O45SBa~0bYxB3~! z!t;I0-6_7rzSSIae0}WMYRetEcTOa2BkkY4RdXZvTAbKVi53>OgwS8;yyQvmWzlW$ zWlTT3>&Iv;BRD+A?ZjGug$GcHQbKzR;#@s@{*}#Y4z#0XYpKoVDgK};&tY4j-;NgA zM`uX$6@jW(!$`YV^HeDiU=M`{?yaQTC2Y_C!7y^uh{(6q@myaB`@oylQc+7ybRL9$J14Rn$#e1Y<}Wzw{PUI+>Gp-Tgf?SEOxPJ_TW^R zHb9MDFQYdE+B3Dp)<8nP+A>sMYBr(LPb#)61_d5L61kU!n^2;+q7t9~iqaeIh70B0 z4SR`2ukuu9=N{Pu3NJ|)-csirET&*xg|4uJ#Gah59I%PWDxKE>c4xZ0D`t+tO!T`E zlZ7EmnuyPnPHnXr!}PxL-eJ8^6$zVtTO@v(Um0Nj#IbwaMSC&NZZ;(uZ!<*0`qA=H zJy;I8`VOvC#hL2wc2rbHjaekn4zhao*1iPeVOrDpmD3a@t$@k?U^FdV$Io+gcAk7b zf7PILug*XpAURwLaIM@=md`HPe9++)pXGgo~oHqm$L~GL6qrCO$$98xym&Riwz%l*ky!K6O?Ie#0oN1%e5ip-VvDo z2%`A{|C`b{CVaBlthO}W@VYiUkMd)Nye*s5#(bZWX#nYSogWRmAeRQx{-$rAi`F_W z*K{Q^1N0LUnWekY{^w|Kcyy3g_|Li>@8+zi7Jp%)g+96o{U}o(41aKeHhkO_VoL~X z;fSr*He03ny&h?g7WxR7mwrj7XtmdmNP_`3E|bo<3KFO9=6?Sr^bu|-R^{QRqj`2( zt6eG~o$+bjvmvf5yn-pOQ98v_kD;PRlL7Nr1tEFoEW!l+FX;28<>-QJhzN?FfBwrv z9Ic7CgWb%zF5}>cW*E_r#xz zb`iI6nT~cOUTOy@$uLkSIM{hjaE7O@{sp)E?>+wSt1QvSt-o8@7Ll;km|Jkx=jLDk z`@Mgik+S$ggxInPa_YR432|Cr`xh_({h(go_r#T1vm(!|tf``f*W*c$B-ar<4YD|? z!*R@|hW&TPc?vOE2EPU8u{NF^jjl%!k2C}tOK@x&#uN)|=&6<-Cao}ocAmDk0t+fe zU6CiQ%_(@#jNL)lg+y!=f`;4ELj;`Pr9_1*bVv9qpBS%k6; zb`VZFY0QUbjZ@1F6X!3LQ_Fdk8u@T*?!<}>Xp@JC>=5ru*4Y_p?QTs0uSSkmx-0i6 zM7@^?!xG_!D?Rr3%u2H`yQp}FB^Q;{H#};RN$6-@f2u>PzMY_MfwTO(8JsQ3AA}-0QTA72iY5YpuV&htBV+Y~TY{l-;v(8Q^quEF>9LSVF%!C#8Y?H{LR_p9e ze$=~3rE`+WIr(%FK5vum8DLIfUuh8T#3?MyO|!WLHz9a~j{xEjs&>)H(VLO+cXK@w zXFa-l;bDH9FoKM*JrvH$qj5Ro*df08aVBU0@2hI7^;~N7%8mDO@y<&eIU}>wV~o{! zVSWN=!P@VbNFyIi4&a|{QEFvi$MVuQm=oHXYw4RL>T`-Bof-q@11Jlp18ic`m8OLr>v^LDQ1G5{c<0Hj1?_o)wf^hxg%f+;vo zq@79$dt*t~Sq9H=)%Q|9d@SNpuskq4EFz#|qc_c?)eb7m>l-I5rd7;48Rb)c`1153 z4qaIu@0@I76+)6_*}^_e$9rAyCP@rv~0pq>YuM5Ds3{d;Pwr3S`fP%&!Lg z`DnJ*I=IL;$)DaBdW@y8h+SIG`Eb?}M1EU18^Uz4s=4DWTv|>#Wi3|W39%m%2nk^< z0X+h5ovKMuIYvZVwsEGWP}PVE7?X*pr(9+qIgSI2K+~sn5NZnD2_9x2UtH;eg{yZ` z8%pd_db-_+7&DqN@g%}^`K>D2Wg1-sfmYfETWStM^GEwC_UERLr;s(5DO7c$8|7S@ z?#46ya;^ICY@4}RQ1qDs3m`#StvXa=+xdxJsQALc>aI)YlroHtbYv9NWY`#ouNHwX zB2j%iEGWOjP1Y$Q1gj1L_1PE4Ml3gc>=D&1F2+in0VEG70O#Q!SKkLnmgaQ`ZZOte zygNAxCuD+VJ>fE`&A^8Smi$ha9skf(nkXqg?%drR_B|;<>~SLQu$5E<)h>4Ehr&Ze zCU)oQqI2EjB2P0+0ZxUl^y;rObDk|N5`Ww2g~Who%1M}B!kA*%*M^U=%Ps2VWt)-h z)pRaL+%fNM_Dy3vZ4=^#D+}EI-sbZR0fs^- zLH3H!pvuZkcMCvZTz-H46Vd_FK;IDeW47c~uWCZE!F_w3E?R7v9Fwp3RMQ-P`3)5C zPO4gWun7Eow1Jko{|naM^U2>>YFx(z8;jOgc>@LntrSV!}X3+NTocVFC)bB;Tp+Q@kk52klQWIKQIm^DIu2fHW zMf7X-ub^4pNgXexPXb$lSM@rkD0hsx1E?skdn&KeL z9DiqAH4CJ!cuojojzOB1mgW@`o*WmE0B3co8F5e9N9#<~|03?K!{Yj~ZPAJl0wf_M zI01qO5AKi~NTvyttaYo@k#%@>qbfj6qL#$6O@+|w8M1%PxiuHqb(={<$b>fqiL*TD)3Oo>+0 zPq((9?Dg`Cn}|2A+buR~TQ05JgAIzMq)Aak*+ewJkxn)6`xlA3?@ab5{x z{B&}wVg!Qh-06HwEDSWTJn%K0(>{r|(`tuJ#f5sW1!62G*Yg0QEZa3@?CP=l-!)vS z&`?>B%^gW6T#8W;s9oXQ6nl%xs7KCuh4``)*s?WzR4lQb5S(w@Q3l0ZvZ>3t14+&C zy9=LYw4RHpzYhu+jX=p;f+l+$Gbebu;Vjh))q-M{IZJzP@$v{!U89)H#TS*vL^A~& za4lfj(~PXCiMk=~=wJs%x%c&{QruFHhMLqAf)b>VDsOyked%t}#V-#~Xqh)cX;5P* z!3lc1$_#Nq8lyU|2KNx37D>!Na)-12NNo?7*zu&IT9e6uNUXZzPHkeAb*4cG{Grxs z0(7_r$d85Pw0`Fn{H~=neY31RPZwwfTU3xZhSbfzaCS|s+iDUrv>+bN10N=HQjWEn zkwOR0=TqgxbZSDS=zsQj`i}0CQgtn@*7HYxsSK_GR#{@Yjl5oc!zf~1%YSa8lMjWb z>UGc@%+8w>PbQF^&;{l)R61uCf`&r3L_k){0S+;yrVdSV;$Qx_boy za5{R2EQ`N5yzK1^Y<)yhp~AW6*S=rzoYpMd&T2kh4sN#3z+N~(NlDzp=2^IPlQY86 zDz{&?ge)P+n)OGnh0N>Z0zb36Ja#{MUMd0Z-odRPLFW%qT>JDL0H8p3Kh`yH+*=lU zmkh%`pW=@#b>@-fNsO)G0nzg!C3Jjufl68x>7N|yEKNjCF$L*o1gw1#FI4UZF*;C# zyf4^)&XS(8D$eKSMl;+S$-;gT*fy)&-698$nrFfqgiC zvB=7ok1#l*SY(Hyvj~U=vA_KSjBU%E*u}F732Ig8p3$a*9fWI5zPNPEFC1seH^F5~ z3^Eruyk2^D&`tzqTt9HZ&l>`Vs`Yr>PbR1g%fY2A;Jytq&ld$t_!TPo(*nJ)gN$mxVR(#Y=9pM-84rmsF3l z(zq2*BDVquWaGV4qe8*l#{D0TNb^6UPaxasEl5fNTEime@K;PVUA1< z8na;L=G)Jz^h#DvnYb;Z@gYn5B`_HgOZ&Ga`7i3Sa`)3M^MtNS^1y1Fv zHYl44mj;F8G}@Bd>rgj{CDAYQBvrtGthJGSQ+yc}kB_@Ugq4P(?xXmlaap#H+bi1a z*e2@*YNS8KWCYBUFwAwVbGI+#Rnc(^q-P4uylHINkA_H7FuFL>X&Dpai9)o@sPsQF zA18=K=I5_H?WB)o5p11rsaJKMT?am))3hOBLFO;+&oO^I=$xi3h8Jw@;YK@E?!T21 ziiYW6o(j0h306F8ogF19ETAI&$nq`I9zI?jwj@Y$xpFxHzv98H=fsE2%ya~Ba-1^b zd%>&Bnq2Fz`NuF4@s02047gvp?rtw+jJ${FJf!sreq(_C`SAO3`n~mmSgz{Y9eL+D zj)m=y)0&4>ZZ`8t*>3JNhnrNsd95W(U>)aPuE5PoFFq-^1G6XL>5?3(5sj)pkUn`^ ztrdfF@bT6bnK&*UsU_zUlfy95Ry>0I>^=N(9%$sb4N|W++Xi$m=}dwn zs4he0%{1g(qnB7o1M=0va70e_j!3B#k0+hro+(?iDNnB&d4w}6_enYQJJdG0T^RM| zviX)J8kWkK%F+@o z$4aXBIkbCM7=v4Y7#1AKTN>!Z2G(=iCO)t2qOCaOHi+qylIWl0k$tQzDBTvbA}`~K zWo3QEu@PwiY|bWVIX`{cY+qVol(i**w@^FcqeR*bXFiBe;6EFmsFe}dAS<*s)=E^P z%I08ak@Tjm16WpEu`E+MD{Lsh1APyT1y!W?cI^oE!y8Q_TyN{R4y*T~Q_W!U%*x_4 z4$QAdD|l>^tx}#e)}8PUiDc8t5)5pA7`GrjJ-Tl_T*G$eTfDMN0wrFz(A$;FW~|%e zDZ%~pH1g3FG0;)FaJPnzczQvN5yZ$FDt^@CLdS8kL!F{1w-=HJ^QG?)W?oW&XUnC6 z_kil$7bZF;7Xp`-4`mk|v6R{T?eL1flw@Qz2}q$kj8pFuqHqbLQ?>DUVHrCN=^rc< z{|J956PE%`#9QfMa`PGQBeO8X=cqS)&@dvAwGKCxIc?&~J_bAarDZBm_>y+WUL_Y3 zPBbQy`>x~XKQ#1;u#eu+!~%`pArVEP`cTy+ra={E(u6vzN^(9vH3^%EZb`hJ=%|=)LDnkj?Cf5=3>m^V5%?$%4`h8c*?uGOMRo z+dhfqchQS7v)ugn+%nnqVdZ={rNz^SJBt1DdW5si9XQCW4%kKa2S0qR868yea^IG z`!&zXYMI1zmYX8GwRP^8RW++~RDNnA-0tbC1B-8%Yi`d@rgtDx6Nc3dTP*&^+n4@b zmXVy!Ga4q=I$O;x&aWtBcNw&!HJ28_SQXOFCw$zb^>L;VTxooe^kMg_zNJQ$N3`-P z!I5{Yq_OZo{mYtZ;sA2rA5}^n6xIT?e)(^ZB!((nE8XcZCxUP&dsCB8)tDV_ z$!~li+B23uCEIF|m7wUTRLvUWjSb0lrG!vQha(xJKkrRKYw9~Eg@~Ap?cLd>=?Q^7 zyIoIp(Cjr%@^BqVogR=h+P)fkz@dcFd)nAPps(cerHx2TsH0+4eH-&Mb}=Ayo6YKY zetIc$C`)Ej9%!J5U=^~>EKhV$M1;N4t#BFPdSIqWJX0pH70wjEcB-m3@={!FPdQ#3 zzEOaumRP`}B_{b@M8OYN-@eIUusRLua9$N%-k>2V-!Mw65?iSuJ?eS!(ymODc1xL2 zRb9U+(#LOZE7T^ncAPRJcZxGas#`9b%XPj<6@6m{jy@uiShZJRvt#DG`)s*}+vbt~)pT8SNS(L2X6 zl%4JPqSJF+R_3(#;i8Zq<8X#Xo`;_$f1&IYlV)pvkCC$TbRl>hbR0azxSidUa%ok; zZs1g(KHPK>a8K~&EpIHuid_%44pZX99PuGGFuk@U2R)2dO1^o7EUW404$)O;4y#ez@f zp)2HBh@|yKGq<#2l0N{z03Uf8TfwO9m8Iv%;R9OV`UPm^<0dg9P?#~Vsu!DZ^uqdX zGgG<$eQhgCV(+Y?6Jm5dR5)K?x2k43f_$g_yHxVA7goQsr0_x8#fX<2*e2e4D6e8Fr;V*{bZrIiF%~m|%HNOUH`|B0 zV9^!|f-GxJl6BtOfKjZFw)t9sY6g!5yM^-9LB=-INa zzZVT%>243n+@zsqmfNKb^un?`!aF)(!jB1CIP=}BVs8h4z@V8-Lh=iS4_yVW(zSyOk}%LsN_Teg1ZnFrPENWk<;Ww zwn@i!^gJOHr40~iKc;=Z2aREdPIje1i)KPg(9?ouD8g_WGs7pQn|o0I{vG3h^^huF0&1GkUj(?)l`FlR)pbvrr0aNBKmnoN9MnWRY_phwcp6 zrFlM(1&M{z1ti|%zlm}G2w$Zmj?CulKgwBdxvmutOJy7v zh8${RQN9nF2L?KHlMK#|MEOYCyn zD_8lKuh8B|dOJ9V%za+UBpU*I_V+0y+HfeA9_&xU!D80+=rnF5tUThCU104EgphNX zF;7yQ!}DT?+^9Z-Wkd+Hj|HVq*wiJhYgk0@@EbiN#1>^jUQgH$_$O-c_x%M8d#XcI zV+TdQnkkQ13XONa-Jy#08<2BnF-`MaQxlR2x_dDMOXw2f$hwYOLA54?dQbfN4o;!qXAN^XCL>eeeR;{j&D(Vu=31X1}WxI(vfWSW=Ptc4dBc zs}9R6{A4`q2xLNKm)bO6?FjoA48kIvD5mpTw8%drN1&4!oA^$MuD|AJ>w-@M=#&5VxgirSwC~-%pM3SXP7Y`Pis+_!-TlbsOr@_W&j)2XcFZ_ zL-H3@ewKV+6o*LIsXGtISY<3lm5(PBNz%QCMH#}Xz01-x>B|V;(C@Qb9wKg-oq(LvNi{uBru$@F%!zP?-AdJ+2kCLG zH$-^jy&ytLX5J;kP8?#Qb*t*7Ocy)k%BE!))?Sq6hrE@x&HouX;9~(tW((Ty(;qDB zO}%Q*HteVaD>9!y)+@9W(s`df$9Af&CFSE~R4^(vZ&-PPDMNB_mY@c&ubk&@MkM87 zH-#Qs9D}>{$g8g-0m zR@YW#UOA4gbB^WI+9$cVwfdVTQ>5fFU7cy8 z7SpU-lLv=koebqLbHMl#9;YHbas8)x+5;%X3)V)yLD_!{2mm~r3ShS8;z+0Z0<$gm zWYeKcya$3+8Isa4$`7SK#OK~~Y{k(IesAgWCEwN}R4s=(=k*ZY5>?Q!3w8jU5K9Ub zW5$F~sm?17B#M!_@S)E7*oPTGl5d`kiOzwMs1RJLHBUJ6O224cL3>(a#<^}~DXOB~)-cLkC^Ip7;gh{t^hw)t3mY(QLL5<`D-b6aVVi8UFF8UO-1V2* zA*z-CI{q`t7f*e?4Tc3ZzF}`LW4d6aUEX$eu}3R+n|6hr7-TTXk<%wT#D+PR22pt% zHjhc`Ch`pw%jGJ<2>+|!|IXP%@&ZJm-dyKJHezJjn=bHqq~XaC0*kj#qY)RKW4cloojgAQCV(A7*65@3Y#5< z;`GzC$+$qPrz7D{M`gJO^m~wf=HrfmIqb<8VW)9}QvBT*gr%9({N}Qse`mR=#w8tB z()Wnl@v~bi#`oj+;r4qr2P!~v!;I?E0n=T}&L+#3So{H=eLY+8pxllE@hI0Z2V{9% zABrmAcar+C$b*G=rw%a)K`aP(aZvn5us~{84dPMBJRO~Arg#N|4`s<;vb-XH0V<*hM6SQGHvRLK{%@Z?^r+q?a|}gobhPZn z<57EtY*y~5RwZ*zjqS{gZq-yt4Ye15Qp<8|T;ml-i^y6yEKChQFOk!-sHrSQ8UA+n z-=Jng0H}4QuqZ4p`;;{IrxvGwZO@wZnICGso5~Jwc#J;;Y=2+t z3vX^VU2{%m98#qc0lGHzKWRq2pAhx`Z%oVMQ<74Tp;Tq9i!4)d$!s0IqKfMUC~tK3 zRHMsTlsH7ROE20>@!LYwg2sy`0sDM~W3r5hrlG{)tuA}nzhpE1)?@d?L}FQy1a{J#Rtb4waSv`H|3ga>r&lOw@p0lBQ!H5!OYH`dD7RG%~34iofOA`Qirs~JgN3B=`O^F;H+Zf?x@yh6p^~Pip>3fdvGq2 zaVb*2-NDm(ar^am$us80iOGCwHPwvg3*@^B)v5#mlX||?Oj@latheS$V7+B`q<)tG z(U}%6_}DPq;s!2Q?m}W@fb30T-M?pIh#EJea#cT)|K*4+?C7IY0~Z(CQ82iZtyg_< z2!;*PF|b6Y4AgFHZsbi@6lq9&X=7@Udc8&;oRdB&s{*EU?+(FY{KI+wp~v7+XUByamPSah$hYW4?^;ROGFMJhHt+3Z;e zj;*m_841xC2kI3w;WuS5e2!?~!XeAg55NfR;~~_#@w7)5Jf0cec~HCGqR)QWt-Prf z8y6XvA=GNM0(t2|_%@j>w~>+Cb$T&(-C}w;-^Ap?$sZpzmEGrZc{H?nYoszawX6+( zO6(Kpvb(F^!u_>4)w<9hHqvo3%90xbORT5~n9 zOr`7DKss|*?~(qyObO*1zrAPSfreSc81paZxvN&q^JQ9`jJuod*I%DsHb`g6CO)KW z8amF^^>2v#!+HNdvz0bL(Dsz;9eafbIrj>@{}vT z;Mm}kTpMKj-Kn=i^v(fm{dAY#>wn(X|Ls#MHDDnpb7DEfkNNAMw|wNRXLWdk>qEN* zM>$~wJ$*AXW3K0>;S6fkqq6}J`FtiGaD>_X$n=1(1O4UH}la@RL;^^%h+FE4Q!qV8r z6rXE3MD_C=En<^KA;9CJtRV8m#j0hDZmv940ZVF0->^G1wSRQ} zUo;=G6dn5*r!FtHwrVe1ohR#G>}`|vZh&)Ja4u_&DR+@p1?DA?@Uh4xB+$(;o9;UqWa(SY&$OLD z3ybCgtMRBz&UMQp=`NdDf7FnJ zC%g?b15fVx9Gz9yk1VmaHOpM>L69i7D)a3t_m-e9VCrA zUmRTxq)YDdUHV;EzFMt)*o_+XJvF??D}}5XawCjP#XrVL7~xQLwvxhY^#xzQfp>K6 z%vf6E_ny@EDB~>f6%s+nn%>_ZBIa33{;7}_WVw;b6>B`uTr4(7hrDG zAMNOGzy9O=Ux7XU@hAV(B))V1n(W$`dNio~d`n@2s*(JWG=v2siZRqla9nV5C#K{i zn}h1%>Vm;7s_OX{VD?@uY7YJH9FIjl9!R#<_#N3kl`-gyO_{t#SFsD#p^?FI! zF{aB(r5?h>ul#E%fK9hc)={>%tG{U>h``o5Y}f1D9h%!F`0SX{3q4(RfWB(C50FmZ zkY+ak_vpgWbaPv?zCZjOzVD%b_flqY8%*gwM#%ZzHuRwnwb zebnkNZmrnSSjyXXTWB6w4Ci^}_!+yD?>d4&xDpA(2VMf@`wtyo)GV73;U6>W3UWJ& z6KFi_l74}rK|&`QFXqTSG~v5L0N#lZ2$j)od<<+rH~qGUAOIn^F^MFYzi8}Se_W>enF*lR!0cv4h}n@OK=&e?05Y! zASyJYN^C6UyLf}JOu;K(U?L7nOCb9e>Ew-aoLq3LuE`@^)Sh4z{5A7y)bJi!MNZyK z_f#EZ;}8(tmv5~7(Xvtskty-YaWK;n@767>1VdO{Kq)Rjhe?d3oMwKrizWJ-l;%~k5A6p7qL}@xdKY?C7fMJ$5JJGdW`jVPD?O$T#p^c1@HL8h-t|>%- zQfTb`R4mAXSc}5gxOx$xqeW`1j7uYX$)+C%xr5Y#qIt#q9hv)H+|u`TKRq- ze0gSXO-e0PesG*2TLPwWcT9tY1Zcugj=-F*aOfBdI$4AWOfDTfm3!~CW@Gow0S|KG z`d~k0Zozl6Zl90O9sbUt|B=j4&0N$& z>d37JCwPw4uWm{8db%XphGMMiza{!w)Y3Bsy%++C^7Uvibf!Ob!A8q?uFM)_ubSZ9#BcdR z0Gja(|0z(;!*4N*{kUC*SUbMU6I!^IP4(<7)yZ1Gb{8~3qv*wNX);iw%Y-YfB~_;x zugrTv&PYOp_|Ej>54ZhiRs*8m8016LnGgT%?^V# zlQr*>Mc*U8w$+z#PcNDM^S1_re{}s{G*Kw%ANoGEXJZ3f5x4PyIZ1QcW62-{jTXO8 zdRCM{TRJ+C%G|N1&w$8Wlj*#thz2odS!V@M9bm9y zIWTSC|;2B?#Y)-jMEizal6pJCqGgQ;^&>>}$#x7i!iFAIbp(;AtPnLN_a)Eua*7pOjg!AhJK9krC z_B|vub;k=L)7YaMXTJkL1L}DMJa55{+x69jGRn+@iDP& ze;RDGin(yuuAGtOe7`I6O?cf;>G-$S7Ry=w8L$8DuA@rxLytFq0gMmew?@mWcOBLM zR`)-C<8RD=Ao6_JeuA+X818N8o8U*dNZf^zm(&%$z2(2gZ*4R1t-7{E7%Z=HAxx>q zr+h!0-)Ega6~Ttzn0Sl0^@udQ?g!oE3i8<;B-XeHQGKbwuFe!bR+&qF#BxUVBgX1v z(j~`8+uv&I{W3FmrA|{yqGYI+3La-+L!>dQhR=m;u}KM0 zDel{9>F^Y<6d^Pk5T{=A$hYL zbtVuLw$>8yx>+W$*yK5EOw{D^7vN;4dqv*N)tioNe-G_iL1AFV%ucSv1$-TPwnf0h zG5R{y3f$lMY()mnTg^v596siJi_f70v9l^= z8X}jrD?)K!;AI^r`J*2K8Zu|?<)Z1OWeIp;W%ID|zKE4u0t6D*MjPYd>+(quXw27K zTa7ZUPX?AQBx?;PyeCV(@bzc%EtM-evK6&Gaq=1sY1v*UxvCwib|hp$y>6mw>tpwE zk@!w;$?1oiazk{aW@M8Zbyn}tmsx4%o*zUZyVgzGlp=p{bFy@c(R|zcER8=M#glTq zedkYt1t^W#I8BH94M=R6eetZZTcFnwIw!+6D!{-fb2Xc(W!%9=7WoLlw@ig{p zGcM!hygCfei&*5Myz2bPs8AU{3Ty(>o{1J@D&M$^E^}+D#qAuz#ZyD}K)L_}8&rm` z7ix5QijZ!wX2eb^pU!15h^iB&hjyEuD{iug?)fW@A#RdxB%N%x7KqVpk7WA1EOr+A zBJ~X@h(9ei^3SqQswe0(XjNFINoW2@=iX_twKE=Mavn2~g^Qd4Qtm`EBEYo5vgZ?j7>OD>+lKoo2gsg`?^QRU2k&kt#wH#wax4LsxU5Fd@vg&y@z3o8cC6$Y8}H zlk&nZ>+DtIwnp%X+`&X1S+Ay0!%$ zaEp#^*PYC^gD~UD#R6xKNsM1mzEraYj%aX(FA8<%l28|dMO#^km%E|UQq-WOT|7s; z7g>E>xU{l_@c;B&QBQP6pH0V_BWdFAk%M7LAs4fRK+b1$D7iJe0qmu4Zo7^Z8#-*Tkcu;!uDD>`Y=Oy_es3_gtaVH<*X^JIIQ+h!A!6u{M853O+||+Q#bo!0^tnt}6_S21*NLs;tkSa;D7G zfn(B5#ZuTvyh>&;4aLe_moX}1-7}zFB z^S3_x)^De=kIAUcdy(Adlz0u*IFN}iSRSFN!BhAA z+(0V+u_@k$igrX}|A(}Gw+}htCorxfh^<7f-sQQTY%+mdlIm->C_=Y49v(MA7Guur zV+v08LHroF!)8OTx=xu!<}5VRWias2Uep!@ov5VEvnd?XI|OWW)&9-u@ZUMbY@F%r zv$$DqX*}$lsrkNlDY3meWl-T`ab|HIdbsDdOSeFd+JJZP**x=0yiWak=lA_aBpfLO z(0=zDuo0_kLB5RQG5LwZ6XFinyu91glNtOCwl$k=u}RL0Hg+-}n%lBtorLIHUtGOZ zkW#Zc8$_3;;sDww5ZMvVp;?O!Tfe$w55+J?rDkYdu}y(wZ;F77E7JiT3cWVApP@^u zdNwt1#uJci4GCxf(XOs;NkW=mI@7K!^b2q_$PzH7PaS(8UcPnv^~E>T$kBABJ0r%g z1`n|-?oe1Sw9XS$9-G|UdwbSTtTW7R7AdkUB7X4;aELu^SLds-bw_?Tb3%^2jLh-> zE7AA2&il9C%TEy_vQLRC%v-e$d|2ufYQ8O4JNhvkA3?tWkH^f4R=ty)+#H|XDGTLC zcvHVnGZX5Vdfq?A7I$;lNmk`B)A6CL#n~@`S*sC&Eppt4U3M+wHV(92D4;ub%-S6> zJpkL_(Y;qYIj>*$EWM4GWn_qtNEiHCd+ixM*l2k2mZy04@B!R*1BrDB*>3mi5Rtjt zxfMa~6kHfs-5wyem;Gd}==Ocd=Y%JtFYk`rYKBU7P`sjwQD?E}zBF9zQ+ajtuSoM5 z=cMuS@9!w)T4FI&lnS8eBMLVj_Lwezfgr#xMfRtoC zaX>@QdO#<($b(MRmN^GOMtg60YL)!Wx|#bY>LgK0!QF=BPcb^R;wpH8UP2C0cc`j? z@o_R*Z+l1EiHK#v%|YTo2azv`Rl+H z^ehcn7GvZ>-Vl-jX2JbUN)HNIGarRO(?Ud0ZtK)8k}}n9&duEwL9Bk0zJM_ZBR{#OCWtX@PW48dvxZtxb$o_uFVa}%tn^kp3psN75ZGiVyD>o zD8My8c_ycYmerLW^JL3HCj_jz5hPeSN(>Dxe=jXkGf2gV*}Ao_&n}Q#Jk%)HEzSoL ztxU(>Iiz>;ufe{0aB1@kFtYg0?`Qbx-728r*6UsIZUh>+`d-GF&qw_|Z6EW`V&!7- zV(S&S3)YMb2m0-V8RZORRBJ|5an>S|oqwRz0_V)v> zG4x3seB$X|(;M7+$udw&RCRpfbv!r+x!{gcE>nwSvyC#`@ z*dlp?qL-7K%XO+NS+@iHfoe7jJafKN9Ww<*aoL}d8DKVroGU!tk(PJ4s+BM@kj7Ga zyOO=P2~;@!tL7zU(|iuDx9Xu>sOut$Oqg@uw#mMi*z(k|eXfb%q506KIuU(kbL01Z z)p5o4=ckFeG!&BQY+1O$^lSm&snHqSFV{QBT-8)>fB&6 zn%U3RS3TP`DLw9+CD~bp3e@ybHDy1olX51K5yUrKp!<$hCNXxb`k)@vZdJbC_kJM# zQ}!YE9m#DP`Y;n7&%Pn3Fk6-d4Y%JXOq8TSoJh#(O?w0SFTmsMUx0(37$|K4@2asI zk7|z6=13*z83fD~pgdb7nlz-YB%_6@eh(^yMzOS}KBFdUjJ8dm<_{U~dCzV#QONR^ zEYRl%Dc7`i&RqUAy8axZI=}4}>u>?D2yZSR2WdWnZU6adBa_#v`7YJG2IQFv7QJR# zN*}Kla)^WO4PvkN!gY`I@aBZ^5lkW!2IqH!)nmF=7YCm|xron>!gFwrFVr5*STS#P zCwd;LXr9h!QAzdPkL<<4|avF@ltZ@y=S}C`${FpypH!OAwSv`dUbObur9Y9)avno)np6-9FRvOZ6k$e`=kS%>#xco8)nB)1`xr3#5aznH1)d z($nG$_6q5E6t}AE3Yd7@U0sl-V2Y$b z4VBu-BPYsXzT=$1cKS?+w{@v50or)$UEE7DZTg@Ml%`}kITsNd8G=E)u}=#-&FJ;K zMC|j+(P{B+5u%oU)@F{R>~>hg@0zvVRCe**Beox|4lL7(P0~*N(Y`T>H1bi3-?M zi@0598ALZ3u?(sQHFar^7iXMaAHJw~1-t61H3*p$^z?-D4SGex3%FRNty*p4R~kO~ zB$u;B!?ktaoL+gH0Uv0FmoDOZS>CU69vzOJNDs5_kP3dQ#-4)dO_^mF*RA%=(zFjHTqxuQ$vCIS}n;8UMxS!gV&fQ-@}4WoE9v#HovDVJ2u z=t0mn_gj-p;bK%U48Kl|^ssceH(vOQ)I8!LxgfsvElEV)M8$E;#AM2b!J6 zA0+YFb_a1jAWy_*RnX2ZGN&nm6h3!i+dI}JI_QfuJka9xc%)o6<{Qe=Ino*HmuHwF6dG64-N!wB&K#_v}Z{=jozuie>_LDRuIuC z*PX%2D$pW?+mO;8X|t{R zA?s4h!pHY;XwEZ`W$s!G`+;ev!pZLQJXk$4%^%Q*J=onfXlbRrabZ|=_q0g7dRQZ) zuRw=7u@1))X)pm1!*7g+EqZitTKx1n*~)}Bj8w1lUouY$1m=In%1R0z$a6pf%wn{U z>$-L2tU&sFGHEU$F^xKV8|7L|s-*+bn`RZ3gW%L&#!!Xa4nC7n+9ABLbfGl?RvsN!)i|X1y-7mKwI;#Xb9^Mzu!M z{w>PeP_<7$T=wDombw#+*B~s?=Gk{4W0;QH(@nQuxQI`1F01C&S&6sH1etYE-5|>; zvBliX<%+}~ZF)OB5nr?XWknSJKAn^Ka8o+2T#7LrU2%isGPQQM&3rUom_*{)eMfhr zv89L;rEP}vHXA%(^>o#OpnVO9xnUR{QKUZ#GUa2D5I#1ndDrTGJ1zU9xA30&C)lpV z8m%2n*${<(8+!e>i`0Mp{Pit4nDVgw7ofd~i`n;1Sm)ypQn?ON>trV8rH`5!Hd0rY zxQsR^dSvl$z%-Swn;)7vW+FQ9y6n@1I+`)Q{v$T8A{UE)v6KrtDn@5cC@cB+Sz48t zbrK{%hkHXW(1WV^>3H4do}YdbzVC!qGKXwj%DktuUO&xbDpU9GZ2(eh?o5(`kMs%>T~xmUW%hR7WJi_Jl7ogl!yGHIi3cRhiA+B(AStCa%T~&zI4| zsgEqBZFfZnY`u4DXT8s4BQ?^o2%2Jsq@$x(yi+&MD=_D}eX0MBtN?!s_b|OAMTWPh zjKG2;u-G|KjQO9iuKu5nqBbt(>NV)v>O77`JKuC`aU$jbGMh^}LAf89rZl402`TG6 z9+jwNCK8LHt>Jb(vyR)@Uf$XBCusUVE&O{w?{ld)_;I~IS07(JFCD~fDZeQHk@Pf| zA^6baHhi|CY^;WoFBr~(|w8d7jEN~g>c%KUvtzp z5_S)8ql+lajVF6HL7K=>1G5?unjvxb-3=x(asZkbjDGl}cfM}Plk4*G;rd7H_|oJ_ zka^e*2W!%CpkWhRfUiFQP#(*o3vO;1kv-KDkeo}?+y)PLov_F2ml!MF9X0SXZ?7p@~jc!ET(IU!N1azzvr1h z&ys!U(U)vPfZ><`nuRsb4r|R+4T#Y>dT#8Dlstj=Fz(2}@L-DTicQv2IMLxq6wZB8RvYtU2Fm7V(YoR!nfd5^yd=+I2L*8{wQKpK z!GF>G4X>a$unI-ND-Jv?@Te@82TFJDyG*9rTKnxh7Lt805(oIG~`(!O<%<>ht zZmhbJ%Kswnt;6D2)_viL5E2NG1oz;85@i&Bm}kkjGG5ITJOMh0`2_w_loQv9@gFVKE{zW{{E{nk3d> z`R22$nSOatj*)#p7pclnK`lg z;!I540d`f6ctV6B-t%PtQI5eP<$Bx)K&!OK{ZBk@WK55%Y%)XZa(y}wqg$WX3bU9! zR@$sbVn>uyy}X;;NoC!%9Z$e*ft=D*;}t|`+Dz0NmC+>&-eHNdk>1sK|JY4`iHG+A zi^=eApOq%({wFCiL4UGe6`+E(+VB8aib0MUZwwC&kLi>G=Fb4naF_K7^P9ZlhfL2f zhs$ijIiuRaBBmDg0?GxZ8NTz%yf!^VHk1)xZ>RImg@oo>Yl{Q~~^=$gN zpDy2Ttdrsns}uukIk-zQQBi}I8@_;hS~61g6FCJmCr?HvV@z@rY@jHajl%E6zPY3p z4t%U6^HaKF9K-;c8ZWnshT->4)BIexJ5uxoIzEM?&L7PH0@-goPvkKqP(Ba{bGG}%esX9ngp zxjJ1 z-D=&}>K%3R_ggtfha$@-V!lsS+1lu~L>)Fp72mu~DS87q3Qv1! zJSwsY0Eiq>E8A}idU(__)|6#vN3-lnH8Ak7Jzt61;B?!~>;JF^sy5duDbGxyrHJ^7 zoNDpj1#nH>GWg%XbWf2}B_w?E0dU)VQjF2@0bq97U6Zc2|whyY~qlUbapWQg_qa%qu8zT_n)}5ZNb|i8V%?M z{??!G*}zTtC8(rCWi_F32YIi&uC@A&q7}mBRkhVmchm+=2^t4JV3Cn&G#(8yS=kmL?-SJ)a{jntlPAn)Id|L>;&fWm-GW@ia&tc<|%hotu% zBKDSw<0`A_g~l_B%3o9=plHM4G=6;sdo_?$i8z1Y>%CpP$ou@aYB&znh;f@u{AVvu z*SCl0s69!c-U8;IFxePMvaP*fqEY{)bsH!%zWk2KV-X!qaPq+NW}Hi`2t0L-cfZ zkyTsx=*^OZSD=?0W5K{QlX~ZDTy*7wN=Ch^-i41^Cs-Cjr_QHYi}E&JcJ-lQWm?;1 z6T13X>BVCb{Yw=@Tbyw=b)%Ro;5Ko%zoBvL?_f5xR|%>9lzyN0eDA<~Tj1_?#bP|t zDk~8%uIqejhibqKZUYN3pIex2!MVTl{3Zn>V#>xb^|J&fww>IqxbL?l@#n-(0D#?t z_fd8~u#@@+d8MNQ$rN6%m1Cwm#M?4J+%a+N+~W1$@KQ2vQ$q%vS7!T5Tw-RQXaf7! zztwHW2p^rg4eB_%m}=MIl{9@K5$E@}8kzx>PhysNJ-(`CYE1T515d8tyTde1ZYc>{ zb@hznbYDFHUW*W_hD-G?!rKhAkiWjCE z55VTtcs+@&bDK#Dq1x$+lqe2-e^UX=ymTX|v^aB%#mx^wvbR&mu_sqlR&bMdjK>)?Inil% zjSwNwz#ov<=g?_xVb!wf^V5nZ#}d6MawN1Kqg=MpX1sn@>~bUX^5?Xy2H~ifTtn3C zi`fo4fitOuOooo_(EY+{O4Od#DBnjPgY$!c6kte9Sc?=lo_TuGZ<4u=?iu ze!(kX6ae6a-*&huDh!{xr_SRVq}l6irQQvJid}pr?M?Muw=_M56b!Rxc>K6?hrK0U z2L~zBl7F~}3t?YoVM};d^-HOV1!X;lF`d|DP+vfE5RBPB9ZkD!`+jYe6nb** zZo?#sZTatevgW*l^&z!5Wj{VkKd(PBwk})I_2G%;pj>^EW#VnJXt_r6MS$x8kdvJs zcI%BSILtD-$%id?$X`@qt88R-y$>p!Z$O{h9oDSp3aopz$4!WDZQB@3%5SxBK)E>k zYOIPoGP&zT(4$k|{&wdx{D{T;n(Gcoxox@Y=9MoWA&j3O-6lplYNj#^;p@vLodivJ zz9SJ$m%m!Sp~k^OXVO*XrpwD#J)tI7h0Lohu)T+44K%E^fVpkmPnzR&4Ej5go7f`3 zI%#u>ZREvt#iJXhFi3tOwK3DLDu23QRlazRn%MT>Pd)@3*$ZF4&U7UE(?V?B%Z-Y7 z9HZ>xeXr+V;G|oO5uzKzJ;7S#K{jH;$M7bZmyB{O>vWgNxVcV|=9C*KZJsDlKWT28 z%q2%Medc1%IR?;rOxDhcy$y$Mv^z__+ypX}sasA6r93(rQ)N*21r}@n)^e1pIQPDP z#c~}r^JKBIHDIyn%^d5xcb?j~?~^hJUv)ybZ5cf=nWC2nbQdNmy(=ZjlgwUqmNukz zs9iiLO&dXQm;r+7ffl<4ak#LN>ftByw!ik*x9xZQX*lrPPj{n7XZau5>!OeHqkwuE z6dDa4zhPB_FdV*~CJ~U=x!N#2k)tBYNUNr&42B7FP%z)1n_{$YMvw1|32^hz4D420IVf#T1h=aS8MGQtLK2V{Ef z=CoPt%6XyPiYBEYFIKT=vUS}QBk$`^Kf*{gZgQ{ovdFEWKAmGc<*&gd;r8HzKIqJK za%hJobMq2y8;?>R^eEwIeYumzwW+{Q{6xz+F5ay9i|W?x7w-xzeU3dmG8PCII{adLNj$(d~vhn#QQ2n>7#vLBikLGZ0i>i}oRT-Z>vzTFGUblv{!@R}J zh&73p^%L5((&qJHjyYbLyZMW+wp^0gMEkXjam2{XXKk%drq59GdGoNIB%d!i)ls<7 z#%#3y+PBy5_d9}t5Y%|Dh=>ge!N``Q3ftL5#xfKBH8e?y-8Nr}3a*c3ahMm>RaK@U zBb-^xI@((*RHe7Iwr2);!v$uhrO9tzoWkK#y_dlmV;?-xI@IC$v-)xPnicdaWeUS{ zm#iXE`=5EQ?<}f577q_g^(xs|uO?bsZW7c*hx6~$C?DzzcT=lB&tC%u@H2X9-^O@c zCXgcC5fA*dt=~6KO2+2mWR$h#xrDn~*2+oP1rso00MUPxb9%_lF-8(5u5*0fTpvr0 z^#f~s3}{KI;<9LkqTYUPrPyf{I&Zc2YTwUgm0Ou{vYymSDr4^PHJHEfZShHSVi>hG z^1qrsJ7FAX(SJUd(kK`nO)VEG`g4$Ub$k*Sof9ZTJbKnrx#TO{MvCb?HAAh-uRxJK zUQ5H_Fb7wT+fac}?RzaGyumbjg*3u;IqShwq&M$O!0_^Mu&X3&b=>Uenbe6W-SK9@ zWK6lJGc3L)h@IK_mf>%9^cA7C(BI+hP~0icN@fYu;w1<@B#wJ5&TRwkRn=rscyiVO zcocEznq3(@dE;g3K}S}f5(d4z5^9aCl?%F6X>JQv(T3jf@xDuTX0K7+E~P#~=d@e& zz*HSomFLKuWf6V)D;O)n#%EQo;TKsG?0P-p5^ROoRg=}+-`44Ryq?i!(>|BAxhWxo z*~HmpVo`-k6rLjtX_?G}QbQ@6v(W=me*(C8Z(Hk9#I*$NwcX9h^IsL5R&*qOXI~Or zv$xhyHFTGsYoKk6p3$Jbh^TY1C`vI=^8eDyv+h z&cT3_WyUM^-6JMp7s}tZ?F7HFYf9X93*Cdj`nHd%?cnGpm2^K&%hre`rK-$M_$$dt zV5aO2NCJ%3P!L<{<<+S30a8Z)P>r zGpd7QHg8_c+Zw{&TE)WJzRFqSN9lS*2e2@qtJiZhCX&*VlWOwK-I_~zyhwx;6RTTi z`;Eg}`V0wV4x{LbFAlQnR~MxPDmL8x8j1{alt3GNqoBpxUbe+jBJExSzh_vaJ9{iWN-klQ4iBPsz?ITa%xOIN?5E&DO@28Jn* zi0FkPj)tyL8+SX}$saJ&H?iOCw#sR6Q$BSry*MivJ*NfPpAHY?r6~z4Q}VYoa97gJ zgY_sTU;C_2RcYCpe`(>H!4)X8M=tM74mHEfEk7$butWwQ+Kr%Tmr>q*?xI0lyc5S6 zz+kG`L<-mW5aWNB-AEpFe2aF4eNTE?^Z*DR;mE4c)cQK??m4!5RZ7yavH@}H4XRrc z>^`-qK?1qEQZc4|KG}>;Px~~eRC*jqR`0e)Cw=zTPll6M+UVSs(-fRBy}5$EFiDMg zmyt`)Q39m59G-};99T#B{ZyX%PUJIS$|kh6wF-&1$rkeO&1%@#B=oa_r{~)`jBuFv zk2I^4h%HEQmFb);YY4aw;?dh1o4_hf$z|TH)Hqf*8Kvgsc`Dco;!ssa0pGT3;f<5R zH@)+`!xalZ8^(;amGb9nDx9{L!(`p{Ls~G;gIPAZbc)nNvnP}AxSQtpgA;A6TaCrZ zB{HTqX4nGr!mgfr$mbi%yOE@%^WJ$(KGm`4ebo6W^ZkjT)D~M1I4QplauG3@SBSV5uJ{%&=-y|t;AzHRur=0T1?rEqYX==ygHgizmQ z13D%GvAcHDHM0F(FxapZF>$~$5CNXSDbb*_u)9ggrFLs2@%5|G`aW%8L2XjZ#rbQH zv*sjHr(c}ZRlE3-CC50PAxQa{{3j`HYuUYI%MB#DJZ$;`$W)uEZ%s9z6fO@lnBeQ> z=a{FZ;ueX-4ao_f)kqfr*(LS89CS(>cRt=8k^-uO-~QAenoX}Uu=#o~Xz=PxkL-Gh zyUoyTJO%X0wy}1XMFxhqCR}OnExgz+*O+9<^#gdj=jDmJpaXm;BYE~Jk%IcHyrL>TaBqp?#?WKC9 z-mZCVYc_fIduft@>SA>d$lp%R;+SOM1jV?K#P&xse;vBI_*-_r?e)B$n9s)H zs9j@*$?g!SZzY0;`|1MG{$2Z`#N}m0V~Qef2Ew`AHiAYjs$V0?==~|re2`XN z{E7L8KhISD_44b|x>mkYm(qH_fO6@dV5pkBrdY?p?}B#;!%XsS`qdg5!ru->O<1*T z^>d;P)W4Qdkt}IXg;?|{2GJ`#Pwlp`{a@aZ1=HB?$PFU{18(TQa{noEk0EEIs%8%Et_ zl7hg_Y-oCEe3VWvhM5p3PD#VlLEJd+|IcW(Pf*X6j>5ML?01CYvcAUFbh&KBz=_*U z>fWr@@aewPD9#;QrhdCl+S;eX6`O1gaNjAeU{AwQea3UR{+6OYtbNdFAYP^vrF&8y`?tS98e38hs06=l` zls$$@oveIx(CE(>7gYUY zLPC>LlAISi za`A#gWDUFY-{{Nq$dTB|nhnY_IOp)7kR7qMDGj1MlQ;&i9b_?PXW zzol+OdIM~u#tfq+g0_#CE@NnsPD9r7s{0E4^EwHMiLlwI3hTRxy1X?4UB|M@v{@D9 z)8RMHW$oSZZ#}rl5oH+cvJh5G;BAGRg`F zWi%%9{#=oHz13Qj)aV%dSTBxA)%$(G`IgPwUOjG#xC0ap+lOa*0|SRrzF`UV20Gmq z5mCt%ChpwU!*-rfX?pJX0zp|W;Q}|Q=4cVlNgqB>jn;!Ub>=x?H;?48sreiusN)7q zheM_N^Z?f7BA?{Q_!`@#b;@2I$K@r$Oo)ufug?)}eY}SakJ1+OI;H?f;xG>h z2+?I%>9nhvEwWcZn7-yuwV3lL<+4nIvA+PB!yw(}%#8h?J%;8L|I)eK5w6C02%Y^g93t%~=L|k2Ang`vbQyzuhtcnPI7`tqQFbqdb4d5&6sGT+akI}s z;po?m6d5_RW4Ymv{}3(y_yC}4SJc{5F)PkI7u4w_J<5F?IiE3?zxm*~EfXF@ zFU%uaTMuNR>vhFE$@OVxd?s;P{^)I%_n*$9gW`KCU~TI)@z1&J zrD7LGU)HW1k4vMW3$`Mu#>QlZOz>&P^|fipkfuE6m4iLR}sx{Cd~lG0&=!Y?7(`E=w?1K_tA=YDoYgYv>! zg8gAfFZ3C8A z%ErY#Feex+^rA+qWrAa_=r^`oz6x$f$;T4vqdB);2v!_`@6xjhq+ zqz(<3SH#5#eqw?pP5kO=di$^=q*-d9k-a$hYmK-&g6R9^e9Xf4xu<%<>eR2)*LDk3 z7!YE*vQV!4o`Q;9*K!&yV4pGl#jwa>^7A3)P=nr26wVBXH=` zAuWU72%0E30&<%r*XK4Ivu(jv(XywWe$dRa5l^vQ&Y>a}Wp3fVnIEhh`U8bea6Jq* zF*i*qTh60BUQ3oYu!BZbEuw|qE9DR`cWyXJd}8^sDO5D4dp$Pk)8_5bP#fbdWzK$R z1DbO(Pp+a6EYp1f!z7hJ+oXLPphxBQl_asQEm&#`9D{E*Hb( z_U#qg-d$Zjg3Qn-Hy?H_n&a-~?+EqmR^PE9%P!VuRTsz6X+ODSR{T;>?Lw4H#&3n- z5N-8qbt%kf@EThu_>GYd0GQs@JOFko%^v{Q9Zmib@gF!XZi!#p>7>Lb4F&qlCkrcz zyBN(4rr8Bqijh{lRP@M|QrBZfjDduZ20{x+xrJr3e|@s@=OdBQz{h&}j7Vu+J_|L*P$j`aqYWAP-r#GzB_61N840S zHZf7u(8oz-t2vTI5e6Fnc>Ko<2noN^_0D&NA+_K}EmP{d=gWnT2AU7A+k}6>rq5yN zO$Uk~yMZfihgCXhQbTXln@VzAPBojHBV4nZo1cHv@p?-BWv#K1l~NadfP(wUB?tbY za>2>u{7>lCZeQPfkR7BvHVY!4Koh+?1wjEhj}KS(1j&`$$}!n(&jn=u`O5rJ{wqsW zPoJMuOd!d#_}NmaRd%cU+pVu>V~==*R^QfeqkwYSDrgJp&11>))+B~1i6~-lt!TLe-A|CF7KAr zpvx2y!+&o2u10=Tm#fc@=&AIZIQ9@!#p4gh&FNt!*DXtyzb)zSLUyYYK^Ga%E>a;% z2}+GTMIdKYoY@CJJ}(kZT`feGk6*d#hDJhka+qt)<#i`PI(*v`9OBud??6+)>2>OI zGSr-unWCOgNjTKcP`o-g#~x15&+QU>{->ZRQDajx$3C%s>P!}rHkCU;l@&{-r`;3K zP-N#f*&BPk%PCc)J{iN_(9K}pC+CwfMSyfu*3apF(+1Qz_G~AZUD;whN;>lWj-YjM ziZRJzfGT)y)=F)1AB}%BZQvguNfl;`-{|2M@^y+Ob`JQ{FXxjD-}-*eY4XC`pM-mu zxgB6!{|@$iQC#jc?ult`kfeEilHH2l1V5o6DOB@C&FTS zEOd-qodtsLc5gWL@28~wf4?IbzzC}yEiK<^lP>4%Q==l3~G z$Pa1S>w(PlxSbbGvT)%QVJD)ozmeRfZNfgMSfYd9LE9s`qkhV_C*!q#Z3TN#(x^WK zJ!kfaE`ndqw-{7pk!d6~jhP5z3RS-@(}9dECwpWTW~OP%R=A`Pj|n{wYt&4xE+S`!=ko zptR1*>`CG4w7N#=mkp7&f%TN;+?YOuqCqvtgb}s?YBg63j!Iq9kl|M+1fXC%~hDgTc zh6gGXiF5W3T)y(($Ep!_*$3ts_V6HmP!4nQZjj$Z-Q$_58~t}$4}d+u#y_<4pVeoqi*Ol6iCy2e=513mT4T{Z=7YV`0&|4Vkgz0Y4Z+s>U{7ZO9Iu; zCfxw~4Ll-Xb#3ricpD97T`QN+$J@=rN&wn_QS4e3NHB|(TL(ye|Igq5hZ33JoJDw{ zy4?&RwB6uMp*iIUSpb+xJfqLOHp>+MI#vDte5 zl9I5*4RAzJWvYZDeJ`!n!H3i=2&BE#H>gFSCuWs#4@zc=%v3<@g^}}3Z#pNxh&1L* z?&G*wS|K+2hampdYVdUXc$vOHi^rz~9ko+Ni}(rt8gCgx{MZwLLP$X(3WOxCO@jqS! zo45?1Tn4-;RHQ%-kT-dXv+Yo*sYZ1DpE%D>5pC9K*BZ;7ixtiZg>oR1@P04ih0&2h z8h;aT{kv=Smw2>*Dcio8jkHh6C&W|jK*556U0-S2vLoBW)JbnD{r<7eJy;9v*E)MO z=|y83B-3OrD=#=|PPyml`nFsuF7Hu_OFYXaHVF@ISaNtycw|;!LQHy4dW<St$diZ!m5QQ6 zKoL#~HxHMT#erEtW@d~|Z@v3AEks>kC9&|Gl+hnhsQ<+GKW(*e@4xMB^bm7*+|AiJ z+Bb7UzNL3Nmi&v6*w38m!}Oz7-_?sks(qmHjp(+}vaIzlKhsq&^3_|_phM(8gvPy= zXNRxrQua1ck?{NL;P|hLnHf!oC3kbgO23KnwGLP-4(R*^=%|PibB09kFQ=y1BU&E- zKg7mzW*<-fae|VmOJ3WvO9eIOIQI5|`asJ`=pCCyJ>IH?j`UJ`CESep$159U^(wHk z$w*z*O>hKlq>3VpuR3lai+xq_|Im1VGk=@)`K|J1de&yz;l-Dng2TPaj1Fqtd3-Tj zI_2XG%8qA)$A-yK#UnJVyvu91h7C!W6+X>~HVD9LY zqZ=R%v~H`Y7v+u&kHZL3SEX3uST&oaGtI1Op}d~asKKuc7Tnljsu~<`FZGrYS)l;F z^y6C0h3LC|Z$x`$;cYBJ5eHdT|Nf z+%VZ*Wl!m%rurs6U8mVXBg1T;^nThnsJ2Ig|_GEpbHR@p@vKPVSQkr@URV)Ld5~3np>_%7}YFDv!8|y!OH~1 zLw%U_i?z&)uqo0{msSRh?$>we)=}+zQ28=3^i<1n>AD7emVNVKwlViF&8f~BJ|8!*VprmQ5WsGn4IPY>3?Qh#+%LIW!r_WEI z4hqUc){u;+)e3KHjpd)*T2^2rP-U*%r}e#m-7y>!z1X%fl)BSQC3|+;Pp4z!y;fDf ze4wRuBbZQ9{G8OcmVUHEhC;1zl3Hp>LgE^Fq2OUd^;IH;P^u4tnQE(g7jJ+xsu<1t z<2D;?$+>?U;3%LPh#UlJm)*{c`L-B_1YRwFIa%gA`?iHdfpMKI6ZQ|?_7C0M<-EEd zGgNeH6LQAE-iqiZ7rhdm<5##T@E)8PIB*LlgDB;ciF7Ve>%d86$GFo>1@lXz#|o%I zF_gQaPSO!cY%(evrQ6RSGPo1{9g!(kE{1BSV;`0TE8&+Yl5&S=c#tZaE14yjL)R&; z(vPZiHv@f7U#L-}wbOI+~VQxrcim}>$vU3 zGI_^yO_miAFgK=!bbX(8Wp<(aMXK3=t3hhmNXvjpgYip?0wHj)a(}uqNW$L#E#`?3 z-+BT#$5nXegwAp=y*fR6^`mh)Ef|iHq2Y32?FH_uuU(wKXdC21eeErx2==vk&_WZQxAgXy+K<~Z&Z$?K? z;!Qlk%cE6bs7f5va06lB_2m3v+H& zws6G{FuH$!kQOgh)e5cnD|M7D&F@z;3T?0LWi0#`HjpZUNfnw@#v3$uN0|9FzS3}x z^1YHskS~%$nwc?;LVX23el&isTy(W(D0L~6AE&jO>YGFiL`TQ{oM7wf=IJft+NUfx z?JiG$7L8&0RQ2qxtGE zBZXHEfo7K^N>G|Y)2To#3z$T=^J5Si@|*W&IY-$Nc>ov&HQS*lRgi>iu+Ok~lng5#8+D1mY@7d^$a? z-_8bQE+W>o#G8W{vc?ZxH_UxsB9ak~=uHq_t>$*A~ z2BGlz__`{jA+(1Y;ayy18Rw2I*o?}Il3K$_JuXfD`1vASJ<^OVkRh!ND`0Z}pYMkG zmfjrPEtS4#pA!o6>=unML|Ex@SeB6|B%z>fDLgB&dC{2lOiv~Hyvv(flvx)mc zz!Kt>0{Z7FDUYVE3|F%fHh{5oP2H+zV*T&+sU2~=y-5qviG^r)@#?f`^aef@N7*dO zBulq;Ytr_qQPto-lPhGG=&Uw*Rt}t_IjX$fV?|O+z)cT;wdc?FThGf2T}J~auKnD* zFI|^Prmm-S<#dIy6NR^v@+MbpPVmSBkGuP=?yMKI7jD$m)zOcAU%C)a&!K%eHD8nU zyOO;pd;o~HsB|JlVsrh=mR)_(7H1W$=>yb;n zq$K$qt6k_+@#Q&QmaHYt#MJ>%$wABY;~BpX-s}&6DgF*fR!7E8ed*m}G%V%bZo6G> z+~r#OD%>j`DP{wB5`k-b%X>)o`fEI~Gl9s|Exut>Gl3W!;m6JW{h@|GUHxWHK3eWx z>)Df?D*1vfaxdge)faX0vF*wE(_!)@u3_pu0-()NYypuB8l^slLZn%I$mCDXydA^d zmL?DGyCvd4^kJ_1{1gfVYHW%e>R5hTxp?#J^7tNc zCt`hc<)3tM_r3#r?K)vK_z3X5k{}jBu~~WY{D-mm%UaO!>a02OMq!sU4a(4Ac0mf&90%$=u zw8DD9t-N}U%S6NA)iM2h%a`shQmfS)Ue)0_i(hHwObG|H|6tWTKOI!_vxHqgF;Hnl z)Yn;#jN4l#z~pOh&+IewXeSV*?lQd>(;H%$!!Bo0pFH1M63P!YQVo~NZ9ij?4T$}T zq%w>_u&lIy`)lsBoNFXK@${^3LCt6?l@RWvpdgv2#QMDG2TQ79*Wd-Ids-oJNTUs6 z4}jrf|5^Xp$K8LNrUtWW>_shW%3agK7>QaFK2TON}1DN?<-Q$Nu z7_}E}_eWWpASCl1DiGtu9_AF|E;1VMM-zYA{sOxffJXI8_8$ENap!Fy4eXnLhyL%n z&LOqM9}k)iqi!&``FZSs76frL2>6nQ{h{c?7cYlyOE_`@G@Gp4f#fuOSn)pO9u@Bu zKg{C%DeAvaM6t0|7?_)3WoQ%VcK)Qs$;F+#X^-7%1H*fr{Is6x#$v2a$7p+fdoxpq zR3l4Mhx4n{@TB7_=%5Y3m9m>`CQh!o5dgqDigcm$_dps?{d(NwBYj-B-#-?^;6w1+oi6;2-j*eQDk0~htw?yM0DvM{c!S0i8BLk6FU|84uauFp{)H>7CITptTj3LvRc&9h2 zXgavyawQQw3K}JcM1EG8oj$fjQo&XB;6Ie}|8C18?@zglXoxuv_3*#G`T=NL`MceJ zSHY27Nx8+}gkN8@#`{OrD*fB+`#?bxxg>w)@peB$F|hCopQq zpte-VOE_0IG3}-0;;~bVoVUM~;lG(DMkCzfXJ2wn&c~ED!9gU_1DcuI*qNsJEQW3Z#d*-$R=(7hs-oLL z$CqwWv&(IXiAZ&_Kg`&%=nhv0&_$)?)5;;^y2Ge2n3I1B|xy6+`Oi3Q{9#^1av|Jz`b`uwVV|#vSTbo> zmS--zk=sB>3_Rx!fHWOE56L|gc7kOo=4LBO+1US>?7H}VGV-j=!O4sf?*@fTlXX?E zw0c-hdT%H%;TgB@u2E6{0XN_eL?W*8=8&VSL;9594Bzm{#jLk_QQH;1{<3BfjA%Z) zc}i0#>}({BoOWb^dY1}(Wb^=VTVDf(N2%Vel_mRs{tw5gufGH}@liHpjXj^z{`A{2 zxf|rn*3s1>YF*@=?ZX*K^1wj~(z2*X^$* zVE?b#4LtyUcZNELrsNDr_-uW78&m%l5cRNi1R;MPKDlL^d%aS3&s>R-stGqH^@k&nc5noShf+7 zz)Hjr4RDR0UY@2OBt(165Cw<`%9KvKWZ}nVmtHvwc0ry0x0Z@Z8Q$niIdWzsGiVD< zM03mbgUWmr)z8#xGm#+s097xY*ER8v%`MCem}{M+8132|0=MFs9J|?ry0lq7==|fG z^#9RYfb0JFE7PtctAA|b>t87li9-X-WXH@`vxY37rgZy^;dezHH0)XukFYG>>zOq{ zYBINH$iDn|{x5g?-yQf|UlxJZJJr(Q@q(*Rk-grU85_(~TX^jG{G72SPC&-p{YB7A z9~<-Q1W$rOqv*Ur`pg37l1TiE^|!R&kcy@l`-riklWWVV`wqp~$EB@Q%Gaj4zneGt zHBGqZxISr-{fwl4!dV-6nI^ay`cpRsjk39LLk=VEuSty|q@;z=ceNq^@01k0=IWuMaR zt}G+UCr!8=>$~tN@guKU@c(4K=TGml`6L~^-wnf-@WqH3>_SLHUn3|h_;aRhgsV@j zestxzWeuj?ob9_Vr+G`w?RK+9T|Ua-QS{8GUK?0A?ZSZ!+`llWt-Ju?ikBJ9F|H)_-y>Z=JMOSz<8l>aXAp9b0(5hU2S(ij_3S$or$dc zwG~cf@<791hAEEw^C!~x+!G%RR{pWQf31#&eH`_+Q;~^0_N9sqK6~Wl4EBJa!2KF+*+G7L;6)*H1M~S&z)#41lhwIvn-f;ssFZDqD{GrUvN26q-aly-~kXi;mx5w z!R7ul{-#=bdDym;v#hL8s9~;Dss#7-FEewJ?d>-<=@q`dAif`ZrjP!R^KnAz+VI_P zapW8sJ$-YW1}8toTU~rT*8%d+?pp&_>Rz;EJm-@k-F(lT@|x~sSHmQuvO(zxoC%#Ka`W9z{TO*A zQqg;Y`-7n)_tBoCs?~`7`1KmgDt<*h!XKa6#~bP?Qy%CSMtQzPKjZw9Wpilu`o61z zdfuHFvmL#cVne5R_1RFC#O||C|LvXgNX+krf2W^*N2M)QTG4)kh} z<)u%OswHn9+s@<356EUEDuA$sj$+S#9oy{pG9jTHuyidID8!aAt{#J+Y=imYg*W-Q z18Pn!k-6YP0kv~e^<96LOVXRatxf;UeC6~+bRN%-9D2=ir(XDB$-us%bL-G1F6M`n zlqC1JuS-0Jb%l1rNzwF4209oWr{xpCQwH)2r0dR)pZ~>a5lQ=*kO^+ur{}160;gVX zP+mB44Scj68DY~FNWEIHxr(4Avn35o404L|niSQvn*$~>21SKFmFQA;cBbuVdS~6e zdKv1Nt=h~#8_s$6U)|Hb}d#YxJgJb5yM9t*=F4f8^Sw256&m=#Cc2t<1T`1kE)|jj_ zk6Ic>g#ufe-Kek+J<B3^O&!~IY*bbF;M>y&!aM*j>=Pio|7D^<2Io8+^;r&%kLNgKmXgX{>L2} zpjza_)%gTG)yhMMA}XqC?VvFs&{uA3OlTx)0<5>YQ3qlA3HxdH=0dHZZuwH2kt4Ln z7@n3#g5c19ayBX@VJiL`6as5i5h3Jzt9KuN zmv%h^?vVG_@{2fAoEfY1zoon}-OsrJ?x8LiuK0g_0My=&-7WtwWs3i=y{`_6Yist6 zLJ|T52@(P%xC9RnJPGc>-Q9yb#DL(zoxzFmezE^ejA0xH*TB~=j-MxGD>i+eAzg@nL)#(PVlk!uw#Q6`ZdzWj@6ML73 z2e-DS9uCi-$M=728m6c_#2~un+9fsgbE94WALR>tO{4yabHwbdXsDBF6Qw30h%L}r zonEfd;?H%!3G*!)r9zIV=X)$^Zt-926s1R;wX?+zE88sg;@SO zE&uuduTw&d{Y*o8hf5dt^=cRU#7!|PElHy1=|j4&&Uhs(<<6B6F+WB@#(ec|KEcbB zRL28Nr(xJn|4GQd@+)2tx5(9^6{tHd_Ao7K`Y`34!-1;LH`JvSYyr)j8i)vK`_`ZK zbibI?r5-6{m31DEhl_ds>K|6b)@6SS$6483pl#zVEDoNp{S$f9n$}?I;G!h;>$3!+ z6riC{NOwX3nltvMZY*(?9v96YMwaeOE-6%rrJ zY0`iot5wliUGA@;|KNx$skB+nP)QL6&KIL$y8R!Yc-re)iO(IT#xyu>w=WDI%$GL3 zPCYQaTjpimM`k%vJbK3BQ%?0>!^D6F5*OQCa`d1!MIcO*{kx|P8P@X?rQE+VasRmX zuhUK+;$v(z$}cRhKCNqV_@#Mwyh%w~Tk*`$k{dW&9*AOJurV%*vi4>=V%ECHo(xls zNuPW9@k7V2f0g3@bN!Q$_&#?pAP&fFX+8C4=u57)*F+`b`{4SNP4rr2wV?6U)ggu4 zsU`H)rsBxNR(-|-bt1F$L-V;0HyD+B{}>d8!x7xN;KO?UaVzhh{|0x_ltg)+;Fg-_ zM%ZESJfI68-tPcLo@_2*C;sF_+VsTDn~QagrYg)M5@| zmV?E=jhN#=s^tY?Xq!1y)Cjt~&U`P3`A`Fr_O&=`>z)uL zv>a%gWqb699W$hoO7ck>5kpDM?cuy7a)PV8dFP){_D@dTp_{_;M|z@{M!QEiW@jP9 zN)EI(ZChvdL{ubvM7p`fN z1VDEH>X+A2=8sx_gN%2F({P^A8Xr__J*I4}KuFOC6uae+*^1Vod$Wk8U9uBekO(&o zu+;IGG?udf>iRTlT7A>vR(elf{15l&zkd96uQ&OAfw>uRTn%LRA!>0bc=@@~$pP=@ z;c_c$5w~BmRQn#Oqy_@5vHo95zorju;lY%QpSSRu{Ye4CIi)wM1$1|E;^9BQTj(<& zeAcU4)(6CHp6O1@C<~wT@Hp}7KLO|F;)(RIPNLHHKfH~u{%VEKR~KV-`{k$qy0W|V zO;Q-EDS*qk-o6fX@!J(`p4Hzqm1`LA6!3TR3Gx zsP(!8zFN2NeR0Ee_OBm*l`D{(WQ~-n%-{9uOVyi?R&-jvzZdy3uf)yEQt{L2Iv8U( zF)%edk-GF2o+i5Gu$arOCG4H+pZ_|&|7fApSiO#J*E+<^v?uYx?qzDZ9`Z=;v^uS` zJ~8cWhZ`?cPYG!=ac(FRQOPq6TRblKH87+j>sz29rJXTvXJZ^>A4ZCDuP2lNN!?Q|J<|1Q!`qDECP^9s?h5$m8orm3Wvpb1P@B^9v=0 z?LGhod=KB|><2V?U4Dc!m0+F?XW3Lb;%mW5Unz|*4NTdUmN-`zT2?}33z-s@v-eG2 ztjNSpV+)Yr?Op!rbUiP;#QA53zf%y353G6=TXI^QX=IeZ zn-dOFsy&vC-NUacSlEX(Y49F;kBJ|Y_{*zlUf{sI*s279xp0?_`6+gi_|0Ofu9p%P zX<%r{S1fsUFEmD?Rt@6e4BRkIhFTXNvph1!v;Q-*^q0xA;MKwB6SIrXkoN$Z&6~)} zfi8gh8TLkgx3fq5Y*x|?e$knAp`LJ#1hns+n|6Z6@qU{B8G>=!x>|-la^jLMS1d6lZ}-WNm+LOkAt~ ztB~w{ToCi0I^fv;$I3-P?KkhWN{7vunOposnLBtrZGuF@HXWMc`3piu$jfc0k0mm? zKuLCZ{kYxUfbgF${?u5vH~`v6QJb^$I57OwPVCiU;r)R&j8x^k;?JR%6^ZkcaDuOa ztKewTgh<&=Fgy7YkiJQAzM>%YVG@h69rd8$&p$UTuGJ$$#2>ffc_Pq5AR)<>xy8iY zP)l^>$(X$}^>F}eMV=O*-I<0dr(;FzyAtx?dyR~(Z6EIj%!tn(Q6QqnB+E3_rTFRh zAGZ)Wzca2MFXQ6q@GV1opWvU`1+j!m3*$&aXCzdQJw2Y){Ye@+&Tqi;k^pFKxKT&4`r(z=AQ2Sg&=ekE^* zRPx8Uj(Z~Fcj|%jse2%z?fF0-7#BX>iH> zWuK;S08NNYV_2Z)sqmYdTIY&tJYoH}2r0cYSm(u~*RtVbPH|r^C9CIOf5$iDk@841 z+thU|?714N9nN??`SOkiIz()XYZ)5v2eEgmX}Ktni&mel5o}t5mzl*Xu){Tv%Y6IctbkQ6oM5S$f#Dso5E1fi|LPm72Cg}?aeeXskwMV7x>LTYl z-QN_lTFF(Fb~>A3lw_{C#H_<5s5Z*=F5B3( zZpO~I(u}Z3n;o#jk%kb*M(JJ|x^W7JAieWls4^&iG91L96h+s&|^>Ff5Noizt-WgU<~Xe+oSb9!{^8007@uy zfUvD6PciUm>lX_;mAH+)Zi6+S+aam3E(sxSq|HH{T_*-M{NDWLJwbe{sIVYsN_(=X z2Dj}NU%ql%g#NY4q8R1Aw9|+YpZXYZZcezZ=bDi+QvIg(!YuuwX6uHvM}8%oqQg({ zzzGykt+oI;Je$4rHv#A@RyZ%?CM@>MIaBN0Ky);IZ!os8$INOc-f3j)&@Zy*eg|H%y$hc(nzUf)g@Zz_(05ot(Dhe*4mmq2>7bq-Tv} zXI<4P_WiQ>F+!?GV`ozNf{dL2drMkU`M1OBRJgFVl#Is!Pe__88&&jUa@C#_P=V_* z`%}kQT^9eE1uUh@LIw2dDqvF`SxHuwR?{NzWUl{hpzHkaleMhDj-M4S+(TP#vz$u| z*-C__z9()0IJ&iyGE?hFS$8r)b{Z?lOBFRyqqSz&O2AeyfNV~aJfC@g(HCY!nH<|I z%;kL&XC_ot>7at5htz1?v<|1y5E2_y2k$(!v#RYpbM_7{2o*bVE93edJPh=t+T^t` zr9QChx)Gv4Ko3l#uBr4qSk;uRX->r4_sp87@}7+cBm_gOEKH#!$4SQPN4q_Q9-Sk9 z+>$tud}jKyCrh}iCfcNL8V1p_Iy_r6;Xkv`ZY<16yF^Uz(UW3j#YuKS%eGxE&8@d3 zQyr&XU6otb4uyWcI^+nPKD(EP4!N@)&yYFUREnk73aLd^#BQVx)#!y_5t-PXGyQD> zLX&Lx4A^BjK@@{4m%W^i%fHvj@32Bz*XNUo+@12Er1mt!+i48K<-t5#KCaY|pR;pO z51KA}ggXGiYr0<~EVP6;3y(u_MQqct+aV zFC8FGXy?5G?#}8QGLH*6I9uAdu1v?`9E~en(`;8mQ@MCw=krK>=VGb?t`-?_;6liY zHXna#$keHn&CwW&78a(Xo1~R-ZGbd9+Yt;aM#9RekI{M{wgdI{$$A3LgX}y|OyG61 z7UE0VZKhXkzA!x^RrT<_0MxE{V=AE9Ugjip30{)Br264=RF$41AfOHep3AuOdd{dV z8OVQ;XtWYI^U?dUXF9Jh;d#X`0F7ItN*aZI=tKm>yI3c#$LK`b)7cDH88a1NTA$W4 zk0m`$R-by>AUW0&X+-T7zYWyxB0I%nT<9PSGQ@=BhawjA(mHl`43w+IEF}^2xckOG z*nf>TbL!0=4?<c&FQOfe^)Qdb@p0f$FI7`=Ue}*A#0-E*!7tH9Ci@&~kPcHU5+QkwsoTEo(@Q zO;fz7-tnM}YAGdLbcsJSfg{x4nE+m#*sm*-k$Aa33|_2ix#%8Xyn>kq-Ma>i{^%$< zc@R2+BosGvUw-%gL`uZtY-jcyn7ddcfx-!y%Q8#gB2%9S9MH&Ppp-ietnYPk>&7%J!Ams*TaP1}KIm1d za6CDoZ_h$Z%#~b-?pYGvgz{e`YatGZ)`!;z2F*ig4`hFa2-}=8VJPj@Co)WGXiXwJ zMXxoNXvk0#C-VGbFKDg^YwBuL3vS+xtQ7_#Na|+JJEE-5*?B+oi8bM*%rs_a$aHYA zt^F)sL3N&8Pf;}%CuM?|%z9LF#X7SnE&>U-@HM~i*l##F!0i5;dSUm&n~lNpY3+fB zM)Ts#!@1IeGIBrmXbmpgD}U8!h;eY#o%Ijo1y#>JA{{V^%DkMdX_~C!ys8pxd`F+E zftaV?l-N=jP-T1~34623ZT5>$1{kp(Y;~zzq1QNSno@DSQ(KdZ(g~~5Vvke@Izw$> zgJ#)IkET2C67wRY!ggh#N%SlaVqv+sN%L@%e|J(Nb$B*Mi*NL!b@=@rF;1t0gOaWz z&1z>{Ij_sT7kosO_WVRj5+c6EDxPXe zWo8n1VclxcDM_gpycEDwlILKgl!q)lt9!LE*EtH;;0y&^cFqXw9Pn>(TxR3lUcr!= zO0~I-E$>}fKCCD^nb{BzJ0rcyr-nvj4dqzZPFc5O z{byc|^n`5POj{f6s1zM??X8D)h-^Wq={s*uT55P^=ZUAT+~Ue98AlUxfi%Wo1; z0tqg;!)R7$*CbLsIj*iL6hhIrZd5AqQqYEIVhy&0#%jZ@HxW-Q1Nz_F)Ic3`;^pjEkK!3Mq}o?q z$<-0t(&Q~d{L~knbe>6S%EFI@;bz3CswK3ZD`4LxcwxXUehRpP$Wbbg4(1XaA8BHY zz6r%;Uuh52^!8+P#yr0luu4p7Dk<)6fYIaT+xp|y1cYVyb4=Beb*D~~YtQBMbMY^U zbyXqij(b_q2bk25y5#d}(sx#uSQSsosat$aa(=H59A<-QL}uIXLYG62pT=HusM+G+ zi*T}LmK|Kfd2$IB2?aYCxi%@f9s6-q-jgVoE0L$ai|io(u={+48$$oR@C{zoyAF;6 zcrXmIqEYE9)rzG`YSV zRyQIIy&L+uLWt{WTj})kcdv!pcb@0)gl*?4ex=8Ij3L39y!VG3fA{LPUYHt~sf-@2 zwnFw%lQ-yhR@I$lD$^mSo_0o-X$;g4Gh^AL^$8v5Twi5!5Z6>QvY>aOaE0m*pc zJ2g0g6Qj(tZA~j|2|S?;oQu^E@B7EQ!BOgs<0~dvpGrU@Tb65`n%i-iJTJu*y5x z6>o*KQN_%ToG%7Kk%89({Hhc~$}wBx_pbIs1s93ekcIg?kXfu-M)C2e_erPff*>=g zosCX4SXblZLQ`F^p~psac}P*-00A5+_VHfKtLbC#SEqjSxM{6t2*ct3F3;3QogHds zgEHKO4Hw!P9@RLL+mNOIhGp=T zA3O^#Z>sJ-@IX>DH`o<4xW*PWBgFWq@I~~iHf__NUI~V@BIUWpQona5I3KK9^OrFc z$cTnL#&prC`ArR})z>-S=**WHASRabdaSkf>h?Kkhz%fq@=$q8j8@tZt(-hmIL5|N6 z3@+xM4Cx{*Y(xcBZhw!Y@K9t<(J$K&VDSvso|j4lM02SYU10K@g|Ay)R$saR2YoB8 z%^PwMQ>E;?(Uqg29!l=`=)x!ALBoTCeX4ac0x2}K!c^)HDl$fB)u30ILHdqYFk9CV1uJ7jlZhK|w6&qlN_L}bR`T`elAwlp zr(cFplDkIIEB=;r=m03m+D?_0*E2PKfn}%Bm@g>Ww<>LzBMq52R8nDk!al5J_2#F~ zEx}pjaQ9e#0bx#;A3C zc7}$_yg{+?d!#CSQadFi%c|W+g)N_^{CDK8r`3I&^^n#f6SVw0K6*4uemciM3dG@XB~Je$pwgyy724gO7+`YZcZQyN^W`J zeqF;u*wHe8g#2;qO}}Tn*?7jX${M!$B)LCW0GHGm|AA8aXUIGMsUh9i!11dtX6KaI zgEPdn>&nb<)owmdw1gx79a9YkS#gu@5?eVR-4+HUMmmdSpmhIJ)~A10oyF^>9tzTv z4!w+BFk5|mP7GU4E;kKoYFax|op=t;Yay`wm{L(ZuHl|Yr7JR^tE#=^CeeX%cs?rR z)8$t4OtEiC{dKm|gof%CnRSv@Jzwr6hdC~tcUD5HHIxWq+z3#bn}o=&@jF}$EJtmMsN+p&Wn|eYN;>sLuyRq#ppstQP(lIs&JFtSQu_f4 z%)ETF34$oIRh>A!4jsm)?+K|>9=jCmn6+=8dT$R#+BKqGN?%yc3sq@gp1eYIIM9L2 zLafH)Jh~DeKRRF~``rGKsKZcGh=@bY(dNj0(C_j1*cEKK^;2T=i;aoe@Nl0 zZLB7LgrsK-c`?lM>eFlO4VeF-MxSvt zP6l<QQIo>rSd;;@78YWzi5c57+FqL%kwFIo%~i%?O*|eDb!exmgMe?E z1zPbz`NKvb7J;nW=wKxoxgjs()pw6Ak9!;y^^X8gb zhHE9({KQKdSbX<~muw@=9pgNtC)LsGE_*TNbA>*1;WqD5^7+1|o6g{1}9x7_2+u&IJ}0Fn_DBoBhx~)_fuK-sD(n zt)(Xrm2rXnk#|aO;F%_M)TTPmL11!QmN#6>>^tBQ!eQcazi%F?xpi`-Y+vH8?Es9Qnrzta|DnJ#OIQ_b zb0*01XsAKJIx6cu%V(t}{$J(4n)`Y0bhmx><66HtU;J$%>2&taqelB(S7Q~=xN&iv z<}5tW#r>-+W1%H?uP?!yZnK>_s7<|kCaI)0S|M5|2WL+bPCcGj!}JP2^IL<{$VHYp zQK%+}b)@)}zIDmL^3a2Xd@zbxwG>-|y-*qwVgdPD(f8C|3Njww z66+C+9v0*(?E^0Mgl3lPx<5D^g|s% zJIG9@mP+;3hVa(Tq*PG%oKSm>p+S1JSt!e@g!Q-u*>hXw&oQ#)Ag;}_Uqwa&(_wQE99v9UL+xj@Z>*YJwmnhQJ)LMYy&%XqEj zoH08z8gDBAcuP)|wkq8BbQ?eOk6Q+(dI?JwW!3l8Pk3}@n%b$`m91Ji?mIT*d|{SQ zKoa&Q&Ds}5^?&S}6j3B4k0hhI?KXvE6^WOWS`BjDRVF&zG$FwMzQIrxglRxd`aUAg zGSqHSQH0Eqk)&piiZ$yS0~_khKwe8QO1?*l)(p%FoKA0VxW)ZPv8nieC>ZR}$H>{V z7wEijS2%~KX3bJaHV(`c8QzOfnre{M9B{CD(UPLN)hf$MkhfwPWVs@3_mZJ4B1Gy6 zupVDlaUBS5sywLi8<4w)5KY~w*ZV`ZnxLu7 zg8Rx(BX5GoO1IRXj-pk0UjJSO>#0dH?AqN-)ZbYOvhi!2pyN16otT+tG{>l>&@?qx zF7>(ByJ2laH{9JnCf0Q8nbA@Q_;kQyXYA2!q)z2&s*9}}lNH&}Ej(P3g7&|wj{Fb0 z?w+?eFRgjci@l@dGGVB+p_1_9WyZsXG z#Ved6Z!fNCcG7=~JXtCr+EK|7KOM#`a0spm-EfVpQ|+n&XM_&{fQEYTf$fK$vGm6P^9rhnJJj~4qYP5U>M+b|9leu%KIN7OZDR;z&3~&LY zg?)Dq!Zo{p@pZ;a3FJy#W%UWjXEr$p&?ro7{Yc-6&C3$SyZEdMu@e6=G%s$iOUx=zw2&d0;R7&x3G4HICvt_ z4hpG>c>B;)b9(0+<}69VrGvUM)2i6zdBX!DP~011a+5#d^cwb*i+Ssrr=)5ajFAlN&? zcI!Z5ZiBMv!(l<3!B9ULU7*FLJ(~TCgT}$vZCzDt*k_F6a=_$cN7nkYjXUc396?E- ztmW$9hnAafwsrOiF~V6g28MdkPRWJIiR+f#rjIwQoqvrgZ16XIq#g+Iw?kv)dVee0 zZfv9Ps!Vq1s8(yw8R9~3%$k>kQxmEf);DTxvtE>uHz6CU zw;{|IiD&Br`D;bAhNoKkCRum;GabuPmPXhc+beE|Go6v|alV1G^E(eq0G}&}#UunL zYcQy;E`57@V6lZM^Vv#VlIX^wLJm5eQNSOS#z-AsDHf;eDWJ)&@`hZBO9utmCU)T& z&Ngk%iVT8f@8m}%D`%$&J0pW-GE^6MaY$-l@hJZhxTHn=+ELsIhRRKW0G6c}on`f@T)^QHC*e!$ zGSV5n*)eLV5ywFrYl>sx^7f3M1`*z;Lg0g^y{{FwmqH~PSgfA!#%XOV(d%e{9xl~1 zZu}ZGoXNZ>?0Zo0WKTP`s%q!x3)6}=6~EEO_{X(hP1@|N`B==@Su9JdaE;hzOV;C7+ce0gc5>d3k2cOZy>$*0|7gH&jVl5>+I(C?v~6d8cK74`DgT;aV0CtrWaJ&vTEOwBqeVG zq~d*+C?8ZLd><*cT~&F>B%fi%%4=-1(OukFDV`;d!H|(<4lQBK%jtuY$7$4!_MW~{ zUxQcZF=nTJfY62T=T>~^^J?(&+_0s#@u_;8RPcoFIXpB(8Mi1In9&3-Jw=dVm1Zj+gSd*7> zYBsznoYt?!3elR?^H#|elmujS^>q3%$ zfwl9eySV)+pfC$OW`2~rxk=lQoe!SH(qj|pMEg# z7_fo7Cm2h1)_2N`tuqLHz__{wOLOWcV9vMz&k?oFr4V88g_Ap>hzAa_)i7kr51#Ur z%C@C!Kd^*<%c3pGNM0bub?~6qWFF5kGDIa|nB;CAnbn|K1gc4P;au3|H|7KQ*J#rk z4DU#0f;+<@YIYs|`*IhnUhTBS%hw>L4WWAAo6;HZPElQP0Le zLz~xx-&z&K*BYH0IV;8mMCYCE`P7v;L2Z0p;Oo@@8(7l}m_4I}T7H`; z2c`7mx+pVa|0hUMJ$LW@1BA=Ip2S!^7{pa^3Ju(@{>=H5>`k|19kFv>Fh5J{O!uT( zZ6)XhQZ{I~{lhD>=&L85$eZ!`JB|bE@FobLzuTsA59YHG8_OdCJns9F@Wd_l{a1uw zM*21+HnGr5qdb{U6i4Ka6r=RYv}8R!ak>R0$HFWtzHrlBkQ;>bJpODRjQKAZXOZMK~Arv zh(l@Fq2!lRl%5C4_Q3qG_Q&DRgzHo z*27<{et%l^{q+rr1tgS94Ru`Gb4v~CswM=AW*oMuTD*;z@@u`xT#LTy;mZnSH^V*} zuTjG!JnoJQyDk-aJ$wC*qS}Zd%V#vJNXDq?9jV3Zgm>h_+LN+O&@aiw{k+CjQ&h*d YuRs0u^#0xB-yHax1AhSrLjOSgFE$m>^Z)<= literal 0 HcmV?d00001 From 198f36495e5e27cb885ca81c19c6cf9d8c5d0b9d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:28:36 +0000 Subject: [PATCH 128/154] multilora inference --- examples/multilora_inference.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py index ec9b4b507bb2b..6aa25b4689ec8 100644 --- a/examples/multilora_inference.py +++ b/examples/multilora_inference.py @@ -1,5 +1,3 @@ -# flake8: noqa -# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation """ This example shows how to use the multi-LoRA functionality for offline inference. From ec0e89aca8052ba338559df909d3d6e100040499 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:28:57 +0000 Subject: [PATCH 129/154] offline inference with prefix --- examples/offline_inference_with_prefix.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 229ac7f0dc047..7ed0563f14e0e 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,6 +1,3 @@ -# flake8: noqa -# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation - from vllm import LLM, SamplingParams prefix = ( From e6f1cbd10551ce0860741105806a3abed4c4bca2 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:29:17 +0000 Subject: [PATCH 130/154] backend request func --- benchmarks/backend_request_func.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0141cbfb472a5..58dcc6167efa6 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,7 +1,3 @@ -# flake8: noqa -# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation -# This file has been modified by Neural Magic - import json import os import sys From ca8d74ad45f596b3ab2badeb958c6397e317b39e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:30:06 +0000 Subject: [PATCH 131/154] benchmark serving --- benchmarks/benchmark_serving.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index dc7288b8b7009..f3d71de775f82 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,5 +1,3 @@ -# flake8: noqa -# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation """Benchmark online serving throughput. On the server side, run one of the following commands: From 5335ad9467dccd1d77efc5e8e629a279d3ff3a5b Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:30:33 +0000 Subject: [PATCH 132/154] prod monitoring readme --- examples/production_monitoring/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md index ead44709d3b73..c47a06e23fffe 100644 --- a/examples/production_monitoring/README.md +++ b/examples/production_monitoring/README.md @@ -21,7 +21,7 @@ Launch Prometheus and Grafana servers with `docker compose`: docker compose up ``` -Submit some sample requests to the server (after `pip install aiohttp`): +Submit some sample requests to the server: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json From 611cfed781438fa475070655171eb052ae24567a Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:35:30 +0000 Subject: [PATCH 133/154] format --- setup.py | 3 ++- tests/kernels/test_cutlass.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index dfc88e784b4ea..ab42020b8c450 100644 --- a/setup.py +++ b/setup.py @@ -412,7 +412,8 @@ def _read_requirements(filename: str) -> List[str]: _sparsity_deps = ["nm-magic-wand-nightly"] nm_release_type = os.getenv(NM_RELEASE_TYPE) if nm_release_type == 'RELEASE': - # gate magic-wand version in nm-vllm for release; for nightly, we always install the latest + # Gate magic-wand version in nm-vllm for release; + # For nightly, we always install the latest magic_wand_version_dep = "0.2.2" _sparsity_deps = [f"nm-magic-wand~={magic_wand_version_dep}"] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 04897029b93f8..d5e9c258925c8 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -92,8 +92,8 @@ def cutlass_int8_gemm_helper(m: int, # automation system yet. @pytest.mark.skipif(capability < 90, reason="FP8 cutlass is not supported on this GPU " - "type because we need CUDA 12.4 + we do " - "not have this in automation yet.") + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, per_out_ch: bool): cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch) @@ -126,8 +126,8 @@ def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, # automation system yet. @pytest.mark.skipif(capability < 90, reason="FP8 cutlass is not supported on this GPU " - "type because we need CUDA 12.4 + we do " - "not have this in automation yet.") + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, out_dtype: Type[torch.dtype]): cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, @@ -142,8 +142,8 @@ def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, # automation system yet. @pytest.mark.skipif(capability < 90, reason="FP8 cutlass is not supported on this GPU " - "type because we need CUDA 12.4 + we do " - "not have this in automation yet.") + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool, device: str): cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, @@ -171,8 +171,8 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool, # automation system yet. @pytest.mark.skipif(capability < 90, reason="FP8 cutlass is not supported on this GPU " - "type because we need CUDA 12.4 + we do " - "not have this in automation yet.") + "type because we need CUDA 12.4 + we do " + "not have this in automation yet.") def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool): for nk in range(32, 128, 32): for m in range(1, 128): From 73132a51c7ff17bd93088ee95ecfc1557200303a Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:51:57 +0000 Subject: [PATCH 134/154] fix benchmark issue - internal method changed --- neuralmagic/benchmarks/scripts/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py index be9296de6980f..54ac420df4dc3 100644 --- a/neuralmagic/benchmarks/scripts/common.py +++ b/neuralmagic/benchmarks/scripts/common.py @@ -5,12 +5,13 @@ import json import random from pathlib import Path -from typing import List, Tuple +from typing import List, Tuple, cast from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams from vllm import __version__ as __vllm_version__ +from vllm.inputs import PromptStrictInputs from vllm.outputs import RequestOutput from vllm.transformers_utils.tokenizer import get_tokenizer @@ -139,8 +140,7 @@ def warmup_vllm_engine(engine: LLM, max_tokens=output_len, ) engine._add_request( - prompt=prompt, - prompt_token_ids=None, + inputs=cast(PromptStrictInputs, prompt), params=sampling_params, ) From 7f5c71577bb7c2454b4544ae8448408aaaf68a3f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 9 Jun 2024 22:53:31 +0000 Subject: [PATCH 135/154] removed skip for remote push edits --- neuralmagic/tests/skip-for-remote-push-tmp.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/neuralmagic/tests/skip-for-remote-push-tmp.txt b/neuralmagic/tests/skip-for-remote-push-tmp.txt index 860b4d025d196..dd39bb8e64d19 100644 --- a/neuralmagic/tests/skip-for-remote-push-tmp.txt +++ b/neuralmagic/tests/skip-for-remote-push-tmp.txt @@ -1,6 +1,16 @@ tests/test_sharded_state_loader.py tests/test_sequence.py tests/metrics/test_metrics.py +tests/kernels/test_prefix_prefill.py +tests/kernels/test_pos_encoding.py +tests/kernels/test_activation.py +tests/kernels/test_moe.py +tests/kernels/test_layernorm.py +tests/kernels/test_attention.py +tests/kernels/test_rand.py +tests/kernels/test_cache.py +tests/kernels/test_sampler.py +tests/kernels/test_cutlass.py tests/core/test_block_manager.py tests/core/test_chunked_prefill_scheduler.py tests/core/test_scheduler.py From 437912ebf69584f7a2c629d8b5e8cdbb4977f8e0 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 10 Jun 2024 00:17:24 +0000 Subject: [PATCH 136/154] update internal method in benchmark throughput too --- neuralmagic/benchmarks/scripts/benchmark_throughput.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py index 0607067e3817d..f49de1ec27a3f 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py +++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py @@ -10,10 +10,12 @@ import time from datetime import datetime from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, cast from transformers import AutoTokenizer +from vllm.inputs import PromptStrictInputs + from .common import (generate_synthetic_requests, num_available_gpus, print_request_outputs, warmup_vllm_engine) from .datasets_registry import DatasetArgs, get_dataset @@ -77,8 +79,7 @@ def run_vllm(requests: List[Tuple[str, int, int]], ) # FIXME(woosuk): Do not use internal method. llm._add_request( - prompt=prompt, - prompt_token_ids=None, + inputs=cast(PromptStrictInputs, prompt), params=sampling_params, ) From c754d5a3b6c26c830cd8670d74d5a23162509db7 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 10 Jun 2024 12:11:58 +0000 Subject: [PATCH 137/154] skip sharded state loader - hanging in automation --- tests/test_sharded_state_loader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 426110b9d7e38..f8de721f8574f 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -43,14 +43,13 @@ def test_filter_subtensors(): assert tensor.equal(state_dict[key]) -# @pytest.mark.skip("OOM in NM Automation") +@pytest.mark.skip("Timeout error in NM automation. Work to re-enable.") @pytest.mark.parametrize("enable_lora", [False, True]) def test_sharded_state_loader(enable_lora): weights_patterns = ("*.bin", "*.pt", "*.safetensors") with TemporaryDirectory() as cache_dir, TemporaryDirectory() as output_dir: - # input_dir = snapshot_download("meta-llama/Llama-2-7b-hf", - input_dir = snapshot_download("TinyLlama/TinyLlama-1.1B-Chat-v1.0", + input_dir = snapshot_download("meta-llama/Llama-2-7b-hf", cache_dir=cache_dir) llm = LLM( From 2bf55cd8e08fc99b6ad488d32ec10c081a59e677 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 10 Jun 2024 12:13:54 +0000 Subject: [PATCH 138/154] skip entrypoints tests in remote-push - too long --- neuralmagic/tests/test_skip_env_vars/remote-push.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralmagic/tests/test_skip_env_vars/remote-push.txt b/neuralmagic/tests/test_skip_env_vars/remote-push.txt index 121e15653211e..a5daa4a182593 100644 --- a/neuralmagic/tests/test_skip_env_vars/remote-push.txt +++ b/neuralmagic/tests/test_skip_env_vars/remote-push.txt @@ -4,7 +4,7 @@ TEST_BASIC_CORRECTNESS=1 TEST_CORE=1 TEST_DISTRIBUTED=0 TEST_ENGINE=1 -TEST_ENTRYPOINTS=1 +TEST_ENTRYPOINTS=0 TEST_KERNELS=0 TEST_LORA=0 TEST_METRICS=1 From 265789116e84b3c2a573a786868448d2c953df1f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 10 Jun 2024 12:15:29 +0000 Subject: [PATCH 139/154] cleanup TEST_ALL_MODELS comment --- tests/models/test_aqlm.py | 2 +- tests/models/test_big_models.py | 2 +- tests/models/test_embedding.py | 2 +- tests/models/test_fp8.py | 2 +- tests/models/test_gptq_marlin.py | 2 +- tests/models/test_gptq_marlin_24.py | 2 +- tests/models/test_llava.py | 2 +- tests/models/test_marlin.py | 2 +- tests/models/test_mistral.py | 2 +- tests/models/test_models.py | 2 +- tests/models/test_models_logprobs.py | 2 +- tests/models/test_oot_registration.py | 2 +- tests/models/test_registry.py | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index b322fbf15b561..200f00e0b9929 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 31782f4ff432d..adbb09e9b4d5f 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -12,7 +12,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index 8db5881ea6887..cee279f5f5622 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -9,7 +9,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index f83d9cfb7adc1..17e417ff9fb27 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 870be32f5442b..374af8adda6ba 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index b9c6650492861..9c0e9905fd2ab 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 1669bed43dd24..b3e378d55c72e 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -11,7 +11,7 @@ from vllm.config import VisionLanguageConfig if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) model_and_vl_config = [ diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 6475cdd97cea0..0c46995e532bd 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 57cd9720519ce..d56d523688470 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -9,7 +9,7 @@ from .utils import check_logprobs_close if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 330ab094b8406..ac170dfda2beb 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -12,7 +12,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 0e3e5eb6dbdee..df449f4400006 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -8,7 +8,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) MODEL_MAX_LEN = 1024 diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 0c81b244e334c..c6713831e87e0 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -7,7 +7,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index f563e264f9280..0d0dde3b82339 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -4,7 +4,7 @@ from vllm.model_executor.models import _MODELS, ModelRegistry if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_ALL_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=0, skipping model test group", allow_module_level=True) From 389bdcd9f6c21cd85d2ac052bc84ebeab12e4605 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 10 Jun 2024 12:17:24 +0000 Subject: [PATCH 140/154] skip samplers during remote push --- neuralmagic/tests/test_skip_env_vars/remote-push.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralmagic/tests/test_skip_env_vars/remote-push.txt b/neuralmagic/tests/test_skip_env_vars/remote-push.txt index a5daa4a182593..79c1bec433a6d 100644 --- a/neuralmagic/tests/test_skip_env_vars/remote-push.txt +++ b/neuralmagic/tests/test_skip_env_vars/remote-push.txt @@ -12,7 +12,7 @@ TEST_MODELS=0 TEST_MODELS_CORE=1 TEST_PREFIX_CACHING=1 TEST_QUANTIZATION=1 -TEST_SAMPLERS=1 +TEST_SAMPLERS=0 TEST_SPEC_DECODE=0 TEST_TENSORIZER_LOADER=0 TEST_TOKENIZATION=1 From 5dd3f5d3c78995aa3ae1113964e5d6698291f5dd Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:27:49 +0000 Subject: [PATCH 141/154] cleanup newline nit --- .github/actions/nm-set-env-test-skip/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/actions/nm-set-env-test-skip/action.yml b/.github/actions/nm-set-env-test-skip/action.yml index c36f857c097f4..fb84bc3a6ef9e 100644 --- a/.github/actions/nm-set-env-test-skip/action.yml +++ b/.github/actions/nm-set-env-test-skip/action.yml @@ -13,4 +13,3 @@ runs: env: ENV_VAR_FILE: ${{ inputs.test_skip_env_vars }} shell: bash - \ No newline at end of file From a475844315bbc7789cda3fcee640cb154b5685e5 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:32:16 +0000 Subject: [PATCH 142/154] switch to enable / disable --- .../tests/test_skip_env_vars/nightly.txt | 19 ----- .../tests/test_skip_env_vars/release.txt | 19 ----- .../tests/test_skip_env_vars/remote-push.txt | 19 ----- .../tests/test_skip_env_vars/weekly.txt | 19 ----- tests/nm_utils/utils_skip.py | 78 +++++++++---------- 5 files changed, 39 insertions(+), 115 deletions(-) delete mode 100644 neuralmagic/tests/test_skip_env_vars/nightly.txt delete mode 100644 neuralmagic/tests/test_skip_env_vars/release.txt delete mode 100644 neuralmagic/tests/test_skip_env_vars/remote-push.txt delete mode 100644 neuralmagic/tests/test_skip_env_vars/weekly.txt diff --git a/neuralmagic/tests/test_skip_env_vars/nightly.txt b/neuralmagic/tests/test_skip_env_vars/nightly.txt deleted file mode 100644 index 121e15653211e..0000000000000 --- a/neuralmagic/tests/test_skip_env_vars/nightly.txt +++ /dev/null @@ -1,19 +0,0 @@ -TEST_ACCURACY=0 -TEST_ASYNC_ENGINE=1 -TEST_BASIC_CORRECTNESS=1 -TEST_CORE=1 -TEST_DISTRIBUTED=0 -TEST_ENGINE=1 -TEST_ENTRYPOINTS=1 -TEST_KERNELS=0 -TEST_LORA=0 -TEST_METRICS=1 -TEST_MODELS=0 -TEST_MODELS_CORE=1 -TEST_PREFIX_CACHING=1 -TEST_QUANTIZATION=1 -TEST_SAMPLERS=1 -TEST_SPEC_DECODE=0 -TEST_TENSORIZER_LOADER=0 -TEST_TOKENIZATION=1 -TEST_WORKER=1 diff --git a/neuralmagic/tests/test_skip_env_vars/release.txt b/neuralmagic/tests/test_skip_env_vars/release.txt deleted file mode 100644 index 89d5e9eb5b2e4..0000000000000 --- a/neuralmagic/tests/test_skip_env_vars/release.txt +++ /dev/null @@ -1,19 +0,0 @@ -TEST_ACCURACY=0 -TEST_ASYNC_ENGINE=1 -TEST_BASIC_CORRECTNESS=1 -TEST_CORE=1 -TEST_DISTRIBUTED=0 -TEST_ENGINE=1 -TEST_ENTRYPOINTS=1 -TEST_KERNELS=0 -TEST_LORA=1 -TEST_METRICS=1 -TEST_MODELS=1 -TEST_MODELS_CORE=1 -TEST_PREFIX_CACHING=1 -TEST_QUANTIZATION=1 -TEST_SAMPLERS=1 -TEST_SPEC_DECODE=0 -TEST_TENSORIZER_LOADER=1 -TEST_TOKENIZATION=1 -TEST_WORKER=1 diff --git a/neuralmagic/tests/test_skip_env_vars/remote-push.txt b/neuralmagic/tests/test_skip_env_vars/remote-push.txt deleted file mode 100644 index 79c1bec433a6d..0000000000000 --- a/neuralmagic/tests/test_skip_env_vars/remote-push.txt +++ /dev/null @@ -1,19 +0,0 @@ -TEST_ACCURACY=0 -TEST_ASYNC_ENGINE=1 -TEST_BASIC_CORRECTNESS=1 -TEST_CORE=1 -TEST_DISTRIBUTED=0 -TEST_ENGINE=1 -TEST_ENTRYPOINTS=0 -TEST_KERNELS=0 -TEST_LORA=0 -TEST_METRICS=1 -TEST_MODELS=0 -TEST_MODELS_CORE=1 -TEST_PREFIX_CACHING=1 -TEST_QUANTIZATION=1 -TEST_SAMPLERS=0 -TEST_SPEC_DECODE=0 -TEST_TENSORIZER_LOADER=0 -TEST_TOKENIZATION=1 -TEST_WORKER=1 diff --git a/neuralmagic/tests/test_skip_env_vars/weekly.txt b/neuralmagic/tests/test_skip_env_vars/weekly.txt deleted file mode 100644 index 89d5e9eb5b2e4..0000000000000 --- a/neuralmagic/tests/test_skip_env_vars/weekly.txt +++ /dev/null @@ -1,19 +0,0 @@ -TEST_ACCURACY=0 -TEST_ASYNC_ENGINE=1 -TEST_BASIC_CORRECTNESS=1 -TEST_CORE=1 -TEST_DISTRIBUTED=0 -TEST_ENGINE=1 -TEST_ENTRYPOINTS=1 -TEST_KERNELS=0 -TEST_LORA=1 -TEST_METRICS=1 -TEST_MODELS=1 -TEST_MODELS_CORE=1 -TEST_PREFIX_CACHING=1 -TEST_QUANTIZATION=1 -TEST_SAMPLERS=1 -TEST_SPEC_DECODE=0 -TEST_TENSORIZER_LOADER=1 -TEST_TOKENIZATION=1 -TEST_WORKER=1 diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index d6a5bfb618830..f1a9bd7d9e4f1 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -8,103 +8,103 @@ def should_skip_accuracy_test_group(): - TEST_ACCURACY = os.getenv("TEST_ACCURACY", "1") - return TEST_ACCURACY == "0" + TEST_ACCURACY = os.getenv("TEST_ACCURACY", "ENABLE") + return TEST_ACCURACY == "DISABLE" def should_skip_async_engine_test_group(): - TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "1") - return TEST_ASYNC_ENGINE == "0" + TEST_ASYNC_ENGINE = os.getenv("TEST_ASYNC_ENGINE", "ENABLE") + return TEST_ASYNC_ENGINE == "DISABLE" def should_skip_basic_correctness_test_group(): - TEST_BASIC_CORRECTNESS = os.getenv("TEST_BASIC_CORRECTNESS", "1") - return TEST_BASIC_CORRECTNESS == "0" + TEST_BASIC_CORRECTNESS = os.getenv("TEST_BASIC_CORRECTNESS", "ENABLE") + return TEST_BASIC_CORRECTNESS == "DISABLE" def should_skip_core_test_group(): - TEST_CORE = os.getenv("TEST_CORE", "1") - return TEST_CORE == "0" + TEST_CORE = os.getenv("TEST_CORE", "ENABLE") + return TEST_CORE == "DISABLE" def should_skip_distributed_test_group(): - TEST_DISTRIBUTED = os.getenv("TEST_DISTRIBUTED", "1") - return TEST_DISTRIBUTED == "0" + TEST_DISTRIBUTED = os.getenv("TEST_DISTRIBUTED", "ENABLE") + return TEST_DISTRIBUTED == "DISABLE" def should_skip_engine_test_group(): - TEST_ENGINE = os.getenv("TEST_ENGINE", "1") - return TEST_ENGINE == "0" + TEST_ENGINE = os.getenv("TEST_ENGINE", "ENABLE") + return TEST_ENGINE == "DISABLE" def should_skip_entrypoints_test_group(): - TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "1") - return TEST_ENTRYPOINTS == "0" + TEST_ENTRYPOINTS = os.getenv("TEST_ENTRYPOINTS", "ENABLE") + return TEST_ENTRYPOINTS == "DISABLE" def should_skip_kernels_test_groups(): - TEST_KERNELS = os.getenv("TEST_KERNELS", "1") - return TEST_KERNELS == "0" + TEST_KERNELS = os.getenv("TEST_KERNELS", "ENABLE") + return TEST_KERNELS == "DISABLE" def should_skip_lora_test_group(): - TEST_LORA = os.getenv("TEST_LORA", "1") - return TEST_LORA == "0" + TEST_LORA = os.getenv("TEST_LORA", "ENABLE") + return TEST_LORA == "DISABLE" def should_skip_metrics_test_group(): - TEST_METRICS = os.getenv("TEST_METRICS", "1") - return TEST_METRICS == "0" + TEST_METRICS = os.getenv("TEST_METRICS", "ENABLE") + return TEST_METRICS == "DISABLE" def should_skip_model_executor_test_group(): - TEST_MODEL_EXECUTOR = os.getenv("TEST_MODEL_EXECUTOR", "1") - return TEST_MODEL_EXECUTOR == "0" + TEST_MODEL_EXECUTOR = os.getenv("TEST_MODEL_EXECUTOR", "ENABLE") + return TEST_MODEL_EXECUTOR == "DISABLE" def should_skip_models_test_group(): - TEST_MODELS = os.getenv("TEST_MODELS", "0") - return TEST_MODELS != "1" + TEST_MODELS = os.getenv("TEST_MODELS", "ENABLE") + return TEST_MODELS == "DISABLE" def should_skip_models_core_test_group(): - TEST_MODELS_CORE = os.getenv("TEST_MODELS_CORE", "0") - return TEST_MODELS_CORE != "1" + TEST_MODELS_CORE = os.getenv("TEST_MODELS_CORE", "ENABLE") + return TEST_MODELS_CORE == "DISABLE" def should_skip_prefix_caching_test_group(): TEST_PREFIX_CACHING = os.getenv("TEST_PREFIX_CACHING", "0") - return TEST_PREFIX_CACHING != "1" + return TEST_PREFIX_CACHING == "DISABLE" def should_skip_quantization_test_group(): - TEST_QUANTIZATION = os.getenv("TEST_QUANTIZATION", "0") - return TEST_QUANTIZATION != "1" + TEST_QUANTIZATION = os.getenv("TEST_QUANTIZATION", "ENABLE") + return TEST_QUANTIZATION == "DISABLE" def should_skip_samplers_test_group(): - TEST_SAMPLERS = os.getenv("TEST_SAMPLERS", "0") - return TEST_SAMPLERS != "1" + TEST_SAMPLERS = os.getenv("TEST_SAMPLERS", "ENABLE") + return TEST_SAMPLERS == "DISABLE" def should_skip_spec_decode_test_group(): - TEST_SPEC_DECODE = os.getenv("TEST_SPEC_DECODE", "0") - return TEST_SPEC_DECODE != "1" + TEST_SPEC_DECODE = os.getenv("TEST_SPEC_DECODE", "ENABLE") + return TEST_SPEC_DECODE == "DISABLE" def should_skip_tensorizer_loader_test_group(): - TEST_TENSORIZER_LOADER = os.getenv("TEST_TENSORIZER_LOADER", "0") - return TEST_TENSORIZER_LOADER != "1" + TEST_TENSORIZER_LOADER = os.getenv("TEST_TENSORIZER_LOADER", "ENABLE") + return TEST_TENSORIZER_LOADER == "DISABLE" def should_skip_tokenization_test_group(): - TEST_TOKENIZATION = os.getenv("TEST_TOKENIZATION", "0") - return TEST_TOKENIZATION != "1" + TEST_TOKENIZATION = os.getenv("TEST_TOKENIZATION", "ENABLE") + return TEST_TOKENIZATION == "DISABLE" def should_skip_worker_test_group(): - TEST_WORKER = os.getenv("TEST_WORKER", "0") - return TEST_WORKER != "1" + TEST_WORKER = os.getenv("TEST_WORKER", "ENABLE") + return TEST_WORKER == "DISABLE" MAP = { From 397cfe219ca7520f071f6866d8ca1df4430d8908 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:32:29 +0000 Subject: [PATCH 143/154] readded --- neuralmagic/tests/test_skip_env_vars/full.txt | 19 +++++++++++++++++++ .../tests/test_skip_env_vars/smoke.txt | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 neuralmagic/tests/test_skip_env_vars/full.txt create mode 100644 neuralmagic/tests/test_skip_env_vars/smoke.txt diff --git a/neuralmagic/tests/test_skip_env_vars/full.txt b/neuralmagic/tests/test_skip_env_vars/full.txt new file mode 100644 index 0000000000000..9c6f69cacd225 --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/full.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=DISABLE +TEST_ASYNC_ENGINE=ENABLE +TEST_BASIC_CORRECTNESS=ENABLE +TEST_CORE=ENABLE +TEST_DISTRIBUTED=DISABLE +TEST_ENGINE=ENABLE +TEST_ENTRYPOINTS=ENABLE +TEST_KERNELS=ENABLE +TEST_LORA=ENABLE +TEST_METRICS=ENABLE +TEST_MODELS=ENABLE +TEST_MODELS_CORE=ENABLE +TEST_PREFIX_CACHING=ENABLE +TEST_QUANTIZATION=ENABLE +TEST_SAMPLERS=ENABLE +TEST_SPEC_DECODE=DISABLE +TEST_TENSORIZER_LOADER=ENABLE +TEST_TOKENIZATION=ENABLE +TEST_WORKER=ENABLE diff --git a/neuralmagic/tests/test_skip_env_vars/smoke.txt b/neuralmagic/tests/test_skip_env_vars/smoke.txt new file mode 100644 index 0000000000000..5c5066aaee391 --- /dev/null +++ b/neuralmagic/tests/test_skip_env_vars/smoke.txt @@ -0,0 +1,19 @@ +TEST_ACCURACY=DISABLE +TEST_ASYNC_ENGINE=ENABLE +TEST_BASIC_CORRECTNESS=DISABLE +TEST_CORE=ENABLE +TEST_DISTRIBUTED=DISABLE +TEST_ENGINE=ENABLE +TEST_ENTRYPOINTS=DISABLE +TEST_KERNELS=DISABLE +TEST_LORA=DISABLE +TEST_METRICS=ENABLE +TEST_MODELS=DISABLE +TEST_MODELS_CORE=ENABLE +TEST_PREFIX_CACHING=ENABLE +TEST_QUANTIZATION=ENABLE +TEST_SAMPLERS=DISABLE +TEST_SPEC_DECODE=DISABLE +TEST_TENSORIZER_LOADER=DISABLE +TEST_TOKENIZATION=ENABLE +TEST_WORKER=ENABLE From 8c6d1f32b5fae2b6c0ad0c38145b3643281ac07f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:33:54 +0000 Subject: [PATCH 144/154] convert workflows to use new files --- .github/workflows/nm-nightly.yml | 8 ++++---- .github/workflows/nm-release.yml | 8 ++++---- .github/workflows/nm-remote-push.yml | 8 ++++---- .github/workflows/nm-weekly.yml | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index 67c99e3a86ed8..b3ed3e42c251a 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -27,7 +27,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -45,7 +45,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -63,7 +63,7 @@ jobs: test_label_solo: aws-avx2-32G-a10g-24G test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -81,7 +81,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/nightly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml index 9db1a402678a1..f5c9056cbc5d7 100644 --- a/.github/workflows/nm-release.yml +++ b/.github/workflows/nm-release.yml @@ -23,7 +23,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -41,7 +41,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -59,7 +59,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt @@ -77,7 +77,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/release.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index 7142128539dd9..3c1fe246756a4 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -21,7 +21,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -37,7 +37,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -53,7 +53,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt @@ -69,7 +69,7 @@ jobs: test_label_solo: gcp-k8s-l4-solo test_label_multi: ignore test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/remote-push.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml index d56e9687cb461..d92a2619ef359 100644 --- a/.github/workflows/nm-weekly.yml +++ b/.github/workflows/nm-weekly.yml @@ -27,7 +27,7 @@ jobs: test_label_solo: aws-avx2-32G-a10g-24G test_label_multi: aws-avx2-192G-4-a10g-96G test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/weekly.txt + test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: aws-avx2-32G-a10g-24G benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt From e093e6175891665e4015eaf4596f73e715de59e2 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:43:16 +0000 Subject: [PATCH 145/154] updated each comment --- tests/accuracy/test_lm_eval_correctness.py | 2 +- tests/async_engine/test_api_server.py | 2 +- tests/async_engine/test_async_llm_engine.py | 2 +- tests/async_engine/test_chat_template.py | 2 +- tests/async_engine/test_openapi_server_ray.py | 2 +- tests/async_engine/test_request_tracker.py | 2 +- tests/basic_correctness/test_basic_correctness.py | 2 +- tests/basic_correctness/test_chunked_prefill.py | 2 +- tests/basic_correctness/test_preemption.py | 2 +- tests/core/test_block_manager.py | 2 +- tests/core/test_chunked_prefill_scheduler.py | 2 +- tests/core/test_scheduler.py | 2 +- tests/distributed/test_basic_distributed_correctness.py | 2 +- tests/distributed/test_chunked_prefill_distributed.py | 2 +- tests/distributed/test_comm_ops.py | 2 +- tests/distributed/test_custom_all_reduce.py | 2 +- tests/distributed/test_pynccl.py | 2 +- tests/engine/output_processor/test_multi_step.py | 2 +- tests/engine/output_processor/test_stop_checker.py | 2 +- tests/engine/test_computed_prefix_blocks.py | 2 +- tests/engine/test_detokenization.py | 2 +- tests/engine/test_multiproc_workers.py | 2 +- tests/engine/test_skip_tokenizer_init.py | 2 +- tests/engine/test_stop_reason.py | 2 +- tests/engine/test_stop_strings.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 2 +- tests/entrypoints/test_guided_processors.py | 2 +- tests/entrypoints/test_llm_encode.py | 2 +- tests/entrypoints/test_llm_generate.py | 2 +- tests/entrypoints/test_openai_run_batch.py | 2 +- tests/entrypoints/test_openai_server.py | 2 +- tests/entrypoints/test_server_oot_registration.py | 2 +- tests/kernels/test_activation.py | 2 +- tests/kernels/test_attention.py | 2 +- tests/kernels/test_attention_selector.py | 2 +- tests/kernels/test_blocksparse_attention.py | 2 +- tests/kernels/test_cache.py | 2 +- tests/kernels/test_cutlass.py | 2 +- tests/kernels/test_flash_attn.py | 2 +- tests/kernels/test_int8_quant.py | 2 +- tests/kernels/test_layernorm.py | 2 +- tests/kernels/test_marlin_gemm.py | 2 +- tests/kernels/test_moe.py | 2 +- tests/kernels/test_pos_encoding.py | 2 +- tests/kernels/test_prefix_prefill.py | 2 +- tests/kernels/test_rand.py | 2 +- tests/kernels/test_sampler.py | 2 +- tests/lora/test_baichuan.py | 2 +- tests/lora/test_chatglm3.py | 2 +- tests/lora/test_gemma.py | 2 +- tests/lora/test_layer_variation.py | 2 +- tests/lora/test_layers.py | 2 +- tests/lora/test_llama.py | 2 +- tests/lora/test_long_context.py | 2 +- tests/lora/test_lora.py | 2 +- tests/lora/test_lora_checkpoints.py | 2 +- tests/lora/test_lora_manager.py | 2 +- tests/lora/test_mixtral.py | 2 +- tests/lora/test_phi.py | 2 +- tests/lora/test_punica.py | 2 +- tests/lora/test_quant_model.py | 2 +- tests/lora/test_tokenizer_group.py | 2 +- tests/lora/test_utils.py | 2 +- tests/lora/test_worker.py | 2 +- tests/metrics/test_metrics.py | 2 +- tests/model_executor/weight_utils.py | 2 +- tests/models/test_aqlm.py | 2 +- tests/models/test_big_models.py | 2 +- tests/models/test_embedding.py | 2 +- tests/models/test_fp8.py | 2 +- tests/models/test_gptq_marlin.py | 2 +- tests/models/test_gptq_marlin_24.py | 2 +- tests/models/test_llava.py | 2 +- tests/models/test_marlin.py | 2 +- tests/models/test_mistral.py | 2 +- tests/models/test_models.py | 2 +- tests/models/test_models_logprobs.py | 2 +- tests/models/test_oot_registration.py | 2 +- tests/models/test_registry.py | 2 +- tests/models_core/test_llm_logprobs.py | 2 +- tests/models_core/test_magic_wand.py | 2 +- tests/models_core/test_server_logprobs.py | 2 +- tests/prefix_caching/test_disable_sliding_window.py | 2 +- tests/prefix_caching/test_prefix_caching.py | 2 +- tests/quantization/test_compressed_tensors.py | 2 +- tests/quantization/test_configs.py | 2 +- tests/quantization/test_fp8.py | 2 +- tests/samplers/test_beam_search.py | 2 +- tests/samplers/test_ignore_eos.py | 2 +- tests/samplers/test_logits_processor.py | 2 +- tests/samplers/test_logprobs.py | 2 +- tests/samplers/test_ranks.py | 2 +- tests/samplers/test_rejection_sampler.py | 2 +- tests/samplers/test_sampler.py | 2 +- tests/samplers/test_seeded_generate.py | 2 +- tests/spec_decode/e2e/test_compatibility.py | 2 +- tests/spec_decode/e2e/test_integration.py | 2 +- tests/spec_decode/e2e/test_integration_dist.py | 2 +- tests/spec_decode/e2e/test_logprobs.py | 2 +- tests/spec_decode/e2e/test_multistep_correctness.py | 2 +- tests/spec_decode/e2e/test_ngram_correctness.py | 2 +- tests/spec_decode/test_batch_expansion.py | 2 +- tests/spec_decode/test_dynamic_spec_decode.py | 2 +- tests/spec_decode/test_metrics.py | 2 +- tests/spec_decode/test_multi_step_worker.py | 2 +- tests/spec_decode/test_ngram_worker.py | 2 +- tests/spec_decode/test_spec_decode_worker.py | 2 +- tests/spec_decode/test_utils.py | 2 +- tests/tensorizer_loader/test_tensorizer.py | 2 +- tests/tokenization/test_cached_tokenizer.py | 2 +- tests/tokenization/test_detokenize.py | 2 +- tests/tokenization/test_tokenizer.py | 2 +- tests/tokenization/test_tokenizer_group.py | 2 +- tests/worker/test_model_runner.py | 2 +- tests/worker/test_swap.py | 2 +- 115 files changed, 115 insertions(+), 115 deletions(-) diff --git a/tests/accuracy/test_lm_eval_correctness.py b/tests/accuracy/test_lm_eval_correctness.py index 4539011916051..4c1ac9638a10a 100644 --- a/tests/accuracy/test_lm_eval_correctness.py +++ b/tests/accuracy/test_lm_eval_correctness.py @@ -11,7 +11,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ACCURACY"): - pytest.skip("TEST_ACCURACY=0, skipping accuracy test group", + pytest.skip("TEST_ACCURACY=DISABLE, skipping accuracy test group", allow_module_level=True) if TYPE_CHECKING: diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index e08956836fbbc..e2cddf228cce7 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -10,7 +10,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", allow_module_level=True) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index efa0e62a0e218..77801437e7581 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -7,7 +7,7 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", allow_module_level=True) diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 0eaba87444bfb..5e21ed2061a89 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", allow_module_level=True) chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 3de088f798da3..776dc45fa9a56 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -8,7 +8,7 @@ from tests.utils import ServerRunner if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", allow_module_level=True) # any model with a chat template should work here diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 010962f4c293e..d217db1ba7068 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -5,7 +5,7 @@ from vllm.outputs import RequestOutput if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): - pytest.skip("TEST_ASYNC_ENGINE=0, skipping async engine test group", + pytest.skip("TEST_ASYNC_ENGINE=DISABLE, skipping async engine test group", allow_module_level=True) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 74c03455a9ec1..43bb230f53700 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -12,7 +12,7 @@ if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): pytest.skip( - "TEST_BASIC_CORRECTNESS=0, skipping basic correctness test group", + "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group", allow_module_level=True) MODELS = [ diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index afa9a6a89dcce..47ace46e1abdc 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -12,7 +12,7 @@ if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): pytest.skip( - "TEST_BASIC_CORRECTNESS=0, skipping basic correctness test group", + "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group", allow_module_level=True) MODELS = [ diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 5ef2d5ddd9b37..5606085dae865 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -15,7 +15,7 @@ if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): pytest.skip( - "TEST_BASIC_CORRECTNESS=0, skipping basic correctness test group", + "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group", allow_module_level=True) MODELS = [ diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index ee96b88286f2c..17c7f91f01eed 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -18,7 +18,7 @@ from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index e01efaa1f376d..8fce7b4364298 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -12,7 +12,7 @@ from .utils import create_dummy_prompt if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index aa47bc89b7b90..b7960435a6d69 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -16,7 +16,7 @@ from .utils import create_dummy_prompt if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 7ff61e4520583..de2aedb46e6c1 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -24,7 +24,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_DISTRIBUTED"): - pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", allow_module_level=True) MODELS = [ diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index bf4921c1b6a14..28fb1bcb644c3 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -23,7 +23,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_DISTRIBUTED"): - pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", allow_module_level=True) MODELS = [ diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index b285c0853cf14..894938e874092 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -16,7 +16,7 @@ tensor_model_parallel_all_reduce) if should_skip_test_group(group_name="TEST_DISTRIBUTED"): - pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", allow_module_level=True) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index bf40654efe8e9..5f77ed7539979 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -15,7 +15,7 @@ get_tp_ca_communicator) if should_skip_test_group(group_name="TEST_DISTRIBUTED"): - pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", allow_module_level=True) random.seed(42) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 41f20fcb98c65..b5dca7f8a82fd 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -15,7 +15,7 @@ from vllm.utils import update_environment_variables if should_skip_test_group(group_name="TEST_DISTRIBUTED"): - pytest.skip("TEST_DISTRIBUTED=0, skipping distributed test group", + pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", allow_module_level=True) diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py index 2b419842f40d2..99189af1b0076 100644 --- a/tests/engine/output_processor/test_multi_step.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -17,7 +17,7 @@ from ...core.utils import create_seq_group if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index 7837fd2cec84b..e6af7a3257d44 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -9,7 +9,7 @@ from vllm.sequence import Logprob, Sequence, SequenceStatus if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index bbcc07bfb54f5..1f2c7fd14c4cc 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -6,7 +6,7 @@ from vllm.sampling_params import SamplingParams if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py index 8b55d711b0975..ae1d5779d72c1 100644 --- a/tests/engine/test_detokenization.py +++ b/tests/engine/test_detokenization.py @@ -5,7 +5,7 @@ from vllm.sampling_params import SamplingParams if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index a103a5fee4477..8b3f68c843cc1 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -11,7 +11,7 @@ ResultHandler, WorkerMonitor) if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index 418c9e9566a06..438ae0fc71477 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -5,7 +5,7 @@ from vllm.sampling_params import SamplingParams if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index 2420111f6329e..491849ef2eeff 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -13,7 +13,7 @@ from vllm import SamplingParams if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) MODEL = "facebook/opt-350m" diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index c6456a4c5090c..a416419822cbb 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -6,7 +6,7 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams if should_skip_test_group(group_name="TEST_ENGINE"): - pytest.skip("TEST_ENGINE=0, skipping distributed test group", + pytest.skip("TEST_ENGINE=DISABLE, skipping engine test group", allow_module_level=True) MODEL = "meta-llama/llama-2-7b-hf" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 1033f02bea771..e3168f67e001f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -7,7 +7,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): - pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", allow_module_level=True) MODEL_NAME = "openai-community/gpt2" diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index e4e596898c940..3e6fa703865db 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -12,7 +12,7 @@ JSONLogitsProcessor, RegexLogitsProcessor) if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): - pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", allow_module_level=True) TEST_SCHEMA = { diff --git a/tests/entrypoints/test_llm_encode.py b/tests/entrypoints/test_llm_encode.py index c142b242bba1a..12a0a1a269ede 100644 --- a/tests/entrypoints/test_llm_encode.py +++ b/tests/entrypoints/test_llm_encode.py @@ -7,7 +7,7 @@ from vllm import LLM, EmbeddingRequestOutput, PoolingParams if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): - pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", allow_module_level=True) from ..conftest import cleanup diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py index 1cea00a96e950..96b47fb5e170b 100644 --- a/tests/entrypoints/test_llm_generate.py +++ b/tests/entrypoints/test_llm_generate.py @@ -9,7 +9,7 @@ from ..conftest import cleanup if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): - pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", allow_module_level=True) MODEL_NAME = "facebook/opt-125m" diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/test_openai_run_batch.py index 2068d5e878623..6ce7bc08b6cb2 100644 --- a/tests/entrypoints/test_openai_run_batch.py +++ b/tests/entrypoints/test_openai_run_batch.py @@ -8,7 +8,7 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): - pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", allow_module_level=True) # ruff: noqa: E501 diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index e554bd10ef3a2..8e98b96e9ebdd 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -18,7 +18,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): - pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", allow_module_level=True) # any model with a chat template should work here diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py index 1d8e69b4b3aec..394594fcbf085 100644 --- a/tests/entrypoints/test_server_oot_registration.py +++ b/tests/entrypoints/test_server_oot_registration.py @@ -12,7 +12,7 @@ from vllm.utils import get_open_port if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): - pytest.skip("TEST_ENTRYPOINTS=0, skipping entrypoints group", + pytest.skip("TEST_ENTRYPOINTS=DISABLE, skipping entrypoints group", allow_module_level=True) pytestmark = pytest.mark.openai diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 6ce27d6097d48..e0824937081b8 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -10,7 +10,7 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) DTYPES = [torch.half, torch.bfloat16, torch.float] diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index cacd06af072fe..458226ce38ccd 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -13,7 +13,7 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 03a71949559d1..36146fc4d8a99 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -8,7 +8,7 @@ from vllm.attention.selector import which_attn_to_use if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 8f2151a958404..8a4b7e62a053b 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -13,7 +13,7 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index c0413b0d56ac1..f7aec1cb5b677 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -8,7 +8,7 @@ from vllm import _custom_ops as ops if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index f1a0d47f90220..9ff434dd49e3f 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -11,7 +11,7 @@ from vllm import _custom_ops as ops if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) CUDA_DEVICES = [ diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index e218dd3cbd345..4437a5ddc8d7a 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -7,7 +7,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) NUM_HEADS = [(16, 16), (32, 8), (64, 8)] diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 87fff0753e9a3..1eee2b9652bb5 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -5,7 +5,7 @@ from vllm._C import ops if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) DTYPES = [torch.half, torch.bfloat16, torch.float] diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 98e8f65f0334f..1fcbb36a9a74b 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -5,7 +5,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) DTYPES = [torch.half, torch.bfloat16, torch.float] diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 0fa2bf12ea8db..1e59513a1ae48 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -22,7 +22,7 @@ gptq_pack, quantize_weights, sort_weights) if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) ACT_ORDER_OPTS = [False, True] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 8b7a2cd173f79..7fad6e53ee754 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -13,7 +13,7 @@ from vllm.model_executor.models.mixtral import MixtralMoE if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 4b841970eb13e..b400dbb88f699 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -10,7 +10,7 @@ from .allclose_default import get_default_atol, get_default_rtol if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) IS_NEOX_STYLE = [True, False] diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 4eead346c232d..630cf77cd4b51 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -12,7 +12,7 @@ from vllm.attention.ops.prefix_prefill import context_attention_fwd if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) NUM_HEADS = [64] diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index 75cc0ffb6a54e..737467e5f6252 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -8,7 +8,7 @@ from vllm.model_executor.utils import set_random_seed if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index 95bed09f04d72..cdb3785badad5 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -13,7 +13,7 @@ from vllm.model_executor.utils import set_random_seed if should_skip_test_group(group_name="TEST_KERNELS"): - pytest.skip("TEST_KERNELS=0, skipping kernel group", + pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", allow_module_level=True) SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 4b2b50819970d..3ac53524893bc 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -7,7 +7,7 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) MODEL_PATH = "baichuan-inc/Baichuan-7B" diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index d7406fa8f7307..9ceb627c5cc8e 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -5,7 +5,7 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) MODEL_PATH = "THUDM/chatglm3-6b" diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 212bfb4548279..16d9c036ace5f 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -5,7 +5,7 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) MODEL_PATH = "google/gemma-7b" diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index deb39aa6fd6c1..ad523417df588 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -13,7 +13,7 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" PROMPTS = [ diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 2d922e82e6731..03f3f0bc206a5 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -39,7 +39,7 @@ from .utils import DummyLoRAManager if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) TOLERANCES = { torch.float16: (5e-3, 5e-3), diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 667f5fdcfdec6..28ebe97f8f694 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -8,7 +8,7 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) MODEL_PATH = "meta-llama/Llama-2-7b-hf" diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 4c6b06ec0b25e..b0a74c2fc1fc6 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -15,7 +15,7 @@ from .data.long_context_test_data import prompts_and_responses if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) context_len_to_scaling_factor = { "16k": 4, diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index eae74f6e5e37e..b7116508706e6 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -7,7 +7,7 @@ from .utils import DummyLoRAManager if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] QKV_TENSOR_SIZES = [ diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 35acf0b1a4cab..8601cd2c1f2a9 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -5,7 +5,7 @@ from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"] diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index d58d60145d527..084a9bbedf185 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.linear import RowParallelLinear if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 80500f32f41b5..31ca2ac6ee3f0 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -6,7 +6,7 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 258ff193f0db8..6efdb82003689 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -5,7 +5,7 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) MODEL_PATH = "microsoft/phi-2" diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index 30bf7dcb80c82..23ca3f2c09bbe 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -7,7 +7,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) def assert_close(a, b): diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index a1381e9513ef1..35de893584cbd 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -12,7 +12,7 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) @dataclass diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 396bc95c56db1..20372d5e390bf 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -9,7 +9,7 @@ from ..conftest import get_tokenizer_pool_config if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) @pytest.mark.asyncio diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index c05317b7c5925..c2d602458ebe8 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -8,7 +8,7 @@ from vllm.utils import LRUCache if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) def test_parse_fine_tuned_lora_name(): diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index ea82af40cf9c3..e57cbdf0c4e85 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -13,7 +13,7 @@ from vllm.worker.worker import Worker if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=0, skipping kernel group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) @patch.dict(os.environ, {"RANK": "0"}) diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 2b04c94690040..709ebd7fe6a23 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -10,7 +10,7 @@ from vllm.sampling_params import SamplingParams if should_skip_test_group(group_name="TEST_METRICS"): - pytest.skip("TEST_METRICS=0, skipping metrics test group", + pytest.skip("TEST_METRICS=DISABLE, skipping metrics test group", allow_module_level=True) MODELS = [ diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index 1958186b2e111..5b7c5441e1539 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -10,7 +10,7 @@ download_weights_from_hf, enable_hf_transfer) if should_skip_test_group(group_name="TEST_MODEL_EXECUTOR"): - pytest.skip("TEST_MODEL_EXECUTOR=0, skipping model executor test group", + pytest.skip("TEST_MODEL_EXECUTOR=DISABLE, skipping model executor test group", allow_module_level=True) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 200f00e0b9929..f29c06bf5f445 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index adbb09e9b4d5f..4fe2358befc1c 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -12,7 +12,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index cee279f5f5622..599291fb1058c 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -9,7 +9,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 17e417ff9fb27..55c37f95e4bde 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 374af8adda6ba..979fb8af04848 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 9c0e9905fd2ab..ed7834fab16c7 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index b3e378d55c72e..2efabb39d7277 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -11,7 +11,7 @@ from vllm.config import VisionLanguageConfig if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) model_and_vl_config = [ diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 0c46995e532bd..3e2e6fa448edf 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index d56d523688470..91915af2fad48 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -9,7 +9,7 @@ from .utils import check_logprobs_close if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models.py b/tests/models/test_models.py index ac170dfda2beb..485907d8261cf 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -12,7 +12,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) MODELS = [ diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index df449f4400006..a41e7a1154fef 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -8,7 +8,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) MODEL_MAX_LEN = 1024 diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index c6713831e87e0..fa3f058ed8035 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -7,7 +7,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 0d0dde3b82339..b44e93b9d4fef 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -4,7 +4,7 @@ from vllm.model_executor.models import _MODELS, ModelRegistry if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=0, skipping model test group", + pytest.skip("TEST_MODELS=DISABLE, skipping model test group", allow_module_level=True) diff --git a/tests/models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py index 1cc349084ea05..be776637c87f6 100644 --- a/tests/models_core/test_llm_logprobs.py +++ b/tests/models_core/test_llm_logprobs.py @@ -11,7 +11,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS_CORE"): - pytest.skip("TEST_MODELS_CORE=0, skipping core model test group", + pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group", allow_module_level=True) MODEL_MAX_LEN = 1024 diff --git a/tests/models_core/test_magic_wand.py b/tests/models_core/test_magic_wand.py index 116fc2f815813..5a51efaa2ae08 100644 --- a/tests/models_core/test_magic_wand.py +++ b/tests/models_core/test_magic_wand.py @@ -12,7 +12,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS_CORE"): - pytest.skip("TEST_MODELS_CORE=0, skipping core model test group", + pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group", allow_module_level=True) MAX_MODEL_LEN = 1024 diff --git a/tests/models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py index 385629f2afb84..1477192c0ced7 100644 --- a/tests/models_core/test_server_logprobs.py +++ b/tests/models_core/test_server_logprobs.py @@ -18,7 +18,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_MODELS_CORE"): - pytest.skip("TEST_MODELS_CORE=0, skipping core model test group", + pytest.skip("TEST_MODELS_CORE=DISABLE, skipping core model test group", allow_module_level=True) # Silence warning. diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index 181a504f0f1a4..b68422ac4b0da 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -9,7 +9,7 @@ from vllm import LLM if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): - pytest.skip("TEST_PREFIX_CACHING=0, skipping prefix caching test group", + pytest.skip("TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", allow_module_level=True) MODEL_LEN_LEN = [ diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index a290a75d96aa4..a9c296d9d3ca5 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -9,7 +9,7 @@ from vllm.utils import Device if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): - pytest.skip("TEST_PREFIX_CACHING=0, skipping prefix caching test group", + pytest.skip("TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", allow_module_level=True) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 73ac6ca947d49..0a93caf885106 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -11,7 +11,7 @@ CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor) if should_skip_test_group(group_name="TEST_QUANTIZATION"): - pytest.skip("TEST_QUANTIZATION=0, skipping quantization test group", + pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", allow_module_level=True) diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index da02d3d631b46..3b7dcbc5983fc 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -11,7 +11,7 @@ from vllm.config import ModelConfig if should_skip_test_group(group_name="TEST_QUANTIZATION"): - pytest.skip("TEST_QUANTIZATION=0, skipping quantization test group", + pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", allow_module_level=True) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 8c10768e42142..f731fbe9b58db 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod if should_skip_test_group(group_name="TEST_QUANTIZATION"): - pytest.skip("TEST_QUANTIZATION=0, skipping quantization test group", + pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", allow_module_level=True) capability = torch.cuda.get_device_capability() diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 257a2668c63b1..9afa2510dc3e9 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -10,7 +10,7 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) # FIXME(zhuohan): The test can not pass if we: diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 74f8e62ea7369..fb26415b9c3cd 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -9,7 +9,7 @@ from vllm import SamplingParams if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index a30567553c1e7..2bf538f24106a 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -5,7 +5,7 @@ from vllm import SamplingParams if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 128326123f855..b157b2b9e656d 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -7,7 +7,7 @@ from ..conftest import VllmRunner if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 8fc88af774b80..90b7139a008ef 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -4,7 +4,7 @@ from vllm import SamplingParams if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 1fec93400269e..f7ce4d1d0c694 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -10,7 +10,7 @@ from vllm.model_executor.utils import set_random_seed if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) CUDA_DEVICES = [ diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 5e65bfdfa7755..03708e173ea33 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -16,7 +16,7 @@ from vllm.utils import Counter, is_pin_memory_available if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index d2aa9f7848c19..786b5ddeae7ab 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -13,7 +13,7 @@ from vllm.model_executor.utils import set_random_seed if should_skip_test_group(group_name="TEST_SAMPLERS"): - pytest.skip("TEST_SAMPLERS=0, skipping sampler group", + pytest.skip("TEST_SAMPLERS=DISABLE, skipping sampler test group", allow_module_level=True) MODEL = "facebook/opt-125m" diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index ceb11ff77792a..5600272de9adb 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -6,7 +6,7 @@ from .conftest import get_output_from_llm_generator if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index a58873c9379bf..14d4e3f33eb7e 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -9,7 +9,7 @@ from .conftest import run_greedy_equality_correctness_test if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index 4d0f3204dca49..80cfb7eb7b7d9 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -11,7 +11,7 @@ from .conftest import run_greedy_equality_correctness_test if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index ee37622ca3648..881e85c70fc3f 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -9,7 +9,7 @@ from .conftest import get_logprobs_from_llm_generator if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 60eeb18806513..e9814d81d8f8d 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -40,7 +40,7 @@ run_greedy_equality_correctness_test) if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index 4f57f00dfdc3a..1dbdc2c82447d 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -31,7 +31,7 @@ from .conftest import run_greedy_equality_correctness_test if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index e6cf14c5a6784..0b9ebe4e63556 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -7,7 +7,7 @@ from .utils import create_seq_group_metadata_from_prompts, mock_worker if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index d9d4dcf1624ab..45e85acbf8e45 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -14,7 +14,7 @@ from .utils import create_batch, mock_worker if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 6c4890fc7232a..d1141d67c38f6 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -8,7 +8,7 @@ from vllm.spec_decode.metrics import AsyncMetricsCollector if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 145d9cf3320d0..ae5a271ae7824 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -16,7 +16,7 @@ patch_execute_model_with_seeds, zero_kv_cache) if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index 0578bf1611422..d1e324f882ce6 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -9,7 +9,7 @@ from .utils import create_seq_group_metadata_from_prompts, create_worker if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index dc4b2509bbe5c..4c098246ab1a4 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -19,7 +19,7 @@ from .utils import create_batch, create_sampler_output_list, mock_worker if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 82e1ee6908894..bdc72346ab011 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -7,7 +7,7 @@ from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len if should_skip_test_group(group_name="TEST_SPEC_DECODE"): - pytest.skip("TEST_SPEC_DECODE=0, skipping spec decode group", + pytest.skip("TEST_SPEC_DECODE=DISABLE, skipping spec decode group", allow_module_level=True) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 8c87c624fdd6e..c32dad4fd2c4c 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -24,7 +24,7 @@ if should_skip_test_group(group_name="TEST_TENSORIZER_LOADER"): - pytest.skip("TEST_TENSORIZER=0, skipping tensorizer group", + pytest.skip("TEST_TENSORIZER=DISABLE, skipping tensorizer group", allow_module_level=True) prompts = [ diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index 549a8ea8ebe36..dbd17b88282a0 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -7,7 +7,7 @@ from vllm.transformers_utils.tokenizer import get_cached_tokenizer if should_skip_test_group(group_name="TEST_TOKENIZATION"): - pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", allow_module_level=True) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 67d8abb31513b..a48cfe6fed01f 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer_group import get_tokenizer_group if should_skip_test_group(group_name="TEST_TOKENIZATION"): - pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", allow_module_level=True) TRUTH = [ diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py index 3273e2fa55e53..119fbd2d02e4f 100644 --- a/tests/tokenization/test_tokenizer.py +++ b/tests/tokenization/test_tokenizer.py @@ -5,7 +5,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer if should_skip_test_group(group_name="TEST_TOKENIZATION"): - pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", allow_module_level=True) TOKENIZER_NAMES = [ diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 801fb86644030..e18ee99494f65 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -15,7 +15,7 @@ from ..conftest import get_tokenizer_pool_config if should_skip_test_group(group_name="TEST_TOKENIZATION"): - pytest.skip("TEST_TOKENIZATION=0, skipping tokenization test group", + pytest.skip("TEST_TOKENIZATION=DISABLE, skipping tokenization test group", allow_module_level=True) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index bec7b9f91c74c..e40de0dc027d8 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -10,7 +10,7 @@ from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size if should_skip_test_group(group_name="TEST_WORKER"): - pytest.skip("TEST_WORKER=0, skipping worker test group", + pytest.skip("TEST_WORKER=DISABLE, skipping worker test group", allow_module_level=True) diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index a822372913e0a..cd3807a133cd4 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -8,7 +8,7 @@ from vllm.worker.worker import Worker if should_skip_test_group(group_name="TEST_WORKER"): - pytest.skip("TEST_WORKER=0, skipping worker test group", + pytest.skip("TEST_WORKER=DISABLE, skipping worker test group", allow_module_level=True) From e95ad95a0524e758c8a76ac2b508c51ad515d7de Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:44:49 +0000 Subject: [PATCH 146/154] updated missed core files --- tests/core/block/test_block_manager_v2.py | 2 +- tests/core/block/test_block_table.py | 2 +- tests/core/block/test_common.py | 2 +- tests/core/block/test_cpu_gpu_block_allocator.py | 2 +- tests/core/block/test_naive_block.py | 2 +- tests/core/block/test_prefix_caching_block.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 11a74d902676a..4d5cb4f6c0de9 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -11,7 +11,7 @@ from ..utils import create_seq_group, create_seq_group_encoder_decoder if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index a68fafabda16f..2a1c9945b93dc 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -6,7 +6,7 @@ from vllm.utils import Device, cdiv, chunk_list if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py index 71dfffbe2e350..957465a2b56dc 100644 --- a/tests/core/block/test_common.py +++ b/tests/core/block/test_common.py @@ -6,7 +6,7 @@ from vllm.core.block.common import RefCounter if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index 000d4fa4eab33..a70310906e2f1 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -5,7 +5,7 @@ from vllm.utils import Device, chunk_list if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index 821c8f67c1eff..4e619ee433f85 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -7,7 +7,7 @@ from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 5b8425fe32ab4..c300345dd7da6 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -11,7 +11,7 @@ PrefixCachingBlockAllocator) if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) From fe0be9ee1b6b9f064c313d10fd6ac6eab4f055e6 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:45:22 +0000 Subject: [PATCH 147/154] updated test core --- tests/core/block/e2e/test_correctness.py | 2 +- tests/core/block/e2e/test_correctness_sliding_window.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 469f8ec90bcd1..a4777a489ed78 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -8,7 +8,7 @@ from .conftest import get_token_ids_from_llm_generator if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 0f17d8a9e7b3d..37e9af1116c9d 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -9,7 +9,7 @@ from .conftest import get_text_from_llm_generator if should_skip_test_group(group_name="TEST_CORE"): - pytest.skip("TEST_CORE=0, skipping core test group", + pytest.skip("TEST_CORE=DISABLE, skipping core test group", allow_module_level=True) # relatively small model with 4k sliding window From 4fabe985e8507bbf05be3a1651e46719649e6c17 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:46:03 +0000 Subject: [PATCH 148/154] format --- tests/lora/test_baichuan.py | 3 ++- tests/lora/test_chatglm3.py | 3 ++- tests/lora/test_gemma.py | 3 ++- tests/lora/test_layer_variation.py | 3 ++- tests/lora/test_layers.py | 3 ++- tests/lora/test_llama.py | 3 ++- tests/lora/test_long_context.py | 3 ++- tests/lora/test_lora.py | 3 ++- tests/lora/test_lora_checkpoints.py | 3 ++- tests/lora/test_lora_manager.py | 3 ++- tests/lora/test_mixtral.py | 3 ++- tests/lora/test_phi.py | 3 ++- tests/lora/test_punica.py | 3 ++- tests/lora/test_quant_model.py | 3 ++- tests/lora/test_tokenizer_group.py | 3 ++- tests/lora/test_utils.py | 3 ++- tests/lora/test_worker.py | 3 ++- tests/model_executor/weight_utils.py | 5 +++-- tests/prefix_caching/test_disable_sliding_window.py | 5 +++-- tests/prefix_caching/test_prefix_caching.py | 5 +++-- 20 files changed, 43 insertions(+), 23 deletions(-) diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 3ac53524893bc..825f26ad28892 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -7,7 +7,8 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) MODEL_PATH = "baichuan-inc/Baichuan-7B" diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index 9ceb627c5cc8e..9cee24c90f972 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -5,7 +5,8 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) MODEL_PATH = "THUDM/chatglm3-6b" diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 16d9c036ace5f..0c31726dc0fd0 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -5,7 +5,8 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) MODEL_PATH = "google/gemma-7b" diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index ad523417df588..712f822d9bed9 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -13,7 +13,8 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" PROMPTS = [ diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 03f3f0bc206a5..1dc2f28e0c372 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -39,7 +39,8 @@ from .utils import DummyLoRAManager if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) TOLERANCES = { torch.float16: (5e-3, 5e-3), diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 28ebe97f8f694..a14368a9bbf02 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -8,7 +8,8 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) MODEL_PATH = "meta-llama/Llama-2-7b-hf" diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index b0a74c2fc1fc6..de9ea550dc235 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -15,7 +15,8 @@ from .data.long_context_test_data import prompts_and_responses if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) context_len_to_scaling_factor = { "16k": 4, diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index b7116508706e6..34c6941140754 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -7,7 +7,8 @@ from .utils import DummyLoRAManager if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] QKV_TENSOR_SIZES = [ diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 8601cd2c1f2a9..9c9a0fea5cb6c 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -5,7 +5,8 @@ from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"] diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 084a9bbedf185..09a27c90f4768 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -20,7 +20,8 @@ from vllm.model_executor.layers.linear import RowParallelLinear if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 31ca2ac6ee3f0..f7541f271fd98 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -6,7 +6,8 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 6efdb82003689..13636b9be5140 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -5,7 +5,8 @@ from vllm.lora.request import LoRARequest if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) MODEL_PATH = "microsoft/phi-2" diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index 23ca3f2c09bbe..29b4f9c411e1d 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -7,7 +7,8 @@ from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) def assert_close(a, b): diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 35de893584cbd..278acd2dcdb89 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -12,7 +12,8 @@ from .conftest import cleanup if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) @dataclass diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 20372d5e390bf..ce72a63016732 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -9,7 +9,8 @@ from ..conftest import get_tokenizer_pool_config if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) @pytest.mark.asyncio diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index c2d602458ebe8..a89c6251dfb59 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -8,7 +8,8 @@ from vllm.utils import LRUCache if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) def test_parse_fine_tuned_lora_name(): diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index e57cbdf0c4e85..8c45e15b50d0c 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -13,7 +13,8 @@ from vllm.worker.worker import Worker if should_skip_test_group(group_name="TEST_LORA"): - pytest.skip("TEST_LORA=DISABLE, skipping lora test group", allow_module_level=True) + pytest.skip("TEST_LORA=DISABLE, skipping lora test group", + allow_module_level=True) @patch.dict(os.environ, {"RANK": "0"}) diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index 5b7c5441e1539..4bd0afdc8ca68 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -10,8 +10,9 @@ download_weights_from_hf, enable_hf_transfer) if should_skip_test_group(group_name="TEST_MODEL_EXECUTOR"): - pytest.skip("TEST_MODEL_EXECUTOR=DISABLE, skipping model executor test group", - allow_module_level=True) + pytest.skip( + "TEST_MODEL_EXECUTOR=DISABLE, skipping model executor test group", + allow_module_level=True) def test_hf_transfer_auto_activation(): diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index b68422ac4b0da..1e2dc9197b403 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -9,8 +9,9 @@ from vllm import LLM if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): - pytest.skip("TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", - allow_module_level=True) + pytest.skip( + "TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", + allow_module_level=True) MODEL_LEN_LEN = [ # Example models with sliding window. diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index a9c296d9d3ca5..7c3be3a1367b2 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -9,8 +9,9 @@ from vllm.utils import Device if should_skip_test_group(group_name="TEST_PREFIX_CACHING"): - pytest.skip("TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", - allow_module_level=True) + pytest.skip( + "TEST_PREFIX_CACHING=DISABLE, skipping prefix caching test group", + allow_module_level=True) @pytest.mark.parametrize("block_size", [16]) From 14dedf1b21d135b5d9e9a47f0afddbcdf43d9f63 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:58:00 +0000 Subject: [PATCH 149/154] fix bad merge llm_generate --- tests/entrypoints/test_llm_generate.py | 114 ------------------------- 1 file changed, 114 deletions(-) diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py index cf38972eb0e30..96b47fb5e170b 100644 --- a/tests/entrypoints/test_llm_generate.py +++ b/tests/entrypoints/test_llm_generate.py @@ -1,9 +1,6 @@ import weakref from typing import List -import weakref -from typing import List - import pytest from tests.nm_utils.utils_skip import should_skip_test_group @@ -34,109 +31,6 @@ pytestmark = pytest.mark.llm -@pytest.fixture(scope="module") -def llm(): - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.10, - enforce_eager=True) - - with llm.deprecate_legacy_api(): - yield weakref.proxy(llm) - - del llm - - cleanup() - - -def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): - assert [o.outputs for o in o1] == [o.outputs for o in o2] - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize('prompt', PROMPTS) -def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt): - sampling_params = SamplingParams(temperature=0.0, top_p=1.0) - - with pytest.warns(DeprecationWarning, match="'prompts'"): - v1_output = llm.generate(prompts=prompt, - sampling_params=sampling_params) - - v2_output = llm.generate(prompt, sampling_params=sampling_params) - assert_outputs_equal(v1_output, v2_output) - - v2_output = llm.generate({"prompt": prompt}, - sampling_params=sampling_params) - assert_outputs_equal(v1_output, v2_output) - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) -def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, - prompt_token_ids): - sampling_params = SamplingParams(temperature=0.0, top_p=1.0) - - with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): - v1_output = llm.generate(prompt_token_ids=prompt_token_ids, - sampling_params=sampling_params) - - v2_output = llm.generate({"prompt_token_ids": prompt_token_ids}, - sampling_params=sampling_params) - assert_outputs_equal(v1_output, v2_output) - - -@pytest.mark.skip_global_cleanup -def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM): - sampling_params = SamplingParams(temperature=0.0, top_p=1.0) - - with pytest.warns(DeprecationWarning, match="'prompts'"): - v1_output = llm.generate(prompts=PROMPTS, - sampling_params=sampling_params) - - v2_output = llm.generate(PROMPTS, sampling_params=sampling_params) - assert_outputs_equal(v1_output, v2_output) - - v2_output = llm.generate( - [{ - "prompt": p - } for p in PROMPTS], - sampling_params=sampling_params, - ) - assert_outputs_equal(v1_output, v2_output) - - -@pytest.mark.skip_global_cleanup -def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): - sampling_params = SamplingParams(temperature=0.0, top_p=1.0) - - with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): - v1_output = llm.generate(prompt_token_ids=TOKEN_IDS, - sampling_params=sampling_params) - - v2_output = llm.generate( - [{ - "prompt_token_ids": p - } for p in TOKEN_IDS], - sampling_params=sampling_params, - ) - assert_outputs_equal(v1_output, v2_output) - - -@pytest.mark.skip_global_cleanup -def test_multiple_sampling_params(llm: LLM): -TOKEN_IDS = [ - [0], - [0, 1], - [0, 2, 1], - [0, 3, 1, 2], -] - -pytestmark = pytest.mark.llm - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to @@ -240,24 +134,16 @@ def test_multiple_sampling_params(llm: LLM): # Multiple SamplingParams should be matched with each prompt outputs = llm.generate(PROMPTS, sampling_params=sampling_params) assert len(PROMPTS) == len(outputs) - outputs = llm.generate(PROMPTS, sampling_params=sampling_params) - assert len(PROMPTS) == len(outputs) # Exception raised, if the size of params does not match the size of prompts with pytest.raises(ValueError): outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3]) - outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3]) # Single SamplingParams should be applied to every prompt single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95) outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params) assert len(PROMPTS) == len(outputs) - outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params) - assert len(PROMPTS) == len(outputs) # sampling_params is None, default params should be applied outputs = llm.generate(PROMPTS, sampling_params=None) assert len(PROMPTS) == len(outputs) - - outputs = llm.generate(PROMPTS, sampling_params=None) - assert len(PROMPTS) == len(outputs) From 4b078bdf5338fddad2ce516918057a39fb75e4ba Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 00:58:45 +0000 Subject: [PATCH 150/154] fix bad merge oot_registration --- tests/entrypoints/test_server_oot_registration.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py index e221c57a6c86d..8c445689d4f4c 100644 --- a/tests/entrypoints/test_server_oot_registration.py +++ b/tests/entrypoints/test_server_oot_registration.py @@ -36,8 +36,6 @@ def server_function(port): sys.argv = ["placeholder.py"] + \ ("--model facebook/opt-125m --gpu-memory-utilization 0.10 " f"--dtype float32 --api-key token-abc123 --port {port}").split() - ("--model facebook/opt-125m --gpu-memory-utilization 0.10 " - f"--dtype float32 --api-key token-abc123 --port {port}").split() import runpy runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') @@ -46,8 +44,6 @@ def test_oot_registration_for_api_server(): port = get_open_port() ctx = torch.multiprocessing.get_context() server = ctx.Process(target=server_function, args=(port, )) - ctx = torch.multiprocessing.get_context() - server = ctx.Process(target=server_function, args=(port, )) server.start() client = OpenAI( base_url=f"http://localhost:{port}/v1", From 05c57023b0d8777f5331a3b32fe2b85133f43348 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 01:03:43 +0000 Subject: [PATCH 151/154] duplicate mark --- tests/entrypoints/openai/test_serving_chat.py | 2 -- tests/entrypoints/test_server_oot_registration.py | 1 - tests/kernels/test_rand.py | 4 ---- tests/spec_decode/test_dynamic_spec_decode.py | 1 - 4 files changed, 8 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index dbb13c355b8e9..e3168f67e001f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -15,8 +15,6 @@ pytestmark = pytest.mark.openai -pytestmark = pytest.mark.openai - @dataclass class MockModelConfig: diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py index 8c445689d4f4c..394594fcbf085 100644 --- a/tests/entrypoints/test_server_oot_registration.py +++ b/tests/entrypoints/test_server_oot_registration.py @@ -1,7 +1,6 @@ import sys import time -import pytest import pytest import torch from openai import OpenAI, OpenAIError diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index 2bf10651c0aa0..737467e5f6252 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -12,10 +12,6 @@ allow_module_level=True) -@pytest.mark.skip("C compiler not installed in NM automation. " - "This codepath follows a triton pathway, which " - "JITs using clang or gcc. Since neither are installed " - "in our test instances, we need to skip this for now.") @pytest.mark.skip("C compiler not installed in NM automation. " "This codepath follows a triton pathway, which " "JITs using clang or gcc. Since neither are installed " diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index dddcd0010ccad..ea8d7493c5f1c 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -1,5 +1,4 @@ from unittest.mock import MagicMock, patch -from unittest.mock import MagicMock, patch import pytest import torch From 62f62837562e3e66cb08cee69f3567dd4b8863dc Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 11 Jun 2024 01:43:23 +0000 Subject: [PATCH 152/154] yapf on models core --- tests/models_core/test_magic_wand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models_core/test_magic_wand.py b/tests/models_core/test_magic_wand.py index 5a51efaa2ae08..a24618ec09a4a 100644 --- a/tests/models_core/test_magic_wand.py +++ b/tests/models_core/test_magic_wand.py @@ -67,7 +67,7 @@ def test_magic_wand( f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} " f"not bigger than dense model KV cache size {dense_num_kv_blocks} + " f"expected num_extra_blocks {num_extra_blocks}") - + # Confirm the generations are similar. check_logprobs_close( outputs_0_lst=dense_outputs, From 9b2d02f691e38ef504bd9af1d861765c5e49fd2d Mon Sep 17 00:00:00 2001 From: Domenic Barbuzzi Date: Thu, 13 Jun 2024 18:49:43 +0000 Subject: [PATCH 153/154] Replace '0' with 'ENABLE' --- tests/nm_utils/utils_skip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/nm_utils/utils_skip.py b/tests/nm_utils/utils_skip.py index f1a9bd7d9e4f1..cca1b85d87049 100644 --- a/tests/nm_utils/utils_skip.py +++ b/tests/nm_utils/utils_skip.py @@ -1,6 +1,6 @@ """Checks environment variables to skip various test groups. The functions here are imported by each test file. -The .github/actions/nm-test-skipping-env-setup sets these +The .github/actions/nm-test-skipping-env-setup sets these variables in the testing automation. """ @@ -73,7 +73,7 @@ def should_skip_models_core_test_group(): def should_skip_prefix_caching_test_group(): - TEST_PREFIX_CACHING = os.getenv("TEST_PREFIX_CACHING", "0") + TEST_PREFIX_CACHING = os.getenv("TEST_PREFIX_CACHING", "ENABLE") return TEST_PREFIX_CACHING == "DISABLE" From ef38251751c1219a5285796521a2f754096db64d Mon Sep 17 00:00:00 2001 From: Domenic Barbuzzi Date: Thu, 13 Jun 2024 19:06:44 +0000 Subject: [PATCH 154/154] Small fixes from conflict resolution --- tests/kernels/test_int8_quant.py | 2 +- tests/models/test_llava.py | 36 ------------------- tests/quantization/test_compressed_tensors.py | 1 - 3 files changed, 1 insertion(+), 38 deletions(-) diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 3142ffb6625dc..679b21fc74606 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -1,9 +1,9 @@ import pytest import torch -from tests.nm_utils.utils_skip import should_skip_test_group # ruff: noqa: F401 import vllm._C +from tests.nm_utils.utils_skip import should_skip_test_group if should_skip_test_group(group_name="TEST_KERNELS"): pytest.skip("TEST_KERNELS=DISABLE, skipping kernels test group", diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index e5443c263a71e..9e288b8d854c0 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -42,42 +42,6 @@ def iter_llava_configs(model_name: str): image_processor_revision=None)) -from ..conftest import IMAGE_FILES - -pytestmark = pytest.mark.llava - -# The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ - "\nUSER: What's the content of the image?\nASSISTANT:", - "\nUSER: What is the season?\nASSISTANT:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) - - -def iter_llava_configs(model_name: str): - image_hw_to_feature_size = { - (336, 336): 576, - } - - for (h, w), f in image_hw_to_feature_size.items(): - for input_type, input_shape in [ - (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), - (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)), - ]: - yield (model_name, - VisionLanguageConfig(image_input_type=input_type, - image_feature_size=f, - image_token_id=32000, - image_input_shape=input_shape, - image_processor=model_name, - image_processor_revision=None)) - - -if should_skip_test_group(group_name="TEST_MODELS"): - pytest.skip("TEST_MODELS=DISABLE, skipping model test group", - allow_module_level=True) - model_and_vl_config = [ *iter_llava_configs("llava-hf/llava-1.5-7b-hf"), ] diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 3607246ac8a94..510175146910d 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -12,7 +12,6 @@ CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor) - if should_skip_test_group(group_name="TEST_QUANTIZATION"): pytest.skip("TEST_QUANTIZATION=DISABLE, skipping quantization test group", allow_module_level=True)