From 1fe3a5a01123dcfea8a7981fc5aea212d42c1299 Mon Sep 17 00:00:00 2001 From: YangXiuyu Date: Thu, 8 Dec 2022 16:27:01 +0800 Subject: [PATCH] feat: add fp16 inference support (torch/onnx) (#871) * feat: add fp16 inference in clip_torch * Revert "feat: add fp16 inference in clip_torch" This reverts commit 326e2655f423a67a5d6ea5c28eed8d81da037df0. * feat: add fp16 inference in clip_torch * fix: device * fix: str to torch.dtype * fix: layernorm * feat: add fp16 inference in clip_trt * feat: add fp16 inference in clip_onnx * fix: housekeeping * fix: ci * fix: ci * fix: ci * fix: ci and get test path * fix: dtype amp and gpu test dependency * fix: layernorm * fix: cast dtype in visiontransformer * fix: clip_onnx * fix: clip_onnx * fix: convert onnx to fp16 * fix: dtype in preproc images * fix: dtype in preproc images * fix: typo * fix: dtype in clip_torch and fp16 in trt * fix: remove plain text in trt_test * fix: test * fix: typo * fix: stash * Revert "fix: stash" This reverts commit f72fd99f4333443e998edbf4be352fe81557f611. * fix: for test * fix: onnx * fix: for test * fix: for test * fix: trt * fix: convert onnx to fp16 before convert trt * fix: discard changes in trt * fix: optimize fp16 test * fix: move __cast_dtype__ * Revert "fix: move __cast_dtype__" This reverts commit edf46292f144aee1150cd1f16565a2236a5ce5fe. * fix: ci --- .github/workflows/ci.yml | 6 ++++-- server/clip_server/executors/clip_onnx.py | 20 ++++++++++-------- server/clip_server/executors/clip_torch.py | 24 +++++++++++++++++----- server/clip_server/executors/helper.py | 9 ++++++-- server/clip_server/helper.py | 4 ++++ server/clip_server/model/clip_onnx.py | 23 +++++++++++++++++++-- server/clip_server/model/model.py | 14 +++++++++++-- server/setup.py | 1 + tests/test_simple.py | 2 ++ 9 files changed, 82 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6cea783a0..7a64bfa6c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -113,12 +113,11 @@ jobs: pip install --no-cache-dir "server/[onnx]" pip install --no-cache-dir "server/[transformers]" pip install --no-cache-dir "server/[search]" - pip install --no-cache-dir "server/[transformers]" - name: Test id: test run: | pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \ - -v -s -m "not gpu" ${{ matrix.test-path }} + -v -s ${{ matrix.test-path }} echo "::set-output name=codecov_flag::cas" timeout-minutes: 30 - name: Check codecov file @@ -158,6 +157,7 @@ jobs: python -m pip install wheel pytest pytest-cov nvidia-pyindex pip install -e "client/[test]" pip install -e "server/[tensorrt]" + pip install -e "server/[onnx]" { pip install -e "server/[flash-attn]" } || { @@ -168,6 +168,8 @@ jobs: run: | pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \ -v -s -m "gpu" ./tests/test_tensorrt.py + pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \ + -v -s -m "gpu" ./tests/test_simple.py echo "::set-output name=codecov_flag::cas" timeout-minutes: 30 env: diff --git a/server/clip_server/executors/clip_onnx.py b/server/clip_server/executors/clip_onnx.py index 3262f0a8d..bb589f912 100644 --- a/server/clip_server/executors/clip_onnx.py +++ b/server/clip_server/executors/clip_onnx.py @@ -27,6 +27,7 @@ def __init__( minibatch_size: int = 32, access_paths: str = '@r', model_path: Optional[str] = None, + dtype: Optional[str] = None, **kwargs, ): """ @@ -41,8 +42,17 @@ def __init__( :param model_path: The path to the model to be used. If not specified, the model will be downloaded or loaded from the local cache. Visit https://clip-as-service.jina.ai/user-guides/server/#use-custom-model-for-onnx to learn how to finetune custom models. + :param dtype: inference data type, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. """ super().__init__(**kwargs) + import torch + + if not device: + device = 'cuda' if torch.cuda.is_available() else 'cpu' + self._device = device + if not dtype: + dtype = 'fp32' if self._device in ('cpu', torch.device('cpu')) else 'fp16' + self._dtype = dtype self._minibatch_size = minibatch_size self._access_paths = access_paths @@ -55,18 +65,11 @@ def __init__( self._num_worker_preprocess = num_worker_preprocess self._pool = ThreadPool(processes=num_worker_preprocess) - self._model = CLIPOnnxModel(name, model_path) + self._model = CLIPOnnxModel(name, model_path, dtype) self._tokenizer = Tokenizer(name) self._image_transform = clip._transform_ndarray(self._model.image_size) - import torch - - if not device: - self._device = 'cuda' if torch.cuda.is_available() else 'cpu' - else: - self._device = device - # define the priority order for the execution providers providers = ['CPUExecutionProvider'] @@ -116,6 +119,7 @@ def _preproc_images(self, docs: 'DocumentArray', drop_image_content: bool): preprocess_fn=self._image_transform, return_np=True, drop_image_content=drop_image_content, + dtype=self._dtype, ) def _preproc_texts(self, docs: 'DocumentArray'): diff --git a/server/clip_server/executors/clip_torch.py b/server/clip_server/executors/clip_torch.py index 8b740d526..7ed895eb2 100644 --- a/server/clip_server/executors/clip_torch.py +++ b/server/clip_server/executors/clip_torch.py @@ -2,7 +2,7 @@ import warnings from functools import partial from multiprocessing.pool import ThreadPool -from typing import Dict, Optional +from typing import Dict, Union, Optional import numpy as np import torch @@ -12,6 +12,7 @@ set_rank, split_img_txt_da, ) +from clip_server.helper import __cast_dtype__ from clip_server.model import clip from clip_server.model.clip_model import CLIPModel from clip_server.model.tokenization import Tokenizer @@ -28,6 +29,7 @@ def __init__( num_worker_preprocess: int = 4, minibatch_size: int = 32, access_paths: str = '@r', + dtype: Optional[Union[str, torch.dtype]] = None, **kwargs, ): """ @@ -40,6 +42,7 @@ def __init__( number if you encounter OOM errors. :param access_paths: The access paths to traverse on the input documents to get the images and texts to be processed. Visit https://docarray.jina.ai/fundamentals/documentarray/access-elements for more details. + :param dtype: inference data type, if None defaults to torch.float32 if device == 'cpu' else torch.float16. """ super().__init__(**kwargs) @@ -52,9 +55,17 @@ def __init__( self._access_paths = kwargs['traversal_paths'] if not device: - self._device = 'cuda' if torch.cuda.is_available() else 'cpu' - else: - self._device = device + device = 'cuda' if torch.cuda.is_available() else 'cpu' + self._device = device + if isinstance(dtype, str): + dtype = __cast_dtype__.get(dtype) + elif not dtype: + dtype = ( + torch.float32 + if self._device in ('cpu', torch.device('cpu')) + else torch.float16 + ) + self._dtype = dtype if not self._device.startswith('cuda') and ( 'OMP_NUM_THREADS' not in os.environ @@ -77,7 +88,9 @@ def __init__( self._num_worker_preprocess = num_worker_preprocess self._pool = ThreadPool(processes=num_worker_preprocess) - self._model = CLIPModel(name, device=self._device, jit=jit, **kwargs) + self._model = CLIPModel( + name, device=self._device, jit=jit, dtype=dtype, **kwargs + ) self._tokenizer = Tokenizer(name) self._image_transform = clip._transform_ndarray(self._model.image_size) @@ -96,6 +109,7 @@ def _preproc_images(self, docs: 'DocumentArray', drop_image_content: bool): device=self._device, return_np=False, drop_image_content=drop_image_content, + dtype=self._dtype, ) def _preproc_texts(self, docs: 'DocumentArray'): diff --git a/server/clip_server/executors/helper.py b/server/clip_server/executors/helper.py index c7ac6a555..594cd88a5 100644 --- a/server/clip_server/executors/helper.py +++ b/server/clip_server/executors/helper.py @@ -1,8 +1,9 @@ -from typing import Tuple, List, Callable, Any, Dict +from typing import Tuple, List, Callable, Any, Dict, Union import torch import numpy as np from docarray import Document, DocumentArray from docarray.math.distance.numpy import cosine +from clip_server.helper import __cast_dtype__ from clip_server.model.tokenization import Tokenizer @@ -22,8 +23,12 @@ def preproc_image( device: str = 'cpu', return_np: bool = False, drop_image_content: bool = False, + dtype: Union[str, torch.dtype] = torch.float32, ) -> Tuple['DocumentArray', Dict]: + if isinstance(dtype, str): + dtype = __cast_dtype__.get(dtype) + tensors_batch = [] for d in da: @@ -42,7 +47,7 @@ def preproc_image( if drop_image_content: d.pop('blob', 'tensor') - tensors_batch = torch.stack(tensors_batch).type(torch.float32) + tensors_batch = torch.stack(tensors_batch).type(dtype) if return_np: tensors_batch = tensors_batch.cpu().numpy() diff --git a/server/clip_server/helper.py b/server/clip_server/helper.py index 16983c1de..d5108ffc0 100644 --- a/server/clip_server/helper.py +++ b/server/clip_server/helper.py @@ -2,6 +2,7 @@ import os import sys import threading +import torch from packaging.version import Version from urllib.request import Request, urlopen @@ -19,6 +20,9 @@ ) +__cast_dtype__ = {'fp16': torch.float16, 'fp32': torch.float32, 'bf16': torch.bfloat16} + + def _version_check(package: str = None, github_repo: str = None): try: diff --git a/server/clip_server/model/clip_onnx.py b/server/clip_server/model/clip_onnx.py index d93f258d1..2d61307ad 100644 --- a/server/clip_server/model/clip_onnx.py +++ b/server/clip_server/model/clip_onnx.py @@ -1,5 +1,5 @@ import os -from typing import Dict +from typing import Dict, Optional from clip_server.model.pretrained_models import ( download_model, @@ -201,8 +201,11 @@ class CLIPOnnxModel(BaseCLIPModel): - def __init__(self, name: str, model_path: str = None): + def __init__( + self, name: str, model_path: str = None, dtype: Optional[str] = 'fp32' + ): super().__init__(name) + self._dtype = dtype if name in _MODELS: if not model_path: cache_dir = os.path.expanduser( @@ -237,6 +240,22 @@ def __init__(self, name: str, model_path: str = None): f'The given model path {model_path} should be a folder containing both ' f'`textual.onnx` and `visual.onnx`.' ) + if dtype == 'fp16': + import onnx + from onnxmltools.utils import float16_converter + + _textual_model_fp16 = ( + float16_converter.convert_float_to_float16_model_path( + self._textual_path + ) + ) + _visual_model_fp16 = ( + float16_converter.convert_float_to_float16_model_path( + self._visual_path + ) + ) + onnx.save_model(_textual_model_fp16, self._textual_path) + onnx.save_model(_visual_model_fp16, self._visual_path) else: raise RuntimeError( 'CLIP model {} not found or not supports ONNX backend; below is a list of all available models:\n{}'.format( diff --git a/server/clip_server/model/model.py b/server/clip_server/model/model.py index 12de55e5b..28adc7296 100644 --- a/server/clip_server/model/model.py +++ b/server/clip_server/model/model.py @@ -15,6 +15,7 @@ from dataclasses import dataclass from typing import Tuple, Union, Optional from copy import deepcopy +from clip_server.helper import __cast_dtype__ from open_clip.transformer import QuickGELU, LayerNorm, LayerNormFp32, Attention from open_clip.timm_model import TimmModel from open_clip.factory import _MODEL_CONFIGS @@ -81,6 +82,11 @@ def __init__( super().__init__(image_size, patch_size, output_dim=output_dim, **kwargs) self.transformer = Transformer(dtype=dtype, **kwargs) + def forward(self, x: torch.Tensor): + dtype = self.transformer.get_cast_dtype() + x = x.to(dtype) + return super().forward(x) + class TextTransformer(_TextTransformer): def __init__( @@ -435,7 +441,9 @@ def load_openai_model( preprocess : Callable[[PIL.Image], torch.Tensor] A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input """ - if dtype is None: + if isinstance(dtype, str): + dtype = __cast_dtype__.get(dtype, 'amp') + elif dtype is None: dtype = ( torch.float32 if device in ('cpu', torch.device('cpu')) else torch.float16 ) @@ -550,7 +558,9 @@ def load_openclip_model( pretrained_image: bool = False, dtype: Optional[Union[str, torch.dtype]] = None, ): - if dtype is None: + if isinstance(dtype, str): + dtype = __cast_dtype__.get(dtype) + elif dtype is None: dtype = ( torch.float32 if device in ('cpu', torch.device('cpu')) else torch.float16 ) diff --git a/server/setup.py b/server/setup.py index 901709dc4..5d9e9511e 100644 --- a/server/setup.py +++ b/server/setup.py @@ -53,6 +53,7 @@ 'onnx': [ 'onnxruntime', 'onnx', + 'onnxmltools', ] + (['onnxruntime-gpu>=1.8.0'] if sys.platform != 'darwin' else []), 'tensorrt': ['nvidia-tensorrt'], diff --git a/tests/test_simple.py b/tests/test_simple.py index dcc8ef2c6..8611395f4 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -27,6 +27,7 @@ def test_protocols(port_generator, protocol, jit, pytestconfig): c.profile(content=f'{pytestconfig.rootdir}/tests/img/00000.jpg') +@pytest.mark.gpu @pytest.mark.parametrize( 'inputs', [ @@ -48,6 +49,7 @@ def test_plain_inputs(make_flow, inputs): ) +@pytest.mark.gpu @pytest.mark.parametrize( 'inputs', [