From 44869b4c62ba8737975e64a6b1ecb94ae08734dd Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 8 May 2024 16:02:18 +0200 Subject: [PATCH] update --- onnxtr/contrib/base.py | 2 +- .../models/classification/models/mobilenet.py | 2 +- onnxtr/models/classification/zoo.py | 19 +--- .../models/differentiable_binarization.py | 9 +- onnxtr/models/detection/postprocessor/base.py | 4 +- onnxtr/models/engine.py | 3 +- onnxtr/models/preprocessor/base.py | 1 - onnxtr/transforms/base.py | 106 ++++++++---------- 8 files changed, 53 insertions(+), 93 deletions(-) diff --git a/onnxtr/contrib/base.py b/onnxtr/contrib/base.py index f974294..08eb449 100644 --- a/onnxtr/contrib/base.py +++ b/onnxtr/contrib/base.py @@ -101,5 +101,5 @@ def __call__(self, inputs: List[np.ndarray]) -> Any: np.array([self.preprocess(img) for img in batch], dtype=np.float32) for batch in batched_inputs ] - outputs = [self.run(None, {model_inputs[0].name: batch}) for batch in processed_batches] + outputs = [self.session.run(None, {model_inputs[0].name: batch}) for batch in processed_batches] return self.postprocess(outputs, batched_inputs) diff --git a/onnxtr/models/classification/models/mobilenet.py b/onnxtr/models/classification/models/mobilenet.py index e979042..02f59c4 100644 --- a/onnxtr/models/classification/models/mobilenet.py +++ b/onnxtr/models/classification/models/mobilenet.py @@ -23,7 +23,7 @@ "std": (0.299, 0.296, 0.301), "input_shape": (3, 256, 256), "classes": [0, -90, 180, 90], - "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_crop_orientation-f0847a18.pt&src=0", + "url": "https://github.com/felixdittrich92/OnnxTR/releases/download/v0.0.1/mobilenet_v3_small_crop_orientation-5620cf7e.onnx", }, "mobilenet_v3_small_page_orientation": { "mean": (0.694, 0.695, 0.693), diff --git a/onnxtr/models/classification/zoo.py b/onnxtr/models/classification/zoo.py index 35b077f..e83fd3d 100644 --- a/onnxtr/models/classification/zoo.py +++ b/onnxtr/models/classification/zoo.py @@ -3,7 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -from typing import Any, Dict, List +from typing import Any, List from .. import classification from ..preprocessor import PreProcessor @@ -13,23 +13,6 @@ ORIENTATION_ARCHS: List[str] = ["mobilenet_v3_small_crop_orientation", "mobilenet_v3_small_page_orientation"] -default_cfgs: Dict[str, Dict[str, Any]] = { - "mobilenet_v3_small_crop_orientation": { - "mean": (0.694, 0.695, 0.693), - "std": (0.299, 0.296, 0.301), - "input_shape": (3, 256, 256), - "classes": [0, -90, 180, 90], - "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_crop_orientation-f0847a18.pt&src=0", # TODO - }, - "mobilenet_v3_small_page_orientation": { - "mean": (0.694, 0.695, 0.693), - "std": (0.299, 0.296, 0.301), - "input_shape": (3, 512, 512), - "classes": [0, -90, 180, 90], - "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_page_orientation-8e60325c.pt&src=0", # TODO - }, -} - def _orientation_predictor(arch: str, **kwargs: Any) -> OrientationPredictor: if arch not in ORIENTATION_ARCHS: diff --git a/onnxtr/models/detection/models/differentiable_binarization.py b/onnxtr/models/detection/models/differentiable_binarization.py index c98063d..5597729 100644 --- a/onnxtr/models/detection/models/differentiable_binarization.py +++ b/onnxtr/models/detection/models/differentiable_binarization.py @@ -51,7 +51,7 @@ class DBNet(Engine): def __init__( self, model_path, - bin_thresh: float = 0.1, + bin_thresh: float = 0.3, box_thresh: float = 0.1, assume_straight_pages: bool = True, cfg: Optional[Dict[str, Any]] = None, @@ -59,7 +59,6 @@ def __init__( super().__init__(url=model_path) self.cfg = cfg self.assume_straight_pages = assume_straight_pages - print(f"ASSUME STRAIGHT PAGES: {assume_straight_pages}") self.postprocessor = GeneralDetectionPostProcessor( assume_straight_pages=self.assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) @@ -78,12 +77,6 @@ def __call__( if return_model_output: out["out_map"] = prob_map - print(prob_map.shape) - import cv2 - - cv2.imwrite("prob_map.jpg", prob_map[0, 0, :, :] * 255) - print(prob_map) - out["preds"] = [ dict(zip(["words"], preds)) for preds in self.postprocessor(np.transpose(prob_map, (0, 2, 3, 1))) ] diff --git a/onnxtr/models/detection/postprocessor/base.py b/onnxtr/models/detection/postprocessor/base.py index ac6831f..ab6dd4a 100644 --- a/onnxtr/models/detection/postprocessor/base.py +++ b/onnxtr/models/detection/postprocessor/base.py @@ -29,12 +29,12 @@ class GeneralDetectionPostProcessor(DetectionPostProcessor): def __init__( self, - bin_thresh: float = 0.3, + bin_thresh: float = 0.1, box_thresh: float = 0.1, assume_straight_pages: bool = True, ) -> None: super().__init__(box_thresh, bin_thresh, assume_straight_pages) - self.unclip_ratio = 1.0 + self.unclip_ratio = 1.5 def polygon_to_box( self, diff --git a/onnxtr/models/engine.py b/onnxtr/models/engine.py index f1e33f8..a9a6a41 100644 --- a/onnxtr/models/engine.py +++ b/onnxtr/models/engine.py @@ -27,7 +27,6 @@ def __init__(self, url: str, **kwargs: Any) -> None: ) def run(self, inputs: np.ndarray) -> List[np.ndarray]: - # inputs = np.transpose(inputs, (0, 3, 1, 2)).astype(np.float32) # TODO: Can we remove this maybe ? + inputs = np.transpose(inputs, (0, 3, 1, 2)).astype(np.float32) logits = self.runtime.run(["logits"], {"input": inputs})[0] - print(f"logits: {logits.shape}") return logits diff --git a/onnxtr/models/preprocessor/base.py b/onnxtr/models/preprocessor/base.py index 9ca2029..b332561 100644 --- a/onnxtr/models/preprocessor/base.py +++ b/onnxtr/models/preprocessor/base.py @@ -6,7 +6,6 @@ import math from typing import Any, List, Tuple, Union -import cv2 import numpy as np from onnxtr.transforms import Normalize, Resize diff --git a/onnxtr/transforms/base.py b/onnxtr/transforms/base.py index 52758f7..f176bb6 100644 --- a/onnxtr/transforms/base.py +++ b/onnxtr/transforms/base.py @@ -3,8 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -import math -from typing import Tuple, Union, Optional +from typing import Tuple, Union import cv2 import numpy as np @@ -12,74 +11,69 @@ __all__ = ["Resize", "Normalize"] -from torch.nn.functional import pad -from torchvision.transforms import functional as F -from torchvision.transforms import transforms as T - -import torch - - -class Resize(T.Resize): # TODO: Translate me correct !!! +class Resize: """Resize the input image to the given size""" def __init__( self, size: Union[int, Tuple[int, int]], - interpolation=F.InterpolationMode.BILINEAR, + interpolation=cv2.INTER_LINEAR, preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: - super().__init__(size, interpolation, antialias=True) + super().__init__() + self.size = size + self.interpolation = interpolation self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad if not isinstance(self.size, (int, tuple, list)): raise AssertionError("size should be either a tuple, a list or an int") - def forward( + def __call__( self, - img: torch.Tensor, - target: Optional[np.ndarray] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, np.ndarray]]: - img = np.transpose(img, (2, 0, 1)) - print(img.shape) - img = torch.from_numpy(img) - if isinstance(self.size, int): - target_ratio = img.shape[-2] / img.shape[-1] - else: - target_ratio = self.size[0] / self.size[1] - actual_ratio = img.shape[-2] / img.shape[-1] + img: np.ndarray, + ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: + h, w = img.shape[:2] + sh, sw = self.size + + # Calculate aspect ratio of the image + aspect = w / h - if not self.preserve_aspect_ratio or (target_ratio == actual_ratio and (isinstance(self.size, (tuple, list)))): - return super().forward(img).numpy() + # Compute scaling and padding sizes + if self.preserve_aspect_ratio: + if aspect > 1: # Horizontal image + new_w = sw + new_h = int(sw / aspect) + elif aspect < 1: # Vertical image + new_h = sh + new_w = int(sh * aspect) + else: # Square image + new_h, new_w = sh, sw + + img_resized = cv2.resize(img, (new_w, new_h), interpolation=self.interpolation) + + # Calculate padding + pad_top = max((sh - new_h) // 2, 0) + pad_bottom = max(sh - new_h - pad_top, 0) + pad_left = max((sw - new_w) // 2, 0) + pad_right = max(sw - new_w - pad_left, 0) + + # Pad the image + img_resized = cv2.copyMakeBorder( + img_resized, pad_top, pad_bottom, pad_left, pad_right, borderType=cv2.BORDER_CONSTANT, value=0 + ) + + # Ensure the image matches the target size by resizing it again if needed + img_resized = cv2.resize(img_resized, (sw, sh), interpolation=self.interpolation) else: - # Resize - if isinstance(self.size, (tuple, list)): - if actual_ratio > target_ratio: - tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1)) - else: - tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1]) - elif isinstance(self.size, int): # self.size is the longest side, infer the other - if img.shape[-2] <= img.shape[-1]: - tmp_size = (max(int(self.size * actual_ratio), 1), self.size) - else: - tmp_size = (self.size, max(int(self.size / actual_ratio), 1)) - - # Scale image - img = F.resize(img, tmp_size, self.interpolation, antialias=True) - raw_shape = img.shape[-2:] - if isinstance(self.size, (tuple, list)): - # Pad (inverted in pytorch) - _pad = (0, self.size[1] - img.shape[-1], 0, self.size[0] - img.shape[-2]) - if self.symmetric_pad: - half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2)) - _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1]) - img = pad(img, _pad) - - return img.numpy() + # Resize the image without preserving aspect ratio + img_resized = cv2.resize(img, (sw, sh), interpolation=self.interpolation) + + return img_resized def __repr__(self) -> str: - interpolate_str = self.interpolation.value + interpolate_str = self.interpolation _repr = f"output_size={self.size}, interpolation='{interpolate_str}'" if self.preserve_aspect_ratio: _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" @@ -107,15 +101,7 @@ def __call__( img: np.ndarray, ) -> np.ndarray: # Normalize image - print(self.mean, self.std) - print(img.shape) - img = np.transpose(img, (0, 3, 2, 1)) - mean = np.array(self.mean).astype(img.dtype) - std = np.array(self.std).astype(img.dtype) - img = (img - mean) / std - img = np.transpose(img, (0, 3, 2, 1)) - print(img.shape) - return img + return (img - np.array(self.mean).astype(img.dtype)) / np.array(self.std).astype(img.dtype) def __repr__(self) -> str: _repr = f"mean={self.mean}, std={self.std}"