From 44869b4c62ba8737975e64a6b1ecb94ae08734dd Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Wed, 8 May 2024 16:02:18 +0200
Subject: [PATCH] update

---
 onnxtr/contrib/base.py                        |   2 +-
 .../models/classification/models/mobilenet.py |   2 +-
 onnxtr/models/classification/zoo.py           |  19 +---
 .../models/differentiable_binarization.py     |   9 +-
 onnxtr/models/detection/postprocessor/base.py |   4 +-
 onnxtr/models/engine.py                       |   3 +-
 onnxtr/models/preprocessor/base.py            |   1 -
 onnxtr/transforms/base.py                     | 106 ++++++++----------
 8 files changed, 53 insertions(+), 93 deletions(-)

diff --git a/onnxtr/contrib/base.py b/onnxtr/contrib/base.py
index f974294..08eb449 100644
--- a/onnxtr/contrib/base.py
+++ b/onnxtr/contrib/base.py
@@ -101,5 +101,5 @@ def __call__(self, inputs: List[np.ndarray]) -> Any:
             np.array([self.preprocess(img) for img in batch], dtype=np.float32) for batch in batched_inputs
         ]
 
-        outputs = [self.run(None, {model_inputs[0].name: batch}) for batch in processed_batches]
+        outputs = [self.session.run(None, {model_inputs[0].name: batch}) for batch in processed_batches]
         return self.postprocess(outputs, batched_inputs)
diff --git a/onnxtr/models/classification/models/mobilenet.py b/onnxtr/models/classification/models/mobilenet.py
index e979042..02f59c4 100644
--- a/onnxtr/models/classification/models/mobilenet.py
+++ b/onnxtr/models/classification/models/mobilenet.py
@@ -23,7 +23,7 @@
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 256, 256),
         "classes": [0, -90, 180, 90],
-        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_crop_orientation-f0847a18.pt&src=0",
+        "url": "https://github.com/felixdittrich92/OnnxTR/releases/download/v0.0.1/mobilenet_v3_small_crop_orientation-5620cf7e.onnx",
     },
     "mobilenet_v3_small_page_orientation": {
         "mean": (0.694, 0.695, 0.693),
diff --git a/onnxtr/models/classification/zoo.py b/onnxtr/models/classification/zoo.py
index 35b077f..e83fd3d 100644
--- a/onnxtr/models/classification/zoo.py
+++ b/onnxtr/models/classification/zoo.py
@@ -3,7 +3,7 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any, Dict, List
+from typing import Any, List
 
 from .. import classification
 from ..preprocessor import PreProcessor
@@ -13,23 +13,6 @@
 
 ORIENTATION_ARCHS: List[str] = ["mobilenet_v3_small_crop_orientation", "mobilenet_v3_small_page_orientation"]
 
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    "mobilenet_v3_small_crop_orientation": {
-        "mean": (0.694, 0.695, 0.693),
-        "std": (0.299, 0.296, 0.301),
-        "input_shape": (3, 256, 256),
-        "classes": [0, -90, 180, 90],
-        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_crop_orientation-f0847a18.pt&src=0",  # TODO
-    },
-    "mobilenet_v3_small_page_orientation": {
-        "mean": (0.694, 0.695, 0.693),
-        "std": (0.299, 0.296, 0.301),
-        "input_shape": (3, 512, 512),
-        "classes": [0, -90, 180, 90],
-        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_page_orientation-8e60325c.pt&src=0",  # TODO
-    },
-}
-
 
 def _orientation_predictor(arch: str, **kwargs: Any) -> OrientationPredictor:
     if arch not in ORIENTATION_ARCHS:
diff --git a/onnxtr/models/detection/models/differentiable_binarization.py b/onnxtr/models/detection/models/differentiable_binarization.py
index c98063d..5597729 100644
--- a/onnxtr/models/detection/models/differentiable_binarization.py
+++ b/onnxtr/models/detection/models/differentiable_binarization.py
@@ -51,7 +51,7 @@ class DBNet(Engine):
     def __init__(
         self,
         model_path,
-        bin_thresh: float = 0.1,
+        bin_thresh: float = 0.3,
         box_thresh: float = 0.1,
         assume_straight_pages: bool = True,
         cfg: Optional[Dict[str, Any]] = None,
@@ -59,7 +59,6 @@ def __init__(
         super().__init__(url=model_path)
         self.cfg = cfg
         self.assume_straight_pages = assume_straight_pages
-        print(f"ASSUME STRAIGHT PAGES: {assume_straight_pages}")
         self.postprocessor = GeneralDetectionPostProcessor(
             assume_straight_pages=self.assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh
         )
@@ -78,12 +77,6 @@ def __call__(
         if return_model_output:
             out["out_map"] = prob_map
 
-        print(prob_map.shape)
-        import cv2
-
-        cv2.imwrite("prob_map.jpg", prob_map[0, 0, :, :] * 255)
-        print(prob_map)
-
         out["preds"] = [
             dict(zip(["words"], preds)) for preds in self.postprocessor(np.transpose(prob_map, (0, 2, 3, 1)))
         ]
diff --git a/onnxtr/models/detection/postprocessor/base.py b/onnxtr/models/detection/postprocessor/base.py
index ac6831f..ab6dd4a 100644
--- a/onnxtr/models/detection/postprocessor/base.py
+++ b/onnxtr/models/detection/postprocessor/base.py
@@ -29,12 +29,12 @@ class GeneralDetectionPostProcessor(DetectionPostProcessor):
 
     def __init__(
         self,
-        bin_thresh: float = 0.3,
+        bin_thresh: float = 0.1,
         box_thresh: float = 0.1,
         assume_straight_pages: bool = True,
     ) -> None:
         super().__init__(box_thresh, bin_thresh, assume_straight_pages)
-        self.unclip_ratio = 1.0
+        self.unclip_ratio = 1.5
 
     def polygon_to_box(
         self,
diff --git a/onnxtr/models/engine.py b/onnxtr/models/engine.py
index f1e33f8..a9a6a41 100644
--- a/onnxtr/models/engine.py
+++ b/onnxtr/models/engine.py
@@ -27,7 +27,6 @@ def __init__(self, url: str, **kwargs: Any) -> None:
         )
 
     def run(self, inputs: np.ndarray) -> List[np.ndarray]:
-        # inputs = np.transpose(inputs, (0, 3, 1, 2)).astype(np.float32)  # TODO: Can we remove this maybe ?
+        inputs = np.transpose(inputs, (0, 3, 1, 2)).astype(np.float32)
         logits = self.runtime.run(["logits"], {"input": inputs})[0]
-        print(f"logits: {logits.shape}")
         return logits
diff --git a/onnxtr/models/preprocessor/base.py b/onnxtr/models/preprocessor/base.py
index 9ca2029..b332561 100644
--- a/onnxtr/models/preprocessor/base.py
+++ b/onnxtr/models/preprocessor/base.py
@@ -6,7 +6,6 @@
 import math
 from typing import Any, List, Tuple, Union
 
-import cv2
 import numpy as np
 
 from onnxtr.transforms import Normalize, Resize
diff --git a/onnxtr/transforms/base.py b/onnxtr/transforms/base.py
index 52758f7..f176bb6 100644
--- a/onnxtr/transforms/base.py
+++ b/onnxtr/transforms/base.py
@@ -3,8 +3,7 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import math
-from typing import Tuple, Union, Optional
+from typing import Tuple, Union
 
 import cv2
 import numpy as np
@@ -12,74 +11,69 @@
 __all__ = ["Resize", "Normalize"]
 
 
-from torch.nn.functional import pad
-from torchvision.transforms import functional as F
-from torchvision.transforms import transforms as T
-
-import torch
-
-
-class Resize(T.Resize):  # TODO: Translate me correct !!!
+class Resize:
     """Resize the input image to the given size"""
 
     def __init__(
         self,
         size: Union[int, Tuple[int, int]],
-        interpolation=F.InterpolationMode.BILINEAR,
+        interpolation=cv2.INTER_LINEAR,
         preserve_aspect_ratio: bool = False,
         symmetric_pad: bool = False,
     ) -> None:
-        super().__init__(size, interpolation, antialias=True)
+        super().__init__()
+        self.size = size
+        self.interpolation = interpolation
         self.preserve_aspect_ratio = preserve_aspect_ratio
         self.symmetric_pad = symmetric_pad
 
         if not isinstance(self.size, (int, tuple, list)):
             raise AssertionError("size should be either a tuple, a list or an int")
 
-    def forward(
+    def __call__(
         self,
-        img: torch.Tensor,
-        target: Optional[np.ndarray] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, np.ndarray]]:
-        img = np.transpose(img, (2, 0, 1))
-        print(img.shape)
-        img = torch.from_numpy(img)
-        if isinstance(self.size, int):
-            target_ratio = img.shape[-2] / img.shape[-1]
-        else:
-            target_ratio = self.size[0] / self.size[1]
-        actual_ratio = img.shape[-2] / img.shape[-1]
+        img: np.ndarray,
+    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+        h, w = img.shape[:2]
+        sh, sw = self.size
+
+        # Calculate aspect ratio of the image
+        aspect = w / h
 
-        if not self.preserve_aspect_ratio or (target_ratio == actual_ratio and (isinstance(self.size, (tuple, list)))):
-            return super().forward(img).numpy()
+        # Compute scaling and padding sizes
+        if self.preserve_aspect_ratio:
+            if aspect > 1:  # Horizontal image
+                new_w = sw
+                new_h = int(sw / aspect)
+            elif aspect < 1:  # Vertical image
+                new_h = sh
+                new_w = int(sh * aspect)
+            else:  # Square image
+                new_h, new_w = sh, sw
+
+            img_resized = cv2.resize(img, (new_w, new_h), interpolation=self.interpolation)
+
+            # Calculate padding
+            pad_top = max((sh - new_h) // 2, 0)
+            pad_bottom = max(sh - new_h - pad_top, 0)
+            pad_left = max((sw - new_w) // 2, 0)
+            pad_right = max(sw - new_w - pad_left, 0)
+
+            # Pad the image
+            img_resized = cv2.copyMakeBorder(
+                img_resized, pad_top, pad_bottom, pad_left, pad_right, borderType=cv2.BORDER_CONSTANT, value=0
+            )
+
+            # Ensure the image matches the target size by resizing it again if needed
+            img_resized = cv2.resize(img_resized, (sw, sh), interpolation=self.interpolation)
         else:
-            # Resize
-            if isinstance(self.size, (tuple, list)):
-                if actual_ratio > target_ratio:
-                    tmp_size = (self.size[0], max(int(self.size[0] / actual_ratio), 1))
-                else:
-                    tmp_size = (max(int(self.size[1] * actual_ratio), 1), self.size[1])
-            elif isinstance(self.size, int):  # self.size is the longest side, infer the other
-                if img.shape[-2] <= img.shape[-1]:
-                    tmp_size = (max(int(self.size * actual_ratio), 1), self.size)
-                else:
-                    tmp_size = (self.size, max(int(self.size / actual_ratio), 1))
-
-            # Scale image
-            img = F.resize(img, tmp_size, self.interpolation, antialias=True)
-            raw_shape = img.shape[-2:]
-            if isinstance(self.size, (tuple, list)):
-                # Pad (inverted in pytorch)
-                _pad = (0, self.size[1] - img.shape[-1], 0, self.size[0] - img.shape[-2])
-                if self.symmetric_pad:
-                    half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2))
-                    _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1])
-                img = pad(img, _pad)
-
-            return img.numpy()
+            # Resize the image without preserving aspect ratio
+            img_resized = cv2.resize(img, (sw, sh), interpolation=self.interpolation)
+
+        return img_resized
 
     def __repr__(self) -> str:
-        interpolate_str = self.interpolation.value
+        interpolate_str = self.interpolation
         _repr = f"output_size={self.size}, interpolation='{interpolate_str}'"
         if self.preserve_aspect_ratio:
             _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}"
@@ -107,15 +101,7 @@ def __call__(
         img: np.ndarray,
     ) -> np.ndarray:
         # Normalize image
-        print(self.mean, self.std)
-        print(img.shape)
-        img = np.transpose(img, (0, 3, 2, 1))
-        mean = np.array(self.mean).astype(img.dtype)
-        std = np.array(self.std).astype(img.dtype)
-        img = (img - mean) / std
-        img = np.transpose(img, (0, 3, 2, 1))
-        print(img.shape)
-        return img
+        return (img - np.array(self.mean).astype(img.dtype)) / np.array(self.std).astype(img.dtype)
 
     def __repr__(self) -> str:
         _repr = f"mean={self.mean}, std={self.std}"