Merge pull request #167 from JSv4/JSv4/fix-parsing-issue-improve-test…

…s-add-ocr-detection Dynamically Apply OCR, Improve PDF Utilities and Tests
JSv4 · Jul 22, 2024 · 9092561 · 9092561
2 parents e7be2db + d972338
commit 9092561
Show file tree

Hide file tree

Showing 6 changed files with 115 additions and 6 deletions.
diff --git a/config/settings/base.py b/config/settings/base.py
@@ -462,7 +462,9 @@
 NLM_INGESTOR_ACTIVE = env.bool(
     "NLM_INGESTOR_ACTIVE", False
 )  # Use nlm-ingestor where this is True... otherwise PAWLs
-NLM_INGEST_USE_OCR = False  # IF True, always tell nlm-ingestor to use OCR (Tesseract)
+NLM_INGEST_USE_OCR = (
+    True  # IF True, allow ingestor to use OCR when no text found in pdf.
+)
 NLM_INGEST_HOSTNAME = (
     "http://nlm-ingestor:5001"  # Hostname to send nlm-ingestor REST requests to
 )

diff --git a/opencontractserver/tasks/doc_tasks.py b/opencontractserver/tasks/doc_tasks.py
@@ -36,6 +36,7 @@
 from opencontractserver.types.enums import PermissionTypes
 from opencontractserver.utils.etl import build_document_export, pawls_bbox_to_funsd_box
 from opencontractserver.utils.pdf import (
+    check_if_pdf_needs_ocr,
     extract_pawls_from_pdfs_bytes,
     split_pdf_into_images,
 )
@@ -236,6 +237,10 @@ def nlm_ingest_pdf(user_id: int, doc_id: int) -> list[tuple[int, str]]:
     doc_path = doc.pdf_file.name
     doc_file = default_storage.open(doc_path, mode="rb")
 
+    # Check if OCR is needed
+    needs_ocr = check_if_pdf_needs_ocr(doc_file)
+    logger.debug(f"Document {doc_id} needs OCR: {needs_ocr}")
+
     if settings.NLM_INGEST_API_KEY is not None:
         headers = {"API_KEY": settings.NLM_INGEST_API_KEY}
     else:
@@ -244,7 +249,7 @@ def nlm_ingest_pdf(user_id: int, doc_id: int) -> list[tuple[int, str]]:
     files = {"file": doc_file}
     params = {
         "calculate_opencontracts_data": "yes",
-        "applyOcr": "yes" if settings.NLM_INGEST_USE_OCR else "no",
+        "applyOcr": "yes" if needs_ocr and settings.NLM_INGEST_USE_OCR else "no",
     }  # Ensures calculate_opencontracts_data is set to True
 
     response = requests.post(

diff --git a/opencontractserver/tests/fixtures/__init__.py b/opencontractserver/tests/fixtures/__init__.py
@@ -28,6 +28,7 @@
 
 # files for nlm ingestor pipeline test
 NLM_INGESTOR_SAMPLE_PDF = pathlib.Path(__file__).parent / "sample.pdf"
+NLM_INGESTOR_SAMPLE_PDF_NEEDS_OCR = pathlib.Path(__file__).parent / "needs_ocr.pdf"
 NLM_INGESTOR_EXPECTED_JSON = (
     pathlib.Path(__file__).parent / "nlm_ingestor_output_for_sample_pdf.json"
 )

diff --git a/opencontractserver/tests/fixtures/needs_ocr.pdf b/opencontractserver/tests/fixtures/needs_ocr.pdf
diff --git a/opencontractserver/tests/test_pdf_utils.py b/opencontractserver/tests/test_pdf_utils.py
@@ -0,0 +1,70 @@
+import io
+import os
+import tempfile
+
+from django.test import TestCase
+
+from opencontractserver.tests.fixtures import (
+    NLM_INGESTOR_SAMPLE_PDF,
+    NLM_INGESTOR_SAMPLE_PDF_NEEDS_OCR,
+)
+from opencontractserver.utils.pdf import (
+    base_64_encode_bytes,
+    check_if_pdf_needs_ocr,
+    convert_hex_to_rgb_tuple,
+    createHighlight,
+    split_pdf_into_images,
+)
+
+
+class PDFUtilsTestCase(TestCase):
+    def setUp(self):
+        # Create a sample PDF file for testing
+        self.sample_pdf_content = NLM_INGESTOR_SAMPLE_PDF.read_bytes()
+        self.need_ocr_pdf_content = NLM_INGESTOR_SAMPLE_PDF_NEEDS_OCR.read_bytes()
+
+    def test_check_if_pdf_needs_ocr_with_text(self):
+        needs_ocr = check_if_pdf_needs_ocr(io.BytesIO(self.sample_pdf_content))
+        self.assertFalse(needs_ocr)
+
+    def test_check_if_pdf_needs_ocr_without_text(self):
+        # Create a PDF without extractable text
+        needs_ocr = check_if_pdf_needs_ocr(io.BytesIO(self.need_ocr_pdf_content))
+        self.assertTrue(needs_ocr)
+
+    def test_base_64_encode_bytes(self):
+        test_bytes = b"Hello, World!"
+        encoded = base_64_encode_bytes(test_bytes)
+        self.assertEqual(encoded, "SGVsbG8sIFdvcmxkIQ==")
+
+    def test_convert_hex_to_rgb_tuple(self):
+        hex_color = "FF8000"
+        rgb_tuple = convert_hex_to_rgb_tuple(hex_color)
+        self.assertEqual(rgb_tuple, (255, 128, 0))
+
+    def test_create_highlight(self):
+        highlight = createHighlight(
+            x1=10,
+            y1=20,
+            x2=30,
+            y2=40,
+            meta={"author": "Test Author", "contents": "Test Contents"},
+            color=(1.0, 0.5, 0.0),
+        )
+        self.assertEqual(highlight["/Type"], "/Annot")
+        self.assertEqual(highlight["/Subtype"], "/Highlight")
+        self.assertEqual(highlight["/T"], "Test Author")
+        self.assertEqual(highlight["/Contents"], "Test Contents")
+
+    def test_split_pdf_into_images(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Call the function
+            result = split_pdf_into_images(self.need_ocr_pdf_content, temp_dir)
+            print(f"Result: {result}")
+            # Check the results
+            self.assertEqual(len(result), 1)
+            self.assertTrue(all(path.endswith(".png") for path in result))
+
+            # Verify that files were actually created
+            for path in result:
+                self.assertTrue(os.path.exists(path))
diff --git a/opencontractserver/utils/pdf.py b/opencontractserver/utils/pdf.py
@@ -6,6 +6,7 @@
 from io import BytesIO
 
 from django.conf import settings
+from PyPDF2 import PdfReader
 from PyPDF2.generic import (
     ArrayObject,
     DictionaryObject,
@@ -135,8 +136,19 @@ def split_pdf_into_images(
     page_paths = []
 
     try:
+
+        # Ensure target_format is uppercase
+        target_format = target_format.upper()
+        if target_format not in ["PNG", "JPEG"]:
+            raise ValueError(f"Unsupported target format: {target_format}")
+
         # TODO - make sure target image resolution is compatible with PAWLS x,y coord system
         images = convert_from_bytes(pdf_bytes, size=(754, 1000))
+        print(f"PDF images: {len(images)}")
+
+        # Determine file extension and content type
+        file_extension = ".png" if target_format == "PNG" else ".jpg"
+        content_type = f"image/{target_format.lower()}"
 
         if settings.USE_AWS:
             import boto3
@@ -153,24 +165,43 @@ def split_pdf_into_images(
             img.save(img_bytes_stream, target_format)
 
             if settings.USE_AWS:
-                page_path = f"{storage_path}/{uuid.uuid4()}.pdf"
+                import boto3
+
+                s3 = boto3.client("s3")
+                page_path = f"{storage_path}/{uuid.uuid4()}{file_extension}"
                 s3.put_object(
                     Key=page_path,
                     Bucket=settings.AWS_STORAGE_BUCKET_NAME,
                     Body=img_bytes_stream.getvalue(),
+                    ContentType=content_type,
                 )
             else:
                 pdf_fragment_folder_path = pathlib.Path(storage_path)
                 pdf_fragment_folder_path.mkdir(parents=True, exist_ok=True)
-                pdf_fragment_path = pdf_fragment_folder_path / f"{uuid.uuid4()}.pdf"
+                pdf_fragment_path = (
+                    pdf_fragment_folder_path / f"{uuid.uuid4()}{file_extension}"
+                )
                 with pdf_fragment_path.open("wb") as f:
                     f.write(img_bytes_stream.getvalue())
-
-                page_path = pdf_fragment_path.resolve().__str__()
+                page_path = str(pdf_fragment_path.resolve())
 
             page_paths.append(page_path)
 
     except Exception as e:
         logger.error(f"split_pdf_into_images() - failed due to unexpected error: {e}")
 
     return page_paths
+
+
+def check_if_pdf_needs_ocr(file_object, threshold=10):
+    pdf_reader = PdfReader(file_object)
+    total_text = ""
+
+    for page in pdf_reader.pages:
+        total_text += page.extract_text()
+
+    # Reset file pointer to the beginning for subsequent use
+    file_object.seek(0)
+
+    # If the total extracted text is less than the threshold, it likely needs OCR
+    return len(total_text.strip()) < threshold