Skip to content

Commit

Permalink
Merge pull request #167 from JSv4/JSv4/fix-parsing-issue-improve-test…
Browse files Browse the repository at this point in the history
…s-add-ocr-detection

Dynamically Apply OCR, Improve PDF Utilities and Tests
  • Loading branch information
JSv4 authored Jul 22, 2024
2 parents e7be2db + d972338 commit 9092561
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 6 deletions.
4 changes: 3 additions & 1 deletion config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,9 @@
NLM_INGESTOR_ACTIVE = env.bool(
"NLM_INGESTOR_ACTIVE", False
) # Use nlm-ingestor where this is True... otherwise PAWLs
NLM_INGEST_USE_OCR = False # IF True, always tell nlm-ingestor to use OCR (Tesseract)
NLM_INGEST_USE_OCR = (
True # IF True, allow ingestor to use OCR when no text found in pdf.
)
NLM_INGEST_HOSTNAME = (
"http://nlm-ingestor:5001" # Hostname to send nlm-ingestor REST requests to
)
Expand Down
7 changes: 6 additions & 1 deletion opencontractserver/tasks/doc_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from opencontractserver.types.enums import PermissionTypes
from opencontractserver.utils.etl import build_document_export, pawls_bbox_to_funsd_box
from opencontractserver.utils.pdf import (
check_if_pdf_needs_ocr,
extract_pawls_from_pdfs_bytes,
split_pdf_into_images,
)
Expand Down Expand Up @@ -236,6 +237,10 @@ def nlm_ingest_pdf(user_id: int, doc_id: int) -> list[tuple[int, str]]:
doc_path = doc.pdf_file.name
doc_file = default_storage.open(doc_path, mode="rb")

# Check if OCR is needed
needs_ocr = check_if_pdf_needs_ocr(doc_file)
logger.debug(f"Document {doc_id} needs OCR: {needs_ocr}")

if settings.NLM_INGEST_API_KEY is not None:
headers = {"API_KEY": settings.NLM_INGEST_API_KEY}
else:
Expand All @@ -244,7 +249,7 @@ def nlm_ingest_pdf(user_id: int, doc_id: int) -> list[tuple[int, str]]:
files = {"file": doc_file}
params = {
"calculate_opencontracts_data": "yes",
"applyOcr": "yes" if settings.NLM_INGEST_USE_OCR else "no",
"applyOcr": "yes" if needs_ocr and settings.NLM_INGEST_USE_OCR else "no",
} # Ensures calculate_opencontracts_data is set to True

response = requests.post(
Expand Down
1 change: 1 addition & 0 deletions opencontractserver/tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

# files for nlm ingestor pipeline test
NLM_INGESTOR_SAMPLE_PDF = pathlib.Path(__file__).parent / "sample.pdf"
NLM_INGESTOR_SAMPLE_PDF_NEEDS_OCR = pathlib.Path(__file__).parent / "needs_ocr.pdf"
NLM_INGESTOR_EXPECTED_JSON = (
pathlib.Path(__file__).parent / "nlm_ingestor_output_for_sample_pdf.json"
)
Expand Down
Binary file added opencontractserver/tests/fixtures/needs_ocr.pdf
Binary file not shown.
70 changes: 70 additions & 0 deletions opencontractserver/tests/test_pdf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import io
import os
import tempfile

from django.test import TestCase

from opencontractserver.tests.fixtures import (
NLM_INGESTOR_SAMPLE_PDF,
NLM_INGESTOR_SAMPLE_PDF_NEEDS_OCR,
)
from opencontractserver.utils.pdf import (
base_64_encode_bytes,
check_if_pdf_needs_ocr,
convert_hex_to_rgb_tuple,
createHighlight,
split_pdf_into_images,
)


class PDFUtilsTestCase(TestCase):
def setUp(self):
# Create a sample PDF file for testing
self.sample_pdf_content = NLM_INGESTOR_SAMPLE_PDF.read_bytes()
self.need_ocr_pdf_content = NLM_INGESTOR_SAMPLE_PDF_NEEDS_OCR.read_bytes()

def test_check_if_pdf_needs_ocr_with_text(self):
needs_ocr = check_if_pdf_needs_ocr(io.BytesIO(self.sample_pdf_content))
self.assertFalse(needs_ocr)

def test_check_if_pdf_needs_ocr_without_text(self):
# Create a PDF without extractable text
needs_ocr = check_if_pdf_needs_ocr(io.BytesIO(self.need_ocr_pdf_content))
self.assertTrue(needs_ocr)

def test_base_64_encode_bytes(self):
test_bytes = b"Hello, World!"
encoded = base_64_encode_bytes(test_bytes)
self.assertEqual(encoded, "SGVsbG8sIFdvcmxkIQ==")

def test_convert_hex_to_rgb_tuple(self):
hex_color = "FF8000"
rgb_tuple = convert_hex_to_rgb_tuple(hex_color)
self.assertEqual(rgb_tuple, (255, 128, 0))

def test_create_highlight(self):
highlight = createHighlight(
x1=10,
y1=20,
x2=30,
y2=40,
meta={"author": "Test Author", "contents": "Test Contents"},
color=(1.0, 0.5, 0.0),
)
self.assertEqual(highlight["/Type"], "/Annot")
self.assertEqual(highlight["/Subtype"], "/Highlight")
self.assertEqual(highlight["/T"], "Test Author")
self.assertEqual(highlight["/Contents"], "Test Contents")

def test_split_pdf_into_images(self):
with tempfile.TemporaryDirectory() as temp_dir:
# Call the function
result = split_pdf_into_images(self.need_ocr_pdf_content, temp_dir)
print(f"Result: {result}")
# Check the results
self.assertEqual(len(result), 1)
self.assertTrue(all(path.endswith(".png") for path in result))

# Verify that files were actually created
for path in result:
self.assertTrue(os.path.exists(path))
39 changes: 35 additions & 4 deletions opencontractserver/utils/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from io import BytesIO

from django.conf import settings
from PyPDF2 import PdfReader
from PyPDF2.generic import (
ArrayObject,
DictionaryObject,
Expand Down Expand Up @@ -135,8 +136,19 @@ def split_pdf_into_images(
page_paths = []

try:

# Ensure target_format is uppercase
target_format = target_format.upper()
if target_format not in ["PNG", "JPEG"]:
raise ValueError(f"Unsupported target format: {target_format}")

# TODO - make sure target image resolution is compatible with PAWLS x,y coord system
images = convert_from_bytes(pdf_bytes, size=(754, 1000))
print(f"PDF images: {len(images)}")

# Determine file extension and content type
file_extension = ".png" if target_format == "PNG" else ".jpg"
content_type = f"image/{target_format.lower()}"

if settings.USE_AWS:
import boto3
Expand All @@ -153,24 +165,43 @@ def split_pdf_into_images(
img.save(img_bytes_stream, target_format)

if settings.USE_AWS:
page_path = f"{storage_path}/{uuid.uuid4()}.pdf"
import boto3

s3 = boto3.client("s3")
page_path = f"{storage_path}/{uuid.uuid4()}{file_extension}"
s3.put_object(
Key=page_path,
Bucket=settings.AWS_STORAGE_BUCKET_NAME,
Body=img_bytes_stream.getvalue(),
ContentType=content_type,
)
else:
pdf_fragment_folder_path = pathlib.Path(storage_path)
pdf_fragment_folder_path.mkdir(parents=True, exist_ok=True)
pdf_fragment_path = pdf_fragment_folder_path / f"{uuid.uuid4()}.pdf"
pdf_fragment_path = (
pdf_fragment_folder_path / f"{uuid.uuid4()}{file_extension}"
)
with pdf_fragment_path.open("wb") as f:
f.write(img_bytes_stream.getvalue())

page_path = pdf_fragment_path.resolve().__str__()
page_path = str(pdf_fragment_path.resolve())

page_paths.append(page_path)

except Exception as e:
logger.error(f"split_pdf_into_images() - failed due to unexpected error: {e}")

return page_paths


def check_if_pdf_needs_ocr(file_object, threshold=10):
pdf_reader = PdfReader(file_object)
total_text = ""

for page in pdf_reader.pages:
total_text += page.extract_text()

# Reset file pointer to the beginning for subsequent use
file_object.seek(0)

# If the total extracted text is less than the threshold, it likely needs OCR
return len(total_text.strip()) < threshold

0 comments on commit 9092561

Please sign in to comment.