Merge pull request #435 from VikParuchuri/dev

Dev
VikParuchuri · Jan 2, 2025 · ab22a43 · ab22a43
2 parents a1708e8 + b42f10d
commit ab22a43
Show file tree

Hide file tree

Showing 46 changed files with 2,450 additions and 780 deletions.
diff --git a/README.md b/README.md
@@ -5,10 +5,11 @@ Marker converts PDFs to markdown, JSON, and HTML quickly and accurately.
 - Supports a wide range of documents
 - Supports all languages
 - Removes headers/footers/other artifacts
-- Formats tables and code blocks
+- Formats tables, forms, and code blocks
 - Extracts and saves images along with the markdown
 - Converts equations to latex
 - Easily extensible with your own formatting and logic
+- Optionally boost accuracy with an LLM
 - Works on GPU, CPU, or MPS
 
 ## How it works
@@ -18,6 +19,7 @@ Marker is a pipeline of deep learning models:
 - Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
 - Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
 - Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify). [tabled](https://github.com/VikParuchuri/tabled))
+- Optionally use an LLM to improve quality
 - Combine blocks and postprocess complete text
 
 It only uses models where necessary, which improves speed and accuracy.
@@ -65,6 +67,8 @@ PDF is a tricky format, so marker will not always work perfectly.  Here are some
 - Forms are not converted optimally
 - Very complex layouts, with nested tables and forms, may not work
 
+Note: Passing the `--use_llm` flag will mostly solve all of these issues.
+
 # Installation
 
 You'll need python 3.10+ and PyTorch.  You may need to install the CPU version of torch first if you're not using a Mac or a GPU machine.  See [here](https://pytorch.org/get-started/locally/) for more details.
@@ -99,13 +103,16 @@ marker_single /path/to/file.pdf
 
 Options:
 - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
-- `--debug`: Enable debug mode for additional logging and diagnostic information.
 - `--output_format [markdown|json|html]`: Specify the format for the output results.
+- `--use_llm`: Uses an LLM to improve accuracy.  You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
+- `--disable_image_extraction`: Don't extract images from the PDF.  If you also specify `--use_llm`, then images will be replaced with a description.
 - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
 - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
+- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
+- `--debug`: Enable debug mode for additional logging and diagnostic information.
 - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
 - `--config_json PATH`: Path to a JSON configuration file containing additional settings.
-- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "eng,fra,deu"` for English, French, and German.
+- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German.
 - `config --help`: List all available builders, processors, and converters, and their associated configuration.  These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
 
 The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you don't need OCR, marker can work with any language.
@@ -127,7 +134,6 @@ NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out
 
 - `NUM_DEVICES` is the number of GPUs to use.  Should be `2` or greater.
 - `NUM_WORKERS` is the number of parallel processes to run on each GPU.
-- 
 
 ## Use from python
 
@@ -149,7 +155,7 @@ text, _, images = text_from_rendered(rendered)
 
 ### Custom configuration
 
-You can also pass configuration using the `ConfigParser`:
+You can pass configuration using the `ConfigParser`:
 
 ```python
 from marker.converters.pdf import PdfConverter
@@ -171,6 +177,26 @@ converter = PdfConverter(
 rendered = converter("FILEPATH")
 ```
 
+### Extract blocks
+
+Each document consists of one or more pages.  Pages contain blocks, which can themselves contain other blocks.  It's possible to programatically manipulate these blocks.  
+
+Here's an example of extracting all forms from a document:
+
+```python
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.schema import BlockTypes
+
+converter = PdfConverter(
+    artifact_dict=create_model_dict(),
+)
+document = converter.build_document("FILEPATH")
+forms = document.contained_blocks((BlockTypes.Form,))
+```
+
+Look at the processors for more examples of extracting and manipulating blocks.
+
 # Output Formats
 
 ## Markdown
@@ -312,6 +338,7 @@ Note that this is not a very robust API, and is only intended for small-scale us
 
 There are some settings that you may find useful if things aren't working the way you expect:
 
+- If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality.  You must set `GOOGLE_API_KEY` to a Gemini API key for this to work.
 - Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document.
 - `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
 - If you're getting out of memory errors, decrease worker count.  You can also try splitting up long PDFs into multiple files.

diff --git a/convert.py b/convert.py
@@ -1,5 +1,7 @@
 import os
 
+os.environ["GRPC_VERBOSITY"] = "ERROR"
+os.environ["GLOG_minloglevel"] = "2"
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
 os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
 

diff --git a/convert_single.py b/convert_single.py
@@ -1,9 +1,10 @@
 import os
 
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
+os.environ["GRPC_VERBOSITY"] = "ERROR"
+os.environ["GLOG_minloglevel"] = "2"
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
 
 import time
-
 import click
 
 from marker.config.parser import ConfigParser

diff --git a/marker/builders/layout.py b/marker/builders/layout.py
@@ -5,6 +5,10 @@
 from surya.schema import LayoutResult
 from surya.model.layout.encoderdecoder import SuryaLayoutModel
 
+from surya.ocr_error import batch_ocr_error_detection
+from surya.schema import OCRErrorDetectionResult
+from surya.model.ocr_error.model import DistilBertForSequenceClassification
+
 from marker.settings import settings
 from marker.builders import BaseBuilder
 from marker.providers import ProviderOutput, ProviderPageLines
@@ -37,15 +41,21 @@ class LayoutBuilder(BaseBuilder):
         document_ocr_threshold (float):
             The minimum ratio of pages that must pass the layout coverage check
             to avoid OCR. Default is 0.8.
+
+        error_model_segment_length (int):
+            The maximum number of characters to send to the OCR error model.
+            Default is 1024.
     """
     batch_size = None
     layout_coverage_min_lines = 1
     layout_coverage_threshold = .1
     document_ocr_threshold = .8
+    error_model_segment_length = 512
     excluded_for_coverage = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
 
-    def __init__(self, layout_model: SuryaLayoutModel, config=None):
+    def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
         self.layout_model = layout_model
+        self.ocr_error_model = ocr_error_model
 
         super().__init__(config)
 
@@ -71,15 +81,41 @@ def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
         )
         return layout_results
 
+    def surya_ocr_error_detection(self, pages:List[PageGroup], provider_page_lines: ProviderPageLines) -> OCRErrorDetectionResult:
+        page_texts = []
+        for document_page in pages:
+            page_text = ''
+            provider_lines = provider_page_lines.get(document_page.page_id, [])
+            for line in provider_lines:
+                page_text += ' '.join([s.text for s in line.spans])
+
+            # Sample text from the middle
+            if len(page_text) > 0:
+                page_text_middle = len(page_text) // 2
+                page_text_start = max(0, page_text_middle - self.error_model_segment_length // 2)
+                page_text_end = page_text_start + self.error_model_segment_length
+                page_text = page_text[page_text_start:page_text_end]
+
+            page_texts.append(page_text)
+
+        ocr_error_detection_results = batch_ocr_error_detection(
+            page_texts,
+            self.ocr_error_model,
+            self.ocr_error_model.tokenizer,
+            batch_size=int(self.get_batch_size())       #TODO Better Multiplier
+        )
+        return ocr_error_detection_results
+
     def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[LayoutResult]):
         for page, layout_result in zip(pages, layout_results):
             layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
             provider_page_size = page.polygon.size
-            page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model
+            page.layout_sliced = layout_result.sliced  # This indicates if the page was sliced by the layout model
             for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
                 block_cls = get_block_class(BlockTypes[bbox.label])
                 layout_block = page.add_block(block_cls, PolygonBox(polygon=bbox.polygon))
                 layout_block.polygon = layout_block.polygon.rescale(layout_page_size, provider_page_size)
+                layout_block.top_k = {BlockTypes[label]: prob for (label, prob) in bbox.top_k.items()}
                 page.add_structure(layout_block)
 
             # Ensure page has non-empty structure
@@ -91,16 +127,17 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou
                 page.children = []
 
     def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: ProviderPageLines):
+        ocr_error_detection_labels = self.surya_ocr_error_detection(document_pages, provider_page_lines).labels
+
         good_pages = []
-        for document_page in document_pages:
+        for (document_page, ocr_error_detection_label) in zip(document_pages, ocr_error_detection_labels):
             provider_lines = provider_page_lines.get(document_page.page_id, [])
-            good_pages.append(self.check_layout_coverage(document_page, provider_lines))
+            good_pages.append(bool(provider_lines) and self.check_layout_coverage(document_page, provider_lines) and (ocr_error_detection_label != "bad"))
 
         ocr_document = sum(good_pages) / len(good_pages) < self.document_ocr_threshold
         for idx, document_page in enumerate(document_pages):
             provider_lines = provider_page_lines.get(document_page.page_id, [])
             needs_ocr = not good_pages[idx]
-
             if needs_ocr and ocr_document:
                 document_page.text_extraction_method = "surya"
                 continue
@@ -128,7 +165,7 @@ def check_layout_coverage(
             total_blocks += 1
             intersecting_lines = np.count_nonzero(intersection_matrix[idx] > 0)
 
-            if intersecting_lines > self.layout_coverage_min_lines:
+            if intersecting_lines >= self.layout_coverage_min_lines:
                 covered_blocks += 1
 
             if layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text:
@@ -141,4 +178,3 @@ def check_layout_coverage(
         if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
             text_okay = True
         return text_okay
-