add: docling documentloader

obot-platform · Nov 11, 2024 · eb98b32 · eb98b32
1 parent 810f1c8
commit eb98b32
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 0 deletions.
diff --git a/knowledge/documentloaders/docling/load.py b/knowledge/documentloaders/docling/load.py
@@ -0,0 +1,28 @@
+import os
+
+from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
+from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+source = os.getenv("INPUT")
+
+
+IMAGE_RESOLUTION_SCALE = 2.0
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
+pipeline_options.generate_picture_images = True
+
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    }
+)
+result = converter.convert(source)
+
+# TODO: send embedded images to VLM for captioning
+
+
+with open(os.getenv("OUTPUT"), "w") as f:
+    f.write(result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED))
diff --git a/knowledge/documentloaders/docling/requirements.txt b/knowledge/documentloaders/docling/requirements.txt
@@ -0,0 +1 @@
+docling-parse
diff --git a/knowledge/documentloaders/docling/tool.gpt b/knowledge/documentloaders/docling/tool.gpt
@@ -0,0 +1,8 @@
+Name: Docling-Parse PDF Document Loader
+Params: input: Path to input PDF file
+Params: output: Path to output file where the extracted markdown content should be written to. Should end with .md
+metadata: supported-file-type: pdf
+
+#!/usr/bin/env python3 ${GPTSCRIPT_TOOL_DIR}/load.py
+
+