Skip to content

Commit

Permalink
add: docling documentloader
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed Nov 11, 2024
1 parent 810f1c8 commit eb98b32
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 0 deletions.
28 changes: 28 additions & 0 deletions knowledge/documentloaders/docling/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os

from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

source = os.getenv("INPUT")


IMAGE_RESOLUTION_SCALE = 2.0

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
result = converter.convert(source)

# TODO: send embedded images to VLM for captioning


with open(os.getenv("OUTPUT"), "w") as f:
f.write(result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED))
1 change: 1 addition & 0 deletions knowledge/documentloaders/docling/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docling-parse
8 changes: 8 additions & 0 deletions knowledge/documentloaders/docling/tool.gpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Name: Docling-Parse PDF Document Loader
Params: input: Path to input PDF file
Params: output: Path to output file where the extracted markdown content should be written to. Should end with .md
metadata: supported-file-type: pdf

#!/usr/bin/env python3 ${GPTSCRIPT_TOOL_DIR}/load.py


0 comments on commit eb98b32

Please sign in to comment.