Skip to content

Commit

Permalink
refactor: 使用lazy_load方式加载pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
liuruibin committed Aug 16, 2024
1 parent c332a6c commit 0d59ab2
Showing 1 changed file with 3 additions and 6 deletions.
9 changes: 3 additions & 6 deletions apps/common/handle/impl/pdf_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@
re.compile("(?<!\n)\n\n+")]


def number_to_text(pdf_document, page_number):
return pdf_document[page_number].page_content


def check_pdf_is_image(pdf_path):
try:
# 打开PDF文件
Expand Down Expand Up @@ -64,8 +60,9 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
loader = PyPDFLoader(temp_file_path, extract_images=True)
else:
loader = PyPDFLoader(temp_file_path, extract_images=False)
pdf_document = loader.load()
content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])

content = "\n".join([page.page_content for page in loader.lazy_load()])

if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
Expand Down

0 comments on commit 0d59ab2

Please sign in to comment.