Skip to content

Commit

Permalink
Merge pull request #55 from gomate-community/pipeline
Browse files Browse the repository at this point in the history
Pipeline
  • Loading branch information
yanqiangmiffy authored Sep 7, 2024
2 parents 38886f9 + 41dc5ea commit 28f9fa5
Show file tree
Hide file tree
Showing 12 changed files with 59 additions and 40 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ data/docs
data/nltk_data
huqie
dist
build
build
data/huqie.zip
data/nltk_data.zip
examples/retrievers/indexs
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ GoMate框架的设计核心在于其**高度的可配置性和模块化**,使

## 🏗️ 更新记录

- 添加MinerU文档解析:一站式开源高质量数据提取工具,支持PDF/网页/多格式电子书提取`[20240907] `
- 添加[MinerU文档解析](https://github.com/gomate-community/GoMate/blob/main/docs/mineru.md):一站式开源高质量数据提取工具,支持PDF/网页/多格式电子书提取`[20240907] `
- RAPTOR:递归树检索器实现
- 支持多种文件解析并且模块化目前支持解析的文件类型包括:`text`,`docx`,`ppt`,`excel`,`html`,`pdf`,`md`
- 优化了`DenseRetriever`,支持索引构建,增量追加以及索引保存,保存内容包括文档、向量以及索引
Expand Down
10 changes: 2 additions & 8 deletions docs/mineru.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,8 @@ PDF文档中包含大量知识信息,然而提取高质量的PDF内容并非

![](https://i-blog.csdnimg.cn/direct/a54dcae834ae48d498fb595aca4212c3.png)

返回内容字段包括:dict_keys(['layout', 'info', 'content'])
返回内容字段包括:dict_keys(['layout', 'info', 'content_list','md_content'])
其中content是一个字典列表:
```json
{
'type': 'text',
'text': '现在我们知道:价值实体就是劳动;劳动量的尺度就是劳动持续时间。',
'page_idx': 5
}
```



8 changes: 7 additions & 1 deletion examples/parsers/common_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from gomate.modules.document.common_parser import CommonParser


# cp=CommonParser()
# content=cp.parse('../../data/docs/计算所现行制度汇编202406/计算所现行制度汇编202406/综合处/中国科学院计算技术研究所综合安全管理制度_20240531修订版.pdf')
# print(content)



cp=CommonParser()
content=cp.parse('/data/users/searchgpt/yq/GoMate_dev/data/docs/Agent.docx')
content=cp.parse('H:/2024-Xfyun-RAG/data/corpus.txt')
print(content)
2 changes: 1 addition & 1 deletion examples/parsers/pdfparser_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# for chunk in chunks:
# print(chunk)

chunks = parser.parse(fnm="../../data/docs/paper/16400599.pdf")
chunks = parser.parse(fnm="../../data/docs/计算所现行制度汇编202406/计算所现行制度汇编202406/综合处/中国科学院计算技术研究所综合安全管理制度_20240531修订版.pdf")
print(chunks)
print(len(chunks))
for chunk in chunks:
Expand Down
8 changes: 5 additions & 3 deletions examples/parsers/textparser_exmaple.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
max_chunk_size=512
)

chunks=text_parser.get_chunks(
filepath="../../data/docs/制度汇编.txt"
# chunks=text_parser.get_chunks(
# filepath="../../data/docs/制度汇编.txt"
# )
chunks = text_parser.get_chunks(
filepath="H:/2024-Xfyun-RAG/data/corpus.txt/corpus.txt"
)

print(len(chunks))

for chunk in chunks:
Expand Down
14 changes: 7 additions & 7 deletions examples/retrievers/bm25sretrever_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,18 @@

from gomate.modules.document.common_parser import CommonParser
from gomate.modules.retrieval.bm25s_retriever import BM25Retriever

from gomate.modules.document.utils import PROJECT_BASE
if __name__ == '__main__':


corpus = []

new_files = [
r'/data/users/searchgpt/yq/GoMate_dev/data/docs/伊朗.txt',
r'/data/users/searchgpt/yq/GoMate_dev/data/docs/伊朗总统罹难事件.txt',
r'/data/users/searchgpt/yq/GoMate_dev/data/docs/伊朗总统莱希及多位高级官员遇难的直升机事故.txt',
r'/data/users/searchgpt/yq/GoMate_dev/data/docs/伊朗问题.txt',
r'/data/users/searchgpt/yq/GoMate_dev/data/docs/新冠肺炎疫情.pdf',
f'{PROJECT_BASE}/data/docs/伊朗.txt',
f'{PROJECT_BASE}/data/docs/伊朗总统罹难事件.txt',
f'{PROJECT_BASE}/data/docs/伊朗总统莱希及多位高级官员遇难的直升机事故.txt',
f'{PROJECT_BASE}/data/docs/伊朗问题.txt',
f'{PROJECT_BASE}/data/docs/汽车操作手册.pdf',
]
parser = CommonParser()
for filename in new_files:
Expand All @@ -32,6 +32,6 @@
index_path="indexs/description_bm25.index",
rebuild_index=True,
corpus=corpus)
query = "新冠疫情"
query = "伊朗总统莱希"
search_docs = bm25_retriever.retrieve(query)
print(search_docs)
4 changes: 2 additions & 2 deletions gomate/modules/document/chunk.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re

from tqdm import tqdm
from gomate.modules.document import rag_tokenizer


Expand Down Expand Up @@ -80,7 +80,7 @@ def chunk_sentences(self, paragraphs, chunk_size):
current_chunk = []
current_chunk_tokens = 0

for sentence in sentences:
for sentence in tqdm(sentences, desc='Chunking'):
tokens = self.tokenizer.tokenize(sentence)
if current_chunk_tokens + len(tokens) <= chunk_size:
current_chunk.append(sentence)
Expand Down
4 changes: 2 additions & 2 deletions gomate/modules/document/pdf_parser_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
@description: coding..
"""
import re
from io import BytesIO

import fitz
from langchain_core.documents.base import Document
from tqdm import tqdm
from io import BytesIO


class PdfParserUsingPyMuPDF():
"""
Expand Down
4 changes: 2 additions & 2 deletions gomate/modules/document/txt_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import chardet

from tqdm import tqdm
from gomate.modules.document.utils import find_codec


Expand All @@ -19,7 +19,7 @@ def parse(self, fnm, from_page=0, to_page=100000, **kwargs):
txt = f.read()

sections = []
for sec in txt.split("\n"):
for sec in tqdm(txt.split("\n"),desc="Parsing"):
sections.append(sec)
return sections

Expand Down
22 changes: 17 additions & 5 deletions gomate/modules/document/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,25 @@
@description: coding..
"""
import os
import pathlib
import re

import tiktoken

PROJECT_BASE = '/data/users/searchgpt/yq/GoMate'
# 获取当前文件所在的路径
current_path = pathlib.Path(__file__).resolve()

# 找到根目录,这里假设项目的根目录为 'GoMate'
project_root = current_path
while project_root.name != 'GoMate':
project_root = project_root.parent

# 在 Windows 中输出带反斜杠的路径
project_root_str = str(project_root)

print(f"项目根目录为: {project_root_str}")

PROJECT_BASE = project_root_str
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
'cp037', 'cp273', 'cp424', 'cp437',
Expand Down Expand Up @@ -123,8 +137,9 @@ def findMaxTm(fnm):
except Exception as e:
pass
return m


# https://stackoverflow.com/questions/76106366/how-to-use-tiktoken-in-offline-mode-computer
import tiktoken_ext.openai_public
tiktoken_cache_dir = "/data/users/searchgpt/yq/GoMate/data/docs"
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
encoder = tiktoken.get_encoding("cl100k_base")
Expand All @@ -139,6 +154,3 @@ def num_tokens_from_string(string: str) -> int:
def truncate(string: str, max_len: int) -> int:
"""Returns truncated text if the length of text exceed max_len."""
return encoder.decode(encoder.encode(string)[:max_len])



16 changes: 9 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@ langchain-text-splitters==0.2.1
langchain-huggingface==0.0.3
langchain-openai==0.1.8
langsmith>=0.1.61
transformers == 4.41.1
torch == 2.2.0
pandas == 2.0.0
transformers
pandas
nltk == 3.8.1
sentencepiece==0.2.0
PyPDF2==3.0.1
Expand All @@ -41,8 +40,11 @@ tortoise==0.1.1
python-magic==0.4.27
html_text==0.6.2
readability==0.3.1
PyMuPDF==1.24.5
PyMuPDF
hanziconv==0.3.2
PyPDF2==3.0.1
gradio===3.50.2
loguru==0.5.3
PyPDF2
gradio
loguru
xgboost
bm25s
jieba

0 comments on commit 28f9fa5

Please sign in to comment.