Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline #64

Merged
merged 2 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ data/huqie.zip
data/nltk_data.zip
examples/retrievers/indexs
output
examples/rag/indexs
examples/rag/indexs
examples/rag/mobile_rag.py
11 changes: 6 additions & 5 deletions gomate/modules/citation/match_citation.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def ground_response(
]
},
"""
print(selected_docs)
sentences = self.cut(response)
final_response = []
selected_idx = [i - 1 for i in selected_idx]
Expand Down Expand Up @@ -96,14 +97,14 @@ def ground_response(
highlighted_start_end = self.highlight_common_substrings(sentence, evidence_sentence,evidence)
quote_list.append(
{
"doc_id": 90564, # 文件id
"chk_id": best_idx, # 切片索引(从0开始)
"doc_id": selected_docs[i]["doc_id"], # 文件id
"chk_id": selected_docs[i]["chk_id"], # 切片索引(从0开始)
# 非文内溯源知识集合无需返回
"doc_source": "新闻来源",
"doc_source": selected_docs[i]["newsinfo"]["source"],
# 新闻时间, 非文内溯源知识集合无需返回
"doc_date": "2021-10-19",
"doc_date": selected_docs[i]["newsinfo"]["date"],
# 非文内溯源知识集合无需返回
"doc_title": "新闻标题",
"doc_title": selected_docs[i]["newsinfo"]["title"],
# 非文内溯源知识集合无需返回
"chk_content": evidence,
"best_ratio": best_ratio,
Expand Down
2 changes: 1 addition & 1 deletion gomate/modules/document/common_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@ def parse(self, file_path):
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
contents = parser.parse(content)
# loguru.logger.info(contents)
contents = self.tc.chunk_sentences(contents, chunk_size=512)
# contents = self.tc.chunk_sentences(contents, chunk_size=512)
return contents
2 changes: 1 addition & 1 deletion gomate/modules/document/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,5 +1173,5 @@ def remove_tag(txt):

if __name__ == "__main__":
pp=PdfSimParser()
contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/docs/新冠肺炎疫情.pdf')
contents=pp.parse('/data/users/searchgpt/yq/GoMate_dev/data/competitions/df/A_document/AZ06.pdf')
print(contents)
4 changes: 2 additions & 2 deletions gomate/modules/document/rag_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@

class RagTokenizer:
def key_(self, line):
return str(line.lower().encode("utf-8"))[2:-1]
return str(line.lower().encode("utf-8", 'ignore'))[2:-1]

def rkey_(self, line):
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
return str(("DD" + (line[::-1].lower())).encode("utf-8", 'ignore'))[2:-1]

def loadDict_(self, fnm):
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
Expand Down
20 changes: 17 additions & 3 deletions gomate/modules/generator/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,19 @@
提问:{question}

相关资料:{context}
"""
""",
DF_PROMPT_TEMPLATE="""请结合参考的上下文内容回答用户问题,确保答案的准确性、全面性和权威性。如果上下文不能支撑用户问题,或者没有相关信息,请明确说明问题无法回答,避免生成虚假信息。
只输出答案,尽量包括关键词,不要输出额外内容,不要过多解释,不要输出额外无关文字以及过多修饰。

如果给定的上下文无法让你做出回答,请直接回答:“无法回答。”,不要输出额外内容。

问题: {question}
可参考的上下文:
···
{context}
···
简明准确的回答:
""",

)

Expand Down Expand Up @@ -153,8 +165,10 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo
if llm_only:
prompt = prompt
else:
prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE'].format(question=prompt, context=content)
prompt = PROMPT_TEMPLATE['Xunfei_PROMPT_TEMPLATE2'].format(question=prompt, context=content)
prompt=prompt.encode("utf-8", 'ignore').decode('utf-8','ignore')
print(prompt)

inputs = self.tokenizer.apply_chat_template([{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=True,
Expand All @@ -163,7 +177,7 @@ def chat(self, prompt: str, history: List = [], content: str = '', llm_only: boo
)

inputs = inputs.to('cuda')
gen_kwargs = {"max_length": 16000, "do_sample": False, "top_k": 1}
gen_kwargs = {"max_length": 20000, "do_sample": False, "top_k": 1}
with torch.no_grad():
outputs = self.model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
Expand Down
82 changes: 41 additions & 41 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,47 +1,47 @@
tqdm >= 4.23.4
hyperopt >= 0.1.1
h5py >= 2.8.0
coverage >= 4.3.4
codecov >= 2.0.15
pytest >= 3.7.4
pytest-cov >= 2.4.0
flake8 == 7.0.0
flake8_docstrings == 1.7.0
pydocstyle == 2.1
openai == 1.33.0
datasets == 2.16.1
langchain==0.2.0
langchain-core==0.2.5
langchain-text-splitters==0.2.1
langchain-huggingface==0.0.3
langchain-openai==0.1.8
langsmith>=0.1.61
tqdm
hyperopt
h5py
coverage
codecov
pytest
pytest-cov
flake8
flake8_docstrings
pydocstyle
openai
datasets
langchain
langchain-core
langchain-text-splitters
langchain-huggingface
langchain-openai
langsmith
transformers
pandas
nltk == 3.8.1
sentencepiece==0.2.0
PyPDF2==3.0.1
html2text==2024.2.26
beautifulsoup4==4.12.3
faiss-cpu==1.8.0
umap-learn==0.5.5
sentence_transformers==3.0.0
threadpoolctl==3.5.0
PyMuPDF==1.24.5
hanziconv==0.3.2
datrie==0.8.2
xpinyin==0.7.6
python-pptx==0.6.23
pdfplumber==0.11.0
readability==0.3.1
html_text==0.6.2
python-docx==1.1.2
tortoise==0.1.1
python-magic==0.4.27
html_text==0.6.2
readability==0.3.1
nltk
sentencepiece
PyPDF2
html2text
beautifulsoup4
faiss-cpu
umap-learn
sentence_transformers
threadpoolctl
PyMuPDF
hanziconv
datrie
xpinyin
python-pptx
pdfplumber
readability
html_text
python-docx
tortoise
python-magic
html_text
readability
PyMuPDF
hanziconv==0.3.2
hanziconv
PyPDF2
gradio
loguru
Expand Down
Loading