-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdemo_document.py
270 lines (223 loc) · 11.5 KB
/
demo_document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import streamlit as st
from streamlit.delta_generator import DeltaGenerator
from module.llm_client import get_client
from module.conversation import postprocess_text, Conversation, Role, FILE_TEMPLATE
from module.utils import extract_pdf, extract_docx, extract_pptx, extract_text
from module.vector_base import VectorStore
from module.utils import ReadFiles
from module.memory import Abstract, History_Management
from module.PROMPT_TEMPLATE import RAG_PROMPT_TEMPLATE, OTHER_PROMPT_TEMPLATE
from module.tools import web_search_response
import json
from datetime import datetime, timedelta
import random
import base64
import os
import shutil
import tempfile
from uuid import uuid4
# 获取相关组件
today = datetime.now().strftime('%Y-%m-%d')
now = datetime.now()
yesterday = now - timedelta(days=1)
CHAT_HISTORY_FILE = f'./memory_storage/miao_memory/chat_history/{today}_chat_history.txt'
WEB_SEARCH_WORDS = "./module/web_search_words.txt"
TMP_FOLDER_PATH_DOC = "./memory_storage/rag_memory/tmp"
TIME_CHINESE_FORMAT = "{0}年{1}月{2}日 {3}时{4}分{5}秒".format(
now.year,now.month,now.day,now.hour,now.minute,now.second
)
CONFIG_PATH = './module/config.json'
f = open(WEB_SEARCH_WORDS,"r",encoding="utf-8")
web_search_words = f.read().split("\n")
with open(CONFIG_PATH, "r", encoding="utf-8")as f:
config = json.load(f)
Miao_Name = config["Miao_Name"]
Miao_Nick_Name = config["Miao_Nick_Name"]
Miao_Info_Brief = config["Miao_Info_Brief"]
User_Identity = config["User_Identity"]
eco_mode = config["ECO_MODE"]
if eco_mode:
MODEL_LONG = "glm-4-flash"
else:
MODEL_LONG = "glm-4-long"
client = get_client(MODEL_LONG)
Abstractor = Abstract()
History_Manager = History_Management()
def tmp_folder_clean():
for item in os.listdir(TMP_FOLDER_PATH_DOC):
item_path = os.path.join(TMP_FOLDER_PATH_DOC, item)
if os.path.isfile(item_path):
os.remove(item_path)
def save_abstract_to_chat_history(abstract):
with open(CHAT_HISTORY_FILE, "a", encoding="utf-8")as f:
f.write(TIME_CHINESE_FORMAT + "\n")
f.write("[文档模式]漆小喵:" + abstract + "\n")
def main(
uploaded_files,
prompt_text: str,
system_prompt: str,
top_p: float = 0.8,
temperature: float = 0.95,
repetition_penalty: float = 1.0,
max_tokens: int = 4000,
retry: bool = False,
clear_history: bool = False,
):
tmp_folder_clean()
col1, col2 = st.columns([3, 2])
with col1:
if "uploader_key" not in st.session_state:
st.session_state.uploader_key = str(random.randint(1000, 100000000))
if "files_uploaded" not in st.session_state:
st.session_state.files_uploaded = False
if "generate_abstract" not in st.session_state:
st.session_state.generate_abstract = False
if "session_id" not in st.session_state:
st.session_state.session_id = uuid4()
placeholder = st.empty()
with placeholder.container():
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if 'document_history' not in st.session_state:
st.session_state.document_history = []
if prompt_text == "" and retry == False:
print("\n== Clean ==\n")
st.session_state.chat_history = []
st.session_state.document_history = []
return
history: list[Conversation] = st.session_state.chat_history
document_history = st.session_state.document_history
if retry:
print("\n== Retry ==\n")
last_user_conversation_idx = None
for idx, conversation in enumerate(history):
if conversation.role == Role.USER:
last_user_conversation_idx = idx
if last_user_conversation_idx is not None:
prompt_text = history[last_user_conversation_idx].content
del history[last_user_conversation_idx:]
first_round = len(st.session_state.chat_history) == 0
if uploaded_files and not st.session_state.files_uploaded:
st.session_state.generate_abstract = True
if st.session_state.generate_abstract:
with st.spinner(f"{Miao_Nick_Name}已接收到文档啦,{User_Identity}别着急,让{Miao_Nick_Name}好好读一读哦~"):
uploaded_texts = []
file_name_list = []
for uploaded_file in uploaded_files:
file_name = uploaded_file.name
tmp_file_path = os.path.join(TMP_FOLDER_PATH_DOC,file_name)
if not os.path.exists(tmp_file_path):
with open(tmp_file_path,"wb")as f:
f.write(uploaded_file.getbuffer())
if file_name.endswith(".pdf"):
content = extract_pdf(tmp_file_path)
elif file_name.endswith(".docx"):
content = extract_docx(tmp_file_path)
elif file_name.endswith(".pptx"):
content = extract_pptx(tmp_file_path)
else:
content = extract_text(tmp_file_path)
uploaded_texts.append(
FILE_TEMPLATE.format(file_name=file_name, file_content=content)
)
file_name_list.append(tmp_file_path)
st.session_state.uploaded_texts = "\n\n".join(uploaded_texts)
uploaded_texts = st.session_state.get("uploaded_texts")
uploaded_texts = "<文章开始>" + uploaded_texts + "<文章结束>" + "文章已上传!"
History_Manager.append_conversation(Conversation(Role.USER, uploaded_texts), document_history,save_and_show=False)
print(history)
print(document_history)
with st.spinner(f"{Miao_Nick_Name}正在进行文档摘要!马上就给{User_Identity}送上一份,嘻嘻~"):
if len(uploaded_texts) <= 100000:
print("文档长度不足100000字,启动默认摘要模式")
prompt = RAG_PROMPT_TEMPLATE["RAG_abstract_prompt_template"].format(
Miao_Info_Brief = Miao_Info_Brief,
text = uploaded_texts,
)
file_abstract = Abstractor.default_abstract(model_name=MODEL_LONG, prompt=prompt)
else:
print("文本长度超过100000字,启动map_reduce摘要模式")
prompt_template = RAG_PROMPT_TEMPLATE["RAG_abstract_prompt_template"].format(
Miao_Info_Brief = Miao_Info_Brief,
text = "{text}"
)
file_abstract = Abstractor.map_reduce_abstract(
prompt_template = prompt_template,
text = uploaded_texts)
History_Manager.append_conversation(
conversation=Conversation(Role.ASSISTANT,postprocess_text(file_abstract),),
history=document_history,
save_and_show=False)
History_Manager.append_conversation(
conversation=Conversation(Role.ASSISTANT,postprocess_text(file_abstract),),
history=history,
save_and_show=False)
save_abstract_to_chat_history(file_abstract)
st.session_state.files_uploaded = True
st.session_state.uploaded_texts = ""
st.session_state.generate_abstract = False
for conversation in history:
conversation.show()
if prompt_text:
prompt_text = prompt_text.strip()
History_Manager.append_conversation(
conversation=Conversation(Role.USER, prompt_text),
history=document_history,
save_and_show=True,
document_mode=True)
History_Manager.append_conversation(
conversation=Conversation(Role.USER, prompt_text),
history=history,
save_and_show=False)
time_prompt = OTHER_PROMPT_TEMPLATE["Time_Prompt"].format(time = TIME_CHINESE_FORMAT)
if any(word in prompt_text for word in web_search_words):
with st.spinner(f"耶嘿,{Miao_Nick_Name}正在启动网页搜索功能!"):
print("已启动网页搜索")
prompt_text = prompt_text.replace("今天", "{0}年{1}月{2}日".format(now.year, now.month, now.day))
prompt_text = prompt_text.replace("昨天", "{0}年{1}月{2}日".format(yesterday.year, yesterday.month, yesterday.day))
web_info = web_search_response(prompt_text).strip()
history[-1].content = web_info + time_prompt + history[-1].content
with st.expander(f"{User_Identity},{Miao_Nick_Name}在网上找到了这个",expanded=False,icon="📨"):
web_info_show = web_info.replace("<web_info>\n已触发联网搜索,以下是联网搜索返回结果,如果与用户提问有关,请结合搜索答案回答,如果无关,请使用模型自身能力回答,不要杜撰任何内容。","")
web_info_show = web_info_show.replace("<web_info/>\n关于这个网页信息的问题:","")
st.write(web_info_show)
with st.spinner(f"别着急哦,{Miao_Name}的脑瓜子已经转起来啦~"):
placeholder = st.empty()
message_placeholder = placeholder.chat_message(name="assistant", avatar="😺")
markdown_placeholder = message_placeholder.empty()
output_text = ''
for response in client.generate_stream(
system_prompt,
tools=None,
history=document_history,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
):
token = response.token
if response.token.special:
print("\n==Output:==\n", output_text)
match token.text.strip():
case '<|user|>':
break
case _:
st.error(f'Unexpected special token: {token.text.strip()}')
break
output_text += response.token.text
markdown_placeholder.markdown(postprocess_text(output_text + '▌'))
# print(history)
History_Manager.append_conversation(
conversation=Conversation(Role.ASSISTANT,postprocess_text(output_text),),
history=document_history,
placeholder=markdown_placeholder,
save_and_show=True,
document_mode=True)
History_Manager.append_conversation(
conversation=Conversation(Role.ASSISTANT,postprocess_text(output_text),),
history=history,
save_and_show=False)
print(history)
print("history_length:")
print(History_Manager.count_history(history))
print("document_history_length")
print(History_Manager.count_history(document_history))