Merge pull request #236 from wangxinbiao/main

feat:chunks pdf files to generate Q&A
kubeagi · Nov 22, 2023 · 31f0f88 · 31f0f88
2 parents 5ba5f5d + 12f620b
commit 31f0f88
Show file tree

Hide file tree

Showing 12 changed files with 2,074 additions and 146 deletions.
diff --git a/data-processing/data_manipulation/common/config.py b/data-processing/data_manipulation/common/config.py
@@ -22,3 +22,6 @@
 
 # zhipuai api_key
 zhipuai_api_key = os.getenv('ZHIPUAI_API_KEY', 'xxxxx')
+
+knowledge_chunk_size = os.getenv("KNOWLEDGE_CHUNK_SIZE", 500)
+knowledge_chunk_overlap = os.getenv("KNOWLEDGE_CHUNK_OVERLAP", 50)
diff --git a/data-processing/data_manipulation/file_handle/csv_handle.py b/data-processing/data_manipulation/file_handle/csv_handle.py
@@ -65,8 +65,6 @@ async def text_manipulate(opt={}):
         # 获取CSV文件的内容
         data = pd.read_csv(file_path)
 
-        logger.info('data')
-
         logger.info("start text manipulate!")
         text_data = data['prompt']
 
@@ -81,7 +79,7 @@ async def text_manipulate(opt={}):
             return clean_result
 
         text_data = clean_result['data']
-
+        
         # 将清洗后的文件保存为final
         new_file_name = await file_utils.get_file_name({
             'file_name': file_name,
@@ -117,6 +115,8 @@ async def text_manipulate(opt={}):
 # content:
 # 1) 基本功能实现
 ###
+
+
 async def data_clean(opt={}):
     logger.info("csv text data clean start!")
     support_type = opt['support_type']
@@ -138,7 +138,9 @@ async def data_clean(opt={}):
                 }
 
             clean_data.append(result['data'])
+
         data = clean_data
+        data.insert(0, ['prompt'])
 
         # 将文件存为middle
         file_name = await file_utils.get_file_name({
@@ -171,6 +173,8 @@ async def data_clean(opt={}):
 # content:
 # 1) 基本功能实现
 ###
+
+
 async def remove_invisible_characters(opt={}):
     return await clean_transform.remove_invisible_characters({
         'text': opt['text']
@@ -221,10 +225,6 @@ async def save_csv(opt={}):
 
     with open(file_path, 'w', newline='') as file:
         writer = csv.writer(file)
-
-        writer.writerow(['prompt'])
-
-        for row in data:
-            writer.writerow([row])
+        writer.writerows(data)
 
     return file_path
diff --git a/data-processing/data_manipulation/file_handle/pdf_handle.py b/data-processing/data_manipulation/file_handle/pdf_handle.py
@@ -0,0 +1,237 @@
+# Copyright 2023 KubeAGI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###
+# PDF文件处理
+# @author: wangxinbiao
+# @date: 2023-11-01 16:43:01
+# modify history
+# ==== 2023-11-01 16:43:01 ====
+# author: wangxinbiao
+# content:
+# 1) 基本功能实现
+###
+
+import logging
+import os
+import pandas as pd
+
+from common import config
+from file_handle import csv_handle
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import SpacyTextSplitter
+from pypdf import PdfReader
+from transform.text import clean_transform, privacy_transform, QA_transform
+from utils import file_utils
+
+
+
+logger = logging.getLogger('pdf_handle')
+
+###
+# 文本数据处理
+# @author: wangxinbiao
+# @date: 2023-11-17 16:14:01
+# modify history
+# ==== 2023-11-17 16:14:01 ====
+# author: wangxinbiao
+# content:
+# 1) 基本功能实现
+###
+
+
+async def text_manipulate(request, opt={}):
+    logger.info("pdf text manipulate!")
+
+    """
+        数据处理逻辑：
+            处理某条数据时，如果某个方式（比如：去除不可见字符）处理失败了，则直接结束，不在处理，整个文件都视作处理失败
+            
+    """
+
+    try:
+
+        file_name = opt['file_name']
+        support_type = opt['support_type']
+
+        pdf_file_path = await file_utils.get_temp_file_path()
+        file_path = pdf_file_path + 'original/' + file_name
+
+
+        # 获取PDF文件的内容
+        content = await get_content({
+            "file_path": file_path
+        })
+
+        logger.info("start text manipulate!")
+
+        # 数据清洗
+        clean_result = await data_clean({
+            'support_type': support_type,
+            'file_name': file_name,
+            'data': content
+        })
+
+        if clean_result['status'] != 200:
+            return clean_result
+
+        content = clean_result['data']
+
+        # 去隐私
+
+
+        # QA拆分
+        if 'qa_split' in support_type:
+            qa_data = await generate_QA(request, {
+                'support_type': support_type,
+                'data': content
+            })
+
+            # 将生成的QA数据保存为CSV文件
+            new_file_name = await file_utils.get_file_name({
+                'file_name': file_name,
+                'handle_name': 'final'
+            })
+
+            file_name_without_extension = file_name.rsplit('.', 1)[0]
+
+            await csv_handle.save_csv({
+                'file_name': file_name_without_extension + '.csv',
+                'phase_value': 'final',
+                'data': qa_data
+            })
+
+        return {
+            'status': 200,
+            'message': '',
+            'data': ''
+        }
+    except Exception as ex:
+        return {
+            'status': 400,
+            'message': '',
+            'data': ''
+        }
+
+###
+# 数据异常清洗
+# @author: wangxinbiao
+# @date: 2023-11-17 16:14:01
+# modify history
+# ==== 2023-11-17 16:14:01 ====
+# author: wangxinbiao
+# content:
+# 1) 基本功能实现
+###
+
+
+async def data_clean(opt={}):
+    logger.info("pdf text data clean start!")
+    support_type = opt['support_type']
+    data = opt['data']
+
+    # 去除不可见字符
+    if 'remove_invisible_characters' in support_type:
+        result = await clean_transform.remove_invisible_characters({
+            'text': data
+        })
+
+        if result['status'] != 200:
+            return {
+                'status': 400,
+                'message': '去除不可见字符失败',
+                'data': ''
+            }            
+
+        data = result['data']
+
+    logger.info("pdf text data clean stop!")
+
+    return {
+        'status': 200,
+        'message': '',
+        'data': data
+    }
+
+
+###
+# 获取PDF内容
+# @author: wangxinbiao
+# @date: 2023-11-17 16:14:01
+# modify history
+# ==== 2023-11-17 16:14:01 ====
+# author: wangxinbiao
+# content:
+# 1) 基本功能实现
+###
+
+
+async def get_content(opt={}):
+    file_path = opt['file_path']
+
+    reader = PdfReader(file_path)
+    number_of_pages = len(reader.pages)
+    pages = reader.pages
+    content = ""
+    for page in pages:
+        content += page.extract_text()
+
+    return content
+
+###
+# QA拆分
+# @author: wangxinbiao
+# @date: 2023-11-17 16:14:01
+# modify history
+# ==== 2023-11-17 16:14:01 ====
+# author: wangxinbiao
+# content:
+# 1) 基本功能实现
+###
+
+
+async def generate_QA(request, opt={}):
+    request_json = request.json
+
+    # 文本分段
+    chunk_size = config.knowledge_chunk_size
+    if "chunk_size" in request_json:
+        chunk_size = request_json['chunk_size']
+
+    chunk_overlap = config.knowledge_chunk_overlap
+    if "chunk_overlap" in request_json:
+        chunk_overlap = request_json['chunk_overlap']
+
+    separator = "\n\n"
+
+    text_splitter = SpacyTextSplitter(
+        separator=separator,
+        pipeline="zh_core_web_sm",
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )
+    texts = text_splitter.split_text(opt['data'])
+
+    # 生成QA
+    qa_list = [['q', 'a']]
+
+    for item in texts:
+        text = item.replace("\n", "")
+        data = await QA_transform.generate_QA({
+            'text': text
+        })
+
+        qa_list.extend(data)
+
+    return qa_list
diff --git a/data-processing/data_manipulation/server.py b/data-processing/data_manipulation/server.py
@@ -72,7 +72,8 @@ async def text_manipulate(request):
 
         Args:
             type: 对文本数据需要进行那些处理;
-            file_path: 文本路径
+            bucket_name: minio桶名称;
+            folder_prefix: minio中文件目录
 
         Returns:
 

diff --git a/data-processing/data_manipulation/service/minio_store_process_service.py b/data-processing/data_manipulation/service/minio_store_process_service.py
@@ -28,7 +28,7 @@
 import os
 
 import pandas as pd
-from file_handle import csv_handle
+from file_handle import csv_handle, pdf_handle
 from minio import Minio
 from minio.commonconfig import Tags
 from minio.error import S3Error
@@ -80,6 +80,13 @@ async def text_manipulate(request):
                 'support_type': support_type
             })
 
+        elif file_extension in ['pdf']:
+            # 处理PDF文件
+            result = await pdf_handle.text_manipulate(request, {
+                'file_name': item,
+                'support_type': support_type
+            })
+
     # 将清洗后的文件上传到MinIO中
     # 上传middle文件夹下的文件，并添加tag
     tags = Tags(for_object=True)
@@ -135,14 +142,16 @@ async def download(opt={}):
     for obj in objects:
         file_name = obj.object_name[len(folder_prefix):]
 
-        data = minio_client.get_object(bucket_name, obj.object_name)
-        df = pd.read_csv(data)
+        csv_file_path = await file_utils.get_temp_file_path()
+
+        # 如果文件夹不存在，则创建
+        directory_path = csv_file_path + 'original'
+        if not os.path.exists(directory_path):
+            os.makedirs(directory_path)
+
+        file_path = directory_path + '/' + file_name
 
-        await csv_handle.save_csv({
-            'file_name': file_name,
-            'phase_value': 'original',
-            'data': df['prompt']
-        })
+        minio_client.fget_object(bucket_name, obj.object_name, file_path)
         file_names.append(file_name)
 
     return file_names

diff --git a/data-processing/data_manipulation/transform/text/QA_transform.py b/data-processing/data_manipulation/transform/text/QA_transform.py
@@ -87,9 +87,6 @@ async def formatSplitText(text):
         q = match[1]
         a = match[4]
         if q and a:
-            result.append({
-                'q': q,
-                'a': a
-            })
+            result.append([q, a])
 
     return result
diff --git a/data-processing/requirements.txt b/data-processing/requirements.txt
@@ -5,4 +5,7 @@ sanic_cors==2.2.0
 aiohttp==3.8.6
 ulid==1.1
 minio==7.1.17
-zhipuai==1.0.7
+zhipuai==1.0.7
+langchain==0.0.336
+spacy==3.5.4
+pypdf==3.17.1
diff --git a/gqlgen.yaml b/gqlgen.yaml
@@ -141,6 +141,10 @@ models:
         resolver: true
       listKnowledgeBases:
         resolver: true
+  DataProcessQuery:
+    fields:
+      allDataProcessListByPage:
+        resolver: true
   DatasetQuery:
     fields:
       getDataset: