Merge pull request #228 from GautamR-Samagra/embed_chunk_page_number

embed_chunk_page_number
Samagra-Development · Aug 2, 2023 · 8e8d2a9 · 8e8d2a9
2 parents ac1a73a + c98f559
commit 8e8d2a9
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 27 deletions.
diff --git a/src/chunking/MPNet/local/Dockerfile b/src/chunking/MPNet/local/Dockerfile
@@ -10,6 +10,6 @@ RUN pip3 install -r requirements.txt
 
 # Copy the rest of the application code to the working directory
 COPY . /app/
-EXPOSE 8000
+EXPOSE 7035
 # Set the entrypoint for the container
-CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
+CMD ["hypercorn", "--bind", "0.0.0.0:7035", "api:app"]
diff --git a/src/chunking/MPNet/local/NEP_Final_English.pdf b/src/chunking/MPNet/local/NEP_Final_English.pdf
diff --git a/src/chunking/MPNet/local/api.py b/src/chunking/MPNet/local/api.py
@@ -4,6 +4,22 @@
 import aiohttp
 import pandas as pd
 import io
+from PyPDF2 import PdfReader
+import os 
+
+def extract_text_from_txt(txt_path):
+    with open(txt_path, 'r', encoding='utf-8') as file:
+        return file.read()
+
+def extract_text_from_pdf(pdf_path):
+    reader = PdfReader(pdf_path)
+    number_of_pages = len(reader.pages)
+    all_text = ""
+
+    for page in reader.pages:
+        all_text += page.extract_text()
+
+    return all_text
 
 app = Quart(__name__)
 
@@ -25,10 +41,24 @@ async def embed():
 
     if uploaded_file:
         print("1- File uploaded")
-        text_data = uploaded_file.stream.read().decode('utf-8')
-        req = ModelRequest(text = text_data)  # Pass the DataFrame to ModelRequest
+
+        if uploaded_file:
+            file_extension = os.path.splitext(uploaded_file.filename)[1].lower()
+
+            if file_extension == '.txt':
+                text_data = uploaded_file.stream.read().decode('utf-8')
+            elif file_extension == '.pdf':
+                pdf_file_stream = io.BytesIO(uploaded_file.stream.read())
+                reader = PdfReader(pdf_file_stream)
+                pages = [(i, page.extract_text()) for i, page in enumerate(reader.pages)]  # Modified line
+                text_data = pages
+            else:
+                return (print('Wrong format of file submitted'))
+
+        req = ModelRequest(text = text_data)
         response = await model.inference(req)
 
+
     else : 
         req = ModelRequest(**data)
         response = await model.inference(req)  # Await the coroutine to get the actual response

diff --git a/src/chunking/MPNet/local/model.py b/src/chunking/MPNet/local/model.py
@@ -15,65 +15,94 @@
 from quart import Response
 
 nltk.download('punkt')
-
 class Splitter:
-    def __init__(self,doc,max_chunk_length):
-        self.doc = doc
+    def __init__(self, pages, max_chunk_length):
+        self.pages = pages
         self.sentencesInDoc = []
         self.max_chunk_length = max_chunk_length
         self.embedding = None
         self.docToSentences()
 
     def docToSentences(self):
-        sentences = sent_tokenize(self.doc)
-        for sentence in sentences:
-            currSentenceSplits = textwrap.wrap(sentence.strip(), self.max_chunk_length)
-            self.sentencesInDoc.extend(currSentenceSplits)
+        for page_num, page_content in self.pages:
+            sentences = sent_tokenize(page_content)
+            for sentence in sentences:
+                currSentenceSplits = textwrap.wrap(sentence.strip(), self.max_chunk_length)
+                for split in currSentenceSplits:
+                    self.sentencesInDoc.append((split, page_num))
+
 
     def getChunksConsideringSentences(self):
-      ## chunk having one sentence only. Keep on adding sentences using docToSentences until you reach maximum words threshold
         chunks = []
         curr_sentence = ''
-        for sentence in self.sentencesInDoc:
+        start_page = end_page = None
+        for sentence, page_num in self.sentencesInDoc:
             if len(curr_sentence + sentence) > self.max_chunk_length:
-                chunks.append(curr_sentence)
+                chunks.append((curr_sentence, start_page, end_page))
                 curr_sentence = sentence
+                start_page = end_page = page_num
             else:
                 curr_sentence = curr_sentence + ' ' + sentence
-        chunks.append(curr_sentence)
+                end_page = page_num
+                if start_page is None:
+                    start_page = page_num
+        chunks.append((curr_sentence, start_page, end_page))
         return chunks
 
+
     # taken from https://medium.com/@npolovinkin/how-to-chunk-text-into-paragraphs-using-python-8ae66be38ea6
-    def getChunksConsideringNeighbouringSimilarity(self,numberOfSentencesToCalculatedWeightedSum=10):
+    def getChunksConsideringNeighbouringSimilarity(self, numberOfSentencesToCalculatedWeightedSum=10):
         encoding_model = SentenceTransformer('all-mpnet-base-v2')
 
         if self.embedding is None:
             print('Document Encoding Process :-')
+            sentences_only = [sentence for sentence, _ in self.sentencesInDoc]
             self.embedding = encoding_model.encode(
-                    self.sentencesInDoc,
+                    sentences_only,
                     show_progress_bar=True,
                     batch_size=32,
-                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
                 )
 
         similarities = cosine_similarity(self.embedding)
         activated_similarities = self.activate_similarities(similarities, p_size=numberOfSentencesToCalculatedWeightedSum)
         minimas = argrelextrema(activated_similarities, np.less, order=2)[0].reshape(-1)
         return self.getChunksWithMinimas(minimas)
 
-    def getChunksWithMinimas(self,minimas):
+
+    def getChunksWithMinimas(self, minimas):
         chunks = []
         chunk_start = 0
+        chunk_text = ''
+        start_page = None
+        end_page = None
+
         for minima in minimas:
-            mergedSentence = ' '.join(self.sentencesInDoc[chunk_start:minima])
+            for i in range(chunk_start, minima):
+                sentence, page_num = self.sentencesInDoc[i]
+                chunk_text += ' ' + sentence
+                end_page = page_num
+                if start_page is None:
+                    start_page = page_num
+
+            chunks.append((chunk_text.strip(), start_page, end_page))
+            chunk_text = ''
+            start_page = end_page = None
             chunk_start = minima
-            currSentenceSplits = textwrap.wrap(mergedSentence, self.max_chunk_length)
-            chunks.extend(currSentenceSplits)
-        if(chunk_start != len(self.sentencesInDoc) - 1):
-            chunks.extend(self.sentencesInDoc[chunk_start:])
+
+        if chunk_start != len(self.sentencesInDoc):
+            for i in range(chunk_start, len(self.sentencesInDoc)):
+                sentence, page_num = self.sentencesInDoc[i]
+                chunk_text += ' ' + sentence
+                end_page = page_num
+                if start_page is None:
+                    start_page = page_num
+            chunks.append((chunk_text.strip(), start_page, end_page))
+
         return chunks
 
 
+
     def rev_sigmoid(self,x:float)->float:
         return (1 / (1 + math.exp(0.5*x)))
 
@@ -120,10 +149,13 @@ def __new__(cls, context):
 
     async def inference(self, request: ModelRequest):
     # Modify this function according to model requirements such that inputs and output remains the same
-        splitter = Splitter(request.text, 4 * 1024)
+        pages =  request.text
+        splitter = Splitter(pages, 4 * 1024)
         chunks = splitter.getChunksConsideringNeighbouringSimilarity()
-        df = pd.DataFrame({'content': chunks})
-        # Convert DataFrame to a CSV string
+        df = pd.DataFrame({'content': [content for content, _, _ in chunks],
+                        'start_page': [start_page for _, start_page, _ in chunks],
+                        'end_page': [end_page for _, _, end_page in chunks]})
+
         csv_string = df.to_csv(index=False)
 
         # Properly escape the CSV string