Skip to content

Commit

Permalink
Merge pull request #228 from GautamR-Samagra/embed_chunk_page_number
Browse files Browse the repository at this point in the history
embed_chunk_page_number
  • Loading branch information
Gautam-Rajeev authored Aug 2, 2023
2 parents ac1a73a + c98f559 commit 8e8d2a9
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 27 deletions.
4 changes: 2 additions & 2 deletions src/chunking/MPNet/local/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ RUN pip3 install -r requirements.txt

# Copy the rest of the application code to the working directory
COPY . /app/
EXPOSE 8000
EXPOSE 7035
# Set the entrypoint for the container
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
CMD ["hypercorn", "--bind", "0.0.0.0:7035", "api:app"]
Binary file removed src/chunking/MPNet/local/NEP_Final_English.pdf
Binary file not shown.
34 changes: 32 additions & 2 deletions src/chunking/MPNet/local/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,22 @@
import aiohttp
import pandas as pd
import io
from PyPDF2 import PdfReader
import os

def extract_text_from_txt(txt_path):
with open(txt_path, 'r', encoding='utf-8') as file:
return file.read()

def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
number_of_pages = len(reader.pages)
all_text = ""

for page in reader.pages:
all_text += page.extract_text()

return all_text

app = Quart(__name__)

Expand All @@ -25,10 +41,24 @@ async def embed():

if uploaded_file:
print("1- File uploaded")
text_data = uploaded_file.stream.read().decode('utf-8')
req = ModelRequest(text = text_data) # Pass the DataFrame to ModelRequest

if uploaded_file:
file_extension = os.path.splitext(uploaded_file.filename)[1].lower()

if file_extension == '.txt':
text_data = uploaded_file.stream.read().decode('utf-8')
elif file_extension == '.pdf':
pdf_file_stream = io.BytesIO(uploaded_file.stream.read())
reader = PdfReader(pdf_file_stream)
pages = [(i, page.extract_text()) for i, page in enumerate(reader.pages)] # Modified line
text_data = pages
else:
return (print('Wrong format of file submitted'))

req = ModelRequest(text = text_data)
response = await model.inference(req)


else :
req = ModelRequest(**data)
response = await model.inference(req) # Await the coroutine to get the actual response
Expand Down
78 changes: 55 additions & 23 deletions src/chunking/MPNet/local/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,65 +15,94 @@
from quart import Response

nltk.download('punkt')

class Splitter:
def __init__(self,doc,max_chunk_length):
self.doc = doc
def __init__(self, pages, max_chunk_length):
self.pages = pages
self.sentencesInDoc = []
self.max_chunk_length = max_chunk_length
self.embedding = None
self.docToSentences()

def docToSentences(self):
sentences = sent_tokenize(self.doc)
for sentence in sentences:
currSentenceSplits = textwrap.wrap(sentence.strip(), self.max_chunk_length)
self.sentencesInDoc.extend(currSentenceSplits)
for page_num, page_content in self.pages:
sentences = sent_tokenize(page_content)
for sentence in sentences:
currSentenceSplits = textwrap.wrap(sentence.strip(), self.max_chunk_length)
for split in currSentenceSplits:
self.sentencesInDoc.append((split, page_num))


def getChunksConsideringSentences(self):
## chunk having one sentence only. Keep on adding sentences using docToSentences until you reach maximum words threshold
chunks = []
curr_sentence = ''
for sentence in self.sentencesInDoc:
start_page = end_page = None
for sentence, page_num in self.sentencesInDoc:
if len(curr_sentence + sentence) > self.max_chunk_length:
chunks.append(curr_sentence)
chunks.append((curr_sentence, start_page, end_page))
curr_sentence = sentence
start_page = end_page = page_num
else:
curr_sentence = curr_sentence + ' ' + sentence
chunks.append(curr_sentence)
end_page = page_num
if start_page is None:
start_page = page_num
chunks.append((curr_sentence, start_page, end_page))
return chunks


# taken from https://medium.com/@npolovinkin/how-to-chunk-text-into-paragraphs-using-python-8ae66be38ea6
def getChunksConsideringNeighbouringSimilarity(self,numberOfSentencesToCalculatedWeightedSum=10):
def getChunksConsideringNeighbouringSimilarity(self, numberOfSentencesToCalculatedWeightedSum=10):
encoding_model = SentenceTransformer('all-mpnet-base-v2')

if self.embedding is None:
print('Document Encoding Process :-')
sentences_only = [sentence for sentence, _ in self.sentencesInDoc]
self.embedding = encoding_model.encode(
self.sentencesInDoc,
sentences_only,
show_progress_bar=True,
batch_size=32,
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

similarities = cosine_similarity(self.embedding)
activated_similarities = self.activate_similarities(similarities, p_size=numberOfSentencesToCalculatedWeightedSum)
minimas = argrelextrema(activated_similarities, np.less, order=2)[0].reshape(-1)
return self.getChunksWithMinimas(minimas)

def getChunksWithMinimas(self,minimas):

def getChunksWithMinimas(self, minimas):
chunks = []
chunk_start = 0
chunk_text = ''
start_page = None
end_page = None

for minima in minimas:
mergedSentence = ' '.join(self.sentencesInDoc[chunk_start:minima])
for i in range(chunk_start, minima):
sentence, page_num = self.sentencesInDoc[i]
chunk_text += ' ' + sentence
end_page = page_num
if start_page is None:
start_page = page_num

chunks.append((chunk_text.strip(), start_page, end_page))
chunk_text = ''
start_page = end_page = None
chunk_start = minima
currSentenceSplits = textwrap.wrap(mergedSentence, self.max_chunk_length)
chunks.extend(currSentenceSplits)
if(chunk_start != len(self.sentencesInDoc) - 1):
chunks.extend(self.sentencesInDoc[chunk_start:])

if chunk_start != len(self.sentencesInDoc):
for i in range(chunk_start, len(self.sentencesInDoc)):
sentence, page_num = self.sentencesInDoc[i]
chunk_text += ' ' + sentence
end_page = page_num
if start_page is None:
start_page = page_num
chunks.append((chunk_text.strip(), start_page, end_page))

return chunks



def rev_sigmoid(self,x:float)->float:
return (1 / (1 + math.exp(0.5*x)))

Expand Down Expand Up @@ -120,10 +149,13 @@ def __new__(cls, context):

async def inference(self, request: ModelRequest):
# Modify this function according to model requirements such that inputs and output remains the same
splitter = Splitter(request.text, 4 * 1024)
pages = request.text
splitter = Splitter(pages, 4 * 1024)
chunks = splitter.getChunksConsideringNeighbouringSimilarity()
df = pd.DataFrame({'content': chunks})
# Convert DataFrame to a CSV string
df = pd.DataFrame({'content': [content for content, _, _ in chunks],
'start_page': [start_page for _, start_page, _ in chunks],
'end_page': [end_page for _, _, end_page in chunks]})

csv_string = df.to_csv(index=False)

# Properly escape the CSV string
Expand Down

0 comments on commit 8e8d2a9

Please sign in to comment.