Skip to content

Commit

Permalink
Merge pull request #485 from Team-WeQuiz/feat/#354
Browse files Browse the repository at this point in the history
[ML] ๋ฐฉ์ƒ์„ฑ ๋™์‹œ ์š”์ฒญ ์‹œ jdk ์—๋Ÿฌ ๋ฐœ์ƒ
  • Loading branch information
noooey authored May 20, 2024
2 parents ba41c78 + e19f23c commit 0c3ef4d
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 17 deletions.
2 changes: 1 addition & 1 deletion model/app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ async def generate(generate_request: GenerateRequest):
res = results[1]

# keyword ์ถ”์ถœ
keywords = extract_keywords(keyword_split_docs, top_n=min(generate_request.num_of_quiz * 2, len(sentences) - 1)) # ํ‚ค์›Œ๋“œ๋Š” ๊ฐœ์ˆ˜๋ฅผ ์—ฌ์œ ๋กญ๊ฒŒ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
keywords = await extract_keywords_async(keyword_split_docs, top_n=min(generate_request.num_of_quiz * 2, len(sentences) - 1)) # ํ‚ค์›Œ๋“œ๋Š” ๊ฐœ์ˆ˜๋ฅผ ์—ฌ์œ ๋กญ๊ฒŒ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
log('info', f'[app.py > quiz] Extracted Keywords: {keywords}')
# queries = extract_concept_relationships(sentences, keywords, min(generate_request.num_of_quiz * 2, len(sentences) - 1))
# log('info', f'[app.py > quiz] Extracted Seed Queries: {len(queries)}')
Expand Down
8 changes: 7 additions & 1 deletion model/app/data/keyword.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import string
import asyncio
from langdetect import detect
from konlpy.tag import Mecab
from nltk.tokenize import word_tokenize
Expand Down Expand Up @@ -52,4 +53,9 @@ def extract_keywords(split_doc_list, top_n):
except InsufficientException as e:
raise e
except Exception as e:
raise Exception(f"An unexpected error occurred during keyword extraction: {str(e)}")
raise Exception(f"An unexpected error occurred during keyword extraction: {str(e)}")


async def extract_keywords_async(split_doc_list, top_n):
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, extract_keywords, split_doc_list, top_n)
23 changes: 8 additions & 15 deletions model/app/data/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,16 @@ async def parse(self, user_id, timestamp):


class TextSplitter():
def __init__(self):
self.okt = Okt()
self.nlp = spacy.blank("en")
self.nlp.add_pipe('sentencizer')

def split_sentences(self, total_text):
lang = detect(total_text[:100])
if lang == "ko":
# Okt๋กœ total_text๋ฅผ ํ˜•ํƒœ์†Œ ๋‹จ์œ„๋กœ ๋ถ„์„
okt = Okt()
morphs = okt.morphs(total_text, stem=True)

morphs = self.okt.morphs(total_text, stem=True)
# ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๋ฌธ์žฅ ๋ถ„๋ฆฌ
sentences = []
current_sentence = []
Expand All @@ -174,55 +177,45 @@ def split_sentences(self, total_text):
sentences.append(' '.join(current_sentence))
else:
# Spacy๋กœ total_text๋ฅผ ๋ฌธ์žฅ ๋‹จ์œ„ ๋ถ„๋ฆฌ
nlp = spacy.blank(lang)
nlp.add_pipe('sentencizer')
doc = nlp(total_text)
doc = self.nlp(total_text)
sentences = [sent.text.strip() for sent in doc.sents]

log('info', f'[preprocessor.py > TextSplitter] splitted_sentences: {len(sentences)}')

return sentences

def split_docs(self, sentences, chunk_size, sentence_overlap):
chunks = []
current_chunk = []
current_chunk_length = 0

for sentence in sentences:
# ๋ฌธ์žฅ ์ „์ฒ˜๋ฆฌ
sentence = remove_urls(sentence)
sentence, is_valid = is_valid_doc(sentence)

# is_valid_doc ํ•จ์ˆ˜์—์„œ invalids ํ‚ค์›Œ๋“œ๊ฐ€ ๋ฐœ๊ฒฌ๋˜๋ฉด ์ฒญํฌ ์ƒ์„ฑ ์ค‘๋‹จ
if not is_valid:
break

# ํ˜„์žฌ ์ฒญํฌ์— ๋ฌธ์žฅ ์ถ”๊ฐ€
if len(sentence.strip()) > MIN_SENTENCE_LENGTH:
sentence_length = len(sentence.strip())

# ํ˜„์žฌ ์ฒญํฌ ๊ธธ์ด์™€ ์ƒˆ๋กœ์šด ๋ฌธ์žฅ ๊ธธ์ด์˜ ํ•ฉ์ด chunk_size๋ฅผ ์ดˆ๊ณผํ•˜๋Š” ๊ฒฝ์šฐ
if current_chunk_length + sentence_length > chunk_size:
# ํ˜„์žฌ ์ฒญํฌ๋ฅผ chunks์— ์ถ”๊ฐ€
chunk = " ".join(current_chunk)
chunks.append(Document(chunk))

# ์ƒˆ๋กœ์šด ์ฒญํฌ ์‹œ์ž‘
if sentence_overlap > 0:
current_chunk = current_chunk[-sentence_overlap:]
else:
current_chunk = []
current_chunk_length = sum(len(s) for s in current_chunk)

# ๋ฌธ์žฅ์„ ํ˜„์žฌ ์ฒญํฌ์— ์ถ”๊ฐ€
current_chunk.append(sentence)
current_chunk_length += sentence_length

# ๋งˆ์ง€๋ง‰ ์ฒญํฌ ์ถ”๊ฐ€
if current_chunk:
chunk = " ".join(current_chunk)
if len(chunk.strip()) > 3:
chunks.append(Document(chunk))

log('info', f'[preprocessor.py > TextSplitter] splitted_chunks: {len(chunks)}')
return chunks

Expand Down

0 comments on commit 0c3ef4d

Please sign in to comment.