From 956baf71876dc99b96abb4c58e5fc63f0bdb84db Mon Sep 17 00:00:00 2001 From: one-Alive Date: Thu, 7 Nov 2024 13:22:02 +0530 Subject: [PATCH] nlp --- Extractive Text Summarization/readme.md | 87 +++++++++++ .../requirements.txt | 4 + Extractive Text Summarization/summary.py | 139 ++++++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 Extractive Text Summarization/readme.md create mode 100644 Extractive Text Summarization/requirements.txt create mode 100644 Extractive Text Summarization/summary.py diff --git a/Extractive Text Summarization/readme.md b/Extractive Text Summarization/readme.md new file mode 100644 index 0000000..c60f9dd --- /dev/null +++ b/Extractive Text Summarization/readme.md @@ -0,0 +1,87 @@ + +# πŸ“ Advanced Extractive Text Summarization Model + +Welcome to the **Advanced Extractive Text Summarization Model**! This project uses **Natural Language Processing (NLP)** techniques to automatically distill essential points from lengthy content, making it an invaluable tool for handling reports, research papers, news articles, and more. + +## πŸš€ Project Overview + +This model leverages NLP to: +- **Extract key sentences** from a body of text. +- **Score sentences** based on their importance using features like **TF-IDF**, sentence length, position, and presence of named entities. +- **Cluster related sentences** via **K-means** to highlight critical points from various thematic groups. + +### Why It Matters +In today’s information-dense world, quickly understanding critical points from long documents is essential. This model saves time and boosts productivity by providing concise summaries while preserving core insights. + +--- + +## πŸ“Š Features + +1. **Preprocessing** + - Cleans and prepares text data for effective summarization. + +2. **Scoring & Ranking** + - Scores sentences based on TF-IDF, sentence structure, and key entities. + +3. **Clustering & Key Point Extraction** + - Uses K-means clustering to group sentences by topic and select key sentences for each group. + +4. **Summary Generation** + - Combines top-ranked sentences from each cluster to create a coherent, impactful summary. + +--- + +## πŸ”§ How It Works + +1. **Data Preprocessing:** Initial cleaning (e.g., removing stop words, punctuation). +2. **Sentence Scoring:** Uses TF-IDF, sentence structure, and named entity recognition to evaluate sentence importance. +3. **K-means Clustering:** Groups related sentences to capture diverse perspectives within the text. +4. **Summarization:** Extracts top sentences across clusters to create a balanced summary. + +--- + +## πŸ› οΈ Installation + +1. **Clone the Repository:** + ```bash + git clone https://github.com/one-alive/extractive_text_summarization.git + cd extractive_text_summarization + ``` + +2. **Install Dependencies:** + ```bash + pip install -r requirements.txt + ``` + +--- + +## ▢️ Usage + +1. **Run the Model on a Sample Text:** + ```bash + python summarize.py + ``` + +2. **Adjust Parameters:** You can tune parameters such as the number of clusters, sentence selection criteria, and summary length for better results based on the text type. + +--- + +## βš™οΈ Next Steps + +- **Parameter Tuning:** Experiment with different clustering techniques and scoring weights. +- **Expand Dataset Compatibility:** Optimize for specific types of documents like research papers or news articles. +- **Add Fine-Tuning:** Integrate more NLP models to improve summarization accuracy. + +--- + +## 🀝 Contributing + +Contributions are welcome! If you have ideas or suggestions, please create a pull request or open an issue. + +--- + +## πŸ“¬ Contact + +If you have questions or want to explore collaboration opportunities, feel free to reach out! + +--- \ No newline at end of file diff --git a/Extractive Text Summarization/requirements.txt b/Extractive Text Summarization/requirements.txt new file mode 100644 index 0000000..c73bfd5 --- /dev/null +++ b/Extractive Text Summarization/requirements.txt @@ -0,0 +1,4 @@ +nltk==3.9.1 +numpy==2.1.3 +scikit_learn==1.5.2 +spacy==3.8.2 diff --git a/Extractive Text Summarization/summary.py b/Extractive Text Summarization/summary.py new file mode 100644 index 0000000..9272342 --- /dev/null +++ b/Extractive Text Summarization/summary.py @@ -0,0 +1,139 @@ +import os +import re +import heapq +import numpy as np +from collections import defaultdict +from typing import List, Dict, Tuple, Union +from nltk.tokenize import sent_tokenize +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import KMeans +import spacy +from nltk.stem import PorterStemmer + + +class TextSummarization: + def __init__(self): + try: + self.nlp = spacy.load('en_core_web_sm') + except OSError: + print("Downloading spaCy model...") + os.system("python -m spacy download en_core_web_sm") + self.nlp = spacy.load('en_core_web_sm') + + self.tfidf_vectorizer = TfidfVectorizer(stop_words='english') + self.stemmer = PorterStemmer() + + def clean_text(self, text: str) -> Tuple[str, str]: + """Clean and preprocess the text, returning both original and stemmed versions.""" + text = re.sub(r'[^\w\s.,!?]', '', text) # Keep sentence structure + cleaned_text = ' '.join(text.split()) # Remove extra whitespace + stemmed_text = self.stem_text(cleaned_text) + return cleaned_text, stemmed_text + + def stem_text(self, text: str) -> str: + """Stem the words in the text.""" + words = text.split() + stemmed_words = [self.stemmer.stem(word) for word in words] + return ' '.join(stemmed_words) + + def score_sentences(self, original_sentences: List[str], stemmed_sentences: List[str]) -> Dict[str, float]: + """Score sentences based on TF-IDF and structural features.""" + tfidf_matrix = self.tfidf_vectorizer.fit_transform(stemmed_sentences) + sentence_scores = defaultdict(float) + + for i, original_sentence in enumerate(original_sentences): + score = sum(tfidf_matrix[i, j] for j in tfidf_matrix[i].indices) + sent_doc = self.nlp(original_sentence) + + # Apply length and positional weighting + length_factor = min( + 1.0, len(sent_doc) / 20.0) if len(sent_doc) < 20 else 20.0 / len(sent_doc) + score *= length_factor + + # Position bonuses + if i < len(original_sentence) * 0.2: + score *= 1.2 + elif i > len(original_sentence) * 0.8: + score *= 1.1 + + # Bonuses for named entities and important dependencies + if sent_doc.ents: + score *= 1.2 + if any(token.dep_ in ['nsubj', 'dobj'] for token in sent_doc): + score *= 1.1 + + sentence_scores[original_sentence] = score + + return sentence_scores + + def extract_key_points(self, original_sentences: List[str], stemmed_sentences: List[str], num_clusters: int = 5) -> List[str]: + """Extract key points using K-means clustering.""" + num_clusters = min(num_clusters, len(original_sentences)) + if num_clusters < 1: + return [] + + tfidf_matrix = self.tfidf_vectorizer.fit_transform(stemmed_sentences) + kmeans = KMeans(n_clusters=num_clusters, random_state=42) + kmeans.fit(tfidf_matrix) + + labeled_sentences = [(orig, stem, label, idx) for idx, (orig, stem, label) in enumerate( + zip(original_sentences, stemmed_sentences, kmeans.labels_))] + key_points = [] + + for cluster in range(num_clusters): + cluster_sentences = [ + item for item in labeled_sentences if item[2] == cluster] + if cluster_sentences: + cluster_center = kmeans.cluster_centers_[cluster] + distances = [np.linalg.norm(tfidf_matrix[item[3]].toarray( + ) - cluster_center) for item in cluster_sentences] + closest_sentence = cluster_sentences[np.argmin( + distances)][0] # Use original sentence + + sent_doc = self.nlp(closest_sentence) + if len(sent_doc) >= 5: + point = re.sub(r'\s+', ' ', closest_sentence.strip('., ')) + if len(point.split()) >= 5: + # Store with original index + key_points.append((point, cluster_sentences[0][3])) + + # Sort key points based on their original position in the text + key_points.sort(key=lambda x: x[1]) + return [point for point, _ in key_points] + + def summarize(self, text: str, num_sentences: int = 5) -> Dict[str, Union[str, List[str]]]: + """Generate a comprehensive summary of the text.""" + cleaned_text, stemmed_text = self.clean_text(text) + original_sentences = sent_tokenize(cleaned_text) + stemmed_sentences = sent_tokenize(stemmed_text) + num_sentences = min(num_sentences, len( + original_sentences)) if original_sentences else 0 + + sentence_scores = self.score_sentences( + original_sentences, stemmed_sentences) + summary_sentences = heapq.nlargest( + num_sentences, sentence_scores.items(), key=lambda x: x[1]) + summary_sentences.sort(key=lambda x: original_sentences.index(x[0])) + + summary = ' '.join([sentence for sentence, _ in summary_sentences]) + key_points = self.extract_key_points( + original_sentences, stemmed_sentences, num_clusters=min(5, len(original_sentences))) + + return { + 'summary': summary, + 'key_points': key_points, + } + + +def main(text): + # Create summarizer instance + summarizer = TextSummarization() + + # Generate summary + summary = summarizer.summarize(text) + + print(summary) + + +if __name__ == "__main__": + main()