nlp

king04aman · Nov 7, 2024 · 956baf7 · 956baf7
1 parent 08a3dbe
commit 956baf7
Show file tree

Hide file tree

Showing 3 changed files with 230 additions and 0 deletions.
diff --git a/Extractive Text Summarization/readme.md b/Extractive Text Summarization/readme.md
@@ -0,0 +1,87 @@
+
+# 📝 Advanced Extractive Text Summarization Model
+
+Welcome to the **Advanced Extractive Text Summarization Model**! This project uses **Natural Language Processing (NLP)** techniques to automatically distill essential points from lengthy content, making it an invaluable tool for handling reports, research papers, news articles, and more.
+
+## 🚀 Project Overview
+
+This model leverages NLP to:
+- **Extract key sentences** from a body of text.
+- **Score sentences** based on their importance using features like **TF-IDF**, sentence length, position, and presence of named entities.
+- **Cluster related sentences** via **K-means** to highlight critical points from various thematic groups.
+
+### Why It Matters
+In today’s information-dense world, quickly understanding critical points from long documents is essential. This model saves time and boosts productivity by providing concise summaries while preserving core insights.
+
+---
+
+## 📊 Features
+
+1. **Preprocessing**  
+   - Cleans and prepares text data for effective summarization.
+
+2. **Scoring & Ranking**  
+   - Scores sentences based on TF-IDF, sentence structure, and key entities.
+
+3. **Clustering & Key Point Extraction**  
+   - Uses K-means clustering to group sentences by topic and select key sentences for each group.
+
+4. **Summary Generation**  
+   - Combines top-ranked sentences from each cluster to create a coherent, impactful summary.
+
+---
+
+## 🔧 How It Works
+
+1. **Data Preprocessing:** Initial cleaning (e.g., removing stop words, punctuation).
+2. **Sentence Scoring:** Uses TF-IDF, sentence structure, and named entity recognition to evaluate sentence importance.
+3. **K-means Clustering:** Groups related sentences to capture diverse perspectives within the text.
+4. **Summarization:** Extracts top sentences across clusters to create a balanced summary.
+
+---
+
+## 🛠️ Installation
+
+1. **Clone the Repository:**
+   ```bash
+   git clone https://github.com/one-alive/extractive_text_summarization.git
+   cd extractive_text_summarization
+   ```
+
+2. **Install Dependencies:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+---
+
+## ▶️ Usage
+
+1. **Run the Model on a Sample Text:**
+   ```bash
+   python summarize.py
+   ```
+
+2. **Adjust Parameters:** You can tune parameters such as the number of clusters, sentence selection criteria, and summary length for better results based on the text type.
+
+---
+
+## ⚙️ Next Steps
+
+- **Parameter Tuning:** Experiment with different clustering techniques and scoring weights.
+- **Expand Dataset Compatibility:** Optimize for specific types of documents like research papers or news articles.
+- **Add Fine-Tuning:** Integrate more NLP models to improve summarization accuracy.
+
+---
+
+## 🤝 Contributing
+
+Contributions are welcome! If you have ideas or suggestions, please create a pull request or open an issue.
+
+---
+
+## 📬 Contact
+
+If you have questions or want to explore collaboration opportunities, feel free to reach out!
+
+---
diff --git a/Extractive Text Summarization/requirements.txt b/Extractive Text Summarization/requirements.txt
@@ -0,0 +1,4 @@
+nltk==3.9.1
+numpy==2.1.3
+scikit_learn==1.5.2
+spacy==3.8.2
diff --git a/Extractive Text Summarization/summary.py b/Extractive Text Summarization/summary.py
@@ -0,0 +1,139 @@
+import os
+import re
+import heapq
+import numpy as np
+from collections import defaultdict
+from typing import List, Dict, Tuple, Union
+from nltk.tokenize import sent_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+import spacy
+from nltk.stem import PorterStemmer
+
+
+class TextSummarization:
+    def __init__(self):
+        try:
+            self.nlp = spacy.load('en_core_web_sm')
+        except OSError:
+            print("Downloading spaCy model...")
+            os.system("python -m spacy download en_core_web_sm")
+            self.nlp = spacy.load('en_core_web_sm')
+
+        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
+        self.stemmer = PorterStemmer()
+
+    def clean_text(self, text: str) -> Tuple[str, str]:
+        """Clean and preprocess the text, returning both original and stemmed versions."""
+        text = re.sub(r'[^\w\s.,!?]', '', text)  # Keep sentence structure
+        cleaned_text = ' '.join(text.split())  # Remove extra whitespace
+        stemmed_text = self.stem_text(cleaned_text)
+        return cleaned_text, stemmed_text
+
+    def stem_text(self, text: str) -> str:
+        """Stem the words in the text."""
+        words = text.split()
+        stemmed_words = [self.stemmer.stem(word) for word in words]
+        return ' '.join(stemmed_words)
+
+    def score_sentences(self, original_sentences: List[str], stemmed_sentences: List[str]) -> Dict[str, float]:
+        """Score sentences based on TF-IDF and structural features."""
+        tfidf_matrix = self.tfidf_vectorizer.fit_transform(stemmed_sentences)
+        sentence_scores = defaultdict(float)
+
+        for i, original_sentence in enumerate(original_sentences):
+            score = sum(tfidf_matrix[i, j] for j in tfidf_matrix[i].indices)
+            sent_doc = self.nlp(original_sentence)
+
+            # Apply length and positional weighting
+            length_factor = min(
+                1.0, len(sent_doc) / 20.0) if len(sent_doc) < 20 else 20.0 / len(sent_doc)
+            score *= length_factor
+
+            # Position bonuses
+            if i < len(original_sentence) * 0.2:
+                score *= 1.2
+            elif i > len(original_sentence) * 0.8:
+                score *= 1.1
+
+            # Bonuses for named entities and important dependencies
+            if sent_doc.ents:
+                score *= 1.2
+            if any(token.dep_ in ['nsubj', 'dobj'] for token in sent_doc):
+                score *= 1.1
+
+            sentence_scores[original_sentence] = score
+
+        return sentence_scores
+
+    def extract_key_points(self, original_sentences: List[str], stemmed_sentences: List[str], num_clusters: int = 5) -> List[str]:
+        """Extract key points using K-means clustering."""
+        num_clusters = min(num_clusters, len(original_sentences))
+        if num_clusters < 1:
+            return []
+
+        tfidf_matrix = self.tfidf_vectorizer.fit_transform(stemmed_sentences)
+        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+        kmeans.fit(tfidf_matrix)
+
+        labeled_sentences = [(orig, stem, label, idx) for idx, (orig, stem, label) in enumerate(
+            zip(original_sentences, stemmed_sentences, kmeans.labels_))]
+        key_points = []
+
+        for cluster in range(num_clusters):
+            cluster_sentences = [
+                item for item in labeled_sentences if item[2] == cluster]
+            if cluster_sentences:
+                cluster_center = kmeans.cluster_centers_[cluster]
+                distances = [np.linalg.norm(tfidf_matrix[item[3]].toarray(
+                ) - cluster_center) for item in cluster_sentences]
+                closest_sentence = cluster_sentences[np.argmin(
+                    distances)][0]  # Use original sentence
+
+                sent_doc = self.nlp(closest_sentence)
+                if len(sent_doc) >= 5:
+                    point = re.sub(r'\s+', ' ', closest_sentence.strip('., '))
+                    if len(point.split()) >= 5:
+                        # Store with original index
+                        key_points.append((point, cluster_sentences[0][3]))
+
+        # Sort key points based on their original position in the text
+        key_points.sort(key=lambda x: x[1])
+        return [point for point, _ in key_points]
+
+    def summarize(self, text: str, num_sentences: int = 5) -> Dict[str, Union[str, List[str]]]:
+        """Generate a comprehensive summary of the text."""
+        cleaned_text, stemmed_text = self.clean_text(text)
+        original_sentences = sent_tokenize(cleaned_text)
+        stemmed_sentences = sent_tokenize(stemmed_text)
+        num_sentences = min(num_sentences, len(
+            original_sentences)) if original_sentences else 0
+
+        sentence_scores = self.score_sentences(
+            original_sentences, stemmed_sentences)
+        summary_sentences = heapq.nlargest(
+            num_sentences, sentence_scores.items(), key=lambda x: x[1])
+        summary_sentences.sort(key=lambda x: original_sentences.index(x[0]))
+
+        summary = ' '.join([sentence for sentence, _ in summary_sentences])
+        key_points = self.extract_key_points(
+            original_sentences, stemmed_sentences, num_clusters=min(5, len(original_sentences)))
+
+        return {
+            'summary': summary,
+            'key_points': key_points,
+        }
+
+
+def main(text):
+    # Create summarizer instance
+    summarizer = TextSummarization()
+
+    # Generate summary
+    summary = summarizer.summarize(text)
+
+    print(summary)
+
+
+if __name__ == "__main__":
+    main()