ironhack-labs · danisiaj · Oct 29, 2024
diff --git a/1429_1.csv b/1429_1.csv
diff --git a/Customer Reviews Sentiment Analysis.pdf b/Customer Reviews Sentiment Analysis.pdf
diff --git a/Project Presentation.pptx b/Project Presentation.pptx
diff --git a/SVC.pkl b/SVC.pkl
diff --git a/TF-IDF_vectorizer.pkl b/TF-IDF_vectorizer.pkl
diff --git a/main.ipynb b/main.ipynb
diff --git a/requirements_streamlit.txt b/requirements_streamlit.txt
@@ -0,0 +1,49 @@
+altair==5.4.1
+attrs==24.2.0
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+colorama==0.4.6
+gitdb==4.0.11
+GitPython==3.1.43
+idna==3.10
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+narwhals==1.11.0
+nltk==3.9.1
+numpy==2.1.2
+packaging==24.1
+pandas==2.2.3
+pillow==10.4.0
+protobuf==5.28.3
+pyarrow==17.0.0
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rich==13.9.3
+rpds-py==0.20.0
+scikit-learn==1.5.2
+scipy==1.14.1
+six==1.16.0
+smmap==5.0.1
+streamlit==1.39.0
+tenacity==9.0.0
+threadpoolctl==3.5.0
+toml==0.10.2
+tornado==6.4.1
+tqdm==4.66.5
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==5.0.3
diff --git a/streamlit_app.py b/streamlit_app.py
@@ -0,0 +1,100 @@
+import streamlit as st
+import pickle
+# from sklearn.svm import SVC
+# import pandas as pd
+# import numpy as np
+import re
+import nltk
+from nltk.corpus import wordnet, stopwords
+from nltk.stem.wordnet import WordNetLemmatizer
+lemmatizer  = WordNetLemmatizer()
+from sklearn.feature_extraction.text import TfidfVectorizer
+vectorizer = TfidfVectorizer()
+
+page_bg_img = '''
+<style>
+body {
+background-image: url("https://images.unsplash.com/photo-1542281286-9e0a16bb7366");
+background-size: cover;
+}
+</style>
+'''
+
+st.markdown(page_bg_img, unsafe_allow_html=True)
+
+st.title("Reviews App")
+st.write('This app will tell you if a customer review is positive, neutral or negative!')
+
+st.header("Type a review!")
+review = st.text_input("type your review here", "e.g.: This product is amazing!")
+if st.button("Analyze review"):
+    st.write("Analyzing review...")
+    def data_cleaning(text):
+        """
+        This function processes each setence and applies regex patterns to remove undesired characters.
+        In this case we built it detele characters that should be equally translated by computers and humans:
+        - special characters
+        - numerical characters/digits
+        - single characthers
+        - multiple spaces (for cleaning purposes)
+
+        Argument: text/corpus/document/sentence; string
+        """
+
+        # Remove numbers
+        text_no_special_characters = re.sub(r'[^A-Za-z\s]+', ' ', str(text))
+
+        # Remove all single characters (e.g., 'a', 'b', 'c' that appear as standalone)
+        text_no_single_charac = re.sub(r'\b\w\b', '', text_no_special_characters)
+
+        # Clean up extra spaces left after removing single characters
+        text_cleaned = re.sub(r'\s+', ' ', text_no_single_charac).strip()
+
+        # Transform data to lowercase
+        text_cleaned = text_cleaned.lower()
+
+        return text_cleaned
+
+
+    def get_wordnet_pos(word):
+        """Map POS tag to first character lemmatize() accepts"""
+
+        tag = nltk.pos_tag([word])[0][1][0]
+        tag_dict = {"J": wordnet.ADJ,
+                    "N": wordnet.NOUN,
+                    "V": wordnet.VERB,
+                    "R": wordnet.ADV}
+
+        return tag_dict.get(tag, wordnet.NOUN)
+
+    def data_processing(text):
+        """
+        This function processes each sentence in the following order:
+        1. Tokenize each word of the sentence.
+        2. Remove stopwords and stem words, if any word is in the 'stopwords.words("english")' list.
+        3. Lemmatize every word not in the stopwords list
+        4. Join all the tokens per row, to rebuild the sentences.
+
+        Argument: text/corpus/document/sentence; string
+        """
+        tolkenize_words = nltk.word_tokenize(text)
+        lemmatized_words = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in tolkenize_words if word not in stopwords.words("english")]
+        text_processed = ' '.join(lemmatized_words)  # Join the words back into a single string
+
+        return text_processed
+
+    review_cleaned = data_cleaning(review)
+    review_processed = data_cleaning(review_cleaned)
+
+    # Load our model from pickle document
+    with open('SVC.pkl', 'rb') as f:
+        model = pickle.load(f)
+    with open('TF-IDF_vectorizer.pkl', 'rb') as f:
+        vectorizer = pickle.load(f)
+
+    review_tfidf = vectorizer.transform([review_cleaned])
+
+    prediction = model.predict(review_tfidf)
+
+    st.write(prediction)
+