GlobalMaksimum · irmakyucel · Mar 31, 2021 · Mar 31, 2021 · Apr 2, 2021 · Apr 7, 2021
diff --git a/sadedegel/dataset/README.md b/sadedegel/dataset/README.md
@@ -315,6 +315,31 @@ test = load_hotel_sentiment_test()
 test_label = load_hotel_sentiment_test_label()
 ```
 
+## `product_sentiment`
+
+This corpus contains 11426 instance of product reviews annotated by a sentiment label from set of `['POSITIVE', 'NEGATIVE', 'NEUTRAL']` sentiments. Dataset [source](https://www.kaggle.com/burhanbilenn/duygu-analizi-icin-urun-yorumlari/version/1)
+
+### Using corpus
+```python
+from sadedegel.dataset.product_sentiment import load_product_sentiment_train
+from sadedegel.dataset.product_sentiment import CLASS_VALUES
+
+import pandas as pd
+
+raw = load_product_sentiment_train()
+
+next(raw)
+
+# Out [0]: {text: "ses kalitesi ve ergonomisi rezalet, sony olduğu için aldım ama 4'de 1 fiyatına çin replika ürün alsaydım çok çok daha iyiydi, kesinlikle tavsiye etmiyorum."
+#            sentiment_class: 0}
+
+df = pd.DataFrame().from_records(raw)
+
+CLASS_VALUES[df.sentiment_class.iloc[0]]
+
+# Out [1]: 'NEGATIVE'
+```
+
 ## `categorized_product_sentiment`
 
 This corpus contains 5600 instances of customer product reviews from E-commerce sites. Reviews contain two sets of class labels. First label is `sentiment_class` which contains `[POSITIVE, NEGATIVE]` sentiment of the review. Second label is `product_category` which contains `["Kitchen", "DVD", "Books", "Electronics"]` as the category of the product being reviewed. Each product category contains 1400 instances. The dataset is material to the research [paper](https://sentic.net/wisdom2013pechenizkiy.pdf) by Demirtaş and Pechenizkiy.

diff --git a/sadedegel/prebuilt/README.md b/sadedegel/prebuilt/README.md
@@ -123,8 +123,7 @@ Comparable [benchmark](https://ieeexplore.ieee.org/document/8554037/) models has
 * `0.6925` **accuracy** score (convolutional neural networks fed with char ngrams)
 * `0.66` **accuracy** score (classical ML approach fed with bag-of-words)
 on the hold-out set.
-
-
+
 ### Turkish Customer Reviews Classification
 
 Classifier assigns each Turkish customer review text into 32 classes by using sadedegel built-in pipeline.
@@ -158,3 +157,18 @@ Current prebuilt customer review classification model has a macro-F1 score of `0
 
 If you want to compare benchmark results:
 > The model on [Kaggle](https://www.kaggle.com/savasy/multiclass-classification-data-for-turkish-tc32) where we got dataset from has F1 score of `0.84`.
+
+### Turkish Product Sentiment Classification
+Classifier assigns each Turkish product review texts into one of 3 classes ('NEUTRAL','NEGATIVE','POSITIVE') by using sadedegel built-in pipeline.
+#### Loading and Predicting with the Model:
+```python
+ from sadedegel.prebuilt import product_sentiment
+ # We load our prebuilt model:
+ model = product_sentiment.load()
+ # Here we enter our text to get sentiment predictions.
+ y_pred = model.predict([])
+```
+#### Accuracy
+Current prebuilt model has 
+* 3-fold cross validation F1 macro score of `mean 0.6494, std 0.0045)`.
+* 5-fold cross validation F1 macro score of `mean 0.655, std 0.0083)`
diff --git a/sadedegel/prebuilt/model/product_sentiment.joblib b/sadedegel/prebuilt/model/product_sentiment.joblib
diff --git a/sadedegel/prebuilt/product_sentiment.py b/sadedegel/prebuilt/product_sentiment.py
@@ -0,0 +1,101 @@
+from math import ceil
+from os.path import dirname
+from pathlib import Path
+
+import numpy as np
+from joblib import dump, load as jl_load
+from rich.console import Console
+from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier
+from sklearn.metrics import f1_score
+from sklearn.model_selection import KFold
+from sklearn.utils import shuffle
+
+from ..dataset.product_sentiment import load_product_sentiment_train, CORPUS_SIZE, CLASS_VALUES
+from ..extension.sklearn import TfidfVectorizer, HashVectorizer, OnlinePipeline, Text2Doc
+
+from itertools import islice
+
+console = Console()
+
+
+def empty_model():
+    return OnlinePipeline(
+        [('text2doc', Text2Doc(tokenizer = 'icu')),
+         ('hash', HashVectorizer()),
+         # ('tfidf', TfidfVectorizer(tf_method='binary', idf_method='smooth', show_progress=True)),
+         ('pa', PassiveAggressiveClassifier(C = 0.000549495850174313, average = True))
+         ]
+    )
+
+def cv(k=3, max_instances=-1):
+    try:
+        import pandas as pd
+    except ImportError:
+        console.log(("pandas package is not a general sadedegel dependency."
+                     " But we do have a dependency on building our prebuilt models"))
+
+    if max_instances > 0:
+        raw = islice(load_product_sentiment_train(), max_instances)
+    else:
+        raw = load_product_sentiment_train()
+
+    df = pd.DataFrame.from_records(raw)
+    df = shuffle(df)
+
+    # BATCH_SIZE = CORPUS_SIZE
+
+    kf = KFold(n_splits=k)
+    console.log(f"Corpus Size: {CORPUS_SIZE}")
+
+    scores = []
+
+    for train_indx, test_index in kf.split(df):
+        train = df.iloc[train_indx]
+        test = df.iloc[test_index]
+
+        pipeline = empty_model()
+        pipeline.fit(train.text, train.sentiment_class)
+
+        y_pred = pipeline.predict(test.text)
+
+        scores.append(f1_score(test.sentiment_class, y_pred, average='macro'))
+
+        console.log(scores)
+
+
+def build(max_instances=-1, save=True):
+    try:
+        import pandas as pd
+    except ImportError:
+        console.log(("pandas package is not a general sadedegel dependency."
+                     " But we do have a dependency on building our prebuilt models"))
+
+    if max_instances > 0:
+        raw = islice(load_product_sentiment_train(), max_instances)
+    else:
+        raw = load_product_sentiment_train()
+
+    df = pd.DataFrame.from_records(raw)
+    df = shuffle(df)
+
+    pipeline = empty_model()
+    pipeline.fit(df.text, df.sentiment_class)
+
+    console.log("Model build [green]DONE[/green]")
+
+    if save:
+        model_dir = Path(dirname(__file__)) / 'model'
+
+        model_dir.mkdir(parents=True, exist_ok=True)
+
+        pipeline.steps[0][1].Doc = None
+
+        dump(pipeline, (model_dir / 'product_sentiment.joblib').absolute(), compress=('gzip', 9))
+
+
+def load(model_name="product_sentiment"):
+    return jl_load(Path(dirname(__file__)) / 'model' / f"{model_name}.joblib")
+
+
+if __name__ == '__main__':
+    build()
diff --git a/tests/prebuilt/context.py b/tests/prebuilt/context.py
@@ -10,5 +10,6 @@
 from sadedegel.prebuilt import tweet_sentiment , movie_reviews, customer_reviews_classification # noqa # pylint: disable=unused-import, wrong-import-position
 from sadedegel.dataset.tweet_sentiment import CLASS_VALUES as SENTIMENT_VALUES  # noqa # pylint: disable=unused-import, wrong-import-position
 from sadedegel.dataset.movie_sentiment import CLASS_VALUES as SENTIMENT_VALUES_M  # noqa # pylint: disable=unused-import, wrong-import-position
+from sadedegel.prebuilt import product_sentiment
 from sadedegel.dataset.telco_sentiment import CLASS_VALUES as SENTIMENT_VALUES_T  # noqa # pylint: disable=unused-import, wrong-import-position
 from sadedegel.dataset.customer_review import CLASS_VALUES as CLASS_VALUES_CUST # noqa # pylint: disable=unused-import, wrong-import-position
diff --git a/tests/prebuilt/test_product_sentiment.py b/tests/prebuilt/test_product_sentiment.py
@@ -0,0 +1,8 @@
+from .context import product_sentiment
+from sklearn.linear_model import SGDClassifier
+
+def test_model_load():
+    model = product_sentiment.load()
+    pred = model.predict(['çok kötü bir kulaklık.'])
+
+    assert pred[0] == 0