From ab8760b48c28de5c3e3188c628b1a994d25256ea Mon Sep 17 00:00:00 2001
From: Juan Manuel Ciro Torres <jmcirot@unal.edu.co>
Date: Wed, 29 Sep 2021 17:25:11 -0500
Subject: [PATCH] data deduplication in text for paper

---
 .../codelabs/create_dataset-process.ipynb     | 123 ++++++++++++++
 .../codelabs/data-deduplication-tuning.ipynb  | 160 ++++++++++++++++++
 2 files changed, 283 insertions(+)
 create mode 100644 galvasr2/codelabs/create_dataset-process.ipynb
 create mode 100644 galvasr2/codelabs/data-deduplication-tuning.ipynb

diff --git a/galvasr2/codelabs/create_dataset-process.ipynb b/galvasr2/codelabs/create_dataset-process.ipynb
new file mode 100644
index 00000000..0f88773a
--- /dev/null
+++ b/galvasr2/codelabs/create_dataset-process.ipynb
@@ -0,0 +1,123 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from datasketch import MinHash, MinHashLSH, MinHashLSHForest\n",
+    "from nltk import ngrams\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import itertools\n",
+    "import random\n",
+    "\n",
+    "pd.options.display.float_format = '{:20,.2f}'.format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('good.csv', error_bad_lines=False)\n",
+    "df['len_text'] = df['text'].str.split().str.len()\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = list(df['text'].values[0:2000])\n",
+    "texts = ' '.join(texts).split('.')\n",
+    "texts = [sentence for sentence in texts if len(sentence.split())>10]\n",
+    "total_texts = list(df['text'].values)\n",
+    "def duplicate_function(text):\n",
+    "    number = random.uniform(0, 1)\n",
+    "    if number > 0.75:\n",
+    "        text = text.split('. ')\n",
+    "        for i in range(int(len(text) * 0.1)):\n",
+    "            text.insert(random.randint(0, len(text)), random.choice(texts))\n",
+    "        result = pd.Series(['. '.join(text), 1])\n",
+    "        return result\n",
+    "    else:\n",
+    "        result = pd.Series([random.choice(total_texts), 0])\n",
+    "        return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "df[['new_text', 'deduplicate']] = df['text'].apply(duplicate_function)\n",
+    "df['len_text'] = df['text'].str.split().str.len()\n",
+    "df['len_new_text'] = df['new_text'].str.split().str.len()\n",
+    "df['diff_similarity'] = np.abs(1 - (df['len_new_text'] / df['len_text'])) * 100\n",
+    "df = df[df['text'] != df['new_text']]\n",
+    "df_1 = df[['text', 'deduplicate', 'diff_similarity']]\n",
+    "df_1[['deduplicate', 'diff_similarity']] = 0\n",
+    "df_2 = df[df['deduplicate'] == 1][['new_text', 'deduplicate', 'diff_similarity']]\n",
+    "df_2.rename(columns={'new_text':'text'}, inplace=True)\n",
+    "df = pd.concat([df_1, df_2])\n",
+    "df = df.reset_index().rename(columns={'index':'original'})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv('Dataset_duplicate_test.csv', index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/galvasr2/codelabs/data-deduplication-tuning.ipynb b/galvasr2/codelabs/data-deduplication-tuning.ipynb
new file mode 100644
index 00000000..9cdac689
--- /dev/null
+++ b/galvasr2/codelabs/data-deduplication-tuning.ipynb
@@ -0,0 +1,160 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "pd.options.display.float_format = '{:20,.2f}'.format\n",
+    "from datasketch import MinHash, MinHashLSH, MinHashLSHForest, MinHashLSHEnsemble\n",
+    "from nltk import ngrams\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import itertools\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score\n",
+    "import seaborn as sn\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('Dataset_duplicate_train.csv')\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "# that accepts MinHash objects with 128 permutations functions\n",
+    "data = df['text'].values\n",
+    "num_perm = 128\n",
+    "lsh = MinHashLSH(threshold=0.8, num_perm=num_perm)\n",
+    " \n",
+    "# Create MinHash objects\n",
+    "minhashes = {}\n",
+    "error = []\n",
+    "for c, i in enumerate(tqdm(data)):\n",
+    "    try:\n",
+    "        if c%5000 == 0:\n",
+    "            print(c)\n",
+    "        minhash = MinHash(num_perm=num_perm)\n",
+    "        for d in ngrams(i, 16):\n",
+    "            minhash.update(\"\".join(d).encode('utf-8'))\n",
+    "        lsh.insert(c, minhash)\n",
+    "        minhashes[c] = minhash\n",
+    "    except:\n",
+    "        error.append(c)\n",
+    "        pass \n",
+    "\n",
+    "    duplicate = []\n",
+    "for i in range(len(minhashes.keys())):\n",
+    "    try:\n",
+    "        result = lsh.query(minhashes[i])\n",
+    "        if len(result) > 1:\n",
+    "            result.sort()\n",
+    "            duplicate.append(result)\n",
+    "    except:\n",
+    "        pass\n",
+    "duplicate.sort()\n",
+    "duplicate = list(duplicate for duplicate, _ in itertools.groupby(duplicate))\n",
+    "delete = []\n",
+    "for value in duplicate:\n",
+    "    delete.append(value[1:])\n",
+    "delete = list(itertools.chain(*delete))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['predict'] = np.where(df.index.isin(delete), 1, 0)\n",
+    "plt.figure(figsize = (10,7))\n",
+    "ax = plt.gca()\n",
+    "ax.get_xaxis().get_major_formatter().set_scientific(False)\n",
+    "labels = ['deduplicate', 'no-deduplicate']\n",
+    "print(roc_auc_score(df['deduplicate'], df['predict']))\n",
+    "sn.heatmap(confusion_matrix(df['deduplicate'], df['predict']), annot=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('Dataset_duplicate_test.csv')\n",
+    "df_result = deduplicate_model(df_test, num_perm=128, ngram=16, threshold=0.6)\n",
+    "roc_auc_score(df_result['deduplicate'], df_result['predict'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize = (10,7))\n",
+    "ax = plt.gca()\n",
+    "ax.get_xaxis().get_major_formatter().set_scientific(False)\n",
+    "labels = ['deduplicate', 'no-deduplicate']\n",
+    "sn.heatmap(confusion_matrix(df_result['deduplicate'], df_result['predict']), annot=True);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "print(classification_report(df['deduplicate'], df['predict']))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}