From ab8760b48c28de5c3e3188c628b1a994d25256ea Mon Sep 17 00:00:00 2001 From: Juan Manuel Ciro Torres Date: Wed, 29 Sep 2021 17:25:11 -0500 Subject: [PATCH] data deduplication in text for paper --- .../codelabs/create_dataset-process.ipynb | 123 ++++++++++++++ .../codelabs/data-deduplication-tuning.ipynb | 160 ++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 galvasr2/codelabs/create_dataset-process.ipynb create mode 100644 galvasr2/codelabs/data-deduplication-tuning.ipynb diff --git a/galvasr2/codelabs/create_dataset-process.ipynb b/galvasr2/codelabs/create_dataset-process.ipynb new file mode 100644 index 00000000..0f88773a --- /dev/null +++ b/galvasr2/codelabs/create_dataset-process.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from datasketch import MinHash, MinHashLSH, MinHashLSHForest\n", + "from nltk import ngrams\n", + "from tqdm import tqdm\n", + "import numpy as np\n", + "import itertools\n", + "import random\n", + "\n", + "pd.options.display.float_format = '{:20,.2f}'.format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv('good.csv', error_bad_lines=False)\n", + "df['len_text'] = df['text'].str.split().str.len()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "texts = list(df['text'].values[0:2000])\n", + "texts = ' '.join(texts).split('.')\n", + "texts = [sentence for sentence in texts if len(sentence.split())>10]\n", + "total_texts = list(df['text'].values)\n", + "def duplicate_function(text):\n", + " number = random.uniform(0, 1)\n", + " if number > 0.75:\n", + " text = text.split('. ')\n", + " for i in range(int(len(text) * 0.1)):\n", + " text.insert(random.randint(0, len(text)), random.choice(texts))\n", + " result = pd.Series(['. '.join(text), 1])\n", + " return result\n", + " else:\n", + " result = pd.Series([random.choice(total_texts), 0])\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "df[['new_text', 'deduplicate']] = df['text'].apply(duplicate_function)\n", + "df['len_text'] = df['text'].str.split().str.len()\n", + "df['len_new_text'] = df['new_text'].str.split().str.len()\n", + "df['diff_similarity'] = np.abs(1 - (df['len_new_text'] / df['len_text'])) * 100\n", + "df = df[df['text'] != df['new_text']]\n", + "df_1 = df[['text', 'deduplicate', 'diff_similarity']]\n", + "df_1[['deduplicate', 'diff_similarity']] = 0\n", + "df_2 = df[df['deduplicate'] == 1][['new_text', 'deduplicate', 'diff_similarity']]\n", + "df_2.rename(columns={'new_text':'text'}, inplace=True)\n", + "df = pd.concat([df_1, df_2])\n", + "df = df.reset_index().rename(columns={'index':'original'})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('Dataset_duplicate_test.csv', index=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/galvasr2/codelabs/data-deduplication-tuning.ipynb b/galvasr2/codelabs/data-deduplication-tuning.ipynb new file mode 100644 index 00000000..9cdac689 --- /dev/null +++ b/galvasr2/codelabs/data-deduplication-tuning.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "pd.options.display.float_format = '{:20,.2f}'.format\n", + "from datasketch import MinHash, MinHashLSH, MinHashLSHForest, MinHashLSHEnsemble\n", + "from nltk import ngrams\n", + "from tqdm import tqdm\n", + "import numpy as np\n", + "import itertools\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score\n", + "import seaborn as sn\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('Dataset_duplicate_train.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "%%time\n", + "# that accepts MinHash objects with 128 permutations functions\n", + "data = df['text'].values\n", + "num_perm = 128\n", + "lsh = MinHashLSH(threshold=0.8, num_perm=num_perm)\n", + " \n", + "# Create MinHash objects\n", + "minhashes = {}\n", + "error = []\n", + "for c, i in enumerate(tqdm(data)):\n", + " try:\n", + " if c%5000 == 0:\n", + " print(c)\n", + " minhash = MinHash(num_perm=num_perm)\n", + " for d in ngrams(i, 16):\n", + " minhash.update(\"\".join(d).encode('utf-8'))\n", + " lsh.insert(c, minhash)\n", + " minhashes[c] = minhash\n", + " except:\n", + " error.append(c)\n", + " pass \n", + "\n", + " duplicate = []\n", + "for i in range(len(minhashes.keys())):\n", + " try:\n", + " result = lsh.query(minhashes[i])\n", + " if len(result) > 1:\n", + " result.sort()\n", + " duplicate.append(result)\n", + " except:\n", + " pass\n", + "duplicate.sort()\n", + "duplicate = list(duplicate for duplicate, _ in itertools.groupby(duplicate))\n", + "delete = []\n", + "for value in duplicate:\n", + " delete.append(value[1:])\n", + "delete = list(itertools.chain(*delete))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['predict'] = np.where(df.index.isin(delete), 1, 0)\n", + "plt.figure(figsize = (10,7))\n", + "ax = plt.gca()\n", + "ax.get_xaxis().get_major_formatter().set_scientific(False)\n", + "labels = ['deduplicate', 'no-deduplicate']\n", + "print(roc_auc_score(df['deduplicate'], df['predict']))\n", + "sn.heatmap(confusion_matrix(df['deduplicate'], df['predict']), annot=True);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('Dataset_duplicate_test.csv')\n", + "df_result = deduplicate_model(df_test, num_perm=128, ngram=16, threshold=0.6)\n", + "roc_auc_score(df_result['deduplicate'], df_result['predict'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "plt.figure(figsize = (10,7))\n", + "ax = plt.gca()\n", + "ax.get_xaxis().get_major_formatter().set_scientific(False)\n", + "labels = ['deduplicate', 'no-deduplicate']\n", + "sn.heatmap(confusion_matrix(df_result['deduplicate'], df_result['predict']), annot=True);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(classification_report(df['deduplicate'], df['predict']))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}