From 4a7150e87ca9be7bc17002597b234b077b808395 Mon Sep 17 00:00:00 2001 From: Christopher <110344005+RobotPsychologist@users.noreply.github.com> Date: Sun, 17 Nov 2024 13:43:47 -0500 Subject: [PATCH] EAgglo Model (#155) (#156) Couldn't get the clustering working Co-authored-by: Muhammad Abdullah Shahid --- .../3.05-as-eagglo-change-points.ipynb | 241 ++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 0_meal_identification/meal_identification/notebooks/3.05-as-eagglo-change-points.ipynb diff --git a/0_meal_identification/meal_identification/notebooks/3.05-as-eagglo-change-points.ipynb b/0_meal_identification/meal_identification/notebooks/3.05-as-eagglo-change-points.ipynb new file mode 100644 index 0000000..6d77d7f --- /dev/null +++ b/0_meal_identification/meal_identification/notebooks/3.05-as-eagglo-change-points.ipynb @@ -0,0 +1,241 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Meal Identification using EAgglo Model\n", + "\n", + "This notebook focuses on implementing the EAgglo (Hierarchical Agglomerative Estimation of Multiple Change Points) \n", + "model from the sktime library to detect significant meals in CGM time-series data. \n", + "\n", + "The goal is to identify key change points in the glucose time series data that correspond to significant meals. \n", + "\n", + "EAgglo is a non-parametric clustering approach that preserves the time ordering of observations. It merges \n", + "neighboring segments sequentially to maximize a goodness-of-fit statistic, simultaneously identifying the number \n", + "and location of change points without assuming any specific data distribution. The parameters of the model allow \n", + "flexibility to control clustering behavior and penalization to avoid overfitting.\n", + "\n", + "References:\n", + "- sktime Documentation: https://www.sktime.net/en/v0.28.0/api_reference/auto_generated/sktime.annotation.eagglo.EAgglo.html#sktime.annotation.eagglo.EAgglo\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sktime.annotation.eagglo import EAgglo\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " date bgl trend\n", + "0 2024-07-01 00:02:32 115.0 FLAT\n", + "1 2024-07-01 00:05:33 112.0 FLAT\n", + "2 2024-07-01 00:08:33 116.0 FLAT\n", + "3 2024-07-01 00:10:34 121.0 FLAT\n", + "4 2024-07-01 00:13:36 122.0 FLAT\n", + "date datetime64[ns]\n", + "bgl float64\n", + "trend object\n", + "dtype: object\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import os\n", + "\n", + "data_path = \"../data/raw\"\n", + "relevant_cols = ['date', 'bgl', 'trend']\n", + "csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]\n", + "\n", + "dataframes = {\n", + " file: pd.read_csv(\n", + " os.path.join(data_path, file),\n", + " usecols=relevant_cols,\n", + " parse_dates=['date'],\n", + " )\n", + " for file in csv_files\n", + "}\n", + "\n", + "df = dataframes['679372_2024-07-01_2024-09-30.csv']\n", + "\n", + "df['date'] = pd.to_datetime(df['date'], format='ISO8601')\n", + "\n", + "df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S')\n", + "\n", + "df['date'] = pd.to_datetime(df['date'])\n", + "\n", + "df = df.dropna(subset=['bgl', 'date'])\n", + "df = df.drop_duplicates()\n", + "\n", + "print(df.head())\n", + "print(df.dtypes)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " trend trend_encoded\n", + "0 FLAT 0\n", + "1 FLAT 0\n", + "2 FLAT 0\n", + "3 FLAT 0\n", + "4 FLAT 0\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "\n", + "df['trend_encoded'] = le.fit_transform(df['trend'])\n", + "\n", + "print(df[['trend', 'trend_encoded']].head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " bgl trend trend_encoded cluster\n", + "date \n", + "2024-07-05 00:01:53 126.0 FLAT 0 NaN\n", + "2024-07-05 00:04:54 132.0 FLAT 0 NaN\n", + "2024-07-05 00:08:55 132.0 FLAT 0 NaN\n", + "2024-07-05 00:11:54 135.0 FLAT 0 NaN\n", + "2024-07-05 00:14:55 133.0 FLAT 0 NaN\n", + "... ... ... ... ...\n", + "2024-07-05 23:46:46 165.0 FORTYFIVE_DOWN 1 NaN\n", + "2024-07-05 23:49:45 156.0 FORTYFIVE_DOWN 1 NaN\n", + "2024-07-05 23:53:47 155.0 FORTYFIVE_DOWN 1 NaN\n", + "2024-07-05 23:56:46 145.0 FORTYFIVE_DOWN 1 NaN\n", + "2024-07-05 23:59:48 127.0 SINGLE_DOWN 4 NaN\n", + "\n", + "[474 rows x 4 columns]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sktime.annotation.eagglo import EAgglo\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "df['date'] = pd.to_datetime(df['date'])\n", + "le = LabelEncoder()\n", + "df['trend_encoded'] = le.fit_transform(df['trend'])\n", + "\n", + "df_day = df[df['date'].dt.date == pd.to_datetime('2024-07-05').date()]\n", + "df_day = df_day.set_index('date')\n", + "\n", + "X = df_day[['bgl', 'trend_encoded']]\n", + "\n", + "model = EAgglo(alpha=1.0, penalty=\"mean_diff_penalty\")\n", + "clusters = model.fit_transform(X)\n", + "\n", + "df_day['cluster'] = clusters\n", + "\n", + "print(df_day)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "(474, 2)\n" + ] + } + ], + "source": [ + "print(type(X))\n", + "print(X.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 2.3315865 3.71527897]\n", + " [-0.54540029 2.99161615]\n", + " [ 1.62133597 2.27991444]\n", + " [ 4.26551159 5.10854853]\n", + " [ 4.00429143 4.82539979]\n", + " [ 4.43302619 6.20303737]\n", + " [ 3.03493433 6.02827408]]\n", + "[0 0 0 1 1 1 1]\n" + ] + } + ], + "source": [ + "from sktime.annotation.datagen import piecewise_normal_multivariate\n", + "X_example = piecewise_normal_multivariate(means=[[1, 3], [4, 5]], lengths=[3, 4], random_state=10)\n", + "model = EAgglo()\n", + "print(X_example)\n", + "clusters_example = model.fit_transform(X_example)\n", + "print(clusters_example)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}