EAgglo Model (#155) (#156)

Couldn't get the clustering working Co-authored-by: Muhammad Abdullah Shahid <[email protected]>
RobotPsychologist · Nov 17, 2024 · 4a7150e · 4a7150e
1 parent 1088e3a
commit 4a7150e
Showing 1 changed file with 241 additions and 0 deletions.
diff --git a/0_meal_identification/meal_identification/notebooks/3.05-as-eagglo-change-points.ipynb b/0_meal_identification/meal_identification/notebooks/3.05-as-eagglo-change-points.ipynb
@@ -0,0 +1,241 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Meal Identification using EAgglo Model\n",
+    "\n",
+    "This notebook focuses on implementing the EAgglo (Hierarchical Agglomerative Estimation of Multiple Change Points) \n",
+    "model from the sktime library to detect significant meals in CGM time-series data. \n",
+    "\n",
+    "The goal is to identify key change points in the glucose time series data that correspond to significant meals. \n",
+    "\n",
+    "EAgglo is a non-parametric clustering approach that preserves the time ordering of observations. It merges \n",
+    "neighboring segments sequentially to maximize a goodness-of-fit statistic, simultaneously identifying the number \n",
+    "and location of change points without assuming any specific data distribution. The parameters of the model allow \n",
+    "flexibility to control clustering behavior and penalization to avoid overfitting.\n",
+    "\n",
+    "References:\n",
+    "- sktime Documentation: https://www.sktime.net/en/v0.28.0/api_reference/auto_generated/sktime.annotation.eagglo.EAgglo.html#sktime.annotation.eagglo.EAgglo\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sktime.annotation.eagglo import EAgglo\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                 date    bgl trend\n",
+      "0 2024-07-01 00:02:32  115.0  FLAT\n",
+      "1 2024-07-01 00:05:33  112.0  FLAT\n",
+      "2 2024-07-01 00:08:33  116.0  FLAT\n",
+      "3 2024-07-01 00:10:34  121.0  FLAT\n",
+      "4 2024-07-01 00:13:36  122.0  FLAT\n",
+      "date     datetime64[ns]\n",
+      "bgl             float64\n",
+      "trend            object\n",
+      "dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "\n",
+    "data_path = \"../data/raw\"\n",
+    "relevant_cols = ['date', 'bgl', 'trend']\n",
+    "csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]\n",
+    "\n",
+    "dataframes = {\n",
+    "    file: pd.read_csv(\n",
+    "        os.path.join(data_path, file),\n",
+    "        usecols=relevant_cols,\n",
+    "        parse_dates=['date'],\n",
+    "    )\n",
+    "    for file in csv_files\n",
+    "}\n",
+    "\n",
+    "df = dataframes['679372_2024-07-01_2024-09-30.csv']\n",
+    "\n",
+    "df['date'] = pd.to_datetime(df['date'], format='ISO8601')\n",
+    "\n",
+    "df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S')\n",
+    "\n",
+    "df['date'] = pd.to_datetime(df['date'])\n",
+    "\n",
+    "df = df.dropna(subset=['bgl', 'date'])\n",
+    "df = df.drop_duplicates()\n",
+    "\n",
+    "print(df.head())\n",
+    "print(df.dtypes)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  trend  trend_encoded\n",
+      "0  FLAT              0\n",
+      "1  FLAT              0\n",
+      "2  FLAT              0\n",
+      "3  FLAT              0\n",
+      "4  FLAT              0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "\n",
+    "le = LabelEncoder()\n",
+    "\n",
+    "df['trend_encoded'] = le.fit_transform(df['trend'])\n",
+    "\n",
+    "print(df[['trend', 'trend_encoded']].head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                       bgl           trend  trend_encoded  cluster\n",
+      "date                                                              \n",
+      "2024-07-05 00:01:53  126.0            FLAT              0      NaN\n",
+      "2024-07-05 00:04:54  132.0            FLAT              0      NaN\n",
+      "2024-07-05 00:08:55  132.0            FLAT              0      NaN\n",
+      "2024-07-05 00:11:54  135.0            FLAT              0      NaN\n",
+      "2024-07-05 00:14:55  133.0            FLAT              0      NaN\n",
+      "...                    ...             ...            ...      ...\n",
+      "2024-07-05 23:46:46  165.0  FORTYFIVE_DOWN              1      NaN\n",
+      "2024-07-05 23:49:45  156.0  FORTYFIVE_DOWN              1      NaN\n",
+      "2024-07-05 23:53:47  155.0  FORTYFIVE_DOWN              1      NaN\n",
+      "2024-07-05 23:56:46  145.0  FORTYFIVE_DOWN              1      NaN\n",
+      "2024-07-05 23:59:48  127.0     SINGLE_DOWN              4      NaN\n",
+      "\n",
+      "[474 rows x 4 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sktime.annotation.eagglo import EAgglo\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "\n",
+    "df['date'] = pd.to_datetime(df['date'])\n",
+    "le = LabelEncoder()\n",
+    "df['trend_encoded'] = le.fit_transform(df['trend'])\n",
+    "\n",
+    "df_day = df[df['date'].dt.date == pd.to_datetime('2024-07-05').date()]\n",
+    "df_day = df_day.set_index('date')\n",
+    "\n",
+    "X = df_day[['bgl', 'trend_encoded']]\n",
+    "\n",
+    "model = EAgglo(alpha=1.0, penalty=\"mean_diff_penalty\")\n",
+    "clusters = model.fit_transform(X)\n",
+    "\n",
+    "df_day['cluster'] = clusters\n",
+    "\n",
+    "print(df_day)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "(474, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(type(X))\n",
+    "print(X.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 2.3315865   3.71527897]\n",
+      " [-0.54540029  2.99161615]\n",
+      " [ 1.62133597  2.27991444]\n",
+      " [ 4.26551159  5.10854853]\n",
+      " [ 4.00429143  4.82539979]\n",
+      " [ 4.43302619  6.20303737]\n",
+      " [ 3.03493433  6.02827408]]\n",
+      "[0 0 0 1 1 1 1]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sktime.annotation.datagen import piecewise_normal_multivariate\n",
+    "X_example = piecewise_normal_multivariate(means=[[1, 3], [4, 5]], lengths=[3, 4], random_state=10)\n",
+    "model = EAgglo()\n",
+    "print(X_example)\n",
+    "clusters_example = model.fit_transform(X_example)\n",
+    "print(clusters_example)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}