-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Couldn't get the clustering working Co-authored-by: Muhammad Abdullah Shahid <[email protected]>
- Loading branch information
1 parent
1088e3a
commit 4a7150e
Showing
1 changed file
with
241 additions
and
0 deletions.
There are no files selected for viewing
241 changes: 241 additions & 0 deletions
241
0_meal_identification/meal_identification/notebooks/3.05-as-eagglo-change-points.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\"\"\"\n", | ||
"Meal Identification using EAgglo Model\n", | ||
"\n", | ||
"This notebook focuses on implementing the EAgglo (Hierarchical Agglomerative Estimation of Multiple Change Points) \n", | ||
"model from the sktime library to detect significant meals in CGM time-series data. \n", | ||
"\n", | ||
"The goal is to identify key change points in the glucose time series data that correspond to significant meals. \n", | ||
"\n", | ||
"EAgglo is a non-parametric clustering approach that preserves the time ordering of observations. It merges \n", | ||
"neighboring segments sequentially to maximize a goodness-of-fit statistic, simultaneously identifying the number \n", | ||
"and location of change points without assuming any specific data distribution. The parameters of the model allow \n", | ||
"flexibility to control clustering behavior and penalization to avoid overfitting.\n", | ||
"\n", | ||
"References:\n", | ||
"- sktime Documentation: https://www.sktime.net/en/v0.28.0/api_reference/auto_generated/sktime.annotation.eagglo.EAgglo.html#sktime.annotation.eagglo.EAgglo\n", | ||
"\"\"\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"from sktime.annotation.eagglo import EAgglo\n", | ||
"import os" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 123, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" date bgl trend\n", | ||
"0 2024-07-01 00:02:32 115.0 FLAT\n", | ||
"1 2024-07-01 00:05:33 112.0 FLAT\n", | ||
"2 2024-07-01 00:08:33 116.0 FLAT\n", | ||
"3 2024-07-01 00:10:34 121.0 FLAT\n", | ||
"4 2024-07-01 00:13:36 122.0 FLAT\n", | ||
"date datetime64[ns]\n", | ||
"bgl float64\n", | ||
"trend object\n", | ||
"dtype: object\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import os\n", | ||
"\n", | ||
"data_path = \"../data/raw\"\n", | ||
"relevant_cols = ['date', 'bgl', 'trend']\n", | ||
"csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]\n", | ||
"\n", | ||
"dataframes = {\n", | ||
" file: pd.read_csv(\n", | ||
" os.path.join(data_path, file),\n", | ||
" usecols=relevant_cols,\n", | ||
" parse_dates=['date'],\n", | ||
" )\n", | ||
" for file in csv_files\n", | ||
"}\n", | ||
"\n", | ||
"df = dataframes['679372_2024-07-01_2024-09-30.csv']\n", | ||
"\n", | ||
"df['date'] = pd.to_datetime(df['date'], format='ISO8601')\n", | ||
"\n", | ||
"df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S')\n", | ||
"\n", | ||
"df['date'] = pd.to_datetime(df['date'])\n", | ||
"\n", | ||
"df = df.dropna(subset=['bgl', 'date'])\n", | ||
"df = df.drop_duplicates()\n", | ||
"\n", | ||
"print(df.head())\n", | ||
"print(df.dtypes)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 132, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" trend trend_encoded\n", | ||
"0 FLAT 0\n", | ||
"1 FLAT 0\n", | ||
"2 FLAT 0\n", | ||
"3 FLAT 0\n", | ||
"4 FLAT 0\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from sklearn.preprocessing import LabelEncoder\n", | ||
"\n", | ||
"le = LabelEncoder()\n", | ||
"\n", | ||
"df['trend_encoded'] = le.fit_transform(df['trend'])\n", | ||
"\n", | ||
"print(df[['trend', 'trend_encoded']].head())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" bgl trend trend_encoded cluster\n", | ||
"date \n", | ||
"2024-07-05 00:01:53 126.0 FLAT 0 NaN\n", | ||
"2024-07-05 00:04:54 132.0 FLAT 0 NaN\n", | ||
"2024-07-05 00:08:55 132.0 FLAT 0 NaN\n", | ||
"2024-07-05 00:11:54 135.0 FLAT 0 NaN\n", | ||
"2024-07-05 00:14:55 133.0 FLAT 0 NaN\n", | ||
"... ... ... ... ...\n", | ||
"2024-07-05 23:46:46 165.0 FORTYFIVE_DOWN 1 NaN\n", | ||
"2024-07-05 23:49:45 156.0 FORTYFIVE_DOWN 1 NaN\n", | ||
"2024-07-05 23:53:47 155.0 FORTYFIVE_DOWN 1 NaN\n", | ||
"2024-07-05 23:56:46 145.0 FORTYFIVE_DOWN 1 NaN\n", | ||
"2024-07-05 23:59:48 127.0 SINGLE_DOWN 4 NaN\n", | ||
"\n", | ||
"[474 rows x 4 columns]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from sktime.annotation.eagglo import EAgglo\n", | ||
"from sklearn.preprocessing import LabelEncoder\n", | ||
"\n", | ||
"df['date'] = pd.to_datetime(df['date'])\n", | ||
"le = LabelEncoder()\n", | ||
"df['trend_encoded'] = le.fit_transform(df['trend'])\n", | ||
"\n", | ||
"df_day = df[df['date'].dt.date == pd.to_datetime('2024-07-05').date()]\n", | ||
"df_day = df_day.set_index('date')\n", | ||
"\n", | ||
"X = df_day[['bgl', 'trend_encoded']]\n", | ||
"\n", | ||
"model = EAgglo(alpha=1.0, penalty=\"mean_diff_penalty\")\n", | ||
"clusters = model.fit_transform(X)\n", | ||
"\n", | ||
"df_day['cluster'] = clusters\n", | ||
"\n", | ||
"print(df_day)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"<class 'pandas.core.frame.DataFrame'>\n", | ||
"(474, 2)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(type(X))\n", | ||
"print(X.shape)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 119, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[[ 2.3315865 3.71527897]\n", | ||
" [-0.54540029 2.99161615]\n", | ||
" [ 1.62133597 2.27991444]\n", | ||
" [ 4.26551159 5.10854853]\n", | ||
" [ 4.00429143 4.82539979]\n", | ||
" [ 4.43302619 6.20303737]\n", | ||
" [ 3.03493433 6.02827408]]\n", | ||
"[0 0 0 1 1 1 1]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from sktime.annotation.datagen import piecewise_normal_multivariate\n", | ||
"X_example = piecewise_normal_multivariate(means=[[1, 3], [4, 5]], lengths=[3, 4], random_state=10)\n", | ||
"model = EAgglo()\n", | ||
"print(X_example)\n", | ||
"clusters_example = model.fit_transform(X_example)\n", | ||
"print(clusters_example)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |