Skip to content

Commit

Permalink
EAgglo Model (#155) (#156)
Browse files Browse the repository at this point in the history
Couldn't get the clustering working

Co-authored-by: Muhammad Abdullah Shahid <[email protected]>
  • Loading branch information
RobotPsychologist and abdshd authored Nov 17, 2024
1 parent 1088e3a commit 4a7150e
Showing 1 changed file with 241 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"Meal Identification using EAgglo Model\n",
"\n",
"This notebook focuses on implementing the EAgglo (Hierarchical Agglomerative Estimation of Multiple Change Points) \n",
"model from the sktime library to detect significant meals in CGM time-series data. \n",
"\n",
"The goal is to identify key change points in the glucose time series data that correspond to significant meals. \n",
"\n",
"EAgglo is a non-parametric clustering approach that preserves the time ordering of observations. It merges \n",
"neighboring segments sequentially to maximize a goodness-of-fit statistic, simultaneously identifying the number \n",
"and location of change points without assuming any specific data distribution. The parameters of the model allow \n",
"flexibility to control clustering behavior and penalization to avoid overfitting.\n",
"\n",
"References:\n",
"- sktime Documentation: https://www.sktime.net/en/v0.28.0/api_reference/auto_generated/sktime.annotation.eagglo.EAgglo.html#sktime.annotation.eagglo.EAgglo\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sktime.annotation.eagglo import EAgglo\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" date bgl trend\n",
"0 2024-07-01 00:02:32 115.0 FLAT\n",
"1 2024-07-01 00:05:33 112.0 FLAT\n",
"2 2024-07-01 00:08:33 116.0 FLAT\n",
"3 2024-07-01 00:10:34 121.0 FLAT\n",
"4 2024-07-01 00:13:36 122.0 FLAT\n",
"date datetime64[ns]\n",
"bgl float64\n",
"trend object\n",
"dtype: object\n"
]
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"data_path = \"../data/raw\"\n",
"relevant_cols = ['date', 'bgl', 'trend']\n",
"csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]\n",
"\n",
"dataframes = {\n",
" file: pd.read_csv(\n",
" os.path.join(data_path, file),\n",
" usecols=relevant_cols,\n",
" parse_dates=['date'],\n",
" )\n",
" for file in csv_files\n",
"}\n",
"\n",
"df = dataframes['679372_2024-07-01_2024-09-30.csv']\n",
"\n",
"df['date'] = pd.to_datetime(df['date'], format='ISO8601')\n",
"\n",
"df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S')\n",
"\n",
"df['date'] = pd.to_datetime(df['date'])\n",
"\n",
"df = df.dropna(subset=['bgl', 'date'])\n",
"df = df.drop_duplicates()\n",
"\n",
"print(df.head())\n",
"print(df.dtypes)\n"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" trend trend_encoded\n",
"0 FLAT 0\n",
"1 FLAT 0\n",
"2 FLAT 0\n",
"3 FLAT 0\n",
"4 FLAT 0\n"
]
}
],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"le = LabelEncoder()\n",
"\n",
"df['trend_encoded'] = le.fit_transform(df['trend'])\n",
"\n",
"print(df[['trend', 'trend_encoded']].head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" bgl trend trend_encoded cluster\n",
"date \n",
"2024-07-05 00:01:53 126.0 FLAT 0 NaN\n",
"2024-07-05 00:04:54 132.0 FLAT 0 NaN\n",
"2024-07-05 00:08:55 132.0 FLAT 0 NaN\n",
"2024-07-05 00:11:54 135.0 FLAT 0 NaN\n",
"2024-07-05 00:14:55 133.0 FLAT 0 NaN\n",
"... ... ... ... ...\n",
"2024-07-05 23:46:46 165.0 FORTYFIVE_DOWN 1 NaN\n",
"2024-07-05 23:49:45 156.0 FORTYFIVE_DOWN 1 NaN\n",
"2024-07-05 23:53:47 155.0 FORTYFIVE_DOWN 1 NaN\n",
"2024-07-05 23:56:46 145.0 FORTYFIVE_DOWN 1 NaN\n",
"2024-07-05 23:59:48 127.0 SINGLE_DOWN 4 NaN\n",
"\n",
"[474 rows x 4 columns]\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sktime.annotation.eagglo import EAgglo\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"df['date'] = pd.to_datetime(df['date'])\n",
"le = LabelEncoder()\n",
"df['trend_encoded'] = le.fit_transform(df['trend'])\n",
"\n",
"df_day = df[df['date'].dt.date == pd.to_datetime('2024-07-05').date()]\n",
"df_day = df_day.set_index('date')\n",
"\n",
"X = df_day[['bgl', 'trend_encoded']]\n",
"\n",
"model = EAgglo(alpha=1.0, penalty=\"mean_diff_penalty\")\n",
"clusters = model.fit_transform(X)\n",
"\n",
"df_day['cluster'] = clusters\n",
"\n",
"print(df_day)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"(474, 2)\n"
]
}
],
"source": [
"print(type(X))\n",
"print(X.shape)"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 2.3315865 3.71527897]\n",
" [-0.54540029 2.99161615]\n",
" [ 1.62133597 2.27991444]\n",
" [ 4.26551159 5.10854853]\n",
" [ 4.00429143 4.82539979]\n",
" [ 4.43302619 6.20303737]\n",
" [ 3.03493433 6.02827408]]\n",
"[0 0 0 1 1 1 1]\n"
]
}
],
"source": [
"from sktime.annotation.datagen import piecewise_normal_multivariate\n",
"X_example = piecewise_normal_multivariate(means=[[1, 3], [4, 5]], lengths=[3, 4], random_state=10)\n",
"model = EAgglo()\n",
"print(X_example)\n",
"clusters_example = model.fit_transform(X_example)\n",
"print(clusters_example)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 4a7150e

Please sign in to comment.