From 9bb9d469cabdc08396d91732f691c92410d433bf Mon Sep 17 00:00:00 2001 From: Jonathan Gong Date: Tue, 15 Oct 2024 16:54:28 -0400 Subject: [PATCH 01/51] added GMMHMM implementation --- .../1.01-jg-GMMHMM-implementation.ipynb | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb diff --git a/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb b/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb new file mode 100644 index 0000000..34428f3 --- /dev/null +++ b/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Author: Jonathan Gong" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import sys\n", + "from sktime.annotation.hmm_learn import GMMHMM \n", + "import numpy as np" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data processing" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_61427/895166927.py:4: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " csv_files.append(pd.read_csv(f'../data/raw/{file}'))\n" + ] + } + ], + "source": [ + "csv_files = []\n", + "for file in os.listdir('../data/raw'):\n", + " if file.endswith('.csv'):\n", + " csv_files.append(pd.read_csv(f'../data/raw/{file}'))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_61427/872985159.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n", + "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_61427/872985159.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n" + ] + } + ], + "source": [ + "each_bgl =[]\n", + "for file in csv_files:\n", + " each_df = file[[\"date\", \"bgl\"]]\n", + " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n", + " each_bgl.append(each_df.dropna(subset=each_df.columns[:4], how='all'))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "days_patients = []\n", + "for bgl in each_bgl:\n", + " # (hour:minute:second)\n", + " bgl['time'] = bgl['date'].dt.time\n", + " # (year-month-day)\n", + " bgl['day'] = bgl['date'].dt.date\n", + " # unique days\n", + " unique_days = bgl['day'].unique()\n", + " day_data = []\n", + " for i, day in enumerate(unique_days):\n", + " bgl_day = bgl[bgl['day'] == day]\n", + " day_data.append(bgl_day[\"bgl\"])\n", + "\n", + " days_patients.append(day_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "sequences = days_patients[0]\n", + "sequences = np.concatenate(sequences)\n", + "sequences = sequences[~np.isnan(sequences)]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 98., 100., 98., ..., 89., 88., 88.])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sequences\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Model is not converging. Current: -183579.8275463407 is not greater than -178807.85189297324. Delta is -4771.975653367466\n" + ] + } + ], + "source": [ + "model = GMMHMM(n_components=2, n_mix=2) \n", + "model = model.fit(sequences) " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1ee4e52d05cd8ffa4fa7ffde6ecc0c1c142e8584 Mon Sep 17 00:00:00 2001 From: Jonathan Gong Date: Thu, 17 Oct 2024 01:10:08 -0400 Subject: [PATCH 02/51] adjusted na filtering --- .../1.01-jg-GMMHMM-implementation.ipynb | 154 +++++++++++++++--- 1 file changed, 133 insertions(+), 21 deletions(-) diff --git a/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb b/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb index 34428f3..e6b9c9b 100644 --- a/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb +++ b/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -31,14 +31,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_61427/895166927.py:4: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "C:\\Users\\jonat\\AppData\\Local\\Temp\\ipykernel_15640\\895166927.py:4: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n", " csv_files.append(pd.read_csv(f'../data/raw/{file}'))\n" ] } @@ -52,20 +52,60 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_61427/872985159.py:4: SettingWithCopyWarning: \n", + "C:\\Users\\jonat\\AppData\\Local\\Temp\\ipykernel_15640\\1612716447.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n", - "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_61427/872985159.py:4: SettingWithCopyWarning: \n", + " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " date bgl\n", + "0 2024-07-01 05:02:39+00:00 98.0\n", + "1 2024-07-01 05:07:39+00:00 100.0\n", + "2 2024-07-01 05:12:39+00:00 98.0\n", + "3 2024-07-01 05:17:39+00:00 94.0\n", + "4 2024-07-01 05:22:40+00:00 94.0\n", + "... ... ...\n", + "39622 2024-10-01 04:47:37+00:00 88.0\n", + "39623 2024-10-01 04:49:38+00:00 90.0\n", + "39624 2024-10-01 04:53:37+00:00 89.0\n", + "39625 2024-10-01 04:55:38+00:00 88.0\n", + "39626 2024-10-01 04:57:37+00:00 88.0\n", + "\n", + "[39627 rows x 2 columns]\n", + " date bgl\n", + "0 2024-07-01 04:02:32+00:00 115.0\n", + "1 2024-07-01 04:05:33+00:00 112.0\n", + "2 2024-07-01 04:08:33+00:00 116.0\n", + "3 2024-07-01 04:10:34+00:00 121.0\n", + "4 2024-07-01 04:13:36+00:00 122.0\n", + "... ... ...\n", + "40101 2024-10-01 03:43:17+00:00 108.0\n", + "40102 2024-10-01 03:47:20+00:00 101.0\n", + "40103 2024-10-01 03:50:22+00:00 96.0\n", + "40104 2024-10-01 03:54:25+00:00 103.0\n", + "40105 2024-10-01 03:57:26+00:00 103.0\n", + "\n", + "[40106 rows x 2 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\jonat\\AppData\\Local\\Temp\\ipykernel_15640\\1612716447.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -79,14 +119,81 @@ "for file in csv_files:\n", " each_df = file[[\"date\", \"bgl\"]]\n", " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n", - " each_bgl.append(each_df.dropna(subset=each_df.columns[:4], how='all'))" + " print(each_df)\n", + " each_bgl.append(each_df.dropna(subset=[\"bgl\"], how='all'))" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[ date bgl\n", + " 0 2024-07-01 05:02:39+00:00 98.0\n", + " 1 2024-07-01 05:07:39+00:00 100.0\n", + " 2 2024-07-01 05:12:39+00:00 98.0\n", + " 3 2024-07-01 05:17:39+00:00 94.0\n", + " 4 2024-07-01 05:22:40+00:00 94.0\n", + " ... ... ...\n", + " 39622 2024-10-01 04:47:37+00:00 88.0\n", + " 39623 2024-10-01 04:49:38+00:00 90.0\n", + " 39624 2024-10-01 04:53:37+00:00 89.0\n", + " 39625 2024-10-01 04:55:38+00:00 88.0\n", + " 39626 2024-10-01 04:57:37+00:00 88.0\n", + " \n", + " [39518 rows x 2 columns],\n", + " date bgl\n", + " 0 2024-07-01 04:02:32+00:00 115.0\n", + " 1 2024-07-01 04:05:33+00:00 112.0\n", + " 2 2024-07-01 04:08:33+00:00 116.0\n", + " 3 2024-07-01 04:10:34+00:00 121.0\n", + " 4 2024-07-01 04:13:36+00:00 122.0\n", + " ... ... ...\n", + " 40101 2024-10-01 03:43:17+00:00 108.0\n", + " 40102 2024-10-01 03:47:20+00:00 101.0\n", + " 40103 2024-10-01 03:50:22+00:00 96.0\n", + " 40104 2024-10-01 03:54:25+00:00 103.0\n", + " 40105 2024-10-01 03:57:26+00:00 103.0\n", + " \n", + " [39923 rows x 2 columns]]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "each_bgl" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\jonat\\AppData\\Local\\Temp\\ipykernel_15640\\2013028078.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " bgl['time'] = bgl['date'].dt.time\n", + "C:\\Users\\jonat\\AppData\\Local\\Temp\\ipykernel_15640\\2013028078.py:6: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " bgl['day'] = bgl['date'].dt.date\n" + ] + } + ], "source": [ "days_patients = []\n", "for bgl in each_bgl:\n", @@ -106,18 +213,17 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "sequences = days_patients[0]\n", - "sequences = np.concatenate(sequences)\n", - "sequences = sequences[~np.isnan(sequences)]" + "sequences = np.concatenate(sequences)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -126,7 +232,7 @@ "array([ 98., 100., 98., ..., 89., 88., 88.])" ] }, - "execution_count": 30, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -137,14 +243,20 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 15, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Model is not converging. Current: -183579.8275463407 is not greater than -178807.85189297324. Delta is -4771.975653367466\n" + "ename": "ModuleNotFoundError", + "evalue": "No module named 'hmmlearn'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[15], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m GMMHMM(n_components\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, n_mix\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m) \n\u001b[1;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43msequences\u001b[49m\u001b[43m)\u001b[49m \n", + "File \u001b[1;32mc:\\Users\\jonat\\Documents\\Code\\WAT.ai\\bg_control-1\\0_meal_identification\\meal_identification\\venv\\Lib\\site-packages\\sktime\\annotation\\base\\_base.py:153\u001b[0m, in \u001b[0;36mBaseSeriesAnnotator.fit\u001b[1;34m(self, X, Y)\u001b[0m\n\u001b[0;32m 149\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_Y \u001b[38;5;241m=\u001b[39m Y\n\u001b[0;32m 151\u001b[0m \u001b[38;5;66;03m# fkiraly: insert checks/conversions here, after PR #1012 I suggest\u001b[39;00m\n\u001b[1;32m--> 153\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 155\u001b[0m \u001b[38;5;66;03m# this should happen last\u001b[39;00m\n\u001b[0;32m 156\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_fitted \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\jonat\\Documents\\Code\\WAT.ai\\bg_control-1\\0_meal_identification\\meal_identification\\venv\\Lib\\site-packages\\sktime\\annotation\\hmm_learn\\gmm.py:155\u001b[0m, in \u001b[0;36mGMMHMM._fit\u001b[1;34m(self, X, Y)\u001b[0m\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_fit\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, Y\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m 154\u001b[0m \u001b[38;5;66;03m# import inside _fit to avoid hard dependency.\u001b[39;00m\n\u001b[1;32m--> 155\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhmmlearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhmm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GMMHMM \u001b[38;5;28;01mas\u001b[39;00m _GMMHMM\n\u001b[0;32m 157\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_hmm_estimator \u001b[38;5;241m=\u001b[39m _GMMHMM(\n\u001b[0;32m 158\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_components,\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_mix,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 176\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimplementation,\n\u001b[0;32m 177\u001b[0m )\n\u001b[0;32m 178\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m_fit(X, Y)\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'hmmlearn'" ] } ], @@ -170,7 +282,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.5" }, "orig_nbformat": 4 }, From 7b15ebd643ffbb1aea92251f41bb76a197e9938a Mon Sep 17 00:00:00 2001 From: Jonathan Gong Date: Fri, 18 Oct 2024 17:16:53 -0400 Subject: [PATCH 03/51] tried diff data processing didnt do anything --- .../1.01-jg-GMMHMM-implementation.ipynb | 321 ++++++++---------- ...vr-time-series-gaussian-segmentation.ipynb | 9 +- 2 files changed, 139 insertions(+), 191 deletions(-) diff --git a/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb b/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb index 28eb81a..5714812 100644 --- a/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb +++ b/0_meal_identification/meal_identification/notebooks/1.01-jg-GMMHMM-implementation.ipynb @@ -41,14 +41,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 164, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_97377/895166927.py:4: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_13753/895166927.py:4: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n", " csv_files.append(pd.read_csv(f'../data/raw/{file}'))\n" ] } @@ -62,25 +62,33 @@ }, { "cell_type": "code", - "execution_count": 502, + "execution_count": 165, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_97377/4223640982.py:4: SettingWithCopyWarning: \n", + "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_13753/2409137033.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n", - "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_97377/4223640982.py:4: SettingWithCopyWarning: \n", + " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='%Y-%m-%d %H:%M:%S%z', errors='coerce', utc=True)\n", + "/var/folders/22/0fj82gz10j5cvdcxtqcw5yc80000gn/T/ipykernel_13753/2409137033.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n" + " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='%Y-%m-%d %H:%M:%S%z', errors='coerce', utc=True)\n" ] } ], @@ -88,143 +96,60 @@ "each_bgl =[]\n", "for file in csv_files:\n", " each_df = file[[\"date\", \"bgl\"]]\n", - " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='mixed', utc=True)\n", - " each_bgl.append(each_df.dropna(subset=[\"bgl\"], how='all'))" - ] - }, - { - "cell_type": "code", - "execution_count": 503, - "metadata": {}, - "outputs": [], - "source": [ - "each_bgl[0].set_index('date', inplace=True)\n", - "df_resampled_0 = each_bgl[0].resample('5min').mean().interpolate(method='linear')" + " each_df[\"date\"] = pd.to_datetime(each_df[\"date\"], format='%Y-%m-%d %H:%M:%S%z', errors='coerce', utc=True)\n", + " each_df = each_df.dropna(subset=['date'])\n", + " each_df = each_df.dropna(subset=[\"bgl\"])\n", + " each_df = each_df.sort_values('date')\n", + " each_df = each_df.drop_duplicates(subset=['date'])\n", + " print(type(each_df))\n", + " each_df.set_index('date', inplace=True)\n", + " each_bgl.append(each_df)" ] }, { "cell_type": "code", - "execution_count": 504, - "metadata": {}, - "outputs": [], - "source": [ - "each_bgl[1].set_index('date', inplace=True)\n", - "df_resampled_1 = each_bgl[1].resample('5min').mean().interpolate(method='linear')" - ] - }, - { - "cell_type": "code", - "execution_count": 505, + "execution_count": 166, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bgl
date
2024-07-01 05:00:00+00:0098.0
2024-07-01 05:05:00+00:00100.0
2024-07-01 05:10:00+00:0098.0
2024-07-01 05:15:00+00:0094.0
2024-07-01 05:20:00+00:0094.0
......
2024-10-01 04:35:00+00:0092.5
2024-10-01 04:40:00+00:0087.0
2024-10-01 04:45:00+00:0088.0
2024-10-01 04:50:00+00:0089.0
2024-10-01 04:55:00+00:0088.0
\n", - "

26496 rows × 1 columns

\n", - "
" - ], "text/plain": [ - " bgl\n", - "date \n", - "2024-07-01 05:00:00+00:00 98.0\n", - "2024-07-01 05:05:00+00:00 100.0\n", - "2024-07-01 05:10:00+00:00 98.0\n", - "2024-07-01 05:15:00+00:00 94.0\n", - "2024-07-01 05:20:00+00:00 94.0\n", - "... ...\n", - "2024-10-01 04:35:00+00:00 92.5\n", - "2024-10-01 04:40:00+00:00 87.0\n", - "2024-10-01 04:45:00+00:00 88.0\n", - "2024-10-01 04:50:00+00:00 89.0\n", - "2024-10-01 04:55:00+00:00 88.0\n", - "\n", - "[26496 rows x 1 columns]" + "array([ 39., 42., 43., 44., 45., 46., 47., 48., 50., 51., 52.,\n", + " 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63.,\n", + " 64., 65., 66., 67., 68., 69., 70., 71., 72., 73., 74.,\n", + " 75., 76., 77., 78., 79., 80., 81., 82., 83., 84., 85.,\n", + " 86., 87., 88., 89., 90., 91., 92., 93., 94., 95., 96.,\n", + " 97., 98., 99., 100., 101., 102., 103., 104., 105., 106., 107.,\n", + " 108., 109., 110., 111., 112., 113., 114., 115., 116., 117., 118.,\n", + " 119., 120., 121., 122., 123., 124., 125., 126., 127., 128., 129.,\n", + " 130., 131., 132., 133., 134., 135., 136., 137., 138., 139., 140.,\n", + " 141., 142., 143., 144., 145., 146., 147., 148., 149., 150., 151.,\n", + " 152., 153., 154., 155., 156., 157., 158., 159., 160., 161., 162.,\n", + " 163., 164., 165., 166., 167., 168., 169., 170., 171., 172., 173.,\n", + " 174., 175., 176., 177., 178., 179., 180., 181., 182., 183., 184.,\n", + " 185., 186., 187., 188., 189., 190., 191., 192., 193., 194., 195.,\n", + " 196., 197., 198., 199., 200., 201., 202., 203., 204., 205., 206.,\n", + " 207., 208., 209., 210., 211., 212., 213., 214., 215., 216., 217.,\n", + " 218., 219., 220., 221., 222., 223., 224., 225., 226., 227., 228.,\n", + " 229., 230., 231., 232., 233., 234., 235., 236., 237., 238., 239.,\n", + " 240., 241., 242., 243., 244., 245., 246., 247., 248., 249., 250.,\n", + " 251., 252., 253., 254., 255., 256., 257., 258., 259., 260., 262.,\n", + " 263., 264., 265., 266., 267., 268., 269., 270., 271., 272., 273.,\n", + " 274., 277., 280., 282., 283., 284., 285., 288., 291., 298., 299.])" ] }, - "execution_count": 505, + "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_resampled_0" + "np.unique(each_bgl[0])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -246,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 600, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -256,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 507, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -265,7 +190,7 @@ "26496" ] }, - "execution_count": 507, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -276,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 601, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -285,23 +210,23 @@ }, { "cell_type": "code", - "execution_count": 602, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler()\n", - "scaled_data = scaler.fit_transform(sequences)" + "scaled_data = scaler.fit_transform(each_bgl[0])" ] }, { "cell_type": "code", - "execution_count": 624, + "execution_count": 201, "metadata": {}, "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", - "data_used = scaled_data\n", + "data_used = each_bgl[0][3500:4000]\n", "n_components = 2 # Number of components in the GMMHMM\n", "kmeans = KMeans(n_clusters=n_components).fit(data_used)\n", "initial_means = kmeans.cluster_centers_" @@ -309,17 +234,17 @@ }, { "cell_type": "code", - "execution_count": 553, + "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[0.2734143 ],\n", - " [0.41113593]])" + "array([[113.27272727],\n", + " [ 96.47368421]])" ] }, - "execution_count": 553, + "execution_count": 115, "metadata": {}, "output_type": "execute_result" } @@ -338,23 +263,23 @@ }, { "cell_type": "code", - "execution_count": 625, + "execution_count": 202, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 1 -66294.38587971 +nan\n", - " 2 -25567.55553585 +40726.83034387\n", - " 3 -29680.95006715 -4113.39453131\n", - "Model is not converging. Current: -29680.950067150698 is not greater than -25567.555535845248. Delta is -4113.39453130545\n" + " 1 -2524.08505622 +nan\n", + " 2 -2468.64703189 +55.43802433\n", + " 3 -2475.35984899 -6.71281710\n", + "Model is not converging. Current: -2475.359848988479 is not greater than -2468.647031889409. Delta is -6.71281709906998\n" ] }, { "data": { "text/html": [ - "
GMMHMM(covariance_type='full', n_components=2, n_iter=100, n_mix=2,\n",
-       "       verbose=True)
Please rerun this cell to show the HTML repr or trust the notebook.