From 00e8992050aafc265ca604f0686953adba0bbb02 Mon Sep 17 00:00:00 2001 From: Bhargav Kowshik Date: Sun, 15 Sep 2024 17:32:38 +0530 Subject: [PATCH] Prepare clean datasets for Shiny app --- frontend/app.py | 29 +--- isl_2024/_modidx.py | 3 +- isl_2024/clean_datasets.py | 69 +++++++++ isl_2024/scrape_wallstream.py | 2 +- nbs/03_scrape_wallstream.ipynb | 2 +- nbs/04_clean_datasets.ipynb | 256 +++++++++++++++++++++++++++++++++ 6 files changed, 335 insertions(+), 26 deletions(-) create mode 100644 isl_2024/clean_datasets.py create mode 100644 nbs/04_clean_datasets.ipynb diff --git a/frontend/app.py b/frontend/app.py index 8a0e53a..245a29e 100644 --- a/frontend/app.py +++ b/frontend/app.py @@ -24,34 +24,17 @@ def value(): return f"{input.daterange()[0]} to {input.daterange()[1]}" script_dir = os.path.dirname(os.path.abspath(__file__)) - print(script_dir) parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir)) log_dir = os.path.join(parent_dir, 'logs') - data_dir = os.path.join(parent_dir, 'data') + data_dir = os.path.join(parent_dir, 'data/clean') # Shiny app will use cleaned datasets. @render.data_frame def matches_df(): - with open(os.path.join(data_dir, 'matches.txt'), encoding='utf-8') as f: - matches = json.loads(f.readlines()[-1])['matches'] - - df = [] - for match in matches: - df.append({ - 'start_at': match['start_date'], - 'end_at': match['end_date'], - 'home_team': match['participants'][0]['name'], - 'away_team': match['participants'][1]['name'], - 'score': match['winning_margin'], - }) - df = pd.DataFrame(df) - df['date'] = pd.to_datetime(df['start_at']).dt.strftime("%Y-%m-%d") - df['start_at'] = pd.to_datetime(df['start_at']) - df['end_at'] = pd.to_datetime(df['start_at']) - df['start_time'] = pd.to_datetime(df['start_at']).dt.strftime("%H:%M") - df['end_time'] = pd.to_datetime(df['end_at']).dt.strftime("%H:%M") - df['match_id'] = df.index + 1 - print(df.head()) + df = pd.read_csv(os.path.join(data_dir, 'matches.csv'), parse_dates=["start_at", "end_at"]) + df['date'] = df['start_at'].dt.strftime("%Y-%m-%d") + df['start_time'] = df['start_at'].dt.strftime("%I:%M %p") + df['end_time'] = df['end_at'].dt.strftime("%I:%M %p") df_render = df.sort_values(by="start_at", ascending=False) \ .loc[(df['date'] >= str(input.daterange()[0])) & (df['date'] <= str(input.daterange()[1])), @@ -70,4 +53,4 @@ def matches_df(): width="100%" ) -app = App(app_ui, server) \ No newline at end of file +app = App(app_ui, server) diff --git a/isl_2024/_modidx.py b/isl_2024/_modidx.py index d3422f3..c4bd0b1 100644 --- a/isl_2024/_modidx.py +++ b/isl_2024/_modidx.py @@ -5,7 +5,8 @@ 'doc_host': 'https://bkowshik.github.io', 'git_url': 'https://github.com/bkowshik/isl-2024', 'lib_path': 'isl_2024'}, - 'syms': { 'isl_2024.core': {'isl_2024.core.foo': ('core.html#foo', 'isl_2024/core.py')}, + 'syms': { 'isl_2024.clean_datasets': {}, + 'isl_2024.core': {'isl_2024.core.foo': ('core.html#foo', 'isl_2024/core.py')}, 'isl_2024.scrape_live_stats': { 'isl_2024.scrape_live_stats.fetch_live_stats': ( 'scrape_live_stats.html#fetch_live_stats', 'isl_2024/scrape_live_stats.py')}, 'isl_2024.scrape_matches': {}, diff --git a/isl_2024/clean_datasets.py b/isl_2024/clean_datasets.py new file mode 100644 index 0000000..d63eb51 --- /dev/null +++ b/isl_2024/clean_datasets.py @@ -0,0 +1,69 @@ +"""Clean raw datasets for the Shiny app.""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/04_clean_datasets.ipynb. + +# %% auto 0 +__all__ = ['parent_dir', 'log_dir', 'data_dir', 'clean_data_dir', 'df'] + +# %% ../nbs/04_clean_datasets.ipynb 2 +import warnings +warnings.filterwarnings('ignore') + +import json +import logging +import os +import requests + +import pandas as pd + +# NOTE: Had to install the package with the following command for the import to work. +# python3 -m pip install -e '.[dev]' +from .utils import * + +# %% ../nbs/04_clean_datasets.ipynb 4 +try: + # This will work when running as a script + script_dir = os.path.dirname(os.path.abspath(__file__)) +except NameError: + # This will work when running in a Jupyter notebook + script_dir = os.getcwd() + +parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir)) +log_dir = os.path.join(parent_dir, 'logs') +data_dir = os.path.join(parent_dir, 'data') +clean_data_dir = os.path.join(parent_dir, 'data/clean') + +if not os.path.exists(log_dir): + os.makedirs(log_dir) + +if not os.path.exists(data_dir): + os.makedirs(data_dir) + +if not os.path.exists(clean_data_dir): + os.makedirs(clean_data_dir) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename=os.path.join(log_dir, 'clean_datasets.log'), filemode='a') + +# %% ../nbs/04_clean_datasets.ipynb 7 +with open(os.path.join(data_dir, 'matches.txt'), encoding='utf-8') as f: + matches = json.loads(f.readlines()[-1])['matches'] + +df = [] +for match in matches: + df.append({ + 'start_at': match['start_date'], + 'end_at': match['end_date'], + 'home_team': match['participants'][0]['name'], + 'away_team': match['participants'][1]['name'], + 'score': match['winning_margin'], + }) +df = pd.DataFrame(df) +df['start_at'] = pd.to_datetime(df['start_at']) +df['end_at'] = pd.to_datetime(df['start_at']) +df['match_id'] = df.index + 1 +df = df[['match_id', 'start_at', 'end_at', 'home_team', 'away_team', 'score']] + +df.to_csv(os.path.join(clean_data_dir, 'matches.csv'), index=False) + +print(df.shape) +df.head() diff --git a/isl_2024/scrape_wallstream.py b/isl_2024/scrape_wallstream.py index 0d0432a..cf839fa 100644 --- a/isl_2024/scrape_wallstream.py +++ b/isl_2024/scrape_wallstream.py @@ -29,7 +29,7 @@ parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir)) log_dir = os.path.join(parent_dir, 'logs') data_dir = os.path.join(parent_dir, 'data/wallstream') - + if not os.path.exists(log_dir): os.makedirs(log_dir) diff --git a/nbs/03_scrape_wallstream.ipynb b/nbs/03_scrape_wallstream.ipynb index fc01e59..b28a582 100644 --- a/nbs/03_scrape_wallstream.ipynb +++ b/nbs/03_scrape_wallstream.ipynb @@ -67,7 +67,7 @@ "parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir))\n", "log_dir = os.path.join(parent_dir, 'logs')\n", "data_dir = os.path.join(parent_dir, 'data/wallstream')\n", - " \n", + "\n", "if not os.path.exists(log_dir):\n", " os.makedirs(log_dir)\n", "\n", diff --git a/nbs/04_clean_datasets.ipynb b/nbs/04_clean_datasets.ipynb new file mode 100644 index 0000000..3369c67 --- /dev/null +++ b/nbs/04_clean_datasets.ipynb @@ -0,0 +1,256 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "#| default_exp clean_datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Clean datasets\n", + "\n", + "> Clean raw datasets for the Shiny app." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import json\n", + "import logging\n", + "import os\n", + "import requests\n", + "\n", + "import pandas as pd\n", + "\n", + "# NOTE: Had to install the package with the following command for the import to work.\n", + "# python3 -m pip install -e '.[dev]'\n", + "from isl_2024.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "from nbdev.showdoc import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "try:\n", + " # This will work when running as a script\n", + " script_dir = os.path.dirname(os.path.abspath(__file__))\n", + "except NameError:\n", + " # This will work when running in a Jupyter notebook\n", + " script_dir = os.getcwd()\n", + "\n", + "parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir))\n", + "log_dir = os.path.join(parent_dir, 'logs')\n", + "data_dir = os.path.join(parent_dir, 'data')\n", + "clean_data_dir = os.path.join(parent_dir, 'data/clean')\n", + "\n", + "if not os.path.exists(log_dir):\n", + " os.makedirs(log_dir)\n", + "\n", + "if not os.path.exists(data_dir):\n", + " os.makedirs(data_dir)\n", + "\n", + "if not os.path.exists(clean_data_dir):\n", + " os.makedirs(clean_data_dir)\n", + "\n", + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename=os.path.join(log_dir, 'clean_datasets.log'), filemode='a')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Matches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(84, 6)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_idstart_atend_athome_teamaway_teamscore
012024-09-13 19:30:00+05:302024-09-13 19:30:00+05:30Mohun Bagan Super GiantMumbai City FC2 - 2
122024-09-14 17:00:00+05:302024-09-14 17:00:00+05:30Odisha FCChennaiyin FC2 - 3
232024-09-14 19:30:00+05:302024-09-14 19:30:00+05:30Bengaluru FCEast Bengal FC1 - 0
342024-09-15 19:30:00+05:302024-09-15 19:30:00+05:30Kerala Blasters FCPunjab FC
452024-09-16 19:30:00+05:302024-09-16 19:30:00+05:30Mohammedan SCNorthEast United FC
\n", + "
" + ], + "text/plain": [ + " match_id start_at end_at \\\n", + "0 1 2024-09-13 19:30:00+05:30 2024-09-13 19:30:00+05:30 \n", + "1 2 2024-09-14 17:00:00+05:30 2024-09-14 17:00:00+05:30 \n", + "2 3 2024-09-14 19:30:00+05:30 2024-09-14 19:30:00+05:30 \n", + "3 4 2024-09-15 19:30:00+05:30 2024-09-15 19:30:00+05:30 \n", + "4 5 2024-09-16 19:30:00+05:30 2024-09-16 19:30:00+05:30 \n", + "\n", + " home_team away_team score \n", + "0 Mohun Bagan Super Giant Mumbai City FC 2 - 2 \n", + "1 Odisha FC Chennaiyin FC 2 - 3 \n", + "2 Bengaluru FC East Bengal FC 1 - 0 \n", + "3 Kerala Blasters FC Punjab FC \n", + "4 Mohammedan SC NorthEast United FC " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| export\n", + "\n", + "with open(os.path.join(data_dir, 'matches.txt'), encoding='utf-8') as f:\n", + " matches = json.loads(f.readlines()[-1])['matches']\n", + "\n", + "df = []\n", + "for match in matches:\n", + " df.append({\n", + " 'start_at': match['start_date'],\n", + " 'end_at': match['end_date'],\n", + " 'home_team': match['participants'][0]['name'],\n", + " 'away_team': match['participants'][1]['name'],\n", + " 'score': match['winning_margin'],\n", + " })\n", + "df = pd.DataFrame(df)\n", + "df['start_at'] = pd.to_datetime(df['start_at'])\n", + "df['end_at'] = pd.to_datetime(df['start_at'])\n", + "df['match_id'] = df.index + 1\n", + "df = df[['match_id', 'start_at', 'end_at', 'home_team', 'away_team', 'score']]\n", + "\n", + "df.to_csv(os.path.join(clean_data_dir, 'matches.csv'), index=False)\n", + "\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}