From ea543b7f97f175c06650978602596a99d58ef288 Mon Sep 17 00:00:00 2001 From: Mike Rossetti Date: Sat, 9 Jan 2021 15:14:22 -0500 Subject: [PATCH] Author: MJ Rossetti (prof.mj.rossetti@gmail.com) | License: CC BY-NC-SA (https://creativecommons.org/licenses/by-nc-sa/4.0/) --- .../packages/Pandas_Package_Overview.ipynb | 1057 +++++++++++++++++ 1 file changed, 1057 insertions(+) create mode 100644 notes/python/packages/Pandas_Package_Overview.ipynb diff --git a/notes/python/packages/Pandas_Package_Overview.ipynb b/notes/python/packages/Pandas_Package_Overview.ipynb new file mode 100644 index 00000000..a0617487 --- /dev/null +++ b/notes/python/packages/Pandas_Package_Overview.ipynb @@ -0,0 +1,1057 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Pandas Package Overview", + "provenance": [], + "collapsed_sections": [], + "authorship_tag": "ABX9TyOdk7hXHke/UIru2a0Rd9Qo", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8W53f6yM1n_1" + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "MnMfMHnfT4LS", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "18a1a12d-eb57-408d-aad3-2722521b081b" + }, + "source": [ + "\n", + "#\n", + "# SETUP CELL (DOWNLOAD A CSV FILE TO COLAB)\n", + "#\n", + "# to see the filesystem in colab, click the file folder in the left navbar\n", + "\n", + "# this is the os module. very helpful for managing the filesystem\n", + "# reference: https://docs.python.org/3/library/os.html\n", + "import os\n", + "\n", + "csv_filepath = \"jeter_stats.csv\"\n", + "print(csv_filepath)\n", + "\n", + "if not os.path.isfile(csv_filepath):\n", + " print(\"DOWNLOADING DATA...\")\n", + " # FYI: this wget command is a terminal command, NOT python\n", + " # ... in colab, we can execute terminal commands by prefixing them with an exclamation point\n", + " # ... students are not responsible for knowing terminal commands like this\n", + " !wget -q https://raw.githubusercontent.com/prof-rossetti/intro-to-python/master/data/jeter_stats.csv \n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "jeter_stats.csv\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_wGQ8pr3QFdd" + }, + "source": [ + "# The `pandas` Package\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p3IzoCwOQIFZ" + }, + "source": [ + "\n", + "Reference:\n", + "\n", + " + [Pandas Website](http://pandas.pydata.org/)\n", + " + [Pandas Source](https://github.com/pandas-dev/pandas)\n", + " + [Pandas Docs](http://pandas.pydata.org/pandas-docs/stable/)\n", + " + [Pandas Docs - API Reference](https://pandas.pydata.org/pandas-docs/stable/reference/index.html)\n", + "\n", + "\n", + "The `pandas` package provides capabilities for working with structured data, including spreadsheet-like objects called \"DataFrames\".\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GnWv3RChlhbK" + }, + "source": [ + "# Pandas Data Frames\n", + "\n", + "The Pandas [`DataFrame`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) datatype represents a table of data, like a spreadsheet.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-IGSd12Imm4W" + }, + "source": [ + "\n", + "\n", + "## Creating Data Frames\n", + "\n", + "We can create `DataFrame` objects from CSV files or create them ourselves from eligible data structures, including: a list of lists, a dictionary of lists, and a list of dictionaries.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8iPQn-hSzz6s" + }, + "source": [ + "### From CSV\n", + "\n", + "Use the pandas [`read_csv()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) function to import a CSV file to a new `DataFrame` object.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "rDecK8aB2OYg", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2bba0037-40bf-4db8-bfb3-b4c0f24b1d3a" + }, + "source": [ + "\n", + "from pandas import read_csv\n", + "\n", + "# READING CSV FILES\n", + "\n", + "stats_df = read_csv(csv_filepath) \n", + "print(type(stats_df))\n", + "print(stats_df.head())" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + " year games at_bats runs hits walks\n", + "0 1995 15 48 5 12 3\n", + "1 1996 157 582 104 183 48\n", + "2 1997 159 654 116 190 74\n", + "3 1998 149 626 127 203 57\n", + "4 1999 158 627 134 219 91\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J3SNaU8nvKvW" + }, + "source": [ + "\n", + "### From List of Lists\n", + "\n", + "When creating a dataframe from a list of lists, column names will be numeric by default, unless you set them yourself (during or after initialization).\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IbyAPmfGp9tX", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2c8ed1a7-ead0-4f4f-e7bc-7231e16990d1" + }, + "source": [ + "from pandas import DataFrame\n", + "\n", + "my_list = [\n", + " [1, \"A\"],\n", + " [2, \"B\"],\n", + " [3, \"C\"]\n", + "]\n", + "\n", + "df = DataFrame(my_list)\n", + "print(df.columns)\n", + "print(df)\n", + "\n", + "df.columns = [\"id\", \"grade\"] # can set column names after initialization\n", + "print(df.columns)\n", + "print(df)\n", + "\n", + "df = DataFrame(my_list, columns=[\"id\", \"grade\"]) # can set column names during initialization\n", + "print(df.columns)\n", + "print(df)\n", + "\n", + "# alternative variation, if you want to treat a certain column as the index\n", + "# but usually leaving the default index is fine\n", + "#df.set_index(\"id\", inplace=True)\n", + "#print(df.columns)\n", + "#print(df)\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "RangeIndex(start=0, stop=2, step=1)\n", + " 0 1\n", + "0 1 A\n", + "1 2 B\n", + "2 3 C\n", + "Index(['id', 'grade'], dtype='object')\n", + " id grade\n", + "0 1 A\n", + "1 2 B\n", + "2 3 C\n", + "Index(['id', 'grade'], dtype='object')\n", + " id grade\n", + "0 1 A\n", + "1 2 B\n", + "2 3 C\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lKyuPDiepA3c" + }, + "source": [ + "\n", + "\n", + "### From Dictionary of Lists\n", + "\n", + "When creating a dataframe from a dictionary of lists, column names will be the dictionary keys. \n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "C1Cgqlxcw0IT", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "de862b2a-9e02-4d68-886b-320b341bd345" + }, + "source": [ + "# from pandas import DataFrame\n", + "\n", + "my_dict = {\n", + " \"id\": [1,2,3],\n", + " \"grade\": [\"A\", \"B\", \"C\"]\n", + "}\n", + "\n", + "df = DataFrame(my_dict)\n", + "print(df.columns)\n", + "print(df)\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Index(['id', 'grade'], dtype='object')\n", + " id grade\n", + "0 1 A\n", + "1 2 B\n", + "2 3 C\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "55Zuc4wmvqR5" + }, + "source": [ + "### From List of Dictionaries (Records)\n", + "\n", + "When creating a dataframe from a list of dictionaries, column names will be the dictionary keys. \n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "JicP6m2xxUHy", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3662a598-7183-4bf4-e797-d16c0efdfdc1" + }, + "source": [ + "# from pandas import DataFrame\n", + "\n", + "my_records = [\n", + " {\"id\": 1, \"grade\":\"A\"},\n", + " {\"id\": 2, \"grade\":\"B\"},\n", + " {\"id\": 3, \"grade\":\"C\"}\n", + "]\n", + "\n", + "df = DataFrame(my_records)\n", + "print(df.columns)\n", + "print(df)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Index(['id', 'grade'], dtype='object')\n", + " id grade\n", + "0 1 A\n", + "1 2 B\n", + "2 3 C\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6kSx9H5uwpyM" + }, + "source": [ + "## Using Data Frames\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UAQe5FBFvt52" + }, + "source": [ + "\n", + "### Row Operations\n", + "\n", + "References:\n", + " + [`DataFrame.iloc[]`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html)\n", + " + [`DataFrame.iterrows()`](http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iterrows.html)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3kDLYJtP05Lq", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "30863b3f-b021-46ce-847f-1bf310ff2bc9" + }, + "source": [ + "\n", + "print(\"--------------------------\")\n", + "print(\"INSPECT FIRST/LAST X ROWS:\")\n", + "print(stats_df.head(3))\n", + "print(stats_df.tail(3))\n", + "\n", + "print(\"--------------------------\")\n", + "print(\"COUNTING ROWS:\")\n", + "print(len(stats_df))\n", + "\n", + "print(\"--------------------------\")\n", + "print(\"LOOPING THROUGH ROWS:\")\n", + "for index, row in stats_df.iterrows():\n", + " print(row[\"year\"])\n", + "\n", + "print(\"--------------------------\")\n", + "print(\"REFERENCE A SPECIFIC ROW:\")\n", + "print(stats_df.iloc[0])\n", + "\n", + "print(\"--------------------------\")\n", + "print(\"CONVERT ROW TO DICTIONARY:\")\n", + "print(stats_df.iloc[0].to_dict())\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "--------------------------\n", + "INSPECT FIRST/LAST X ROWS:\n", + " year games at_bats runs hits walks\n", + "0 1995 15 48 5 12 3\n", + "1 1996 157 582 104 183 48\n", + "2 1997 159 654 116 190 74\n", + " year games at_bats runs hits walks\n", + "17 2012 159 683 99 216 45\n", + "18 2013 17 63 8 12 8\n", + "19 2014 145 581 47 149 35\n", + "--------------------------\n", + "COUNTING ROWS:\n", + "20\n", + "--------------------------\n", + "LOOPING THROUGH ROWS:\n", + "1995\n", + "1996\n", + "1997\n", + "1998\n", + "1999\n", + "2000\n", + "2001\n", + "2002\n", + "2003\n", + "2004\n", + "2005\n", + "2006\n", + "2007\n", + "2008\n", + "2009\n", + "2010\n", + "2011\n", + "2012\n", + "2013\n", + "2014\n", + "--------------------------\n", + "REFERENCE A SPECIFIC ROW:\n", + "year 1995\n", + "games 15\n", + "at_bats 48\n", + "runs 5\n", + "hits 12\n", + "walks 3\n", + "Name: 0, dtype: int64\n", + "--------------------------\n", + "CONVERT ROW TO DICTIONARY:\n", + "{'year': 1995, 'games': 15, 'at_bats': 48, 'runs': 5, 'hits': 12, 'walks': 3}\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O90d9mJOlqFc" + }, + "source": [ + "### Column (`Series`) Operations \n", + "\n", + "A [`Series`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html) object represents a column in the dataframe. Each series has as many values as the dataframe has rows." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7ADQKDzFpgRp", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "b63f36ce-7f5f-49eb-ba9e-342ea9b99620" + }, + "source": [ + "# selecting multiple columns yields a dataframe object\n", + "print(\"--------------------\")\n", + "print(\"SELECTING MANY COLUMNS:\")\n", + "print(type(stats_df[[\"games\", \"year\"]]))\n", + "print(stats_df[[\"games\", \"year\"]])\n", + "\n", + "# selecting one column yields a series object\n", + "print(\"--------------------\")\n", + "print(\"SELECTING ONE COLUMN:\")\n", + "print(type(stats_df[\"games\"]))\n", + "print(stats_df[\"games\"])" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "--------------------\n", + "SELECTING MANY COLUMNS:\n", + "\n", + " games year\n", + "0 15 1995\n", + "1 157 1996\n", + "2 159 1997\n", + "3 149 1998\n", + "4 158 1999\n", + "5 148 2000\n", + "6 150 2001\n", + "7 157 2002\n", + "8 119 2003\n", + "9 154 2004\n", + "10 159 2005\n", + "11 154 2006\n", + "12 156 2007\n", + "13 150 2008\n", + "14 153 2009\n", + "15 157 2010\n", + "16 131 2011\n", + "17 159 2012\n", + "18 17 2013\n", + "19 145 2014\n", + "--------------------\n", + "SELECTING ONE COLUMN:\n", + "\n", + "0 15\n", + "1 157\n", + "2 159\n", + "3 149\n", + "4 158\n", + "5 148\n", + "6 150\n", + "7 157\n", + "8 119\n", + "9 154\n", + "10 159\n", + "11 154\n", + "12 156\n", + "13 150\n", + "14 153\n", + "15 157\n", + "16 131\n", + "17 159\n", + "18 17\n", + "19 145\n", + "Name: games, dtype: int64\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0_TQE71jpgUG" + }, + "source": [ + "Getting values and [`value_counts()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html) for a given series:" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "moGEkUDFpe_C", + "outputId": "d2ca1c4c-0914-44ea-9d03-44e1138e7d68" + }, + "source": [ + "\n", + "print(\"--------------------\")\n", + "print(\"VALUES:\")\n", + "print(stats_df[\"games\"].values)\n", + "\n", + "print(\"--------------------\")\n", + "print(\"DISTINCT VALUES:\")\n", + "print(stats_df[\"games\"].unique())\n", + "\n", + "print(\"--------------------\")\n", + "print(\"VALUE COUNTS:\")\n", + "print(stats_df[\"games\"].value_counts())\n", + "\n", + "print(\"--------------------\")\n", + "print(\"VALUE COUNTS (NORMALIZED):\")\n", + "print(stats_df[\"games\"].value_counts(normalize=True))\n", + "\n", + "# FYI: can convert this result to its own df:\n", + "# counts_df = stats_df[\"games\"].value_counts().to_frame()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "--------------------\n", + "VALUES:\n", + "[ 15 157 159 149 158 148 150 157 119 154 159 154 156 150 153 157 131 159\n", + " 17 145]\n", + "--------------------\n", + "DISTINCT VALUES:\n", + "[ 15 157 159 149 158 148 150 119 154 156 153 131 17 145]\n", + "--------------------\n", + "VALUE COUNTS:\n", + "159 3\n", + "157 3\n", + "154 2\n", + "150 2\n", + "158 1\n", + "156 1\n", + "153 1\n", + "119 1\n", + "149 1\n", + "148 1\n", + "17 1\n", + "15 1\n", + "145 1\n", + "131 1\n", + "Name: games, dtype: int64\n", + "--------------------\n", + "VALUE COUNTS (NORMALIZED):\n", + "159 0.15\n", + "157 0.15\n", + "154 0.10\n", + "150 0.10\n", + "158 0.05\n", + "156 0.05\n", + "153 0.05\n", + "119 0.05\n", + "149 0.05\n", + "148 0.05\n", + "17 0.05\n", + "15 0.05\n", + "145 0.05\n", + "131 0.05\n", + "Name: games, dtype: float64\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GHeCDWI282O4" + }, + "source": [ + "We can aggregate values in a series:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qvHXtlnf7mnm", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6c0d8484-5699-4702-f59f-8dc1e7278d2a" + }, + "source": [ + "\n", + "print(\"----------------------------\")\n", + "print(\"SERIES AGGREGATIONS...\")\n", + "\n", + "print(\"SUM:\", stats_df[\"games\"].sum()) \n", + "print(\"COUNT DISTINCT:\", stats_df[\"games\"].nunique())\n", + "\n", + "print(\"MIN:\", stats_df[\"games\"].min()) \n", + "print(\"MAX:\", stats_df[\"games\"].max())\n", + "\n", + "print(\"MEAN:\", stats_df[\"games\"].mean()) \n", + "print(\"MEDIAN:\", stats_df[\"games\"].median()) \n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "----------------------------\n", + "SERIES AGGREGATIONS...\n", + "SUM: 2747\n", + "COUNT DISTINCT: 14\n", + "MIN: 15\n", + "MAX: 159\n", + "MEAN: 137.35\n", + "MEDIAN: 153.5\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uqT894A2UJfd", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7e1673a3-297b-4d9f-8e14-297c7050b17c" + }, + "source": [ + "stats_df[\"games\"].describe()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "count 20.000000\n", + "mean 137.350000\n", + "std 42.671666\n", + "min 15.000000\n", + "25% 147.250000\n", + "50% 153.500000\n", + "75% 157.000000\n", + "max 159.000000\n", + "Name: games, dtype: float64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 35 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LNh7aS1B-C-1" + }, + "source": [ + "We can copy columns, perform column-wise operations, and create new ad-hoc columns as desired:\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bitN4kp7-CCX", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "59a09c51-d2e0-4231-867c-d0b15a84fb9a" + }, + "source": [ + "print(\"-------------------\")\n", + "print(\"COPYING COLUMNS...\")\n", + "stats_df[\"year_copy\"] = stats_df[\"year\"]\n", + "print(stats_df.columns)\n", + "\n", + "print(\"-------------------\")\n", + "print(\"DROPPING COLUMNS...\")\n", + "stats_df.drop(columns=[\"year_copy\"], inplace=True)\n", + "print(stats_df.columns)\n", + "\n", + "print(\"-------------------\")\n", + "print(\"COLUMN-WISE OPERATIONS...\")\n", + "\n", + "stats_df[\"batting_avg\"] = stats_df[\"hits\"] / stats_df[\"at_bats\"]\n", + "stats_df[\"on_base_pct\"] = (stats_df[\"hits\"] + stats_df[\"walks\"]) / stats_df[\"at_bats\"]\n", + "print(stats_df.columns)\n", + "print(stats_df.head())\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "-------------------\n", + "COPYING COLUMNS...\n", + "Index(['year', 'games', 'at_bats', 'runs', 'hits', 'walks', 'year_copy'], dtype='object')\n", + "-------------------\n", + "DROPPING COLUMNS...\n", + "Index(['year', 'games', 'at_bats', 'runs', 'hits', 'walks'], dtype='object')\n", + "-------------------\n", + "COLUMN-WISE OPERATIONS...\n", + "Index(['year', 'games', 'at_bats', 'runs', 'hits', 'walks', 'batting_avg',\n", + " 'on_base_pct'],\n", + " dtype='object')\n", + " year games at_bats runs hits walks batting_avg on_base_pct\n", + "0 1995 15 48 5 12 3 0.250000 0.312500\n", + "1 1996 157 582 104 183 48 0.314433 0.396907\n", + "2 1997 159 654 116 190 74 0.290520 0.403670\n", + "3 1998 149 626 127 203 57 0.324281 0.415335\n", + "4 1999 158 627 134 219 91 0.349282 0.494418\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bzuSyU_U9iHC" + }, + "source": [ + "Transforming values in a series using [`pandas.Series.map()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html) or [`pandas.Series.apply()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.apply.html):\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cfyF427c_3WE", + "outputId": "6d0ac90b-b9fd-42b1-af44-28968458cb02" + }, + "source": [ + "\n", + "def fmt_pct(number):\n", + " \"\"\" Formats a decimal number like 0.7 as a percentage: \"70.0%\" \"\"\"\n", + " return f\"{(number * 100):.1f}%\"\n", + "\n", + "stats_df[\"batting_avg_label\"] = stats_df[\"batting_avg\"].map(fmt_pct)\n", + "stats_df[\"on_base_pct_label\"] = stats_df[\"on_base_pct\"].map(fmt_pct)\n", + "\n", + "print(stats_df.head())" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " year games at_bats ... on_base_pct batting_avg_label on_base_pct_label\n", + "0 1995 15 48 ... 0.312500 25.0% 31.2%\n", + "1 1996 157 582 ... 0.396907 31.4% 39.7%\n", + "2 1997 159 654 ... 0.403670 29.1% 40.4%\n", + "3 1998 149 626 ... 0.415335 32.4% 41.5%\n", + "4 1999 158 627 ... 0.494418 34.9% 49.4%\n", + "\n", + "[5 rows x 10 columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mkRc7ngeAzlb" + }, + "source": [ + "### Filtering Rows\n", + "\n", + "We can filter the rows to return only those matching a given condition:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-pI3pJu9lpzG", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3e2b7717-f0e0-4b04-cc9b-2ce9a82fa76f" + }, + "source": [ + "\n", + "print(\"FILTERING BASED ON NUMERIC OPERATIONS...\")\n", + "print(stats_df[stats_df[\"games\"] > 150])\n", + "\n", + "print(\"FILTERING BASED ON STRING OPERATIONS...\")\n", + "df = DataFrame([\n", + " {\"id\":1, \"grade\": \"A+\"},\n", + " {\"id\":2, \"grade\": \"A\"},\n", + " {\"id\":3, \"grade\": \"A-\"},\n", + " {\"id\":4, \"grade\": \"B+\"},\n", + " {\"id\":5, \"grade\": \"B\"}\n", + "])\n", + "print(df[df['grade'].str.contains(\"A\")])\n", + "\n", + "\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "FILTERING BASED ON NUMERIC OPERATIONS...\n", + " year games at_bats ... on_base_pct batting_avg_label on_base_pct_label\n", + "1 1996 157 582 ... 0.396907 31.4% 39.7%\n", + "2 1997 159 654 ... 0.403670 29.1% 40.4%\n", + "4 1999 158 627 ... 0.494418 34.9% 49.4%\n", + "7 2002 157 644 ... 0.409938 29.7% 41.0%\n", + "9 2004 154 643 ... 0.363919 29.2% 36.4%\n", + "10 2005 159 654 ... 0.426606 30.9% 42.7%\n", + "11 2006 154 623 ... 0.454254 34.3% 45.4%\n", + "12 2007 156 639 ... 0.410016 32.2% 41.0%\n", + "14 2009 153 634 ... 0.447950 33.4% 44.8%\n", + "15 2010 157 663 ... 0.365008 27.0% 36.5%\n", + "17 2012 159 683 ... 0.382138 31.6% 38.2%\n", + "\n", + "[11 rows x 10 columns]\n", + "FILTERING BASED ON STRING OPERATIONS...\n", + " id grade\n", + "0 1 A+\n", + "1 2 A\n", + "2 3 A-\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R1hJRnlNELpX" + }, + "source": [ + "### Grouping and Aggregating Rows\n", + "\n", + "It is possible to use a dataframe's `groupby` method to create a `GroupBy` object, which is like a pivot-table.\n", + "\n", + "Reference:\n", + "\n", + " + [`DataFrame.groupby`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html)\n", + " + [`GroupBy`](https://pandas.pydata.org/pandas-docs/stable/reference/groupby.html)\n", + " + [`GroupBy.aggregate`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html)\n", + " + [Naming columns returned by groupby aggregate functions](https://stackoverflow.com/questions/19078325/naming-returned-columns-in-pandas-aggregate-function) lesson: ravel after groupby to rename the columns\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aBnvui2pEL64", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "feb81914-7d23-478f-b26c-bf05558791a9" + }, + "source": [ + "teams_df = DataFrame([\n", + " {\"city\": \"New York\", \"name\": \"Yankees\"},\n", + " {\"city\": \"New York\", \"name\": \"Mets\"},\n", + " {\"city\": \"Boston\", \"name\": \"Red Sox\"},\n", + " {\"city\": \"New Haven\", \"name\": \"Ravens\"}\n", + "])\n", + "\n", + "print(type(teams_df.groupby([\"city\"]))) #> DataFrameGroupBy a.k.a. GroupBy\n", + "\n", + "# can use aggregate function on the GroupBy object\n", + "teams_pivot = teams_df.groupby([\"city\"]).agg({'name': ['nunique']})\n", + "print(type(teams_pivot))\n", + "\n", + "\n", + "\n", + "# TIP: reset the columns to be a single level, to make sorting easier later\n", + "# yeah, we're using the ravel function. just treat it as boilerplate\n", + "# so we can reference the aggregated column names more easily later\n", + "print(\"-----------\")\n", + "print(\"COLUMN NAMES AFTER GROUPBY:\")\n", + "print(teams_pivot.columns)\n", + "print(teams_pivot[('name', 'nunique')]) # this is how we would need to reference the col name if we don't ravel to rename\n", + "\n", + "teams_pivot.columns = [\"_\".join(col) for col in teams_pivot.columns.ravel()]\n", + "print(\"-----------\")\n", + "print(\"COLUMN NAMES AFTER RAVEL:\")\n", + "print(teams_pivot.columns)\n", + "print(teams_pivot[\"name_nunique\"]) # this is how we reference the col name after we ravel to rename\n", + "\n", + "# sorting the pivot table\n", + "teams_pivot.sort_values(by=\"name_nunique\", ascending=False, inplace=True)\n", + "print(teams_pivot[\"name_nunique\"])\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "pandas.core.groupby.generic.DataFrameGroupBy\n", + "\n", + "-----------\n", + "COLUMN NAMES AFTER GROUPBY:\n", + "MultiIndex([('name', 'nunique')],\n", + " )\n", + "city\n", + "Boston 1\n", + "New Haven 1\n", + "New York 2\n", + "Name: (name, nunique), dtype: int64\n", + "-----------\n", + "COLUMN NAMES AFTER RAVEL:\n", + "Index(['name_nunique'], dtype='object')\n", + "city\n", + "Boston 1\n", + "New Haven 1\n", + "New York 2\n", + "Name: name_nunique, dtype: int64\n", + "city\n", + "New York 2\n", + "Boston 1\n", + "New Haven 1\n", + "Name: name_nunique, dtype: int64\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Iu-vfoM7ym-K" + }, + "source": [ + "## Exporting Data Frames" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "68xPjF9WylJM" + }, + "source": [ + "### To List of Dictionaries (Records)\n", + "\n", + "Convert a data frame to list of dictionaries:\n", + "\n", + "```python\n", + "df.to_dict(\"records\") # NOTE: \"records\" is a specific parameter of the to_dict() function, not a characteristic of the underlying data\n", + "```\n", + "\n", + "### To List of Lists \n", + "\n", + "Convert a dataframe to list of lists, each representing a row in the dataframe:\n", + "\n", + "```python\n", + "df.values.tolist()\n", + "```\n", + "\n", + "### To CSV File\n", + "\n", + "Write a dataframe to a CSV file:\n", + "\n", + "```python\n", + "df.to_csv(\"my_data_copy.csv\")\n", + "```" + ] + } + ] +} \ No newline at end of file