From c3bb3272634473edbb9b1b48d2eff080982084b7 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:00:33 -0700 Subject: [PATCH 01/30] patch value counts issue in merge lecture --- lectures/pandas/merge.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lectures/pandas/merge.md b/lectures/pandas/merge.md index 96a141c7..430b4dd6 100644 --- a/lectures/pandas/merge.md +++ b/lectures/pandas/merge.md @@ -477,11 +477,13 @@ a future lecture. ```{code-cell} python users_by_n = ( ratings["user_id"] - .value_counts() # Series. Index: user_id, value: n ratings by user - .value_counts() # Series. Index: n_ratings by user, value: N_users with this many ratings + .value_counts() # Series called "count". Index: user_id, value: n ratings by user + .rename("N_ratings") # Rename the Series to "N_ratings" + .value_counts() # Series called "count". Index: n_ratings by user, value: N_users with this many ratings .sort_index() # Sort our Series by the index (number of ratings) .reset_index() # Dataframe with columns `index` (from above) and `user_id` - .rename(columns={"index": "N_ratings", "user_id": "N_users"}) + .rename(columns={"count": "N_users"}) + .set_index("N_ratings") ) users_by_n.head(10) ``` From b8df94208e4d37c251b017ea71af48c3dde450a3 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:34:29 -0700 Subject: [PATCH 02/30] Update ci.yml hopefully this works --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 769691ba..27a03e20 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: shell: bash -l {0} run: pip list - name: Download "build" folder (cache) - uses: dawidd6/action-download-artifact@v2 + uses: dawidd6/action-download-artifact@v3 with: workflow: cache.yml branch: main From 56078e6a37fc2238e3ac97f992165a06efa60286 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:36:26 -0700 Subject: [PATCH 03/30] Update ci.yml idk --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 27a03e20..8aa9d69d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: shell: bash -l {0} run: pip list - name: Download "build" folder (cache) - uses: dawidd6/action-download-artifact@v3 + uses: dawidd6/action-download-artifact@v2 with: workflow: cache.yml branch: main @@ -33,7 +33,7 @@ jobs: run: | jb build lectures --path-output ./ -W --keep-going - name: Upload Execution Reports - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 if: failure() with: name: execution-reports From 5cef199b29ae43d8256b876d1fcfe3e74213583d Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Wed, 30 Oct 2024 16:56:53 -0700 Subject: [PATCH 04/30] WIP: fix quandl deprecation --- _notebook_repo/environment.yml | 1 + environment.yml | 1 + lectures/pandas/timeseries.md | 16 ++++++++-------- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/_notebook_repo/environment.yml b/_notebook_repo/environment.yml index 92f25f3b..915450d3 100644 --- a/_notebook_repo/environment.yml +++ b/_notebook_repo/environment.yml @@ -31,5 +31,6 @@ dependencies: - pandas_datareader - plotly - lxml + - nasdaq-data-link - conda: - python-graphviz diff --git a/environment.yml b/environment.yml index 270024e8..2d7e90ab 100644 --- a/environment.yml +++ b/environment.yml @@ -51,5 +51,6 @@ dependencies: - numba == 0.56.4 - ipywidgets == 8.0.6 - scipy == 1.10 + - nasdaq-data-link - conda: - python-graphviz diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index 4ba57163..6af356da 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -38,10 +38,10 @@ kernelspec: import os import pandas as pd import matplotlib.pyplot as plt -import quandl +import nasdaqdatalink as ndl # see section on API keys at end of lecture! -quandl.ApiConfig.api_key = os.environ.get("QUANDL_AUTH", "Dn6BtVoBhzuKTuyo6hbp") +os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" start_date = "2014-05-01" %matplotlib inline @@ -197,7 +197,7 @@ The flexibility of these features is best understood through example, so let's load up some data and take a look. ```{code-cell} python -btc_usd = quandl.get("BCHARTS/BITSTAMPUSD", start_date=start_date) +btc_usd = ndl.get_table("QDL/BCHAIN") btc_usd.info() btc_usd.head() ``` @@ -471,18 +471,18 @@ See exercise 8 in the {ref}`exercise list `. Recall above that we had the line of code: ```{code-block} python -quandl.ApiConfig.api_key = "Dn6BtVoBhzuKTuyo6hbp" +os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" ``` -This line told the `quandl` library that when obtaining making requests for data, it should use the *API key* `Dn6BtVoBhzuKTuyo6hbp`. +This line told the `nasdaqdatalink` library that when obtaining making requests for data, it should use the *API key* `jEKP58z7JaX6utPkkpEp`. -An API key is a sort of password that web services (like the Quandl API) require you to provide when you make requests. +An API key is a sort of password that web services (like the Nasdaq Data Link Tables API) require you to provide when you make requests. -Using this password, we were able to make a request to Quandl to obtain data directly from them. +Using this password, we were able to make a request to Nasdaq data link to obtain data directly from them. The API key used here is one that we requested on behalf of this course. -If you plan to use Quandl more extensively, you should obtain your own personal API key from [their website](https://docs.quandl.com/docs#section-authentication) and re-run the `quandl.ApiConfig.api_key...` line of code with your new API key on the right-hand side. +If you plan to use Nasdaq data more extensively, you should obtain your own personal API key from [their website](https://www.nasdaq.com/nasdaq-data-link) and re-run the `os.environ...` line of code with your new API key on the right-hand side. (pd-tim-ex)= ## Exercises From 92240db3cf9f2e71aacb24a2740af4d47aca98b6 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:38:02 -0700 Subject: [PATCH 05/30] Update timeseries.md --- lectures/pandas/timeseries.md | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index 6af356da..c58674f7 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -36,12 +36,13 @@ kernelspec: ```{code-cell} python import os +# see section on API keys at end of lecture! +os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" + import pandas as pd import matplotlib.pyplot as plt import nasdaqdatalink as ndl -# see section on API keys at end of lecture! -os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" start_date = "2014-05-01" %matplotlib inline @@ -197,17 +198,28 @@ The flexibility of these features is best understood through example, so let's load up some data and take a look. ```{code-cell} python -btc_usd = ndl.get_table("QDL/BCHAIN") +btc_usd = ndl.get_table("QDL/BCHAIN", paginate=True) btc_usd.info() btc_usd.head() ``` Here, we have the Bitcoin (BTC) to US dollar (USD) exchange rate from -March 2014 until today. +2009 until today, as well as other variables relevant to the Bitcoin ecosystem, in long ("melted") form. + +```{code-cell} python +print(btc_usd.code.unique()) +btc_usd.dtypes +``` + +Notice that the type of `date` is `datetime`. We would like this to be the index, and we want to drop the long form. We'll also select only a couple of columns of interest. (The column descriptions can be found [here](https://data.nasdaq.com/databases/BCHAIN)). We'll choose Market Price (in USD) (`MKPRU`), Total Market Cap (`MKTCP`), and Estimated Transaction Volume in USD (`ETRVU`). -Notice that the type of index is `DateTimeIndex`. +```{code-cell} python +btc_usd = btc_usd.pivot_table(index='date', columns='code', values='value') +btc_usd = btc_usd[["MKPRU", "MKTCP", "ETRVU"]] +btc_usd.head() +``` -This is the key that enables things like... +Now that we have a datetime index, it enables things like... Extracting all data for the year 2015 by passing `"2015"` to `.loc`. @@ -289,11 +301,11 @@ btc_date_column.head() ``` ```{code-cell} python -btc_date_column["Date"].dt.year.head() +btc_date_column["date"].dt.year.head() ``` ```{code-cell} python -btc_date_column["Date"].dt.month.head() +btc_date_column["date"].dt.month.head() ``` ## Leads and Lags: `df.shift` @@ -379,8 +391,8 @@ window for the whole dataset. ```{code-cell} python fig, ax = plt.subplots(figsize=(10, 4)) -btc_usd["Open"].plot(ax=ax, linestyle="--", alpha=0.8) -btc_usd.rolling("21d").max()["Open"].plot(ax=ax, alpha=0.8, linewidth=3) +btc_usd["MKPRU"].plot(ax=ax, linestyle="--", alpha=0.8) +btc_usd.rolling("21d").max()["MKPRU"].plot(ax=ax, alpha=0.8, linewidth=3) ax.legend(["Original", "21 day max"]) ``` From 54348d7cb032e1a79e59c90c776a1c1cfb98aa15 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:50:37 -0700 Subject: [PATCH 06/30] Update environment.yml upgrade pandas --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 2d7e90ab..a0cf2405 100644 --- a/environment.yml +++ b/environment.yml @@ -7,7 +7,7 @@ dependencies: - pip - pip: # Build Requirements - - pandas == 1.5.3 + - pandas - matplotlib <= 3.8.4 - pandas-datareader == 0.10.0 - numpy == 1.23.5 From fb0c1ecccfc6b7e132d055a1041ad69d2a68fc56 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:56:12 -0700 Subject: [PATCH 07/30] Update environment.yml --- _notebook_repo/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_notebook_repo/environment.yml b/_notebook_repo/environment.yml index 915450d3..c9e6b6b0 100644 --- a/_notebook_repo/environment.yml +++ b/_notebook_repo/environment.yml @@ -28,7 +28,7 @@ dependencies: - statsmodels - quantecon - openpyxl - - pandas_datareader + - pandas-datareader - plotly - lxml - nasdaq-data-link From 591850303f5dc6db16a865df7179677645cd1a84 Mon Sep 17 00:00:00 2001 From: Phil <15682144+doctor-phil@users.noreply.github.com> Date: Wed, 30 Oct 2024 23:33:35 -0700 Subject: [PATCH 08/30] Update groupby.md --- lectures/pandas/groupby.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lectures/pandas/groupby.md b/lectures/pandas/groupby.md index 6da3a245..fb3f249c 100644 --- a/lectures/pandas/groupby.md +++ b/lectures/pandas/groupby.md @@ -249,7 +249,7 @@ index and a `Date` column added. ```{code-cell} python df2 = df.copy() df2["Date"] = pd.date_range( - start=pd.datetime.today().strftime("%m/%d/%Y"), + start=pd.Timestamp.today().strftime("%m/%d/%Y"), freq="BQ", periods=df.shape[0] ) From a3194f07c4eb53452ddfb1c576f7be9f7c146c64 Mon Sep 17 00:00:00 2001 From: Phil <15682144+doctor-phil@users.noreply.github.com> Date: Wed, 30 Oct 2024 23:36:11 -0700 Subject: [PATCH 09/30] applymap -> map --- lectures/pandas/basics.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lectures/pandas/basics.md b/lectures/pandas/basics.md index 0249abeb..1b4b9173 100644 --- a/lectures/pandas/basics.md +++ b/lectures/pandas/basics.md @@ -23,7 +23,7 @@ kernelspec: - Use built-in Series transformation functions and be able to create your own and apply them using `apply` - Use built-in scalar transformation functions and be able to create your - own and apply them using `applymap` + own and apply them using `map` - Be able to select subsets of the DataFrame using boolean selection - Know what the "want operator" is and how to apply it @@ -335,7 +335,7 @@ pandas data. To do this, we use the following pattern: 1. Define a Python function that takes in a scalar and produces a scalar. -1. Pass this function as an argument to the `applymap` Series or DataFrame method. +1. Pass this function as an argument to the `map` Series or DataFrame method. Complete the exercise below to practice writing and using your own scalar transforms. @@ -593,7 +593,7 @@ medium (4.5 < x <= 6.5), or low (<= 4.5) for each state and each month. 1. Write a Python function that takes a single number as an input and outputs a single string noting if that number is high, medium, or low. -1. Pass your function to `applymap` (quiz: why `applymap` and not +1. Pass your function to `map` (quiz: why `map` and not `agg` or `apply`?) and save the result in a new DataFrame called `unemp_bins`. 1. (Challenging) This exercise has multiple parts: @@ -617,8 +617,8 @@ medium (4.5 < x <= 6.5), or low (<= 4.5) for each state and each month. ``` ```{code-cell} python -# Part 2: Pass your function from part 1 to applymap -unemp_bins = unemp.applymap#replace this comment with your code!! +# Part 2: Pass your function from part 1 to map +unemp_bins = unemp.map#replace this comment with your code!! ``` ```{code-cell} python From 768598be849e3bd5c1d4cd796514109ae188f41b Mon Sep 17 00:00:00 2001 From: Phil <15682144+doctor-phil@users.noreply.github.com> Date: Thu, 31 Oct 2024 20:18:42 -0700 Subject: [PATCH 10/30] Update the_index.md --- lectures/pandas/the_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lectures/pandas/the_index.md b/lectures/pandas/the_index.md index 776a2b01..8d9e989b 100644 --- a/lectures/pandas/the_index.md +++ b/lectures/pandas/the_index.md @@ -186,7 +186,7 @@ This would be helpful, for example, if we wanted to compute the difference in the average of all our variables from one year to the next. ```{code-cell} python -df_year.loc[2009].mean() - df_year.loc[2008].mean() +df_year.loc[2009].mean(numeric_only=True) - df_year.loc[2008].mean(numeric_only=True) ``` Notice that pandas did a few things for us. From 6dc9f76c29af7372e109c99fafb9530fa68d0c05 Mon Sep 17 00:00:00 2001 From: Phil <15682144+doctor-phil@users.noreply.github.com> Date: Thu, 31 Oct 2024 23:41:18 -0700 Subject: [PATCH 11/30] Update timeseries.md --- lectures/pandas/timeseries.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index c58674f7..44dc6b57 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -492,7 +492,7 @@ An API key is a sort of password that web services (like the Nasdaq Data Link Ta Using this password, we were able to make a request to Nasdaq data link to obtain data directly from them. -The API key used here is one that we requested on behalf of this course. +The API key used here is one that we requested on behalf of this course. Note that **for the environment variable `NASDAQ_DATA_LINK_API_KEY` to work properly, you must run the line above before importing the `nasdaqdatalink` library.** This is because the library reads the environment variable when it is imported to set its key automatically. Using an environment variable like this is a common way to store sensitive information like API keys, since you can set the environment variable in a secure way that is not stored in your code. How to set environment variables varies by operating system, but you can find instructions for doing so on the web. If you plan to use Nasdaq data more extensively, you should obtain your own personal API key from [their website](https://www.nasdaq.com/nasdaq-data-link) and re-run the `os.environ...` line of code with your new API key on the right-hand side. From 17bbbaad55f97e4bb4f9d7297f526c041db4f786 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Fri, 1 Nov 2024 11:40:55 -0700 Subject: [PATCH 12/30] fix matplotlib --- lectures/pandas/timeseries.md | 2 +- lectures/tools/matplotlib.md | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index 44dc6b57..7041db09 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -488,7 +488,7 @@ os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" This line told the `nasdaqdatalink` library that when obtaining making requests for data, it should use the *API key* `jEKP58z7JaX6utPkkpEp`. -An API key is a sort of password that web services (like the Nasdaq Data Link Tables API) require you to provide when you make requests. +An API key is a sort of password that web services (like the Nasdaq Data Link API) require you to provide when you make requests. Using this password, we were able to make a request to Nasdaq data link to obtain data directly from them. diff --git a/lectures/tools/matplotlib.md b/lectures/tools/matplotlib.md index dd13b6ce..cdc9b704 100644 --- a/lectures/tools/matplotlib.md +++ b/lectures/tools/matplotlib.md @@ -35,15 +35,14 @@ kernelspec: ```{code-cell} python import os +os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" + import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib import matplotlib.transforms as transforms -import quandl - -quandl.ApiConfig.api_key = os.environ.get("QUANDL_AUTH", "Dn6BtVoBhzuKTuyo6hbp") - +import nasdaqdatalink as ndl %matplotlib inline ``` @@ -122,7 +121,8 @@ Then, let's grab Apple's stock price data from quandl, starting a few weeks before the first announcement. ```{code-cell} python -aapl = quandl.get("WIKI/AAPL", start_date="2006-12-25") +aapl = ndl.get_table('WIKI/PRICES', ticker = ['AAPL'], date = { 'gte': '2006-12-25', 'lte': '2018-01-01' }) +aapl = aapl.set_index("date") aapl.head() ``` @@ -158,7 +158,7 @@ Let's see some examples. ```{code-cell} python # plot the Adjusted open to account for stock split -ax = aapl["Adj. Open"].plot() +ax = aapl["adj_open"].plot() # get the figure so we can re-display the plot after making changes fig = ax.get_figure() @@ -202,8 +202,8 @@ We can plot from our DataFrame directly on our Axes objects by setting the `ax` argument when calling `.plot`. ```{code-cell} python -aapl[["Adj. Low", "Adj. High"]].plot(ax=axs2[0]) -aapl[["Low", "High"]].plot(ax=axs2[1]) +aapl[["adj_low", "adj_high"]].plot(ax=axs2[0]) +aapl[["low", "high"]].plot(ax=axs2[1]) fig2 ``` @@ -302,7 +302,7 @@ def scale_by_middle(df): # Divide by middle row and scale to 100 # Note: N // 2 is modulus division meaning that it is # rounded to nearest whole number) - out = (df["Open"] / df.iloc[N // 2]["Open"]) * 100 + out = (df["open"] / df.iloc[N // 2]["open"]) * 100 # We don't want to keep actual dates, but rather the number # of days before or after the announcment. Let's set that From f25fbce9920c8da3b8d34b6984e9135be85d33bb Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Fri, 1 Nov 2024 11:57:53 -0700 Subject: [PATCH 13/30] simplify btc time series data --- lectures/pandas/timeseries.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index 7041db09..149f60d2 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -198,24 +198,23 @@ The flexibility of these features is best understood through example, so let's load up some data and take a look. ```{code-cell} python -btc_usd = ndl.get_table("QDL/BCHAIN", paginate=True) -btc_usd.info() -btc_usd.head() +btc_usd = ndl.get_table("QDL/BCHAIN", date = { 'gte': '2009-12-25', 'lte': '2019-01-01' }, code = ["MKPRU", "MKTCP", "ETRVU"]) +btc_usd_long.info() +btc_usd_long.head() ``` Here, we have the Bitcoin (BTC) to US dollar (USD) exchange rate from 2009 until today, as well as other variables relevant to the Bitcoin ecosystem, in long ("melted") form. ```{code-cell} python -print(btc_usd.code.unique()) +print(btc_usd_long.code.unique()) btc_usd.dtypes ``` -Notice that the type of `date` is `datetime`. We would like this to be the index, and we want to drop the long form. We'll also select only a couple of columns of interest. (The column descriptions can be found [here](https://data.nasdaq.com/databases/BCHAIN)). We'll choose Market Price (in USD) (`MKPRU`), Total Market Cap (`MKTCP`), and Estimated Transaction Volume in USD (`ETRVU`). +Notice that the type of `date` is `datetime`. We would like this to be the index, and we want to drop the long form. We also selected only a couple of columns of interest, but the dataset has a lot more options. (The column descriptions can be found [here](https://data.nasdaq.com/databases/BCHAIN)). We chose Market Price (in USD) (`MKPRU`), Total Market Cap (`MKTCP`), and Estimated Transaction Volume in USD (`ETRVU`). ```{code-cell} python -btc_usd = btc_usd.pivot_table(index='date', columns='code', values='value') -btc_usd = btc_usd[["MKPRU", "MKTCP", "ETRVU"]] +btc_usd = btc_usd_long.pivot_table(index='date', columns='code', values='value') btc_usd.head() ``` From e2a9d2fa8e048dbd80694d2fd363c8ec544121a0 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:16:51 -0700 Subject: [PATCH 14/30] Update timeseries.md --- lectures/pandas/timeseries.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index 149f60d2..df876cb4 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -37,12 +37,12 @@ kernelspec: ```{code-cell} python import os # see section on API keys at end of lecture! -os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" - import pandas as pd import matplotlib.pyplot as plt import nasdaqdatalink as ndl +ndl.ApiConfig.api_key = os.environ.get("NASDAQ_DATA_LINK_API_KEY", "jEKP58z7JaX6utPkkpEp") + start_date = "2014-05-01" %matplotlib inline @@ -482,7 +482,7 @@ See exercise 8 in the {ref}`exercise list `. Recall above that we had the line of code: ```{code-block} python -os.environ["NASDAQ_DATA_LINK_API_KEY"] = "jEKP58z7JaX6utPkkpEp" +ndl.ApiConfig.api_key = os.environ.get("NASDAQ_DATA_LINK_API_KEY", "jEKP58z7JaX6utPkkpEp") ``` This line told the `nasdaqdatalink` library that when obtaining making requests for data, it should use the *API key* `jEKP58z7JaX6utPkkpEp`. @@ -491,7 +491,7 @@ An API key is a sort of password that web services (like the Nasdaq Data Link AP Using this password, we were able to make a request to Nasdaq data link to obtain data directly from them. -The API key used here is one that we requested on behalf of this course. Note that **for the environment variable `NASDAQ_DATA_LINK_API_KEY` to work properly, you must run the line above before importing the `nasdaqdatalink` library.** This is because the library reads the environment variable when it is imported to set its key automatically. Using an environment variable like this is a common way to store sensitive information like API keys, since you can set the environment variable in a secure way that is not stored in your code. How to set environment variables varies by operating system, but you can find instructions for doing so on the web. +The API key used here is one that we requested on behalf of this course. If you create your own API key, you should store it in the NASDAQ_DATA_LINK_API_KEY environment variable, locally on your computer. Using an environment variable like this is a common way to store sensitive information like API keys, since you can set the environment variable in a secure way that is not stored in your code. How to set environment variables varies by operating system, but you can find instructions for doing so on the web. If you plan to use Nasdaq data more extensively, you should obtain your own personal API key from [their website](https://www.nasdaq.com/nasdaq-data-link) and re-run the `os.environ...` line of code with your new API key on the right-hand side. From 5b8eddcfaa561e72d2e5c03488492fd61479732c Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:40:41 -0700 Subject: [PATCH 15/30] Update timeseries.md this should fix time series --- lectures/pandas/timeseries.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index df876cb4..35eabfaf 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -198,7 +198,7 @@ The flexibility of these features is best understood through example, so let's load up some data and take a look. ```{code-cell} python -btc_usd = ndl.get_table("QDL/BCHAIN", date = { 'gte': '2009-12-25', 'lte': '2019-01-01' }, code = ["MKPRU", "MKTCP", "ETRVU"]) +btc_usd_long = ndl.get_table("QDL/BCHAIN", date = { 'gte': '2009-12-25', 'lte': '2019-01-01' }, code = ["MKPRU", "MKTCP", "ETRVU"]) btc_usd_long.info() btc_usd_long.head() ``` From 3814918ba0cb53148ba15fc652873e30c6513ce8 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:41:56 -0700 Subject: [PATCH 16/30] one more --- lectures/pandas/timeseries.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index 35eabfaf..fc30b5e9 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -208,7 +208,7 @@ Here, we have the Bitcoin (BTC) to US dollar (USD) exchange rate from ```{code-cell} python print(btc_usd_long.code.unique()) -btc_usd.dtypes +btc_usd_long.dtypes ``` Notice that the type of `date` is `datetime`. We would like this to be the index, and we want to drop the long form. We also selected only a couple of columns of interest, but the dataset has a lot more options. (The column descriptions can be found [here](https://data.nasdaq.com/databases/BCHAIN)). We chose Market Price (in USD) (`MKPRU`), Total Market Cap (`MKTCP`), and Estimated Transaction Volume in USD (`ETRVU`). From 558705c972d0a148d2d507154f6f25abd04fadbf Mon Sep 17 00:00:00 2001 From: mmcky Date: Sat, 2 Nov 2024 13:30:44 +1100 Subject: [PATCH 17/30] TMP: disable build cache --- .github/workflows/ci.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 734c07d4..40e4e673 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,13 +21,13 @@ jobs: - name: Display Pip Versions shell: bash -l {0} run: pip list - - name: Download "build" folder (cache) - uses: dawidd6/action-download-artifact@v6 - with: - workflow: cache.yml - branch: main - name: build-cache - path: _build + # - name: Download "build" folder (cache) + # uses: dawidd6/action-download-artifact@v6 + # with: + # workflow: cache.yml + # branch: main + # name: build-cache + # path: _build - name: Build HTML shell: bash -l {0} run: | From c0954339106ad1e8551ea2b4a3f94d2e49685227 Mon Sep 17 00:00:00 2001 From: mmcky Date: Sat, 2 Nov 2024 13:33:07 +1100 Subject: [PATCH 18/30] MAINT: maintenance of cloud infrastructure --- .github/workflows/cache.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cache.yml b/.github/workflows/cache.yml index ca5700df..57d5049b 100644 --- a/.github/workflows/cache.yml +++ b/.github/workflows/cache.yml @@ -24,7 +24,7 @@ jobs: - name: Build HTML shell: bash -l {0} run: | - jb build lectures --path-output ./ + jb build lectures --path-output ./ -W --keep-going - name: Upload "_build" folder (cache) uses: actions/upload-artifact@v4 with: From 6e0a997ad7b0f744bd30830df37ded43a6c6428b Mon Sep 17 00:00:00 2001 From: mmcky Date: Sat, 2 Nov 2024 13:40:36 +1100 Subject: [PATCH 19/30] TST: upgrade anaconda and software stack --- environment.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/environment.yml b/environment.yml index 270024e8..5787c78b 100644 --- a/environment.yml +++ b/environment.yml @@ -2,15 +2,11 @@ name: lecture-datascience channels: - default dependencies: - - python=3.9 - - anaconda=2022.10 + - python=3.12 + - anaconda=2024.06 - pip - pip: # Build Requirements - - pandas == 1.5.3 - - matplotlib <= 3.8.4 - - pandas-datareader == 0.10.0 - - numpy == 1.23.5 - jupyter-book==0.15.1 - docutils==0.17.1 - quantecon-book-theme==0.4.1 From d4dfe0ba112fba43a407ca6065fac7925e5e8f60 Mon Sep 17 00:00:00 2001 From: mmcky Date: Sat, 2 Nov 2024 13:44:31 +1100 Subject: [PATCH 20/30] update to python=3.12 in ci workflow --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 734c07d4..a9e4da63 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: auto-update-conda: true auto-activate-base: true miniconda-version: 'latest' - python-version: 3.9 + python-version: 3.12 environment-file: environment.yml activate-environment: lecture-datascience - name: Display Conda Environment Versions From 8f290b518589c70c0321ec6a07445c318ce76780 Mon Sep 17 00:00:00 2001 From: mmcky Date: Sat, 2 Nov 2024 13:47:54 +1100 Subject: [PATCH 21/30] simply build and work through missing dependencies --- environment.yml | 53 +++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/environment.yml b/environment.yml index 5787c78b..2ea360ec 100644 --- a/environment.yml +++ b/environment.yml @@ -19,33 +19,30 @@ dependencies: - sphinx-togglebutton==0.3.1 - arviz==0.13.0 # Datascience Requirements - - joblib == 1.2.0 - - interpolation == 2.2.4 - - networkx == 3.0 - - fiona == 1.9.2 - - geopandas == 0.12.2 - - pyLDAvis == 3.4.0 - - gensim == 4.3.1 - - folium == 0.14.0 - - descartes == 1.1.0 - - pyarrow == 11.0.0 - - xgboost == 1.7.5 - - graphviz == 0.20.1 - - bokeh == 3.1.0 - - sphinxcontrib-bibtex == 2.5.0 - - nltk == 3.8.1 - - seaborn == 0.12.2 - - patsy == 0.5.3 - - quandl == 3.7.0 - - statsmodels == 0.13.5 - - quantecon == 0.6.0 - - openpyxl == 3.1.2 - - pandas_datareader == 0.10.0 - - plotly == 5.14.0 - - lxml == 4.9.2 - - scikit-learn == 1.2.2 - - numba == 0.56.4 - - ipywidgets == 8.0.6 - - scipy == 1.10 + # - joblib + # - interpolation + # - networkx + # - fiona + # - geopandas + # - pyLDAvis + # - gensim + # - folium + # - descartes + # - pyarrow + # - xgboost + # - graphviz + # - bokeh + # - nltk + # - seaborn + # - patsy + # - statsmodels + # - quantecon + # - openpyxl + # - plotly + # - lxml + # - scikit-learn + # - numba + # - ipywidgets + # - scipy - conda: - python-graphviz From 77369d237630e819b20c53764ecd7d00a45758b6 Mon Sep 17 00:00:00 2001 From: mmcky Date: Sat, 2 Nov 2024 14:18:41 +1100 Subject: [PATCH 22/30] import some dependencies --- environment.yml | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/environment.yml b/environment.yml index 2ea360ec..22f41f8b 100644 --- a/environment.yml +++ b/environment.yml @@ -20,29 +20,30 @@ dependencies: - arviz==0.13.0 # Datascience Requirements # - joblib - # - interpolation + - interpolation # - networkx - # - fiona - # - geopandas - # - pyLDAvis - # - gensim - # - folium - # - descartes + - fiona + - geopandas + - pyLDAvis + - gensim + - folium + - descartes # - pyarrow - # - xgboost - # - graphviz + - xgboost + - graphviz # - bokeh # - nltk - # - seaborn - # - patsy - # - statsmodels - # - quantecon - # - openpyxl - # - plotly - # - lxml - # - scikit-learn + - seaborn + - patsy + - statsmodels + - quantecon + - quandl + - openpyxl + - plotly + - lxml + - scikit-learn # - numba - # - ipywidgets + - ipywidgets # - scipy - conda: - python-graphviz From 6ab7e2525d6888e407f4ff938c6a6df5c666c08b Mon Sep 17 00:00:00 2001 From: mmcky Date: Sat, 2 Nov 2024 14:41:50 +1100 Subject: [PATCH 23/30] enable more dependencies --- environment.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 22f41f8b..a29c7b4f 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: # Datascience Requirements # - joblib - interpolation - # - networkx + - networkx - fiona - geopandas - pyLDAvis @@ -33,8 +33,10 @@ dependencies: - graphviz # - bokeh # - nltk + - pandas-datareader - seaborn - patsy + - pyarrow - statsmodels - quantecon - quandl From 099d421d562ea54ef3d178668accd32f8d0b3375 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:00:32 -0800 Subject: [PATCH 24/30] Update networks.md --- lectures/applications/networks.md | 69 ++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/lectures/applications/networks.md b/lectures/applications/networks.md index 15bf92dd..0e5d8edb 100644 --- a/lectures/applications/networks.md +++ b/lectures/applications/networks.md @@ -41,13 +41,16 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt import networkx as nx + +%matplotlib inline ``` ```{code-cell} ipython3 karate = nx.karate_club_graph() #import the Zachary's karate club network data from NetworkX karate_layout = nx.spring_layout(karate,seed=2) #fix a random layout so we can get a consistent look at the network -nx.draw(karate,karate_layout) #plot the network +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(karate,karate_layout, ax) #plot the network ``` This is an example of a **social network**. (Specifically, it's called "Zachary's Karate Club Network", and it represents some data collected by Wayne Zachary in 1977.) In this network, we have a set of dots representing people (**nodes** or **vertices**) who are connected by a line (**link** or **edge**) if they are friends with each other. @@ -105,7 +108,9 @@ edgelist = [(1,2), (11,12)] network.add_edges_from(edgelist) #add a set of links or edges to form a network positions = nx.spring_layout(network,seed=10) #fix the position again -nx.draw(network,positions,node_color="lightblue",with_labels=True) #plot the network graph + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color="lightblue",with_labels=True) #plot the network graph ``` ### (Un)directedness @@ -192,14 +197,18 @@ Maybe it would be important to stop by all of the other businesses on the way. O ```{code-cell} ipython3 color_map = ["black","red","red","red","black","red","black","red","red","red","black","red","black","red","red","red","black"] -nx.draw(network,positions,node_color="lightblue",edge_color=color_map,with_labels=True) # highlight our long path + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color="lightblue",edge_color=color_map,with_labels=True) # highlight our long path ``` However, this might not be the most efficient path since it takes a lot of driving. Another, faster route might be to skip 2,4,9, and 11, and head down the path through 3,5,6,7,8, and 10. ```{code-cell} ipython3 color_map = ["black","red","black","black","red","black","black","red","red","red","black","black","red","black","black","red","black"] -nx.draw(network,positions,node_color="lightblue",edge_color=color_map,with_labels=True) # highlight a shorter path + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color="lightblue",edge_color=color_map,with_labels=True) # highlight a shorter path ``` Clearly, this path will be a lot more efficient, since it gets to the final destination while traveling down 4 fewer roads than the previous path. Which one is "best" depends on your objective. Optimal transport and routing problems are a large part of operations research. In general, finding the shortest path from one node to another is very easy to do using a very famous heuristic called **dijkstra's algorithm**. On the other hand, finding the shortest path that visits every single node in a graph is called the **travelling salesman problem**, and is notoriously difficult to solve (specifically, it is NP-hard.) @@ -208,7 +217,9 @@ Lastly, let's imagine that a storm comes, and the road between 6 and 7 floods an ```{code-cell} ipython3 network.remove_edge(6,7) # delete the edge connecting node 6 to node 7 -nx.draw(network,positions,node_color="lightblue",with_labels=True) + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color="lightblue",with_labels=True) ``` We can see that there is no longer any possible path that could connect node 1 to node 12. For any node from 1 to 6, there is still a path; just like on the right hand side, there is a path between any two nodes from 7 to 12. But there is no path that can connect any node in one of these sets to a node in the other. We would refer to these two sets as **connected components**. @@ -292,7 +303,9 @@ Degree centrality, however, often does not tell the whole story. For example, le ```{code-cell} ipython3 degrees = network.degree() # retrieve the degree sequence degree_colors = [degrees[i] for i in range(1,13)] # turn it into a vector -nx.draw(network,positions,node_color=degree_colors,with_labels=True) # plot the network with colors according to degree + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color=degree_colors,with_labels=True) # plot the network with colors according to degree ``` In this network, almost every node has the same degree. So if we wanted to know which nodes were the most important, number of connections alone would not really give us much useful information. @@ -326,7 +339,9 @@ The first thing we might notice here is that it's no longer the same situation a ```{code-cell} ipython3 cent_colors = [centrality[i] for i in range(1,13)] # build a list of eigenvector centralities -nx.draw(network,positions,node_color=cent_colors,with_labels=True) # plot the graph with colors according to this list + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color=cent_colors,with_labels=True) # plot the graph with colors according to this list ``` As we suspected, considering influence as being the result of connections with other influential friends gives us a centrality measure that looks way more informative. Nodes that appear to be in "central" positions are indeed considered more "central", while nodes that are further from the center have lower centrality. @@ -356,7 +371,9 @@ Now, let's try removing a link from our network, to see how that will change its ```{code-cell} ipython3 network.remove_edge(1,2) # remove the edge connecting node 1 to node 2, and draw the network -nx.draw(network,positions,node_color="lightblue",with_labels=True) + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color="lightblue",with_labels=True) ``` ```{code-cell} ipython3 @@ -372,7 +389,9 @@ We can see that not much has changed (although a few eigenvalues are lower than ```{code-cell} ipython3 network.add_edge(1,2) # return the graph to normal network.remove_edge(6,7) # delete the link from 6 to 7, and draw the network -nx.draw(network,positions,node_color="lightblue",with_labels=True) + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color="lightblue",with_labels=True) ``` ```{code-cell} ipython3 @@ -387,7 +406,9 @@ Now, removing this edge had a very different impact on the spectrum of our lapla ```{code-cell} ipython3 network.remove_edges_from([(3,5),(2,4),(8,10),(9,11)]) # remove a set of links, plot the result -nx.draw(network,positions,node_color="lightblue",with_labels=True) + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color="lightblue",with_labels=True) ``` ```{code-cell} ipython3 @@ -429,8 +450,9 @@ colors = [ "lightcoral" for i in range(1,13) ] # assign a nice reddish color to for i in range(0,12): # for any nodes that have a negative entry, replace this with a nice purplish color if (v[i,1] < 0): colors[i] = "mediumpurple" - -nx.draw(network,positions,node_color=colors,with_labels=True) # draw the result + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(network,positions,ax,node_color=colors,with_labels=True) # draw the result ``` Coloring nodes by their sign in this vector sorts them into two groups, on either side of the $(6,7)$ link! @@ -451,7 +473,9 @@ As a final exercise, let's look at how these concepts can be applied to economic ```{code-cell} ipython3 eigen_cent = nx.eigenvector_centrality(karate) eigen_colors = [eigen_cent[i] for i in range(0,34)] -nx.draw(karate,karate_layout,node_color=eigen_colors) + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(karate,ax,karate_layout,node_color=eigen_colors) ``` We see that there are two really highly central (yellow) nodes, on opposite sides of the network. This might be an indication that there is some homophily in the network. To verify this, let's take a look at the natural partitioning of this network by plotting the eigenvalues of its laplacian matrix. @@ -471,8 +495,9 @@ colors = [ "lightcoral" for i in range(0,34) ] # assign colors to the nodes base for i in range(0,34): if (v_sorted[i,1] < 0): colors[i] = "mediumpurple" - -nx.draw(karate,karate_layout,node_color=colors) # draw the result + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(karate,karate_layout,ax,node_color=colors) # draw the result ``` Again, this spectral homophily partitions the network in a way that seems very natural; into two dense clusters with sparse connections between them, and with each one containing its own highly central hub. @@ -499,7 +524,7 @@ One idea would be to look at the correlations between the returns of these stock ```{code-cell} ipython3 returns = df[1:] # remove the dates -corr = returns.corr() # calculate the correlations between the returns of each pair of stocks +corr = returns.corr(numeric_only=True) # calculate the correlations between the returns of each pair of stocks corr # display the correlation matrix ``` @@ -529,7 +554,9 @@ Let's take a look at it. stocknet = nx.from_numpy_array(adj.to_numpy()) # initialize a new graph from our adjacency matrix stocknet = nx.relabel_nodes(stocknet, dict(enumerate(adj.columns))) # keep the stock tickers as the names of the nodes (instead of integers) stock_layout = nx.spring_layout(stocknet,seed=10) # fix our layout -nx.draw(stocknet,stock_layout) # plot the graph, without any labels (for now) + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(stocknet,stock_layout,ax) # plot the graph, without any labels (for now) ``` It's hard to get much information about this graph, just by looking at it. Let's take a look at its underlying structure by examining its spectrum. @@ -543,8 +570,9 @@ colors = [ "lightcoral" for i in range(0,10) ] for i in range(0,10): if (v_sorted[i,1] < 0): colors[i] = "mediumpurple" - -nx.draw(stocknet,stock_layout,node_color=colors) + +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(stocknet,stock_layout,ax,node_color=colors) ``` Ok, so our spectral homophily identifies two distinct groups in the data. But what do they represent? @@ -552,7 +580,8 @@ Ok, so our spectral homophily identifies two distinct groups in the data. But wh To understand, let's add the labels back in. ```{code-cell} ipython3 -nx.draw(stocknet,stock_layout,node_color=colors,with_labels=True) +fig,ax = plt.subplots() # create a figure and axis object +nx.draw(stocknet,stock_layout,ax,node_color=colors,with_labels=True) ``` Forming a network based on this simple method, and looking at its spectrum, was enough to cleanly identify both of the sectors in our dataset. From 7d81f50f7f7cacdb0e46dd73837b43bce46d9484 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:27:19 -0800 Subject: [PATCH 25/30] oops --- lectures/applications/networks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lectures/applications/networks.md b/lectures/applications/networks.md index 0e5d8edb..fee601cf 100644 --- a/lectures/applications/networks.md +++ b/lectures/applications/networks.md @@ -475,7 +475,7 @@ eigen_cent = nx.eigenvector_centrality(karate) eigen_colors = [eigen_cent[i] for i in range(0,34)] fig,ax = plt.subplots() # create a figure and axis object -nx.draw(karate,ax,karate_layout,node_color=eigen_colors) +nx.draw(karate,karate_layout,ax,node_color=eigen_colors) ``` We see that there are two really highly central (yellow) nodes, on opposite sides of the network. This might be an indication that there is some homophily in the network. To verify this, let's take a look at the natural partitioning of this network by plotting the eigenvalues of its laplacian matrix. From f2b75047df8ecc239e406065853d3dc501a38a48 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 4 Nov 2024 15:40:01 -0800 Subject: [PATCH 26/30] fix deprecated map dataset, problem set issue --- lectures/pandas/timeseries.md | 3 +++ lectures/problem_sets/problem_set_8.md | 2 +- lectures/tools/maps.md | 29 +++++++++++++------------- lectures/tools/matplotlib.md | 3 +++ 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index fc30b5e9..9a434a68 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -11,6 +11,9 @@ kernelspec: # Time series +**Co-author** +> - [Philip Solimine, *UBC*](https://www.psolimine.net) + **Prerequisites** - {doc}`Python functions <../python_fundamentals/functions>` diff --git a/lectures/problem_sets/problem_set_8.md b/lectures/problem_sets/problem_set_8.md index aadad046..6307caa6 100644 --- a/lectures/problem_sets/problem_set_8.md +++ b/lectures/problem_sets/problem_set_8.md @@ -46,7 +46,7 @@ ahs.info() ```{code-cell} python # dataframe of variable descriptions ahs_doc = pd.read_csv("https://datascience.quantecon.org/assets/data/ahs-doc.csv", encoding="latin1") -with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'max_colwidth', -1): +with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'max_colwidth', None): display(ahs_doc[["Variable","Question","Description","Associated.Response.Codes"]]) ``` diff --git a/lectures/tools/maps.md b/lectures/tools/maps.md index fd354028..f1ce56a7 100644 --- a/lectures/tools/maps.md +++ b/lectures/tools/maps.md @@ -13,6 +13,7 @@ kernelspec: **Co-author** > - [Kim Ruhl *University of Wisconsin*](http://kimjruhl.com) +> - [Philip Solimine *UBC*](https://www.psolimine.net) **Prerequisites** @@ -125,24 +126,24 @@ that we use here. The file provides the outlines of countries, over which we'll plot the city locations from our GeoDataFrame. -Luckily, `geopandas` already comes bundled with this data, so we don't -have to hunt it down! +Luckily, Natural Earth has already done the hard work of creating these files, and we can pull them directly from their website. ```{code-cell} python -# Grab low resolution world file -world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) -world = world.set_index("iso_a3") - +# Grab low resolution world file from NACIS +url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip" +world = gpd.read_file(url)[['SOV_A3', 'POP_EST', 'CONTINENT', 'NAME', 'GDP_MD', 'geometry']] +world = world.set_index("SOV_A3") +print(world.columns) world.head() ``` `world` is a GeoDataFrame with the following columns: -* `pop_est`: Contains a population estimate for the country -* `continent`: The country's continent -* `name`: The country's name -* `iso_a3`: The country's 3 letter abbreviation (we made this the index) -* `gdp_md_est`: An estimate of country's GDP +* `POP_EST`: Contains a population estimate for the country +* `CONTINENT`: The country's continent +* `NAME`: The country's name +* `SOV_A3`: The country's 3 letter abbreviation (we made this the index) +* `GDP_MD`: The most recent estimate of country's GDP * `geometry`: A `POLYGON` for each country (we will learn more about these soon) ```{code-cell} python @@ -201,7 +202,7 @@ This is a more complex shape than Albania and thus required more points. fig, gax = plt.subplots(figsize=(10,10)) # By only plotting rows in which the continent is 'South America' we only plot SA. -world.query("continent == 'South America'").plot(ax=gax, edgecolor='black',color='white') +world.query("CONTINENT == 'South America'").plot(ax=gax, edgecolor='black',color='white') # By the way, if you haven't read the book 'longitude' by Dava Sobel, you should... gax.set_xlabel('longitude') @@ -234,7 +235,7 @@ fig, gax = plt.subplots(figsize=(10,10)) # By only plotting rows in which the continent is 'South America' we only plot, well, # South America. -world.query("continent == 'South America'").plot(ax = gax, edgecolor='black', color='white') +world.query("CONTINENT == 'South America'").plot(ax = gax, edgecolor='black', color='white') # This plot the cities. It's the same syntax, but we are plotting from a different GeoDataFrame. # I want the cities as pale red dots. @@ -260,7 +261,7 @@ Finally, we might want to consider annotating the cities so we know which cities fig, gax = plt.subplots(figsize=(10,10)) # By only plotting rows in which the continent is 'South America' we only plot, well, South America. -world.query("continent == 'South America'").plot(ax = gax, edgecolor='black', color='white') +world.query("CONTINENT == 'South America'").plot(ax = gax, edgecolor='black', color='white') # This plot the cities. It's the same syntax, but we are plotting from a different GeoDataFrame. I want the # cities as pale red dots. diff --git a/lectures/tools/matplotlib.md b/lectures/tools/matplotlib.md index cdc9b704..c429c016 100644 --- a/lectures/tools/matplotlib.md +++ b/lectures/tools/matplotlib.md @@ -11,6 +11,9 @@ kernelspec: # Intermediate Plotting +**Co-author** +> - [Philip Solimine, *UBC*](https://www.psolimine.net) + **Prerequisites** - {doc}`Introduction <../pandas/intro>` From 1cddd3d4574d489a8e7d4fe9c6b2bd11786d3c3c Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 4 Nov 2024 16:02:03 -0800 Subject: [PATCH 27/30] need to install bokeh for maps lecture --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 1694a4a8..ec31f08c 100644 --- a/environment.yml +++ b/environment.yml @@ -31,7 +31,7 @@ dependencies: # - pyarrow - xgboost - graphviz - # - bokeh + - bokeh # - nltk - pandas-datareader - seaborn From c19caedeb2c97d656b607297f87375dbca35c6c4 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:04:38 -0800 Subject: [PATCH 28/30] Update timeseries.md --- lectures/pandas/timeseries.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lectures/pandas/timeseries.md b/lectures/pandas/timeseries.md index 9a434a68..097cfc0a 100644 --- a/lectures/pandas/timeseries.md +++ b/lectures/pandas/timeseries.md @@ -442,7 +442,7 @@ Below are some examples. ```{code-cell} python # business quarter -btc_usd.resample("BQ").mean() +btc_usd.resample("BQE").mean() ``` Note that unlike with `rolling`, a single number is returned for From cca9326fd82054876f49fad08ad345c9223a4d3f Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:07:42 -0800 Subject: [PATCH 29/30] Update data_clean.md --- lectures/pandas/data_clean.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lectures/pandas/data_clean.md b/lectures/pandas/data_clean.md index c7452768..328c4c26 100644 --- a/lectures/pandas/data_clean.md +++ b/lectures/pandas/data_clean.md @@ -254,12 +254,12 @@ df.fillna(value=100) ```{code-cell} python # use the _next_ valid observation to fill the missing data -df.fillna(method="bfill") +df.bfill() # in new versions of pandas, bfill will directly fill missing data ``` ```{code-cell} python # use the _previous_ valid observation to fill missing data -df.fillna(method="ffill") +df.ffill() ``` We will see more examples of dealing with missing data in future From 0f23a51569f27e481e209ce3ed57d002663e26e1 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:12:10 -0800 Subject: [PATCH 30/30] Update groupby.md --- lectures/pandas/groupby.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lectures/pandas/groupby.md b/lectures/pandas/groupby.md index fb3f249c..0b81b9e9 100644 --- a/lectures/pandas/groupby.md +++ b/lectures/pandas/groupby.md @@ -213,7 +213,7 @@ def smallest_by_b(df): ``` ```{code-cell} python -gbA.apply(smallest_by_b) +gbA.apply(smallest_by_b, include_groups=False) ``` Notice that the return value from applying our series transform to `gbA` @@ -250,7 +250,7 @@ index and a `Date` column added. df2 = df.copy() df2["Date"] = pd.date_range( start=pd.Timestamp.today().strftime("%m/%d/%Y"), - freq="BQ", + freq="BQE", periods=df.shape[0] ) df2 = df2.set_index("A") @@ -260,7 +260,7 @@ df2 We can group by year. ```{code-cell} python -df2.groupby(pd.Grouper(key="Date", freq="A")).count() +df2.groupby(pd.Grouper(key="Date", freq="YE")).count() ``` We can group by the `A` level of the index. @@ -272,14 +272,14 @@ df2.groupby(pd.Grouper(level="A")).count() We can combine these to group by both. ```{code-cell} python -df2.groupby([pd.Grouper(key="Date", freq="A"), pd.Grouper(level="A")]).count() +df2.groupby([pd.Grouper(key="Date", freq="YE"), pd.Grouper(level="A")]).count() ``` And we can combine `pd.Grouper` with a string, where the string denotes a column name ```{code-cell} python -df2.groupby([pd.Grouper(key="Date", freq="A"), "B"]).count() +df2.groupby([pd.Grouper(key="Date", freq="YE"), "B"]).count() ``` ## Case Study: Airline Delays