update patches branch (#263)

* WIP: patch pandas issues and upgrades (#251) * patch value counts issue in merge lecture * Update ci.yml hopefully this works * Update ci.yml idk * WIP: fix quandl deprecation * Update timeseries.md * Update environment.yml upgrade pandas * Update environment.yml * Update groupby.md * applymap -> map * Update the_index.md * Update timeseries.md * fix matplotlib * simplify btc time series data * Update timeseries.md * Update timeseries.md this should fix time series * one more * TMP: disable build cache * MAINT: maintenance of cloud infrastructure * TST: upgrade anaconda and software stack * update to python=3.12 in ci workflow * simply build and work through missing dependencies * import some dependencies * enable more dependencies * Update networks.md * oops * fix deprecated map dataset, problem set issue * need to install bokeh for maps lecture * fix ml in econ? * Update recidivism.md * working with text * Update working_with_text.md * working with text * Update working_with_text.md * Update environment.yml * Update working_with_text.md * Update working_with_text.md * try limiting api call delay * try something else * Update working_with_text.md --------- Co-authored-by: Matt McKay <[email protected]> Co-authored-by: mmcky <[email protected]> * fix working with text caching --------- Co-authored-by: Matt McKay <[email protected]> Co-authored-by: mmcky <[email protected]>
QuantEcon · Nov 5, 2024 · ee8fb22 · ee8fb22 · github-actions · Nov 5, 2024
1 parent 0f23a51
commit ee8fb22
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 36 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -16,7 +16,7 @@ jobs:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.9
+          python-version: 3.12
           environment-file: environment.yml
           activate-environment: lecture-datascience
       - name: Display Conda Environment Versions

diff --git a/environment.yml b/environment.yml
@@ -32,7 +32,7 @@ dependencies:
     - xgboost
     - graphviz
     - bokeh
-    # - nltk
+    - nltk
     - pandas-datareader
     - seaborn
     - patsy

diff --git a/lectures/_data/avalanche_forecasts.zip b/lectures/_data/avalanche_forecasts.zip
diff --git a/lectures/applications/ml_in_economics.md b/lectures/applications/ml_in_economics.md
@@ -13,6 +13,7 @@ kernelspec:
 
 **Author**
 > - [Paul Schrimpf *UBC*](https://economics.ubc.ca/faculty-and-staff/paul-schrimpf/)
+> - [Philip Solimine *UBC*](https://www.psolimine.net/)
 
 **Prerequisites**
 
@@ -259,11 +260,11 @@ tags: [hide-output]
 ---
 cps["female"] = (cps.sex==2)
 cps["log_earn"] = np.log(cps.earnwke)
-cps["log_earn"][np.isinf(cps.log_earn)] = np.nan
+cps.loc[np.isinf(cps.log_earn),"log_earn"] = np.nan
 cps["log_uhours"] = np.log(cps.uhourse)
-cps["log_uhours"][np.isinf(cps.log_uhours)] = np.nan
+cps.loc[np.isinf(cps.log_uhours),"log_uhours"] = np.nan
 cps["log_hourslw"] = np.log(cps.hourslw)
-cps["log_hourslw"][np.isinf(cps.log_hourslw)] = np.nan
+cps.loc[np.isinf(cps.log_hourslw),"log_hourslw"] = np.nan
 cps["log_wageu"] = cps.log_earn - cps.log_uhours
 cps["log_wagelw"] = cps.log_earn - cps.log_hourslw
 
@@ -394,12 +395,8 @@ def plotpredictions(pl) :
     plt.title("Prediction Errors")
 
     plt.figure()
-    sns.distplot(pl[2][female==0], hist = True, kde = False,
-                 kde_kws = {'shade': True, 'linewidth': 3},
-                 label = "Male")
-    sns.distplot(pl[2][female==1], hist = True, kde = False,
-                 kde_kws = {'shade': True, 'linewidth': 3},
-                 label = "Female")
+    sns.histplot(pl[2][female == 0], bins=30, label="Male", kde=False)
+    sns.histplot(pl[2][female == 1], bins=30, label="Female", kde=False)
     plt.title('P(female|x)')
 plotpredictions(pl_lasso)
 ```

diff --git a/lectures/applications/recidivism.md b/lectures/applications/recidivism.md
@@ -789,10 +789,10 @@ def balance_hist_plot(pred, y, df, bins=20):
         _ax = ax[np.unravel_index(g, ax.shape)]
         y_sub = y[subset]
         pred_sub = pred[subset]
-        sns.distplot(pred_sub[y_sub==0], hist=True, bins=bins, kde=False, ax=_ax,
-                     label="No recidivate", norm_hist=True, axlabel="Predicted Probability")
-        sns.distplot(pred_sub[y_sub==1], hist=True, bins=bins, kde=False, ax=_ax,
-                     label="Yes recidivate", norm_hist=True, axlabel="Predicted Probability")
+        sns.histplot(pred_sub[y_sub==0], bins=bins, kde=False, ax=_ax,
+                     label="No recidivate")
+        sns.histplot(pred_sub[y_sub==1], bins=bins, kde=False, ax=_ax,
+                     label="Yes recidivate")
         _ax.set_title(group)
 
     plt.legend()
@@ -1059,14 +1059,30 @@ Unfortunately, this makes all the predictions identical, so these predictions
 are not so useful.
 
 ```{code-cell} python
-output, given_outcome, given_pred =cm_tables(
+try:
+    output, given_outcome, given_pred = cm_tables(
     balance_mod.best_estimator_.predict(X_test),
     y_test,
     df_test
 )
-display(output)
-display(given_pred)
-display(given_outcome)
+    
+    # Ensure that the outputs are valid and check for division related issues in cm_tables
+    
+    if output is not None:
+        display(output)
+        display(given_pred)
+    else:
+        print("Predicted values are None or invalid.")
+        
+    if given_outcome is not None:
+        display(given_outcome)
+    else:
+        print("Outcome values are None or invalid.")
+        
+except ZeroDivisionError:
+    print("Caught a division by zero error in cm_tables. Please check inputs or calculations.")
+except Exception as e:
+    print(f"An unexpected error occurred: {e}")
 ```
 
 What if we change our CV scoring function to care about both

diff --git a/lectures/applications/working_with_text.md b/lectures/applications/working_with_text.md
@@ -13,6 +13,7 @@ kernelspec:
 
 **Author**
 > - [Paul Schrimpf *UBC*](https://economics.ubc.ca/faculty-and-staff/paul-schrimpf/)
+> - [Phil Solimine *UBC*](https://www.psolimine.net/)
 
 **Prerequisites**
 
@@ -126,17 +127,18 @@ def get_incident_details(id):
     return(result)
 
 
-incidentsfile = "https://datascience.quantecon.org/assets/data/avalanche_incidents.csv"
+incidentsfile = "http://datascience.quantecon.org/assets/data/avalanche_incidents.csv"
 
 # To avoid loading the avalanche Canada servers, we save the incident details locally.
-if (not os.path.isfile(incidentsfile)):
+# to update the data locally, change the incidentsfile to some other file name
+
+try:
+    incidents = pd.read_csv(incidentsfile)
+except Exception as e:
     incident_detail_list = incidents_brief.id.apply(get_incident_details).to_list()
     incidents = pd.DataFrame.from_dict(incident_detail_list, orient="columns")
     incidents.to_csv(incidentsfile)
-else:
-    incidents = pd.read_csv(incidentsfile)
-
-incidents
+incidents.head()
 ```
 
 Many incidents include coordinates, but others do not. Most
@@ -317,10 +319,9 @@ You may have to uncomment the second line below if  folium is not installed.
 import folium
 import matplotlib
 
-cmap = matplotlib.cm.get_cmap('Set1')
+cmap = matplotlib.colormaps["Set1"]
 fmap = folium.Map(location=[60, -98],
-                            zoom_start=3,
-                            tiles='Stamen Terrain')
+                            zoom_start=3)
 with urllib.request.urlopen(req) as response:
     regions_tmp = json.loads(response.read().decode('utf-8'))
 folium.GeoJson(regions_tmp,
@@ -411,6 +412,7 @@ def download_cached_forecasts():
                     warnings.warn(f"'File $f exists and is larger than version in cache. Not replacing.")
                 else :
                     z.extract(f)
+                    print("Downloaded and extracted", f)
 
 download_cached_forecasts()
 ```
@@ -443,7 +445,7 @@ def get_forecasts(start, end, region):
         #print("working on {}, {}".format(region,day))
         forecasts = forecasts + [get_forecast(day, region)]
         #print("sleeping")
-        time.sleep(0.1) # to avoid too much load on Avalanche Canada servers
+        time.sleep(0.01) # to avoid too much load on Avalanche Canada servers
         day = day + pd.Timedelta(1,"D")
     return(forecasts)
 
@@ -456,11 +458,13 @@ def get_season(year, region):
         os.mkdir("avalanche_forecasts")
     seasonfile = "avalanche_forecasts/{}_{}-{}.json".format(region, year, year+1)
     if (not os.path.isfile(seasonfile)):
-        startdate = pd.to_datetime("{}-{}-{} 12:00".format(year, start_month, start_day))
-        lastdate = pd.to_datetime("{}-{}-{} 12:00".format(year+1, last_month, last_day))
-        season = get_forecasts(startdate,lastdate,region)
-        with open(seasonfile, 'w') as outfile:
-            json.dump(season, outfile, ensure_ascii=False)
+        print(f"Season file {seasonfile} not found. Uncomment code here to update cached data")
+        season = []
+        #startdate = pd.to_datetime("{}-{}-{} 12:00".format(year, start_month, start_day))
+        #lastdate = pd.to_datetime("{}-{}-{} 12:00".format(year+1, last_month, last_day))
+        #season = get_forecasts(startdate,lastdate,region)
+        #with open(seasonfile, 'w') as outfile:
+        #    json.dump(season, outfile, ensure_ascii=False)
     else:
         with open(seasonfile, "rb") as json_data:
             season = json.load(json_data)
@@ -481,7 +485,7 @@ for year in range(2011,2019):
 forecasts = pd.DataFrame.from_dict([f for f in forecastlist if not f==None],orient="columns")
 
 forecasts["danger_date"] = forecasts.dangerRatings.apply(lambda r: r[0]["date"])
-forecasts["danger_date"] = pd.to_datetime(forecasts.danger_date, utc=True).dt.date
+forecasts["danger_date"] = pd.to_datetime(forecasts.danger_date, format='ISO8601').dt.date
 forecasts["danger_alpine"]=forecasts.dangerRatings.apply(lambda r: r[0]["dangerRating"]["alp"])
 forecasts["danger_treeline"]=forecasts.dangerRatings.apply(lambda r: r[0]["dangerRating"]["tln"])
 forecasts["danger_belowtree"]=forecasts.dangerRatings.apply(lambda r: r[0]["dangerRating"]["btl"])
@@ -532,6 +536,7 @@ import nltk
 import string
 nltk.download('omw-1.4')
 nltk.download('punkt')
+nltk.download('punkt_tab')
 nltk.download('stopwords')
 nltk.download('wordnet')
 # Remove stopwords (the, a, is, etc)
@@ -783,7 +788,7 @@ dimensional space or that the t-SNE algorithm parameters were
 chosen poorly.
 
 ```{code-cell} python
-cmap = matplotlib.cm.get_cmap('Paired')
+cmap = matplotlib.colormaps["Paired"]
 fig, ax = plt.subplots(1,2,figsize=(16,6))
 n_topics=len(svd_model.components_)
 lsa_keys = np.argmax(lsa_topic_sample, axis=1)