From 022003e8b9d35be69affd5f795689d961ff95106 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Sun, 2 Jul 2023 20:32:54 -0700 Subject: [PATCH 01/11] Update _toc.yml --- lectures/_toc.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lectures/_toc.yml b/lectures/_toc.yml index 026f6371..16cc87cf 100644 --- a/lectures/_toc.yml +++ b/lectures/_toc.yml @@ -36,16 +36,18 @@ parts: - file: pandas/merge - file: pandas/groupby - file: pandas/timeseries +- caption: Data Science Tools + chapters: - file: pandas/matplotlib + - file: applications/maps + - file: applications/visualization_rules + - file: applications/regression + - file: applications/classification - caption: Applications chapters: - file: applications/index - - file: applications/visualization_rules - - file: applications/regression + - file: applications/ml_in_economics - file: applications/recidivism - - file: applications/maps - - file: applications/classification - file: applications/working_with_text - - file: applications/ml_in_economics - file: applications/heterogeneity - file: applications/networks From 11b659da3742c3dd2beb22736fd0a7771339e042 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Sun, 2 Jul 2023 20:59:38 -0700 Subject: [PATCH 02/11] rearrange chapters --- lectures/_toc.yml | 10 +++++----- lectures/{applications => tools}/classification.md | 0 lectures/{applications => tools}/maps.md | 0 lectures/{pandas => tools}/matplotlib.md | 0 lectures/{applications => tools}/regression.md | 0 .../{applications => tools}/visualization_rules.md | 0 6 files changed, 5 insertions(+), 5 deletions(-) rename lectures/{applications => tools}/classification.md (100%) rename lectures/{applications => tools}/maps.md (100%) rename lectures/{pandas => tools}/matplotlib.md (100%) rename lectures/{applications => tools}/regression.md (100%) rename lectures/{applications => tools}/visualization_rules.md (100%) diff --git a/lectures/_toc.yml b/lectures/_toc.yml index 16cc87cf..dc3f8994 100644 --- a/lectures/_toc.yml +++ b/lectures/_toc.yml @@ -38,11 +38,11 @@ parts: - file: pandas/timeseries - caption: Data Science Tools chapters: - - file: pandas/matplotlib - - file: applications/maps - - file: applications/visualization_rules - - file: applications/regression - - file: applications/classification + - file: tools/matplotlib + - file: tools/maps + - file: tools/visualization_rules + - file: tools/regression + - file: tools/classification - caption: Applications chapters: - file: applications/index diff --git a/lectures/applications/classification.md b/lectures/tools/classification.md similarity index 100% rename from lectures/applications/classification.md rename to lectures/tools/classification.md diff --git a/lectures/applications/maps.md b/lectures/tools/maps.md similarity index 100% rename from lectures/applications/maps.md rename to lectures/tools/maps.md diff --git a/lectures/pandas/matplotlib.md b/lectures/tools/matplotlib.md similarity index 100% rename from lectures/pandas/matplotlib.md rename to lectures/tools/matplotlib.md diff --git a/lectures/applications/regression.md b/lectures/tools/regression.md similarity index 100% rename from lectures/applications/regression.md rename to lectures/tools/regression.md diff --git a/lectures/applications/visualization_rules.md b/lectures/tools/visualization_rules.md similarity index 100% rename from lectures/applications/visualization_rules.md rename to lectures/tools/visualization_rules.md From c310186bdcaef8afb3902c764e9375426eb88819 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Sun, 2 Jul 2023 21:26:55 -0700 Subject: [PATCH 03/11] reorganizing chapters --- lectures/_toc.yml | 4 ++-- lectures/applications/index.md | 18 +++++------------- lectures/pandas/index.md | 6 ++---- lectures/tools/index.md | 24 ++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 19 deletions(-) create mode 100644 lectures/tools/index.md diff --git a/lectures/_toc.yml b/lectures/_toc.yml index dc3f8994..a3b110c7 100644 --- a/lectures/_toc.yml +++ b/lectures/_toc.yml @@ -47,7 +47,7 @@ parts: chapters: - file: applications/index - file: applications/ml_in_economics + - file: applications/networks - file: applications/recidivism - file: applications/working_with_text - - file: applications/heterogeneity - - file: applications/networks + - file: applications/heterogeneity \ No newline at end of file diff --git a/lectures/applications/index.md b/lectures/applications/index.md index 3d6adbed..620f33ab 100644 --- a/lectures/applications/index.md +++ b/lectures/applications/index.md @@ -15,20 +15,12 @@ In this part of the course, we will begin to apply the skills that you have lear includes using familiar tools in new applications and learning new tools that can be used for special types of analysis. -## [Data Visualization: Rules and Guidelines](../applications/visualization_rules.md) - -## [Regression](../applications/regression.md) - -## [Case Study: Recidivism](../applications/recidivism.md) - -## [Mapping in Python](../applications/maps.md) - -## [Classification](../applications/classification.md) +## [Machine Learning in Economics](../applications/ml_in_economics.md) -## [Working with Text](../applications/working_with_text.md) +## [Social and Economic Networks](../applications/networks.md) -## [Machine Learning in Economics](../applications/ml_in_economics.md) +## [Case Study: Recidivism](../applications/recidivism.md) -## [Heterogeneous Effects](../applications/heterogeneity.md) +## [Working with Text](../applications/working_with_text.md) -## [Social and Economic Networks](../applications/networks.md) +## [Heterogeneous Effects](../applications/heterogeneity.md) diff --git a/lectures/pandas/index.md b/lectures/pandas/index.md index 18862d16..180c6c1f 100644 --- a/lectures/pandas/index.md +++ b/lectures/pandas/index.md @@ -9,7 +9,7 @@ kernelspec: name: python3 --- -# pandas +# DataFrames and Series in Pandas This section of the workshop covers data ingestion, cleaning, manipulation, analysis, and visualization in Python. @@ -56,6 +56,4 @@ your programs will fall dramatically. ## [GroupBy](../pandas/groupby.md) -## [Time series](../pandas/timeseries.md) - -## [Intermediate Plotting](../pandas/matplotlib.md) \ No newline at end of file +## [Time Series](../pandas/timeseries.md) \ No newline at end of file diff --git a/lectures/tools/index.md b/lectures/tools/index.md new file mode 100644 index 00000000..68b9a606 --- /dev/null +++ b/lectures/tools/index.md @@ -0,0 +1,24 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# Data Science Tools + +In this part of the course we will piece together the skills that we have learned so far, to build a set of tools that we can use to analyze and understand data. + +## [Intermediate Plotting](../tools/matplotlib.md) + +## [Mapping in Python](../tools/maps.md) + +## [Data Visualization: Rules and Guidelines](../tools/visualization_rules.md) + +## [Regression](../tools/regression.md) + +## [Classification](../tools/classification.md) \ No newline at end of file From d4c1e8b8beaa276150023ad6bc4e5ecb5f6f2e0e Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Sun, 2 Jul 2023 22:06:28 -0700 Subject: [PATCH 04/11] bib fix --- lectures/_config.yml | 1 + lectures/tools/applications.bib | 218 ++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 lectures/tools/applications.bib diff --git a/lectures/_config.yml b/lectures/_config.yml index 9ad55646..5ddc7c59 100644 --- a/lectures/_config.yml +++ b/lectures/_config.yml @@ -12,6 +12,7 @@ html: bibtex_bibfiles: - applications/applications.bib + - tools/applications.bib sphinx: extra_extensions: [sphinx_multitoc_numbering, sphinx_exercise, sphinx_tojupyter] diff --git a/lectures/tools/applications.bib b/lectures/tools/applications.bib new file mode 100644 index 00000000..0d66628b --- /dev/null +++ b/lectures/tools/applications.bib @@ -0,0 +1,218 @@ +@article{kleinberg2017, + author = {Kleinberg, Jon and Lakkaraju, Himabindu and Leskovec, Jure and Ludwig, Jens and Mullainathan, Sendhil}, + title = {{Human Decisions and Machine Predictions*}}, + journal = {The Quarterly Journal of Economics}, + volume = {133}, + number = {1}, + pages = {237-293}, + year = {2017}, + month = {08}, + abstract = {{Can machine learning improve human decision making? Bail decisions provide a good test case. Millions of times each year, judges make jail-or-release decisions that hinge on a prediction of what a defendant would do if released. The concreteness of the prediction task combined with the volume of data available makes this a promising machine-learning application. Yet comparing the algorithm to judges proves complicated. First, the available data are generated by prior judge decisions. We only observe crime outcomes for released defendants, not for those judges detained. This makes it hard to evaluate counterfactual decision rules based on algorithmic predictions. Second, judges may have a broader set of preferences than the variable the algorithm predicts; for instance, judges may care specifically about violent crimes or about racial inequities. We deal with these problems using different econometric strategies, such as quasi-random assignment of cases to judges. Even accounting for these concerns, our results suggest potentially large welfare gains: one policy simulation shows crime reductions up to 24.7\\% with no change in jailing rates, or jailing rate reductions up to 41.9\\% with no increase in crime rates. Moreover, all categories of crime, including violent crimes, show reductions; these gains can be achieved while simultaneously reducing racial disparities. These results suggest that while machine learning can be valuable, realizing this value requires integrating these tools into an economic framework: being clear about the link between predictions and decisions; specifying the scope of payoff functions; and constructing unbiased decision counterfactuals. JEL Codes: C10, C55, K40.}}, + issn = {0033-5533}, + doi = {10.1093/qje/qjx032}, + url = {https://dx.doi.org/10.1093/qje/qjx032}, + eprint = {http://oup.prod.sis.lan/qje/article-pdf/133/1/237/24246094/qjx032.pdf} +} + + +@article{kleinberg2015, + author = {Kleinberg, Jon and Ludwig, Jens and Mullainathan, Sendhil and Obermeyer, Ziad}, + title = {Prediction Policy Problems}, + journal = {American Economic Review}, + volume = {105}, + number = {5}, + year = {2015}, + month = {May}, + pages = {491-95}, + doi = {10.1257/aer.p20151023}, + url = {http://www.aeaweb.org/articles?id=10.1257/aer.p20151023} +} + +@article{chernozhukov2018, + author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James}, + title = {Double/debiased machine learning for treatment and structural parameters}, + journal = {The Econometrics Journal}, + volume = {21}, + number = {1}, + year = {2018}, + pages = {C1-C68}, + doi = {10.1111/ectj.12097}, + url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/ectj.12097}, + eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/ectj.12097}, + abstract = {Summary We revisit the classic semi-parametric problem of inference on a low-dimensional parameter θ0 in the presence of high-dimensional nuisance parameters η0. We depart from the classical setting by allowing for η0 to be so high-dimensional that the traditional assumptions (e.g. Donsker properties) that limit complexity of the parameter space for this object break down. To estimate η0, we consider the use of statistical or machine learning (ML) methods, which are particularly well suited to estimation in modern, very high-dimensional cases. ML methods perform well by employing regularization to reduce variance and trading off regularization bias with overfitting in practice. However, both regularization bias and overfitting in estimating η0 cause a heavy bias in estimators of θ0 that are obtained by naively plugging ML estimators of η0 into estimating equations for θ0. This bias results in the naive estimator failing to be consistent, where N is the sample size. We show that the impact of regularization bias and overfitting on estimation of the parameter of interest θ0 can be removed by using two simple, yet critical, ingredients: (1) using Neyman-orthogonal moments/scores that have reduced sensitivity with respect to nuisance parameters to estimate θ0; (2) making use of cross-fitting, which provides an efficient form of data-splitting. We call the resulting set of methods double or debiased ML (DML). We verify that DML delivers point estimators that concentrate in an -neighbourhood of the true parameter values and are approximately unbiased and normally distributed, which allows construction of valid confidence statements. The generic statistical theory of DML is elementary and simultaneously relies on only weak theoretical requirements, which will admit the use of a broad array of modern ML methods for estimating the nuisance parameters, such as random forests, lasso, ridge, deep neural nets, boosted trees, and various hybrids and ensembles of these methods. We illustrate the general theory by applying it to provide theoretical properties of the following: DML applied to learn the main regression parameter in a partially linear regression model; DML applied to learn the coefficient on an endogenous variable in a partially linear instrumental variables model; DML applied to learn the average treatment effect and the average treatment effect on the treated under unconfoundedness; DML applied to learn the local average treatment effect in an instrumental variables setting. In addition to these theoretical applications, we also illustrate the use of DML in three empirical examples.} +} + +@article{chernozhukov2017, + author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney}, + title = {Double/Debiased/Neyman Machine Learning of Treatment Effects}, + journal = {American Economic Review}, + volume = {107}, + number = {5}, + year = {2017}, + month = {May}, + pages = {261-65}, + doi = {10.1257/aer.p20171038}, + url = {http://www.aeaweb.org/articles?id=10.1257/aer.p20171038} +} + +@techreport{cddf2018, + title = {Generic Machine Learning Inference on Heterogenous Treatment Effects in Randomized Experimentsxo}, + author = {Chernozhukov, Victor and Demirer, Mert and Duflo, Esther and Fernández-Val, Iván}, + institution = {National Bureau of Economic Research}, + type = {Working Paper}, + series = {Working Paper Series}, + number = {24678}, + year = {2018}, + month = {June}, + doi = {10.3386/w24678}, + url = {http://www.nber.org/papers/w24678}, + abstract = {We propose strategies to estimate and make inference on key features of heterogeneous effects in randomized experiments. These key features include best linear predictors of the effects using machine learning proxies, average effects sorted by impact groups, and average characteristics of most and least impacted units. The approach is valid in high dimensional settings, where the effects are proxied by machine learning methods. We post-process these proxies into the estimates of the key features. Our approach is generic, it can be used in conjunction with penalized methods, deep and shallow neural networks, canonical and new random forests, boosted trees, and ensemble methods. It does not rely on strong assumptions. In particular, we don’t require conditions for consistency of the machine learning methods. Estimation and inference relies on repeated data splitting to avoid overfitting and achieve validity. For inference, we take medians of p-values and medians of confidence intervals, resulting from many different data splits, and then adjust their nominal level to guarantee uniform validity. This variational inference method is shown to be uniformly valid and quantifies the uncertainty coming from both parameter estimation and data splitting. An empirical application to the impact of micro-credit on economic development illustrates the use of the approach in randomized experiments.} +} + +@article{wager2018, + author = {Stefan Wager and Susan Athey}, + title = {Estimation and Inference of Heterogeneous Treatment Effects using Random Forests}, + journal = {Journal of the American Statistical Association}, + volume = {0}, + number = {0}, + pages = {1-15}, + year = {2018}, + publisher = {Taylor & Francis}, + doi = {10.1080/01621459.2017.1319839}, + url = { + https://doi.org/10.1080/01621459.2017.1319839 + +}, + eprint = { + https://doi.org/10.1080/01621459.2017.1319839 + +} +} + +@article{athey2016b, + author = {Athey, Susan and Imbens, Guido}, + title = {Recursive partitioning for heterogeneous causal effects}, + volume = {113}, + number = {27}, + pages = {7353--7360}, + year = {2016}, + doi = {10.1073/pnas.1510489113}, + publisher = {National Academy of Sciences}, + abstract = {In this paper we propose methods for estimating heterogeneity in causal effects in experimental and observational studies and for conducting hypothesis tests about the magnitude of differences in treatment effects across subsets of the population. We provide a data-driven approach to partition the data into subpopulations that differ in the magnitude of their treatment effects. The approach enables the construction of valid confidence intervals for treatment effects, even with many covariates relative to the sample size, and without {\textquotedblleft}sparsity{\textquotedblright} assumptions. We propose an {\textquotedblleft}honest{\textquotedblright} approach to estimation, whereby one sample is used to construct the partition and another to estimate treatment effects for each subpopulation. Our approach builds on regression tree methods, modified to optimize for goodness of fit in treatment effects and to account for honest estimation. Our model selection criterion anticipates that bias will be eliminated by honest estimation and also accounts for the effect of making additional splits on the variance of treatment effect estimates within each subpopulation. We address the challenge that the {\textquotedblleft}ground truth{\textquotedblright} for a causal effect is not observed for any individual unit, so that standard approaches to cross-validation must be modified. Through a simulation study, we show that for our preferred method honest estimation results in nominal coverage for 90\% confidence intervals, whereas coverage ranges between 74\% and 84\% for nonhonest approaches. Honest estimation requires estimating the model with a smaller sample size; the cost in terms of mean squared error of treatment effects for our preferred method ranges between 7{\textendash}22\%.}, + issn = {0027-8424}, + url = {http://www.pnas.org/content/113/27/7353}, + eprint = {http://www.pnas.org/content/113/27/7353.full.pdf}, + journal = {Proceedings of the National Academy of Sciences} +} + +@techreport{alatas2011, + title = {Program Keluarga Harapan : impact evaluation of Indonesia's Pilot Household Conditional Cash Transfer Program}, + author = {Alatas, Vivi and Cahyadi, Nur and Ekasari, Elisabeth and Harmoun, Sarah and Hidayat, Budi and Janz, Edgar and Jellema, Jon and Tuhiman, H and Wai-Poi, M}, + institution = {World Bank}, + url = {http://documents.worldbank.org/curated/en/589171468266179965/Program-Keluarga-Harapan-impact-evaluation-of-Indonesias-Pilot-Household-Conditional-Cash-Transfer-Program}, + year = {2011} +} + +@article{triyana2016, + author = {Triyana, Margaret}, + title = {Do Health Care Providers Respond to Demand-Side Incentives? Evidence from Indonesia}, + journal = {American Economic Journal: Economic Policy}, + volume = {8}, + number = {4}, + year = {2016}, + month = {November}, + pages = {255-88}, + doi = {10.1257/pol.20140048}, + url = {http://www.aeaweb.org/articles?id=10.1257/pol.20140048} +} + + @article{hdm, + title = {{hdm}: High-Dimensional Metrics}, + author = {Victor Chernozhukov and Chris Hansen and Martin Spindler}, + journal = {R Journal}, + year = {2016}, + volume = {8}, + number = {2}, + pages = {185-199}, + url = {https://journal.r-project.org/archive/2016/RJ-2016-040/index.html} +} + +@article{dieterich2016, + title = {COMPAS risk scales: Demonstrating accuracy equity and predictive parity}, + author = {Dieterich, William and Mendoza, Christina and Brennan, Tim}, + journal = {Northpoint Inc}, + year = {2016} +} + +@inbook{belloni2011, + author = {Belloni, Alexandre +and Chernozhukov, Victor}, + editor = {Alquier, Pierre +and Gautier, Eric +and Stoltz, Gilles}, + title = {High Dimensional Sparse Econometric Models: An Introduction}, + booktitle = {Inverse Problems and High-Dimensional Estimation: Stats in the Ch{\^a}teau Summer School, August 31 - September 4, 2009}, + year = {2011}, + publisher = {Springer Berlin Heidelberg}, + address = {Berlin, Heidelberg}, + pages = {121--156}, + abstract = {In this chapter we discuss conceptually high dimensional sparse econometric models as well as estimation of these models using l 1-penalization and post- l 1-penalization methods. Focusing on linear and nonparametric regression frameworks, we discuss various econometric examples, present basic theoretical results, and illustrate the concepts and methods with Monte Carlo simulations and an empirical application. In the application, we examine and confirm the empirical validity of the Solow-Swan model for international economic growth.}, + isbn = {978-3-642-19989-9}, + doi = {10.1007/978-3-642-19989-9_3}, + url = {https://doi.org/10.1007/978-3-642-19989-9_3} +} + +@misc{athey2018, + title = {Machine learning and econometrics}, + author = {Athey, Susan and Imbens, Guido}, + year = {2018}, + publisher = {AEA Continuing Education }, + url = {https://www.aeaweb.org/conference/cont-ed/2018-webcasts} +} + +@article{athey2017, + author = {Athey, Susan and Imbens, Guido W.}, + title = {The State of Applied Econometrics: Causality and Policy Evaluation}, + journal = {Journal of Economic Perspectives}, + volume = {31}, + number = {2}, + year = {2017}, + month = {May}, + pages = {3-32}, + doi = {10.1257/jep.31.2.3}, + url = {http://www.aeaweb.org/articles?id=10.1257/jep.31.2.3} +} + + +@book{friedman2008, + title = {The elements of statistical learning}, + author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert}, + year = {2009}, + publisher = {Springer series in statistics}, + url = {https://web.stanford.edu/~hastie/ElemStatLearn/} +} + + +@book{efron2016, + title = {Computer age statistical inference}, + author = {Efron, Bradley and Hastie, Trevor}, + volume = {5}, + year = {2016}, + url = {https://web.stanford.edu/~hastie/CASI/}, + publisher = {Cambridge University Press} +} + + +@article{hornik1989, + title = {Multilayer feedforward networks are universal approximators}, + journal = {Neural Networks}, + volume = {2}, + number = {5}, + pages = {359 - 366}, + year = {1989}, + issn = {0893-6080}, + doi = {https://doi.org/10.1016/0893-6080(89)90020-8}, + url = {http://www.sciencedirect.com/science/article/pii/0893608089900208}, + author = {Kurt Hornik and Maxwell Stinchcombe and Halbert White}, + keywords = {Feedforward networks, Universal approximation, Mapping networks, Network representation capability, Stone-Weierstrass Theorem, Squashing functions, Sigma-Pi networks, Back-propagation networks} +} From ae00364d33d965ea1333f60bfdb2ddc4d02a5c99 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Sun, 2 Jul 2023 22:52:09 -0700 Subject: [PATCH 05/11] Update _config.yml --- lectures/_config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/lectures/_config.yml b/lectures/_config.yml index 5ddc7c59..9ad55646 100644 --- a/lectures/_config.yml +++ b/lectures/_config.yml @@ -12,7 +12,6 @@ html: bibtex_bibfiles: - applications/applications.bib - - tools/applications.bib sphinx: extra_extensions: [sphinx_multitoc_numbering, sphinx_exercise, sphinx_tojupyter] From 295cafa600b551130b5bd07473ced4a25110f2db Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 3 Jul 2023 15:10:01 -0700 Subject: [PATCH 06/11] fix broken cross-section refs and hopefully bibtex errors in regression --- lectures/applications/heterogeneity.md | 2 +- lectures/applications/ml_in_economics.md | 2 +- lectures/applications/recidivism.md | 6 +++--- lectures/problem_sets/problem_set_7.md | 4 ++-- lectures/tools/classification.md | 6 +++--- lectures/tools/maps.md | 2 +- lectures/tools/matplotlib.md | 2 +- lectures/tools/regression.md | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lectures/applications/heterogeneity.md b/lectures/applications/heterogeneity.md index 84bb4cfa..7eabd248 100644 --- a/lectures/applications/heterogeneity.md +++ b/lectures/applications/heterogeneity.md @@ -16,7 +16,7 @@ kernelspec: **Prerequisites** -- {doc}`Regression ` +- {doc}`Regression <../tools/regression>` - {doc}`Machine Learning in Economics ` **Outcomes** diff --git a/lectures/applications/ml_in_economics.md b/lectures/applications/ml_in_economics.md index 06648c91..42c7e3d0 100644 --- a/lectures/applications/ml_in_economics.md +++ b/lectures/applications/ml_in_economics.md @@ -16,7 +16,7 @@ kernelspec: **Prerequisites** -- {doc}`Regression ` +- {doc}`Regression <../tools/regression>` **Outcomes** diff --git a/lectures/applications/recidivism.md b/lectures/applications/recidivism.md index 5ec6157e..b9926818 100644 --- a/lectures/applications/recidivism.md +++ b/lectures/applications/recidivism.md @@ -17,9 +17,9 @@ kernelspec: **Prerequisites** -- {doc}`matplotlib Introduction <../pandas/matplotlib>` -- {doc}`Visualization Rules ` -- {doc}`Regression ` +- {doc}`matplotlib Introduction <../tools/matplotlib>` +- {doc}`Visualization Rules <../tools/visualization_rules>` +- {doc}`Regression <../tools/regression>` **Outcomes** diff --git a/lectures/problem_sets/problem_set_7.md b/lectures/problem_sets/problem_set_7.md index f8a92f27..7a645839 100644 --- a/lectures/problem_sets/problem_set_7.md +++ b/lectures/problem_sets/problem_set_7.md @@ -21,7 +21,7 @@ import pandas as pd ## Question 1 -From {doc}`Data Visualization: Rules and Guidelines <../applications/visualization_rules>` +From {doc}`Data Visualization: Rules and Guidelines <../tools/visualization_rules>` Create a bar chart of the below data on Canadian GDP growth. Use a non-red color for the years 2000 to 2008, red for @@ -41,7 +41,7 @@ for side in ["right", "top", "left", "bottom"]: ## Question 2 -From {doc}`Data Visualization: Rules and Guidelines <../applications/visualization_rules>` +From {doc}`Data Visualization: Rules and Guidelines <../tools/visualization_rules>` Draft another way to organize time and education by modifying the code below. That is, have two subplots (one for each diff --git a/lectures/tools/classification.md b/lectures/tools/classification.md index 25b2e49c..31dd6293 100644 --- a/lectures/tools/classification.md +++ b/lectures/tools/classification.md @@ -19,7 +19,7 @@ kernelspec: **Prerequisites** -- {doc}`Regression ` +- {doc}`Regression ` **Outcomes** @@ -93,7 +93,7 @@ import matplotlib.pyplot as plt We have actually already encountered a classification algorithm. -In the {doc}`recidivism ` example, we attempted to predict whether +In the {doc}`recidivism <../applications/recidivism>` example, we attempted to predict whether or not an individual would commit another crime by using a combination of the assigned COMPAS score and the individual's gender or race. @@ -778,7 +778,7 @@ Your task here is to use the `model_selection.cross_val_score` method to select optimal level for the regularization parameter `C`. The `scoring` argument should be set to `roc_auc`. -Refer to the example in the {doc}`recidivism lecture ` for how +Refer to the example in the {doc}`recidivism lecture <../applications/recidivism>` for how to use `model_selection.cross_val_score`. ```{code-cell} python diff --git a/lectures/tools/maps.md b/lectures/tools/maps.md index 48d3d881..fd354028 100644 --- a/lectures/tools/maps.md +++ b/lectures/tools/maps.md @@ -16,7 +16,7 @@ kernelspec: **Prerequisites** -- {doc}`matplotlib Introduction <../pandas/matplotlib>` +- {doc}`matplotlib Introduction ` - {doc}`Visualization Rules ` **Outcomes** diff --git a/lectures/tools/matplotlib.md b/lectures/tools/matplotlib.md index 3569c08a..dd13b6ce 100644 --- a/lectures/tools/matplotlib.md +++ b/lectures/tools/matplotlib.md @@ -13,7 +13,7 @@ kernelspec: **Prerequisites** -- {doc}`Introduction ` +- {doc}`Introduction <../pandas/intro>` **Outcomes** diff --git a/lectures/tools/regression.md b/lectures/tools/regression.md index 36febe80..081d7f13 100644 --- a/lectures/tools/regression.md +++ b/lectures/tools/regression.md @@ -1075,7 +1075,7 @@ See exercise 11 in the {ref}`exercise list `. Two good text books covering the above regression methods are {cite}`reg-friedman2008` and {cite}`reg-efron2016` -```{bibliography} applications.bib +```{bibliography} ../applications/applications.bib :keyprefix: reg- :labelprefix: reg :cited: From a91ce1bce590c3b2bbebb255695d0d80b919c6dc Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 3 Jul 2023 16:05:12 -0700 Subject: [PATCH 07/11] add tools/index to toctree --- lectures/_toc.yml | 1 + lectures/applications/working_with_text.md | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lectures/_toc.yml b/lectures/_toc.yml index a3b110c7..a26708c5 100644 --- a/lectures/_toc.yml +++ b/lectures/_toc.yml @@ -38,6 +38,7 @@ parts: - file: pandas/timeseries - caption: Data Science Tools chapters: + - file: tools/index - file: tools/matplotlib - file: tools/maps - file: tools/visualization_rules diff --git a/lectures/applications/working_with_text.md b/lectures/applications/working_with_text.md index e4cab0a4..84a7cbb2 100644 --- a/lectures/applications/working_with_text.md +++ b/lectures/applications/working_with_text.md @@ -16,10 +16,10 @@ kernelspec: **Prerequisites** -- {doc}`Visualization Rules ` -- {doc}`Regression ` -- {doc}`Classification ` -- {doc}`Maps ` +- {doc}`Visualization Rules <../tools/visualization_rules>` +- {doc}`Regression <../tools/regression>` +- {doc}`Classification <../tools/classification>` +- {doc}`Maps <../tools/maps>` **Outcomes** From 236b9d71c9757e356d19362bee7187aa5c4c7982 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 3 Jul 2023 17:45:27 -0700 Subject: [PATCH 08/11] adjusting fa icons --- lectures/applications/index.md | 10 +++++----- lectures/index.md | 11 +++++++---- lectures/tools/index.md | 10 +++++----- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/lectures/applications/index.md b/lectures/applications/index.md index 620f33ab..e7734c98 100644 --- a/lectures/applications/index.md +++ b/lectures/applications/index.md @@ -15,12 +15,12 @@ In this part of the course, we will begin to apply the skills that you have lear includes using familiar tools in new applications and learning new tools that can be used for special types of analysis. -## [Machine Learning in Economics](../applications/ml_in_economics.md) +## [Machine Learning in Economics](../applications/ml_in_economics.md) -## [Social and Economic Networks](../applications/networks.md) +## [Social and Economic Networks](../applications/networks.md) -## [Case Study: Recidivism](../applications/recidivism.md) +## [Case Study: Recidivism](../applications/recidivism.md) -## [Working with Text](../applications/working_with_text.md) +## [Working with Text](../applications/working_with_text.md) -## [Heterogeneous Effects](../applications/heterogeneity.md) +## [Heterogeneous Effects](../applications/heterogeneity.md) diff --git a/lectures/index.md b/lectures/index.md index 03e48bfe..16bb673a 100644 --- a/lectures/index.md +++ b/lectures/index.md @@ -34,11 +34,14 @@ Course description, software installation ## [Python Fundamentals](../python_fundamentals/index.md) Basic Python programming -## [Scientific Computing](../scientific/index.md) +## [Scientific Computing with Numpy](../scientific/index.md) Numerical and scientific methods -## [Pandas](../pandas/index.md) +## [Working With Data in Pandas](../pandas/index.md) The "data" in data science -## [Applications](../applications/index.md) -Applications and statistical tools +## [Data Science Tools](../tools/index.md) +Putting everything together + +## [Applications](../applications/index.md) +Applying our skills to real economic data diff --git a/lectures/tools/index.md b/lectures/tools/index.md index 68b9a606..fab8ad78 100644 --- a/lectures/tools/index.md +++ b/lectures/tools/index.md @@ -13,12 +13,12 @@ kernelspec: In this part of the course we will piece together the skills that we have learned so far, to build a set of tools that we can use to analyze and understand data. -## [Intermediate Plotting](../tools/matplotlib.md) +## [Intermediate Plotting](../tools/matplotlib.md) -## [Mapping in Python](../tools/maps.md) +## [Mapping in Python](../tools/maps.md) -## [Data Visualization: Rules and Guidelines](../tools/visualization_rules.md) +## [Data Visualization: Rules and Guidelines](../tools/visualization_rules.md) -## [Regression](../tools/regression.md) +## [Regression](../tools/regression.md) -## [Classification](../tools/classification.md) \ No newline at end of file +## [Classification](../tools/classification.md) \ No newline at end of file From 414f1baf7d169ec08e637f2b8b995c6364cfd173 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 3 Jul 2023 18:36:27 -0700 Subject: [PATCH 09/11] testing fa icon adjustments --- lectures/applications/index.md | 10 +++++----- lectures/pandas/index.md | 18 +++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lectures/applications/index.md b/lectures/applications/index.md index e7734c98..3a991908 100644 --- a/lectures/applications/index.md +++ b/lectures/applications/index.md @@ -15,12 +15,12 @@ In this part of the course, we will begin to apply the skills that you have lear includes using familiar tools in new applications and learning new tools that can be used for special types of analysis. -## [Machine Learning in Economics](../applications/ml_in_economics.md) +## [Machine Learning in Economics](../applications/ml_in_economics.md) -## [Social and Economic Networks](../applications/networks.md) +## [Social and Economic Networks](../applications/networks.md) -## [Case Study: Recidivism](../applications/recidivism.md) +## [Case Study: Recidivism](../applications/recidivism.md) -## [Working with Text](../applications/working_with_text.md) +## [Working with Text](../applications/working_with_text.md) -## [Heterogeneous Effects](../applications/heterogeneity.md) +## [Heterogeneous Effects](../applications/heterogeneity.md) \ No newline at end of file diff --git a/lectures/pandas/index.md b/lectures/pandas/index.md index 180c6c1f..00cb58d5 100644 --- a/lectures/pandas/index.md +++ b/lectures/pandas/index.md @@ -40,20 +40,20 @@ The expression "practice makes perfect" is especially true here. As you work with these tools, both the time to write and the time to run your programs will fall dramatically. -## [Introduction](../pandas/intro.md) +## [Introduction](../pandas/intro.md) -## [Basic Functionality](../pandas/basics.md) +## [Basic Functionality](../pandas/basics.md) -## [The Index](../pandas/the_index.md) +## [The Index](../pandas/the_index.md) -## [Storage Formats](../pandas/storage_formats.md) +## [Storage Formats](../pandas/storage_formats.md) -## [Cleaning Data](../pandas/data_clean.md) +## [Cleaning Data](../pandas/data_clean.md) -## [Reshape](../pandas/reshape.md) +## [Reshape](../pandas/reshape.md) -## [Merge](../pandas/merge.md) +## [Merge](../pandas/merge.md) -## [GroupBy](../pandas/groupby.md) +## [GroupBy](../pandas/groupby.md) -## [Time Series](../pandas/timeseries.md) \ No newline at end of file +## [Time Series](../pandas/timeseries.md) \ No newline at end of file From 5552e217ef9848404ce3da89476d3c894b56ad63 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Mon, 3 Jul 2023 19:29:17 -0700 Subject: [PATCH 10/11] fix fa icons --- lectures/applications/index.md | 10 +++++----- lectures/index.md | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lectures/applications/index.md b/lectures/applications/index.md index 3a991908..0d20b97d 100644 --- a/lectures/applications/index.md +++ b/lectures/applications/index.md @@ -15,12 +15,12 @@ In this part of the course, we will begin to apply the skills that you have lear includes using familiar tools in new applications and learning new tools that can be used for special types of analysis. -## [Machine Learning in Economics](../applications/ml_in_economics.md) +## [Machine Learning in Economics](../applications/ml_in_economics.md) -## [Social and Economic Networks](../applications/networks.md) +## [Social and Economic Networks](../applications/networks.md) -## [Case Study: Recidivism](../applications/recidivism.md) +## [Case Study: Recidivism](../applications/recidivism.md) -## [Working with Text](../applications/working_with_text.md) +## [Working with Text](../applications/working_with_text.md) -## [Heterogeneous Effects](../applications/heterogeneity.md) \ No newline at end of file +## [Heterogeneous Effects](../applications/heterogeneity.md) \ No newline at end of file diff --git a/lectures/index.md b/lectures/index.md index 16bb673a..a2f3f971 100644 --- a/lectures/index.md +++ b/lectures/index.md @@ -34,14 +34,14 @@ Course description, software installation ## [Python Fundamentals](../python_fundamentals/index.md) Basic Python programming -## [Scientific Computing with Numpy](../scientific/index.md) +## [Scientific Computing](../scientific/index.md) Numerical and scientific methods -## [Working With Data in Pandas](../pandas/index.md) +## [Working With Data](../pandas/index.md) The "data" in data science ## [Data Science Tools](../tools/index.md) Putting everything together -## [Applications](../applications/index.md) +## [Applications](../applications/index.md) Applying our skills to real economic data From db13ee0e9d365b73573bf97a93f9c16b8bf0e5f3 Mon Sep 17 00:00:00 2001 From: Phil Solimine <15682144+doctor-phil@users.noreply.github.com> Date: Tue, 4 Jul 2023 18:33:42 -0700 Subject: [PATCH 11/11] Update _config.yml fix mime type warnings --- lectures/_config.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lectures/_config.yml b/lectures/_config.yml index 9ad55646..5baf88ca 100644 --- a/lectures/_config.yml +++ b/lectures/_config.yml @@ -18,6 +18,7 @@ sphinx: config: suppress_warnings: ["mystnb.unknown_mime_type"] nb_mime_priority_overrides: [ + # HTML ['html', 'application/vnd.jupyter.widget-view+json', 10], ['html', 'application/javascript', 20], ['html', 'text/html', 30], @@ -27,6 +28,25 @@ sphinx: ['html', 'image/jpeg', 70], ['html', 'text/markdown', 80], ['html', 'text/plain', 90], + # Jupyter Notebooks + ['jupyter', 'application/vnd.jupyter.widget-view+json', 10], + ['jupyter', 'application/javascript', 20], + ['jupyter', 'text/html', 30], + ['jupyter', 'text/latex', 40], + ['jupyter', 'image/svg+xml', 50], + ['jupyter', 'image/png', 60], + ['jupyter', 'image/jpeg', 70], + ['jupyter', 'text/markdown', 80], + ['jupyter', 'text/plain', 90], + # LaTeX + ['latex', 'text/latex', 10], + ['latex', 'application/pdf', 20], + ['latex', 'image/png', 30], + ['latex', 'image/jpeg', 40], + ['latex', 'text/markdown', 50], + ['latex', 'text/plain', 60], + # Link Checker + ['linkcheck', 'text/plain', 10], ] html_favicon: _static/lectures-favicon.ico html_static_path: ['_static']