-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2 - Foundry Intro.py
162 lines (143 loc) · 4.62 KB
/
2 - Foundry Intro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.4
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# # Introduction
#
# In this notebook, we do a comprehensive introduction into the functionality of `foundry`. Intended as a supplement to `scikit-learn`, this is a demonstration of how the current packages in foundry: (1) `foundry.preprocessing`, (2) `foundry.glm`, and (3) `foundry.evaluation` support work in machine learning, predictive analytics, and explainable models.
# +
import numpy as np
import pandas as pd
from plotnine import *
from sklearn.compose import make_column_selector
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn import svm
from foundry.glm import Glm
from foundry.preprocessing import (
ColumnDropper,
DataFrameTransformer,
InteractionFeatures,
as_transformer,
identity,
)
from data.uci import get_online_news_dataset, get_census_dataset
# +
X, y = get_census_dataset()
categorical_features = ("workclass", "education", "married", "occupation", "relationship", "race", "sex", "country")
# -
# Cleaning
X.loc[X["country"] == "Holand-Netherlands", "country"] = '?' # Only one example'
rows_with_missing_values = (X == "?").any(axis=1)
X, y = X.loc[~rows_with_missing_values, :], y.loc[~rows_with_missing_values]
X["education"] = pd.Categorical(
X["education"],
categories=[
'Preschool',
'1st-4th',
'5th-6th',
'7th-8th',
'9th',
'10th',
'11th',
'12th',
'HS-grad',
'Prof-school',
'Assoc-voc',
'Assoc-acdm',
'Some-college',
'Bachelors',
'Masters',
'Doctorate',
], ordered=True)
X = X.astype({feature: "category" for feature in categorical_features})
y = y.astype("category")
# # Train a model with feature engineering
classifier: Pipeline = make_pipeline(
# Newer versions of sklearn, or don't care for pd.Dataframe over np.ndarray?
# Then you can use sklearn.compose.ColumnTransformer
DataFrameTransformer(
transformers=[
(
"onehot",
OneHotEncoder(),
categorical_features
),
(
"log1p",
make_pipeline(as_transformer(np.log1p), StandardScaler()),
make_column_selector("capital")
),
(
"scale",
StandardScaler(),
("age", "education_years", "capitalgain", "capitalloss", "hoursperweek")
)
]
),
InteractionFeatures(
[
(
make_column_selector("scale"),
make_column_selector("onehot")
)
]
),
Glm("categorical")
)
# +
# results = cross_validate(
# final_pipeline,
# X, y, cv=StratifiedKFold(shuffle=True, random_state=100, n_splits=10),
# scoring='accuracy',
# fit_params={"glm__verbose": False, "glm__estimate_laplace_coefs": False,}
# )
# print(results)
# print(f"average accuracy: {results['test_score'].mean()}")
# -
trained_model = classifier.fit(X, y, glm__estimate_laplace_coefs=False)
# ## Evaluation - Marginal Effects
from foundry.evaluation import MarginalEffects
me = MarginalEffects(pipeline=trained_model, )
me(
X,
y.map(lambda item: 1 if item == ">50K" else 0).astype(float),
vary_features = ["sex"],
groupby_features=["occupation"],
marginalize_aggfun=None,
y_aggfun='mean'
)
me.plot() + theme(axis_text_x=element_text(rotation=90, hjust=1))
me = MarginalEffects(pipeline=trained_model, )
me(
X,
y.map(lambda item: 1 if item == ">50K" else 0).astype(float),
vary_features = ["education"],
groupby_features=["occupation"],
marginalize_aggfun=None,
y_aggfun='mean'
)
me.plot(include_actuals=False) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + ylab("P(Income > 50k)")
me = MarginalEffects(pipeline=trained_model, )
me(
X,
y.map(lambda item: 1 if item == ">50K" else 0).astype(float),
vary_features = ["education_years"],
groupby_features=["occupation"],
marginalize_aggfun=None,
y_aggfun='mean'
)
me.plot(include_actuals=True) + theme(axis_text_x=element_text(rotation=90, hjust=1))