forked from YouSayData/DSR_Intermediate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheda.R
433 lines (316 loc) · 15.9 KB
/
eda.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
# Introduction to EDA -----------------------------------------------------
# Exploratory Data Analysis (EDA) is an iterative cycle:
# 1. Generate questions about your data.
# 2. Search for answers by visualising, transforming, and modelling your data.
# 3. Use what you learn to refine your questions and/or generate new questions.
# It's not a formal process though.
# It's a creative process.
# “There are no routine statistical questions, only questionable statistical routines.” — Sir David Cox
# Although two questions are very often useful:
# 1. What type of variation occurs within my variables?
# 2. What type of covariation occurs between my variables?
# Variation is the tendency of the values of a variable to change from measurement to measurement.
# If variation describes the behavior within a variable, covariation describes the behavior between variables.
# Covariation is the tendency for the values of two or more variables to vary together in a related way.
# Some defs
# A variable is a quantity, quality, or property that you can measure.
# A value is the state of a variable when you measure it. The value of a variable may change from measurement to measurement.
# An observation is a set of measurements made under similar conditions (you usually make all of the measurements in an observation at the same time and on the same object).
# Tabular data is a set of values, each associated with a variable and an observation.
# Libraries Used ----------------------------------------------------------
library(tidyverse)
# install.packages("viridis")
library(viridis) # for accessibility
library(ggthemes)
library(hexbin)
library(modelr)
# Variation of 1 variable -------------------------------------------------
# Discrete variables: Distribution ------------------------------------
?diamonds
# Observe via plot
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
# Observe via table
diamonds %>%
count(cut) %>%
rename(count = n) %>%
mutate(perc = count / sum(count))
# Sidetrack: Pie Chart ----------------------------------------------------
# If you must...
diamond_percent <- diamonds %>%
count(cut) %>%
rename(count = n) %>%
mutate(perc = count / sum(count))
(pie <- ggplot(diamond_percent, aes(x="", y=perc, fill=cut))+
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0))
pie + scale_fill_grey() + theme_minimal()
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
pie +
scale_fill_viridis(discrete=TRUE) +
blank_theme +
theme(axis.text.x=element_blank()) +
geom_text(aes(label = paste0(round(perc, 1), "%")),
position = position_stack(vjust = 0.6),
color = "white", fontface='bold')
# back to class
# Continious variables: Distribution --------------------------------------
# via plot
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
# via table
diamonds %>%
count(cut_width(carat, 0.5))
# explore different bins
smaller <- diamonds %>%
filter(carat < 3)
bigger <- diamonds %>%
filter(carat >= 3)
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)
# Sidetrack: ggthemes provides nice themes --------------------------------
ggplot(data = smaller) +
theme_tufte() +
geom_histogram(mapping = aes(x = carat), binwidth = 0.1) +
theme(axis.title=element_blank()) +
annotate("text", x = 2.3, y = 7500, adj=1, family="serif",
label = c("Count of diamonds per\ncarat (binwidth = 0.1)."))
# Return to distribution of continious variables --------------------------
smaller %>%
count(cut_width(carat, 0.1))
# by type
ggplot(data = smaller,
mapping = aes(x = carat, colour = cut)) +
geom_freqpoly(binwidth = 0.1)
# Follow up questions, one should ask ------------------------------------
# Which values are the most common? Why?
# Which values are rare? Why? Does that match your expectations?
# Can you see any unusual patterns? What might explain them?
# Let's have a look
ggplot(data = smaller, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.01)
# Why are there more diamonds at whole carats and very common fractions of carats?
# Why are there more diamonds slightly to the right of each peak than there are slightly to the left of each peak?
# Why are there no diamonds bigger than 3 carats?
ggplot(data = bigger, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.01)
# Clusters of similar values suggest that subgroups exist in your data.
# How are the observations within each cluster similar to each other?
# How are the observations in separate clusters different from each other?
# How can you explain or describe the clusters?
# Why might the appearance of clusters be misleading?
# Let's have a look at another example
?faithful
ggplot(data = faithful, mapping = aes(x = eruptions)) +
geom_histogram(binwidth = 0.25) +
xlab("Duration of Eruptions")
# maybe the duration of the eruption is affected by another variable?
# Outliers ----------------------------------------------------------------
# what is suspicious here?
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5)
# maybe we can change the viewpoint
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
unusual <- diamonds %>%
filter(y < 3 | y > 20) %>%
select(price, x, y, z) %>%
arrange(y)
unusual
# Repeat later experiments with and without outliers
# If outliers have minimal effect on the results, and you can’t figure out why they’re there, it’s reasonable to replace them with missing values, and move on.
# If they have a substantial effect on your results, you shouldn’t drop them without justification. You’ll need to figure out what caused them (e.g. a data entry error) and disclose that you removed them in your write-up.
# Exercises I -------------------------------------------------------------
# 1. Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.
# 2. Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)
# 3. How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?
# 4. Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?
# Dealing with outliers ---------------------------------------------------
# I decided to drop the unusual values, how can I do this?
# 1. Really drop them.
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
# 2. NA them
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point()
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)
# Reason for NAs ----------------------------------------------------------
# often there is a reason for na, which we can use
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_density(mapping = aes(fill = cancelled), alpha = .3)
cancelledFlights <- nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time) * 1,
not_cancelled = !is.na(dep_time) * 1,
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60,
quarterlies = cut(sched_dep_time,
breaks = seq(from = 0, to = 24, .5),
labels = seq(from = .5, to = 24, .5))) %>%
group_by(quarterlies) %>%
summarise(cancelled = sum(cancelled),
not_cancelled = sum(not_cancelled)) %>%
mutate(count = cancelled + not_cancelled,
per_cancelled = cancelled / count) %>%
filter(per_cancelled != 1)
cancelledFlights %>%
ggplot(aes(x = quarterlies)) +
geom_col(aes(y = count)) +
geom_line(aes(y = per_cancelled * max(count), group = 1)) +
scale_y_continuous(
sec.axis = sec_axis(~./max(cancelledFlights$count))
)
# Exercise II -------------------------------------------------------------
# 1. What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference?
# 2. What does na.rm = TRUE do in mean() and sum()?
# Covariation -------------------------------------------------------------
# Categorical and continuous variable
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
# Too much difference in the counts of the groups
ggplot(diamonds) +
geom_bar(mapping = aes(x = cut))
# If there is too much difference in the distribution geom_density might be better
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_density(mapping = aes(colour = cut))
# Alternative boxplot
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
# Alternative violin
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_violin()
# Alternative random-scatter
ggplot(data = diamonds %>% slice(sample(1:nrow(diamonds), 10000)), mapping = aes(x = cut, y = price)) +
geom_jitter(alpha = .2)
#install.packages("ggbeeswarm")
library(ggbeeswarm)
ggplot(data = diamonds, mapping = aes(x = cut, y = price, col = cut)) +
geom_quasirandom()
ggplot(data = diamonds %>% filter(cut %in% c("Premium", "Ideal")) %>% slice(sample(1:nrow(diamonds), 250)), mapping = aes(x = cut, y = price)) +
geom_beeswarm(size = 1)
ggplot(data = diamonds) +
theme_tufte() +
geom_tufteboxplot(mapping = aes(x = cut, y = price)) +
theme(axis.title=element_blank()) +
annotate("text", x = 7, y = 16000, adj=1, family="serif",
label = c("Price per\nquality of cut\nin Diamonds."))
# why are better cut diamonds cheaper?!?
diamonds$cut[1]
# you will have to answer that question, but first let's use order to get the hang of it
# using order
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, median), y = hwy))
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = fct_reorder(class, hwy, median), y = hwy))
ggplot(data = diamonds) +
theme_tufte() +
geom_tufteboxplot(mapping = aes(x = reorder(cut, price, FUN = mean), y = price)) +
theme(axis.title=element_blank()) +
annotate("text", x = 7, y = 16000, adj=1, family="serif",
label = c("Price per\nquality of cut\nin Diamonds."))
ggplot(data = diamonds) +
theme_tufte() +
geom_tufteboxplot(mapping = aes(x = reorder(cut, price, FUN = median), y = price)) +
theme(axis.title=element_blank()) +
annotate("text", x = 7, y = 16000, adj=1, family="serif",
label = c("Price per\nquality of cut\nin Diamonds."))
# long variable names work better flipped
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
ggplot(data = mpg) +
geom_boxplot(mapping = aes(y = reorder(class, hwy, FUN = median), x = hwy))
# Exercise III ------------------------------------------------------------
# 1. Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights.
# 2. What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(aes(sched_dep_time)) +
geom_density(aes(fill = cancelled), alpha = 1/5)
# Two categorical values --------------------------------------------------
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))
diamonds %>%
count(color, cut)
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
# Exercise IV -------------------------------------------------------------
# 1. How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut?
# 2. Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?
# 3. Why is it slightly better to use aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above?
# Two continuous variables
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
# changing alpha can help overplotting
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
# other options
ggplot(data = smaller) +
geom_bin2d(mapping = aes(x = carat, y = price))
ggplot(data = smaller) +
geom_density2d(mapping = aes(x = carat, y = price))
ggplot(data = smaller) +
geom_hex(mapping = aes(x = carat, y = price))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_violin(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_number(carat, 20)))
# Exercise V --------------------------------------------------------------
# 1. Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price?
# 2. Visualise the distribution of carat, partitioned by price.
# 3. How does the price distribution of very large diamonds compare to small diamonds? Is it as you expect, or does it surprise you?
# 4. Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price.
# 5. Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the plot below have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately.
# Why is a scatterplot a better display than a binned plot for this case?
ggplot(data = diamonds) +
geom_point(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
# Patterns and Models -----------------------------------------------------
# Patterns in your data provide clues about relationships. If a systematic relationship exists between two variables it will appear as a pattern in the data. If you spot a pattern, ask yourself:
# Could this pattern be due to coincidence (i.e. random chance)?
# How can you describe the relationship implied by the pattern?
# How strong is the relationship implied by the pattern?
# What other variables might affect the relationship?
# Does the relationship change if you look at individual subgroups of the data?
ggplot(data = faithful) +
geom_point(mapping = aes(x = eruptions, y = waiting))
# Models are a tool for extracting patterns out of data
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
ggplot(data = diamonds2) +
geom_point(mapping = aes(x = carat, y = resid))
ggplot(data = diamonds2) +
geom_boxplot(mapping = aes(x = cut, y = resid))