-
Notifications
You must be signed in to change notification settings - Fork 48
/
R_Intro.R
365 lines (311 loc) · 8.45 KB
/
R_Intro.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
############
?lm
??linear
plot(1:20, 1:20)
############
getwd()
list.files()
############
############
#--------------------------------------------------
############
?read.table
?read.csv
## read.table("/path/to/your/file.ext",
## header=TRUE,
## sep=",",
## stringsAsFactors = FALSE)
# read.table's default seperator ok for this one
set0 <- read.table("../../data/readin/ReadMeIn0.txt",
header=TRUE)
# specify new seperator
set1 <- read.table("../../data/readin/ReadMeIn1.txt",
header=TRUE,
sep=',')
# Or use read.csv
set1 <- read.csv("../../data/readin/ReadMeIn1.txt",
header=TRUE)
## # another change of seperator
set2 <- read.table("../../data/readin/ReadMeIn2.txt",
header=TRUE,
sep=';')
# check for missing
set3 <- read.table("../../data/readin/ReadMeIn3.txt",
header=FALSE,
sep=',',
na.strings = '')
#--------------------------------------------------
##################
# write data to file
# Options largely the same as their read counterparts
# row.names = FALSE is helpful to avoid have 1,2,3,... as a variable/column
?write.csv
write.csv(myRObject, file="/path/to/save/spot/file.csv", row.names=FALSE)
#--------------------------------------------------
# saveRDS/readRDS are used to save (compressed version of) individual R objects
## # save our data set
saveRDS(set1,file="TstObj.rds")
## # get it back
newtst <- readRDS("TstObj.rds")
## # can save any R object. Try a vector
my.vector <- c(1,8,-100)
saveRDS(my.vector, file="JustAVector.rds")
#--------------------------------------------------
# We can save all variables in the current R workspace with save.image We can load in a saved workspace with load
# R will ask you save your work when you exit
## # Save all our work
save.image("AllMyWork.RData")
## # Reload it
load("AllMyWork.RData")
## # name given to default save
# load(".RData")
#--------------------------------------------------
# Let us learb some basic operations of R
# numeric types: interger, double
348
# character
"my string"
# logical
TRUE
FALSE
# artithmetic as you'd expect
43 + 1 * 2^4
# so too logical operators/comparison
TRUE | FALSE
1 + 7 != 7
# Other logical operators:
# &, |, !
# <,>,<=,>=, ==, !=
## ---- datatypes2 ----------------------------------------------------------
# variables assignment is done with the <- operator
my.number <- 483
# the '.' above does nothing. we could have done:
# mynumber <- 483
# instead
# it's an Rism to use .'s in variable names.
# typeof() tells use type
typeof(my.number)
# we can convert between types
my.int <- as.integer(my.number)
typeof(my.int)
# we can test for types
is.logical(my.int)
# the vector is the most important data structure
# create it with c()
my.vec <- c(1,2,67,-98)
# get some properties
str(my.vec)
length(my.vec)
# access elements with []
my.vec[3]
my.vec[c(3,4)]
# can do assignment too
my.vec[5] <- 41.2
# other ways to create vectors
x <- 1:6
y <- seq(7,12,by=1)
# Operations get recycled through whole vector
x + 1
x > 3
# Can do component wise operations between vectors
x * y
x / y
y %/% x
## # Try guess what the following lines will do
## # Will it run at all? If so, what will it give?
## # Think about it and run to confirm
7 -> w
w <- z <- 44
1 + TRUE
0 | 15 & 3
my.vec[2:4]
my.vec[-2]
my.vec[c(TRUE,FALSE,FALSE,TRUE,FALSE)]
my.vec[
sum(
c(TRUE,FALSE,FALSE,TRUE,TRUE)
)
] <- TRUE
my.vec[3] <- "I'm a string"
as.numeric(my.vec)
x[x>3]
x + c(1,2)
# matricies are 2d vectors.
# create using matrix()
my.matrix <- matrix(rnorm(20),nrow=4,ncol=5)
# rnorm() draws 20 random samples from a n(0,1) distribution
my.matrix
# note matricies loaded by column
# Get details
dim(my.matrix)
nrow(my.matrix)
ncol(my.matrix)
# Indexing is similar to vectors but with 2 dimensions
# get second row
my.matrix[2,]
# get first,last columns of row three
my.matrix[3,c(1,4)]
# transposing done with t()
# lists similar to vectors but contain different types
# create with list
my.list <- list("just a string",
44,
my.matrix,
c(TRUE,TRUE,FALSE))
# access items via double brackets [[]]
my.list[[4]]
# access multiple items
my.list[1:2]
# list items can be named too
named.list <- list(Item1="my string",
Item2=my.list)
# access of named item is via dollar sign operator
# [[]] also works
c(named.list$Item1,named.list[[1]])
# ---------------------------------------------------
# ---------------------------------------------------
## ----get iris dataset
data("iris")
head(iris)
str(iris)
# Note iris is a data.frame data type; this is simply a list.
## ----pca1
# Compute the correlation matrix
# R= (1/n-1) X^T . X
# Here X is our (centered and scaled) data matrix, n is the number of rows/observations in our data, and XT is the transpose of X.
# (Hint: t(X) is transpose operator and A%*%B performs matrix multiplication on the matricies A and B) data("iris")
# get numeric portions of list and make a matrix
X <- as.matrix(iris[1:4])
# center and scale
X <- scale(X,center = TRUE,scale=TRUE)
# get the number of rows
n <- nrow(X)
# compute correlation matrix
R <- (1/(n-1))*t(X)%*%X
# perform eigen decomposition
Reig <- eigen(R)
# get eigen vectors
Reig.vecs <- Reig$vectors
# create principle components
pc1 <- X%*%Reig.vecs[,1]
pc2 <- X%*%Reig.vecs[,2]
## ----pcacomp
# compare to R's PCA function
their.pcs <-prcomp(iris[1:4],center = TRUE,scale. = TRUE)
head(their.pcs$x[,1:2])
# our result
head(cbind(pc1,pc2))
## ----pcascatter
plot(pc1,pc2,col=iris$Species)
## ----factors,size='tiny'-------------------------------------------------
# Factors are like vector, but with predefined allowed values called levels
# Factors are used to represent categorical variables in R
# create a factor
factor1 <- factor(c('Good','Bad','Ugly'))
# find it's levels
levels(factor1)
# below gives warning, but not error
factor1[4] <- 17
# see what happened
factor1
factor1[4] <- 'Bad'
# get the breakdown
table(factor1)
## ----mat_fac_Qs
my.matrix[3:4,1:2] <- c(4,5)
my.matrix[4,5] <- 'string'
mf.strings <- c('F','F','M','F')
factor2 <- as.factor(mf.strings)
c(factor1, factor2)
factor1 == 'Ugly'
my.list[[3]][2,]
sum(c(1,2,3,NA))
sum(c(1,2,3,NA),na.rm = TRUE)
## ----df1-----------------------------------------------------------------
# create your own
my.df <- data.frame(
age = c(45,27,19,59,71,13,5),
gender = factor(c('M','M','M','F','M','F','F'))
)
str(my.df)
## ---- df2 -----------------------------------------------------
my.df$age
summary(my.df$age)
table(my.df$gender)
# data frames are really just lists
my.df[[2]]
## ---- df3 -----------------------------------------------------
# data.frames can be subsetted like matrcies
my.df[1:3,c("age")]
# logical subsetting especially useful for .data.frames
# get ages over 40
age.logic <- my.df$age > 40
# take a subset of these rows
my.df[age.logic,]
# create a new variable age.sq
my.df$age.sq <- my.df$age^2
## ---- iris -------------------------------------------------
my.iris <- iris
my.iris
## ----iris_sol1-----------------------------------------------------------
my.iris$Length.Sum = my.iris$Sepal.Length +
my.iris$Petal.Length
my.iris$Width.Sum = my.iris$Sepal.Width +
my.iris$Petal.Width
setosa.inds <- my.iris$Species == 'setosa'
mean(my.iris[setosa.inds,]$Length.Sum)
## ----control_syn ----------------------------------------------
## if(logical_expression){
## execute_code
## } else{
## executre_other_code
## }
##
## for(value in sequence){
## work_with_value
## }
##
## while(expression_is_true){
## execute_code
## }
##
## ----fundef--------------------------------------------------------------
# use function key word with assignment <-
my.mean <- function(input.vector){
sum = 0
for(val in input.vector) {
sum = sum + val
}
# the expression get retuned
return.me <- sum / length(input.vector)
}
my.mean(1:10)
ans <- my.mean(1:10)
ans
## ----fundef2-------------------------------------------------------------
my.mean <- function(input.vector){
sum = 0
for(val in input.vector) {
sum = sum + val
}
# returns 1 now
retrun.me <- sum / length(input.vector)
1
}
my.mean(1:10)
## ----create_fun----------------------------------------------------------
my.summary <- function(x) {
list(
mean = mean(x),
sd = sd(x),
max = max(x),
min = min(x)
)
}
## ----loop_sol,eval=FALSE-------------------------------------------------
for(var in my.iris) {
if(is.numeric(var)){
tmp <- my.summary(var)
print(tmp$max)
}
}