-
Notifications
You must be signed in to change notification settings - Fork 48
/
Module-4-Example-1.R
122 lines (73 loc) · 2.68 KB
/
Module-4-Example-1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# setwd("SET THE Working Director to THE PATH TO THIS DIRECTORY")
data <- read.csv("Datasets/CEO_salary.csv")
attach(data)
# Divid the Salary by 1000 to be able to better show the values.
salary1 <- salary/1000
# Now we create a New Data frame out of age, heit and Salary
data1 <- data.frame(age, height, salary1)
plot(salary1, age)
plot(salary1, height)
# You can also the following command to replace the column inside the original dataframe.
# data[,"salary"] <- salary1
# Calulate Correlation Coefficient
cor(data1)
# Plot Scatterplot Matrix
pairs(data1)
# Fit a Multiple Linear Regression model into data.
# Variables are Salary and age
lm(formula = salary1 ~ age + height)
# or better store the resulted model into a variable for further use.
m <- lm(salary1~age+height)
# Summar function can calculate almost everything that you need.
summary(m)
# Calulate R squared Manually and the P-Value
# Total Sume of Squared.
totalss <- sum((salary1 - mean(salary1))^2)
# Regression and Residual Sum of the Squered.
regss <- sum((fitted(m) - mean(salary1))^2)
resiss <- sum((salary1-fitted(m))^2)
# Calulate the F Value.
fstatistic <- (regss/2)/(resiss/97)
# The P-Value for F-Statistic.
pvalue <- 1-pf(fstatistic, df1=2, df2=97)
# Calulate R squared.
R2 <- regss/totalss
# Regression Diagnostics
# Residual Plots
resid(m)
par(mfrow=c(2,2))
plot(age, resid(m), axes=TRUE, frame.plot=TRUE, xlab='age', ylab='residue')
plot(height, resid(m), axes=TRUE, frame.plot=TRUE, xlab='height', ylab='residue')
plot(fitted(m), resid(m), axes=TRUE, frame.plot=TRUE, xlab='fitted values', ylab='residue')
hist(resid(m))
# influence function
# This function provides the basic quantities which are used in forming a wide variety
# of diagnostics for checking the quality of regression fits.
influence(m)
# Checking for influence points
# Cooks distance
cooks.dist <- cooks.distance(m)
# Where the cook's distance is higher than 4/(n-k-1) (k is the number if parameters in equation)
# 4/(102 - 2 -1) = 4/97
which(cooks.dist > (4/(nrow(data1)-2-1)))
#############################################################
# Identifying Influential Data Points
# https://newonlinecourses.science.psu.edu/stat501/node/340/
# Finding outliers
# IQR Outlier detection.
outlier_iqr <- function(x){
iqr <- IQR(x,na.rm = T,type = 7)
q <- quantile(x)
upper_bound = q[4]+(iqr*1.5)
lower_bound = q[2]-(iqr*1.5)
outliers <- which ((x > upper_bound) | (x < lower_bound))
return(outliers)
}
# The two axises , X_1 and X_2
print(outlier_iqr(age))
print(outlier_iqr(height))
# The Y-Axis outlers
print(outlier_iqr(salary1))
# How to remove data row from a dataframe
dataRemoved <- data1
dataRemoved <- dataRemoved[-c(1),]