-
Notifications
You must be signed in to change notification settings - Fork 0
/
Week5.rmd
103 lines (89 loc) · 2.98 KB
/
Week5.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
---
title: "Week5"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Install library
I am using DBI library for database access and tidyverse library for data manipulation.
```{r}
library(DBI)
library(tidyverse)
```
## Connect to SQL Server
I will create ODBC connection object
```{r}
con <- dbConnect(odbc::odbc(),
.connection_string = "Driver={SQL Server};Server=(local);Database=tb;Trusted_Connection=Yes;")
```
## Create tables
I will create three tables with foreign key references and primary keys.
I will use a “wide” structure for tables.
```{r}
rs = dbSendStatement(con, "DROP TABLE IF EXISTS dbo.AirCurrent;
DROP TABLE IF EXISTS dbo.Airline;
DROP TABLE IF EXISTS dbo.AirStatus;
CREATE TABLE dbo.Airline (
AirlineID INT NOT NULL PRIMARY KEY,
Airline VARCHAR(30) NOT NULL );
CREATE TABLE dbo.AirStatus (
AirStatusID INT NOT NULL PRIMARY KEY,
AirStatus VARCHAR(30) NOT NULL );
CREATE TABLE dbo.AirCurrent (
AirlineID INT NOT NULL FOREIGN KEY REFERENCES Airline(AirlineID),
AirStatusID INT NOT NULL FOREIGN KEY REFERENCES AirStatus(AirStatusID),
[Los_Angeles] INT NOT NULL,
Phoenix INT NOT NULL,
[San Diego] INT NOT NULL,
[San Francisco] INT NOT NULL,
Seattle INT NOT NULL,
CONSTRAINT PK_AirCurrent PRIMARY KEY (AirlineID,AirStatusID));")
dbClearResult(rs)
```
## Load data into SQL table
Because number of records is small I will use INSERT statement instead of bulk load from csv files.
```{r}
rs = dbSendStatement(con, "INSERT INTO dbo.Airline (AirlineID, Airline) VALUES
(1, 'ALASKA'),
(2, 'AM WEST');
INSERT INTO dbo.AirStatus (AirStatusID, AirStatus) VALUES
(1, 'on time'),
(2, 'delayed');
INSERT INTO dbo.AirCurrent(AirlineID,AirStatusID,Los_Angeles,Phoenix,[San Diego],[San Francisco],Seattle)
VALUES
(1, 1, 497,221,212,503,1841),
(1,2,62,12,20,102,305),
(2,1,694,4840,383,320,201),
(2,2,117,416,65,129,61);")
dbClearResult(rs)
```
## Load data into R data.frame
I will create a SELECT statement which will join three tables.
```{r}
rs <- dbSendQuery(con, "SELECT Airline.Airline, AirStatus.AirStatus, Los_Angeles,Phoenix,[San Diego],[San Francisco],Seattle
FROM dbo.AirCurrent AirCurrent
JOIN dbo.Airline Airline ON AirCurrent.AirlineID = Airline.AirlineID
JOIN dbo.AirStatus AirStatus ON AirCurrent.AirStatusID = AirStatus.AirStatusID;")
df = data.frame(dbFetch(rs))
dbClearResult(rs)
```
## Show query result
```{r}
head(df)
```
## Convert R dataframe to tibble
```{r}
tbl = tibble(df)
tbl
```
# tidy and transform data
I will use pivot_wider to pivoting original tibble to wide format
```{r}
tbl2 = tbl %>% pivot_wider(names_from = AirStatus, values_from = c(Los_Angeles,Phoenix,San.Diego,San.Francisco,Seattle))
```
## Perform analysis to compare the arrival delays for the two airlines
```{r}
tbl3 =mutate(tbl2, arrival_delays = Los_Angeles_delayed+Phoenix_delayed+San.Diego_delayed+San.Francisco_delayed+Seattle_delayed)
select(tbl3,Airline, arrival_delays)
```