Skip to content

Commit

Permalink
finished creating the flights dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Knicey committed Aug 14, 2024
1 parent d5f5d72 commit 0acb1c3
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
.quarto
docs
inst/doc
kaggle.json
70 changes: 70 additions & 0 deletions data-raw/flights.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
library(jsonlite)
library(readr)
library(tidyverse)
library(httr)
library(airports)

#Requires a kaggle account and API key

credentials = fromJSON("kaggle.json")
username = credentials$username
key = credentials$key

auth <- authenticate(user = username, password = key)

url <- paste0("https://www.kaggle.com/api/v1/datasets/download/",
"patrickzel/flight-delay-and-cancellation-dataset-2019-2023"
, "/flights_sample_3m.csv")

response <- GET(url, auth)

if (response$status_code == 200) {
# Save the content to a temporary file
zip_file <- tempfile(fileext = ".zip")
writeBin(content(response, "raw"), zip_file)

files_in_zip <- unzip(zip_file, list = TRUE)

csv_file_name <- files_in_zip$Name[1]

# Read the CSV directly from the ZIP file
flights_data_raw <- read_csv(unz(zip_file, csv_file_name))

} else {
stop("Failed to download the dataset. Status code: ", response$status_code)
}

top_100_airports <- flights_data_raw |>
group_by(ORIGIN) |>
summarise(
total_flights = n()
) |>
ungroup() |>
arrange(desc(total_flights)) |>
head(100)

sample_airports <- sample(top_100_airports$ORIGIN, 10)

flights_by_monthyear <- flights_data_raw |>
filter(ORIGIN %in% sample_airports) |>
mutate(
month = month(FL_DATE),
year = year(FL_DATE)
) |>
group_by(ORIGIN, month, year) |>
summarise(
total_flights = n()
)

flights_by_month <- flights_by_monthyear |>
group_by(ORIGIN, month) |>
summarise(
max = max(total_flights),
min = min(total_flights),
)

flights <- left_join(flights_by_month, usairports,
by = c("ORIGIN" = "location_id")) |>
select(ORIGIN, month, max, min, facility_name, arp_latitude, arp_longitude)

usethis::use_data(flights, overwrite = TRUE)
2 changes: 1 addition & 1 deletion data-raw/stations.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## code to prepare `stations` dataset goes here
library(tidyverse)
library(dplyr)
library(rnoaa)

station_data <- ghcnd_stations() |>
Expand Down
Binary file added data/flights.rda
Binary file not shown.

0 comments on commit 0acb1c3

Please sign in to comment.