From 0acb1c3324d990d906831fb55fcd2dfa4dc6f413 Mon Sep 17 00:00:00 2001 From: Nathan Yang Date: Tue, 13 Aug 2024 21:31:10 -0400 Subject: [PATCH] finished creating the flights dataset --- .gitignore | 1 + data-raw/flights.R | 70 ++++++++++++++++++++++++++++++++++++++++++++ data-raw/stations.R | 2 +- data/flights.rda | Bin 0 -> 1878 bytes 4 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 data-raw/flights.R create mode 100644 data/flights.rda diff --git a/.gitignore b/.gitignore index 8a6f2db..df54d9a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ .quarto docs inst/doc +kaggle.json diff --git a/data-raw/flights.R b/data-raw/flights.R new file mode 100644 index 0000000..9bd5f1f --- /dev/null +++ b/data-raw/flights.R @@ -0,0 +1,70 @@ +library(jsonlite) +library(readr) +library(tidyverse) +library(httr) +library(airports) + +#Requires a kaggle account and API key + +credentials = fromJSON("kaggle.json") +username = credentials$username +key = credentials$key + +auth <- authenticate(user = username, password = key) + +url <- paste0("https://www.kaggle.com/api/v1/datasets/download/", + "patrickzel/flight-delay-and-cancellation-dataset-2019-2023" + , "/flights_sample_3m.csv") + +response <- GET(url, auth) + +if (response$status_code == 200) { + # Save the content to a temporary file + zip_file <- tempfile(fileext = ".zip") + writeBin(content(response, "raw"), zip_file) + + files_in_zip <- unzip(zip_file, list = TRUE) + + csv_file_name <- files_in_zip$Name[1] + + # Read the CSV directly from the ZIP file + flights_data_raw <- read_csv(unz(zip_file, csv_file_name)) + +} else { + stop("Failed to download the dataset. Status code: ", response$status_code) +} + +top_100_airports <- flights_data_raw |> + group_by(ORIGIN) |> + summarise( + total_flights = n() + ) |> + ungroup() |> + arrange(desc(total_flights)) |> + head(100) + +sample_airports <- sample(top_100_airports$ORIGIN, 10) + +flights_by_monthyear <- flights_data_raw |> + filter(ORIGIN %in% sample_airports) |> + mutate( + month = month(FL_DATE), + year = year(FL_DATE) + ) |> + group_by(ORIGIN, month, year) |> + summarise( + total_flights = n() + ) + +flights_by_month <- flights_by_monthyear |> + group_by(ORIGIN, month) |> + summarise( + max = max(total_flights), + min = min(total_flights), + ) + +flights <- left_join(flights_by_month, usairports, + by = c("ORIGIN" = "location_id")) |> + select(ORIGIN, month, max, min, facility_name, arp_latitude, arp_longitude) + +usethis::use_data(flights, overwrite = TRUE) diff --git a/data-raw/stations.R b/data-raw/stations.R index 219eada..9af21a8 100644 --- a/data-raw/stations.R +++ b/data-raw/stations.R @@ -1,5 +1,5 @@ ## code to prepare `stations` dataset goes here -library(tidyverse) +library(dplyr) library(rnoaa) station_data <- ghcnd_stations() |> diff --git a/data/flights.rda b/data/flights.rda new file mode 100644 index 0000000000000000000000000000000000000000..28da0127343f9df0433e7cbbefe478d73c5bad3c GIT binary patch literal 1878 zcmV-c2dVf%T4*^jL0KkKS;-8R!3>6tGypU- z14AZ2003mrWFXK3Ks3TK8UPq3L8gODgFs}^0g;GlkOl~3G-;p#p`aQWG5`PrCW9b} zs!V_Y00w{qO#lXf&;S4iBR~KE01W`h00006KmcR_6sU-pO(vQen5NNB(?uSpfZCd5 z0MGycJwN~fra)u>000dD8X5rgJxu@sqs2LSvHyw1I+LwC>EtJro?3gld8zCZk6(AE zvsmzSZ0c0Hn{{ZN)dz1(XSuFdUMG#inUfR=U{VRIQ(CZ#74;x70W>=A^Ri~lx?ibe z^ZKYeILP)@U?L@*Kq4c_9G_^de<6~*KUHm?RTvW8Z&)P>YXrDe+*klKs)Z_)%*&Dh zG_*v;3^dh*(5RKlP1vIV89eDT)dee`94H}CB%!6FJJF=CBAEe6xk6sEvJ2`Gt}dpj zYX!Bnc7_rM31Ho>rw(a?)7pwh=3smz`yC>gTk*ClvYNX1;6*wWg{UqtmKqc8mkSl~8q=^oe_ z*8nq*GFKM}%|`0I@!sbyu$`&<6z8mSCr9M#H!2+KpYCD&Q|KOOXC`<<*td0WKIjmc zx^d~GzD7&%)_iOIa<+DMkHGJq-rY`aL9kVit_Fa?#eF}b;8+^YCGs2Z=fU2pKFfpi z!({*Iwy2vzf8(WHh;2Z8mC>NIL_sqB3XzF0-+fpJfL-7$CH`#S3J?Y7r4`&D@k7Bl z@ef~D9TDiBG88uAI2<#L+BJH-lHrIN1r&G9g2Y@sK7V7U$NF5oPcziefOxto^0@ZY z#y#?w$L!M>>!1;oApk_72m(1|APDLK07S5e5C+=i#d#*@?FlSko$1wJ@&5jHWcjTHv%y1``U=hEp9y&|Hfpg7*db zdDG`9iepM>U6RE%mP=t|zxccby}>b76An1aV)YqJ(aRHDdL`0dB3^l9kEAa|wiczZ zurENii$c(_TT8rK3u$OuOKD)TTT2Ck(6*MrWL(9xyCvqA2R}zKdk$2^F~GR`1&VPl zMXfd#G%S}jH7sgcmqBZ|u}*^H{4bWzMcML8ti3Y9Ytk*^!F!Uyd>M9JsnMEtb4@Xt zW}Q0IIi`iJGp|Ji-{H`=3kA+wi-B+~mJ6J)TuXsV^-GP_;^1B}axOy9xE6)dSuG2R z91DoBSQZNc$zWW0i(z247NxLU#i1G&3j)Dlw(;(C7ZThpp>4cd%VBl}&R8x*lF4ot zNp5rW7cp)v#kgCD?G{C}uv?o;X=+(5OK7kx7R^gS$mqr~zd7?dsLw)WDVggf=NE*$ zCFK?izFs21f2n9%7l5}G3(PM#w-=UPtiPzeX>cqSv2&MU>dVkCp)TCN;IS`aSeDbM zyhXUOUh-P-ESDn5WU^f)jd}~9T*dt5+bow^b<5kAVqL=GT9>~UF?z!9mq!A}km;9H zyG6=h>|O_qU1h%o+b$y1uv+ooF2QzPvg&Xy__vZ|y9KzoPIAe1bYh{fy-{@*OG5ub zwo7q$i;H};mvCK@$#j;>xi3dJ?@JWz77K}XiMkus##}U( z^7G5PFZ(SEk#QG-UQunAh_NqpTzSi!yG7%YcMAoxthvj#FH~CEMT5Y_Fp|fFxCv4S z&_@b<1FRz;B7_Rl&*9B0DM^mQP{NJ~qLX$}A_~qz2tcA7>NGpZ+$5vEcT%!juSP{N zV+d|1ibW_Rr;=1!w+e48rc2SsSl3q6j;_c+lQ$N(lNF66rsu4ZRQ^X2k*fEshD14mG_49k~Tsb<~?|D8(k0YaNOE`1kbnj%> z7sk1=Y|d{_6=w&Nif0(ZTrptHi%f09Ts6iqXE|$}u}Ce|u}fQut`KpGfa1~B5E}q= Q96#djNT&)C86lX8z%bBh+yDRo literal 0 HcmV?d00001