R files added

save_data.R and analysis.R added
rladies · Feb 18, 2024 · 4e8b90f · 4e8b90f
1 parent 4eb2117
commit 4e8b90f
Show file tree

Hide file tree

Showing 2 changed files with 233 additions and 0 deletions.
diff --git a/Reproducible_pipelines_book_club/Meeting_05/analysis.R b/Reproducible_pipelines_book_club/Meeting_05/analysis.R
@@ -0,0 +1,97 @@
+# This script analyses housing data for Luxembourg
+library(dplyr)
+library(ggplot2)
+library(tidyr)
+
+#Let’s load the datasets:
+
+commune_level_data <- read.csv("datasets/commune_level_data.csv")
+country_level_data <- read.csv("datasets/country_level_data.csv")
+
+#Let’s compute the Laspeyeres index for each commune:
+
+commune_level_data <- commune_level_data |>
+  group_by(locality) |>
+  mutate(p0 = ifelse(year == "2010", average_price_nominal_euros, NA)) |>
+  fill(p0, .direction = "down") |>
+  mutate(p0_m2 = ifelse(year == "2010", average_price_m2_nominal_euros, NA)) |>
+  fill(p0_m2, .direction = "down") |>
+  ungroup() |>
+  mutate(pl = average_price_nominal_euros/p0*100,
+         pl_m2 = average_price_m2_nominal_euros/p0_m2*100)
+
+
+#Let’s also compute it for the whole country:
+
+country_level_data <- country_level_data |>
+  mutate(p0 = ifelse(year == "2010", average_price_nominal_euros, NA)) |>
+  fill(p0, .direction = "down") |>
+  mutate(p0_m2 = ifelse(year == "2010", average_price_m2_nominal_euros, NA)) |>
+  fill(p0_m2, .direction = "down") |>
+  mutate(pl = average_price_nominal_euros/p0*100,
+         pl_m2 = average_price_m2_nominal_euros/p0_m2*100)
+
+
+#We are going to create a plot for 5 communes and compare the price evolution in the communes
+#to the national price evolution. Let’s first list the communes:
+
+communes <- c("Luxembourg",
+              "Esch-sur-Alzette",
+              "Mamer",
+              "Schengen",
+              "Wincrange")
+
+make_plot <- function(country_level_data,
+                      commune_level_data,
+                      commune){
+
+filtered_data <- commune_level_data %>%
+    filter(locality == commune)
+
+  data_to_plot <- bind_rows(
+    country_level_data,
+    filtered_data
+  )
+
+  ggplot(data_to_plot) +
+    geom_line(aes(y = pl_m2,
+                  x = year,
+                  group = locality,
+                  colour = locality))
+}
+
+# Luxembourg
+
+lux_plot <- make_plot(country_level_data,
+                      commune_level_data,
+                      communes[1])
+# Esch sur Alzette
+
+esch_plot <- make_plot(country_level_data,
+                       commune_level_data,
+                       communes[2])
+
+# Mamer
+
+mamer_plot <-  make_plot(country_level_data,
+                         commune_level_data,
+                         communes[3])
+
+# Schengen
+
+schengen_plot <- make_plot(country_level_data,
+                           commune_level_data,
+                           communes[4])
+
+# Wincrange
+
+wincrange_plot <- make_plot(country_level_data,
+                            commune_level_data,
+                            communes[5])
+
+# Let’s save the plots
+ggsave("plots/lux_plot.pdf", lux_plot)
+ggsave("plots/esch_plot.pdf", esch_plot)
+ggsave("plots/mamer_plot.pdf", mamer_plot)
+ggsave("plots/schengen_plot.pdf", schenger_plot)
+ggsave("plots/wincrange_plot.pdf", wincrange_plot)
diff --git a/Reproducible_pipelines_book_club/Meeting_05/save_data.R b/Reproducible_pipelines_book_club/Meeting_05/save_data.R
@@ -0,0 +1,136 @@
+# Download raw Excel
+
+url <- "https://github.com/b-rodrigues/rap4all/raw/master/datasets/vente-maison-2010-2021.xlsx"
+
+# Shortened url
+#url <- "https://is.gd/1vvBAc"
+
+raw_data <- tempfile(fileext = ".xlsx")
+
+download.file(url, raw_data, method = "auto", mode = "wb")
+
+sheets <- excel_sheets(raw_data)
+
+read_clean <- function(..., sheet){
+  read_excel(..., sheet = sheet) |>
+    mutate(year = sheet)
+}
+
+raw_data <- map(
+  sheets,
+  ~read_clean(raw_data,
+              skip = 10,
+              sheet = .)
+                   ) |>
+  bind_rows() |>
+  clean_names()
+
+raw_data <- raw_data |>
+  rename(
+    locality = commune,
+    n_offers = nombre_doffres,
+    average_price_nominal_euros = prix_moyen_annonce_en_courant,
+    average_price_m2_nominal_euros = prix_moyen_annonce_au_m2_en_courant,
+    average_price_m2_nominal_euros = prix_moyen_annonce_au_m2_en_courant
+  ) |>
+  mutate(locality = str_trim(locality)) |>
+  select(year, locality, n_offers, starts_with("average"))
+
+
+str(raw_data)
+
+# Let's take a look at the spelling
+raw_data |>
+  dplyr::filter(grepl("Luxembourg", locality)) |>
+  dplyr::count(locality)
+
+
+raw_data |>
+  dplyr::filter(grepl("P.tange", locality)) |>
+  dplyr::count(locality)
+
+
+# Deal with spelling
+
+raw_data <- raw_data |>
+  mutate(locality = ifelse(grepl("Luxembourg-Ville", locality),
+                           "Luxembourg",
+                           locality),
+         locality = ifelse(grepl("P.tange", locality),
+                           "Pétange",
+                           locality)
+         ) |>
+  mutate(across(starts_with("average"), as.numeric))
+
+# Check if missing data
+raw_data |>
+  filter(is.na(average_price_nominal_euros))
+
+
+# Remove rows citing the source
+raw_data <- raw_data |>
+  filter(!grepl("Source", locality))
+
+#Keep commune level data
+commune_level_data <- raw_data |>
+    filter(!grepl("nationale|offres", locality),
+           !is.na(locality))
+
+# Keep country level data
+country_level <- raw_data |>
+  filter(grepl("nationale", locality)) |>
+  select(-n_offers)
+
+offers_country <- raw_data |>
+  filter(grepl("Total d.offres", locality)) |>
+  select(year, n_offers)
+
+country_level_data <- full_join(country_level, offers_country) |>
+  select(year, locality, n_offers, everything()) |>
+  mutate(locality = "Grand-Duchy of Luxembourg")
+
+
+# We need to check if communes are all in our data
+current_communes <- "https://is.gd/lux_communes" |>
+  rvest::read_html() |>
+  rvest::html_table() |>
+  purrr::pluck(2) |>
+  janitor::clean_names() |>
+  dplyr::filter(name_2 != "Name") |>
+  dplyr::rename(commune = name_2) |>
+  dplyr::mutate(commune = stringr::str_remove(commune, " .$"))
+
+# Test if all communes are there
+setdiff(unique(commune_level_data$locality), current_communes$commune)
+
+# We need former communes
+former_communes <- "https://is.gd/lux_former_communes" |>
+  rvest::read_html() |>
+  rvest::html_table() |>
+  purrr::pluck(3) |>
+  janitor::clean_names() |>
+  dplyr::filter(year_dissolved > 2009)
+
+former_communes
+
+# Put former and current communes together
+communes <- unique(c(former_communes$name, current_communes$commune))
+# we need to rename some communes
+
+# Different spelling of these communes between wikipedia and the data
+
+communes[which(communes == "Clemency")] <- "Clémency"
+communes[which(communes == "Redange")] <- "Redange-sur-Attert"
+communes[which(communes == "Erpeldange-sur-Sûre")] <- "Erpeldange"
+communes[which(communes == "Luxembourg City")] <- "Luxembourg"
+communes[which(communes == "Käerjeng")] <- "Kaerjeng"
+communes[which(communes == "Petange")] <- "Pétange"
+
+
+# Test if this set is empty, if yes, we're good
+setdiff(unique(commune_level_data$locality), communes)
+
+# save the data (uncomment if you need to save)
+# you may need to create the `datasets` folder first
+write.csv(commune_level_data, "datasets/commune_level_data.csv", row.names = TRUE)
+write.csv(country_level_data, "datasets/country_level_data.csv", row.names = TRUE)