Rmd Files added

rladies · Feb 20, 2024 · 100a75b · 100a75b
1 parent 381dd34
commit 100a75b
Show file tree

Hide file tree

Showing 2 changed files with 341 additions and 0 deletions.
diff --git a/Reproducible_pipelines_book_club/Meeting_05/analysis.Rmd b/Reproducible_pipelines_book_club/Meeting_05/analysis.Rmd
@@ -0,0 +1,113 @@
+---
+title: "Nominal house prices data in Luxembourg"
+author: "Bruno Rodrigues"
+date: "`r Sys.Date()`"
+---
+
+```{r, messages = FALSE}
+library(dplyr)
+library(ggplot2)
+library(purrr)
+library(tidyr)
+```
+
+Let’s load the datasets:
+
+```{r}
+commune_level_data <- read.csv(
+  "../datasets/house_prices_commune_level_data.csv"
+)
+
+country_level_data <- read.csv(
+  "../datasets/house_prices_country_level_data.csv"
+)
+```
+
+Let’s compute the Laspeyeres index for each commune:
+
+```{r}
+get_laspeyeres <- function(dataset){
+
+  which_dataset <- deparse(substitute(dataset))
+
+  group_var <- if(grepl("commune", which_dataset)){
+                quo(locality)
+              } else {
+                NULL
+              }
+  dataset %>%
+    group_by(!!group_var) %>%
+      mutate(p0 = ifelse(year == "2010", average_price_nominal_euros, NA)) %>%
+      fill(p0, .direction = "down") %>%
+      mutate(p0_m2 = ifelse(year == "2010", average_price_m2_nominal_euros, NA)) %>%
+      fill(p0_m2, .direction = "down") %>%
+      ungroup() %>%
+      mutate(pl = average_price_nominal_euros/p0*100,
+             pl_m2 = average_price_m2_nominal_euros/p0_m2*100)
+
+}
+commune_level_data <- get_laspeyeres(commune_level_data)
+
+```
+
+Let’s also compute it for the whole country:
+
+```{r}
+country_level_data <- get_laspeyeres(country_level_data)
+
+```
+
+We are going to create a plot for 5 communes and compare the price evolution in the communes
+to the national price evolution. Let’s first list the communes:
+
+```{r}
+communes <- c("Luxembourg",
+              "Esch-sur-Alzette",
+              "Mamer",
+              "Schengen",
+              "Wincrange")
+
+```
+
+```{r}
+make_plot <- function(commune){
+
+  commune_data <- commune_level_data %>%
+    filter(locality == commune)
+
+  data_to_plot <- bind_rows(
+    country_level_data,
+    commune_data
+  )
+
+  ggplot(data_to_plot) +
+    geom_line(aes(y = pl_m2,
+                  x = year,
+                  group = locality,
+                  colour = locality))
+}
+
+```
+
+```{r, results = "asis"}
+res <- lapply(communes, function(x){
+
+  knitr::knit_child(text = c(
+
+    '\n',
+    '## Plot for commune: `r x`',
+    '\n',
+    '```{r, echo = FALSE}',
+    'print(make_plot(x))',
+    '```'
+
+     ),
+     envir = environment(),
+     quiet = TRUE)
+
+})
+
+cat(unlist(res), sep = "\n")
+
+```
+
diff --git a/Reproducible_pipelines_book_club/Meeting_05/save_data.Rmd b/Reproducible_pipelines_book_club/Meeting_05/save_data.Rmd
@@ -0,0 +1,228 @@
+---
+title: "Nominal house prices data in Luxembourg - Data cleaning"
+author: "Bruno Rodrigues"
+date: "`r Sys.Date()`"
+---
+
+```{r, warning=FALSE, message=FALSE}
+library(dplyr)
+library(ggplot2)
+library(janitor)
+library(purrr)
+library(readxl)
+library(rvest)
+library(stringr)
+```
+
+## Downloading the data
+
+This data is downloaded from the luxembourguish [Open Data
+Portal](https://data.public.lu/fr/datasets/prix-annonces-des-logements-par-commune/)
+(the data set called *Série rétrospective des prix annoncés des maisons par commune, de 2010 à 2021*), and the original data is from the "Observatoire de l'habitat". This data
+contains prices for houses sold since 2010 for each luxembourguish commune.
+
+The function below uses the permanent URL from the Open Data Portal to access the data,
+but I have also rehosted the data, and use my link to download the data (for archival
+purposes):
+
+```{r}
+get_raw_data <- function(url = "https://data.public.lu/fr/datasets/r/14b0156e-ff87-4a36-a867-933fc9a6f903"){
+
+  raw_data <- tempfile(fileext = ".xlsx")
+
+  download.file(url,
+                raw_data,
+                mode = "wb") # for compatibility with Windows
+
+  sheets <- excel_sheets(raw_data)
+
+  read_clean <- function(..., sheet){
+    read_excel(..., sheet = sheet) %>%
+      mutate(year = sheet)
+  }
+
+  raw_data <- map_dfr(sheets,
+                      ~read_clean(raw_data,
+                                  skip = 10,
+                                  sheet = .)) %>%
+    clean_names()
+
+  raw_data %>%
+    rename(locality = commune,
+           n_offers = nombre_doffres,
+           average_price_nominal_euros = prix_moyen_annonce_en_courant,
+           average_price_m2_nominal_euros = prix_moyen_annonce_au_m2_en_courant,
+           average_price_m2_nominal_euros = prix_moyen_annonce_au_m2_en_courant
+           ) %>%
+    mutate(locality = str_trim(locality)) %>%
+    select(year, locality, n_offers, starts_with("average"))
+
+}
+
+```
+
+```{r}
+raw_data <- get_raw_data(url = "https://github.com/b-rodrigues/rap4all/raw/master/datasets/vente-maison-2010-2021.xlsx")
+```
+
+We need clean the data: "Luxembourg" is "Luxembourg-ville" in 2010 and 2011,
+then "Luxembourg". "Pétange" is also spelled non-consistently, and we also need
+to convert columns to right type. We also directly remove rows where the
+locality contains information on the "Source":
+
+```{r}
+clean_raw_data <- function(raw_data){
+  raw_data %>%
+    mutate(locality = ifelse(grepl("Luxembourg-Ville", locality),
+                             "Luxembourg",
+                             locality),
+           locality = ifelse(grepl("P.tange", locality),
+                             "Pétange",
+                             locality)
+           ) %>%
+    filter(!grepl("Source", locality)) %>%
+    mutate(across(starts_with("average"), as.numeric))
+}
+```
+
+```{r}
+flat_data <- clean_raw_data(raw_data)
+```
+
+We now need to make sure that we got all the communes/localities in there. There
+were mergers in 2011, 2015 and 2018. So we need to account for these localities.
+
+We’re now scraping data from wikipedia of former Luxembourguish communes:
+
+```{r}
+get_former_communes <- function(
+            url = "https://is.gd/lux_former_communes",
+            min_year = 2009,
+            table_position = 3
+            ){
+
+  read_html(url) %>%
+    html_table() %>%
+    pluck(table_position) %>%
+    clean_names() %>%
+    filter(year_dissolved > min_year)
+}
+
+```
+
+```{r}
+former_communes <- get_former_communes()
+```
+
+We can scrape current communes:
+
+```{r}
+get_current_communes <- function(
+                 url = "https://is.gd/lux_communes",
+                 table_position = 2
+                 ){
+
+  read_html(url) |>
+    html_table() |>
+    pluck(table_position) |>
+    clean_names() |>
+    filter(name_2 != "Name") |>
+    rename(commune = name_2) |>
+    mutate(commune = str_remove(commune, " .$"))
+
+}
+
+```
+
+```{r}
+current_communes <- get_current_communes()
+```
+
+Let’s now create a list of all communes:
+
+```{r}
+get_test_communes <- function(former_communes, current_communes){
+
+  communes <- unique(c(former_communes$name, current_communes$commune))
+  # we need to rename some communes
+
+  # Different spelling of these communes between wikipedia and the data
+
+  communes[which(communes == "Clemency")] <- "Clémency"
+  communes[which(communes == "Redange")] <- "Redange-sur-Attert"
+  communes[which(communes == "Erpeldange-sur-Sûre")] <- "Erpeldange"
+  communes[which(communes == "Luxembourg City")] <- "Luxembourg"
+  communes[which(communes == "Käerjeng")] <- "Kaerjeng"
+  communes[which(communes == "Petange")] <- "Pétange"
+
+  communes
+}
+
+```
+
+```{r}
+former_communes <- get_former_communes()
+current_communes <- get_current_communes()
+
+communes <- get_test_communes(former_communes, current_communes)
+```
+
+Let’s test to see if all the communes from our dataset are represented.
+
+```{r}
+setdiff(flat_data$locality, communes)
+```
+
+If the above code doesn’t show any communes, then this means that we are
+accounting for every commune.
+
+Let’s keep the national average in another dataset:
+
+```{r}
+make_country_level_data <- function(flat_data){
+  country_level <- flat_data %>%
+    filter(grepl("nationale", locality)) %>%
+    select(-n_offers)
+
+  offers_country <- flat_data %>%
+    filter(grepl("Total d.offres", locality)) %>%
+    select(year, n_offers)
+
+  full_join(country_level, offers_country) %>%
+    select(year, locality, n_offers, everything()) %>%
+    mutate(locality = "Grand-Duchy of Luxembourg")
+
+}
+
+```
+
+```{r}
+country_level_data <- make_country_level_data(flat_data)
+```
+
+We can finish cleaning the commune data:
+
+```{r}
+make_commune_level_data <- function(flat_data){
+  flat_data %>%
+    filter(!grepl("nationale|offres", locality),
+           !is.na(locality))
+}
+
+```
+
+```{r}
+commune_level_data <- make_commune_level_data(flat_data)
+```
+
+We now save the dataset in a folder for further analysis (keep chunk option to
+`eval = FALSE` to avoid running it when knitting):
+
+```{r, eval = FALSE}
+write.csv(commune_level_data,
+          "datasets/house_prices_commune_level_data.csv",
+          row.names = FALSE)
+write.csv(country_level_data,
+          "datasets/house_prices_country_level_data.csv",
+          row.names = FALSE)
+```