-
Notifications
You must be signed in to change notification settings - Fork 0
/
CCCM Data cleaning.R
207 lines (184 loc) · 14.2 KB
/
CCCM Data cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# CCCM Site Monitoring Tool - Data Cleaning script
# REACH Yemen - [email protected]
# V7
# 27/06/2021
rm(list=ls())
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
today <- Sys.Date()
## Install/Load libraries
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, data.table, openxlsx, reshape2, sf, leaflet, readxl)
# p_load_gh("mabafaba/cleaninginspectoR","agualtieri/cleaninginspectoR","agualtieri/dataqualitycontrol", "impact-initiatives-research/clog")
## Source
source("./R/cleanHead.R")
source("./R/add_locations.R")
source("./R/moveme.R")
source("./R/utils.R")
## Paramaters to be updated each month
## Tool version - uncomment the relevant tool version and comment the other one
tool.version <- "V1" # V1 or V2 update according to the kobo tool used
#tool.version <- "V2"
## Update the directory for all files each month with the latest version. Beware of getting V1 and V2 right!
rawdata.filename.v1 <- "data/CCCM_Site_Reporting_Raw Data_Master_tbc_V1_Round3-5_incl FMF Round2_2021.xlsx" # Update the name of the rawdata filename
rawdata.filename.v2 <- "data/CCCM_Site_Reporting_Raw Data_Master_tbc_V2_Round 3-5.xlsx"
tool.filename.v1 <- "data/kobo/CCCM_Site_Reporting_Kobo_tool_V1_23_05_2021_FINAL_all ngo names.xlsx"
tool.filename.v2 <- "data/kobo/CCCM_Site_Reporting_Kobo_tool_V2_23_05_2021_FINAL_all ngo names.xlsx"
# sitemasterlist.filename <- "data/CCCM_IDP Hosting Site List_coordinates verification exercise_March2021.xlsx"
sitemasterlist.filename <- "data/CCCM IDP Sites_making NEW site names and IDs_Aug 2021_22092021.xlsx"
filename.pcodes <- "R/pcodes/yem_admin_ochayemen_20191002.xlsx"
## Tool columns comparison
# responsev1 <- read.xlsx(rawdata.filename.v1)
# responsev2 <- read.xlsx(rawdata.filename.v2)
# colv1 <- colnames(responsev1)
# colv2 <- colnames(responsev2)
# col.all <- unique(colv1, colv2)
# common.col <- intersect(colv1, colv2)
# col.v1.only <- setdiff(colv1, colv2)
# col.v2.only <- setdiff(colv2, colv1)
## Load survey questions from kobo tool
tool <- if (tool.version == "V1") {read.xlsx(tool.filename.v1, sheet = "survey")} else if(tool.version == "V2") {read.xlsx(tool.filename.v2, sheet = "survey")} else {print("invalid tool entered, should be either V1 or V2")}
## Load survey choices Using kobo to rename the site name and than merge the other column
choices <- if (tool.version == "V1") {read.xlsx(tool.filename.v1, sheet = "choices")} else if(tool.version == "V2") {read.xlsx(tool.filename.v2, sheet = "choices")} else {print("invalid tool entered, should be either V1 or V2")}
external_choices <- if (tool.version == "V1") {read.xlsx(tool.filename.v1, sheet = "external_choices")} else if (tool.version == "V2") {read.xlsx(tool.filename.v2, sheet = "external_choices")} else {print("invalid tool entered, should be either V1 or V2")}
external_choices_site <- external_choices %>%
filter(list_name == "sitename") %>% dplyr::rename(a4_site_name = name)
choices_all <- bind_rows(choices, external_choices) %>% # For cleaning log functions
rbind(c("sitename", "other", "Other", "أخرى", NA, NA))
survey_choices <- get.select.db()
## Load site_name masterlist to match admin names with site_name
masterlist <- read.xlsx(sitemasterlist.filename) %>%
setNames(gsub("\\.", "_", colnames(.))) %>%
setnames(old = c("SITE_ID", "Site_Name", "Site_Name_In_Arabic"), # Put here whatever names CCCM cluster put in the latest masterlist
new = c("Site_ID", "Site_Name", "Site_Name_In_Arabic"), # Streamline colnames to keep the same script.
skip_absent = TRUE) %>%
mutate_at(vars(-matches("_Households|_Population|_IDPs")), as.character) %>% # mutate all names and IDs as character to enable matching
mutate_at(vars(matches("Newly_|_Households|Population|#_of_total_Households")), ~as.numeric(unlist(lapply(gsub(",", "", .), arabic.tonumber))))
## Load data to be cleaned
response <- if (tool.version == "V1") {read.xlsx(rawdata.filename.v1)} else if (tool.version == "V2") {read.xlsx(rawdata.filename.v2)} else {print("invalid tool entered, should be either V1 or V2")}
response <- response %>%
# mutate(a4_site_name = a4_other_site, id = 1:n(), .before = "deviceid") %>%
dplyr::rename(index = '_index', uuid = '_uuid')
## code to check for duplicated colnames
# colnames(response)[4]
# colnames(response)[2:8] ## displays colnames of response from 2 to 8 elementh
# colnames(response)[grepl("site", colnames(response))] ## displays all column that have the pattern "site_name"
# colnames(response)[duplicated(colnames(response))] ## to access duplicted columns
## Upload updated cleaning log files from partners
files.cleaning.log <- paste0("output/cleaning log/parnter cleaning logs updated/", list.files("output/cleaning log/parnter cleaning logs updated"))
cleaning.log <- data.frame()
for (file in files.cleaning.log){
cleaning.log <- bind_rows(cleaning.log, read.xlsx(file) %>% mutate_all(as.character))
}
# ### How to debug this particular case
# ## 1 run file to see on which element of the loop it broke
# file
# ## 2 load the cleaning log on its on
# test <- read.xlsx(file)
# ## 3 check for duplicated column names
# colnames(test)[duplicated(colnames(test))]
## Upload internal cleaning log [sitename/organisation name + easy gps changes]
file.cleaning.log.internal <- "output/cleaning log/internal cleaning logs/"
files.internal.cleaning.log <- paste0(file.cleaning.log.internal, list.files(file.cleaning.log.internal))
cleaning.log.int <- data.frame()
for (file in files.internal.cleaning.log){
cleaning.log.int <- bind_rows(cleaning.log.int, read.xlsx(file) %>% mutate_all(as.character))
}
## Binding internal and external cleaning logs together
cleaning.log <- cleaning.log %>% bind_rows(cleaning.log.int)
## checking for duplicated rows (in case you have twice the same cleaning log in the folder)
if ((cleaning.log %>% group_by(uuid, variable, check_id) %>% filter(n()>1) %>% nrow) > 0) {print("There are duplicates entries (or multiple entries for the same variable) in the cleaning log. Make sure it's ok and that there are no conflicting new_values assigned.")}
duplicate.cl <- cleaning.log %>% group_by(uuid, variable, check_id) %>% filter(n()>1) %>% ungroup %>% arrange(uuid, variable)
### Apply cleaning log changes to raw response file
clean_data <- response
for (r in seq_along(1:nrow(cleaning.log))){
col <- as.character(cleaning.log[r, "variable"]) # Get the name of the variable to be updated in response dataset
new_value <- as.character(cleaning.log[r, "new_value"]) # Get the new value for this cleaning log entry
if ((col %in% colnames(clean_data)) & (cleaning.log[r, "change"] %in% c("TRUE"))) {
clean_data[clean_data$uuid==as.character(cleaning.log[r,"uuid"]), col] <- new_value # Assign new value to the variable in the dataframe response
}
}
### Recode other responses as NA when relevant [sitename + organization other cleaning log entries have been kept out to keep cleaning log concise]
for (r in seq_along(1:nrow(cleaning.log))){
col <- cleaning.log[r, "variable"]
new_value <- cleaning.log[r, "new_value"]
if (col %in% c("a4_site_name")){var.other <- "a4_other_site"} else {var.other <- paste0(col, "_other")}
if ((var.other %in% colnames(clean_data)) & (cleaning.log[r, "change"] %in% c("TRUE")) & !is.na(new_value)) {
clean_data[clean_data$uuid==cleaning.log[r,"uuid"], var.other] <- NA # Set other entry as NA when other has been recoded (i.e. new_value is not NA and change have been applied)
}
}
## Check if new site ids need to be created
new.sites <- cleaning.log %>% filter(variable == "a4_site_name") %>% filter(new_site %in% c("TRUE", "T", TRUE, T))
if ((a<-nrow(new.sites))>0){print(paste0(a, " New site ID must be generated."))}
# ## Creation of new site IDs // COMMENT THE WHOLE SECTION IF NO NEED TO GENERATE SITE IDs.
# site.masterlist <- masterlist$Site_ID %>% unique # Make sure all sites are on one sheet or consolidate on one sheet in masterlist
#
# new.sites <- cleaning.log %>%
# filter(variable == "a4_site_name") %>%
# filter(new_site %in% c("TRUE", "T", TRUE, T)) %>% # Display site id entered that are not in masterlist or kobo tool
# filter(!(new_value %in% unique(c(site.masterlist)))) %>%
# left_join(clean_data %>% select(uuid, a2_district, a3_sub_district), by = "uuid") %>%
# relocate(a4_other_site, a4_site_name_new_en, a4_site_name_new_ar, a2_district, a3_sub_district, new_site, .before = "old_value")
#
# ## Check that cleaning log for new sites has been filled correctly
# check_new_names_filled <- new.sites %>% summarise_at(vars(matches("a4_site_name_new")), ~sum(is.na(.)))
#
# if (sum(check_new_names_filled > 0)>0){
# print("the new names in english and arabic are not specified for all new camps! Please update the file below.")
# new.sites %>% mutate(issue="Check the new names in english and arabic and fill them all. Update new_site with FALSE if some sites don't need new IDs.") %>% write.xlsx(paste0("output/cleaning log/site name/new_sites_corrected_", today, ".xlsx"))
# browseURL(paste0("output/cleaning log/site name/new_sites_corrected_", today, ".xlsx"))
# }
# ## If relevant, check the names and update them accordingly in the excel file and then update the filename below with _updated at the end.
# file.name.new.sites <- "output/cleaning log/site name/new_sites_corrected_2021-06-30_updated.xlsx"
# if (sum(check_new_names_filled > 0)>0){new.sites <- read.xlsx(file.name.new.sites)}
#
# # Inspect the new attributed names in English and arabic
# check_names <- new.sites %>% select(matches("a4_site_name_new")) %>% view
#
# ## Create new sitename code for remaining sites
# max.id.split <- masterlist$Site_ID %>% sub(".*?_", "",.) %>% as.numeric %>% max(na.rm = T) # Extract the highest number in site id in masterlist
# latest_id <- masterlist$Site_ID[grepl(paste0("_", max.id.split), masterlist$Site_ID)] # Extract the latest full ID
# seq <- (max.id.split+1):(max.id.split+500) # Create list of numbers starting from the latest one
#
# new.sites <- new.sites %>%
# mutate(a4_site_name_new = paste0(a2_district, "_", seq[1:nrow(.)])) %>% # create new site ID in column new_site_id of the cleaning log
# relocate(a4_site_name_new, a4_site_name_new_en, a4_site_name_new_ar, .after = "issue")
# new.sites.uuid <- new.sites$uuid
#
# ## Update the masterlist with new sites entries + ID and available informations
# new.sites <- new.sites %>%
# left_join(response %>% select(uuid, a7_site_population_hh, a7_site_population_individual, a5_2_gps_latitude, a5_1_gps_longitude), by="uuid") %>%
# mutate(Partner_Name = agency, Sub_Dist_ID=a3_sub_district, Site_ID = a4_site_name_new, Site_Name=a4_site_name_new_en,
# Site_Name_In_Arabic=a4_site_name_new_ar, Latitude=a5_2_gps_latitude, Longitude=a5_1_gps_longitude, "#_of_total_Households"=as.numeric(a7_site_population_hh),
# Total_Site_Population=as.numeric(a7_site_population_individual)) %>% select(intersect(colnames(.), colnames(masterlist)))
#
# ## Adding the new sites in the existing site masterlist
# masterlist_updated <- masterlist %>% bind_rows(new.sites)
# dir.create("output/masterlist", showWarnings = F)
# masterlist_updated %>% select(-any_of("Site_Name_In_Arabic_tidy")) %>% write.xlsx(paste0("output/masterlist/masterlist_", today,".xlsx"))
## Add Pcodes
admin3 <- read.xlsx(filename.pcodes, sheet = "admin3") %>%
dplyr::rename(a3_sub_district_name=admin3RefName_en, a3_sub_district=admin3Pcode,
a2_district_name=admin2Name_en, a2_district=admin2Pcode,
a1_governorate_name=admin1Name_en, a1_governorate=admin1Pcode)
clean_data_loc <- clean_data %>%
select(-any_of(c("a1_governorate", "a2_district"))) %>%
left_join(admin3, by=c("a3_sub_district"="a3_sub_district")) %>%
select(uuid, start, end, today, matches("a1_|a2_"), matches("a3_"), everything())
## Delete unneccessary site columns
pii <- clean_data_loc %>% select(uuid, c("deviceid", "subscriberid", "imei","calca11", contains("phone")))
clean_data <- clean_data %>%
select(-c("deviceid", "subscriberid", "imei","calca11", contains("phone"))) %>%
setnames(old = c("a4_site_name", "a1_governorate", "a2_district", "a3_sub_district"),
new = c("a4_site_code", "a1_governorate_code", "a2_district_code", "a3_sub_district_code"),
skip_absent = T) # Rename site columns to match the DB file
### Save Internal and External version of the files
## INTERNAL
final_dataset_internal <- clean_data %>% select(-any_of(c("start", "end", "q0_1_enumerator_name", "q0_2_gender", "q1_1_key_informant_name", "q1_2_key_informat_gender", "a5_gps_coordinates",
"q0_3_organization_other", "a4_other_site", "q0_4_date", "b2_exu_fp_name", "b8_community_committee_fp_name", "b9_community_committee_fp_cell",
contains("number"))))
write.xlsx(final_dataset_internal, paste0("./output/internal/CCCM_SiteReporting_", tool.version," Internal_",today,".xlsx"))
## EXTERNAL
final_dataset_external <- clean_data %>% select(-any_of(c("start", "end", "q0_1_enumerator_name", "q0_2_gender", "q1_1_key_informant_name", "q1_2_key_informat_gender", "a5_gps_coordinates",
"q0_3_organization_other", "a4_other_site", "a5_1_gps_longitude", "a5_2_gps_latitude", "b2_exu_fp_name", "b2_exu_fp_name", "b8_community_committee_fp_name",
"b9_community_committee_fp_cell", "q0_3_organization", "q0_4_date", "b7_community_committee_in_place", contains("number"))))
write.xlsx(final_dataset_external, paste0("./output/external/CCCM_SiteReporting_", tool.version," External_",today,".xlsx"))