-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_2020_WSHFC_data.R
228 lines (202 loc) · 7.98 KB
/
clean_2020_WSHFC_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#################################################################################
# Title: 2020 IRHD, Cleaning WSHFC data
# Author: Eric Clute
# Date created: 2023-04-06
# Last Updated: 2023-12-07
#################################################################################
## load packages-----------------------------------------------------------------
library(tidyverse)
library(readxl)
library(janitor)
## 1) load data ---------------------------------------------------------------------
J_drive_raw_files_filepath <- "J:/Projects/IncomeRestrictedHsgDB/2020 vintage/WSHFC/Raw Data/"
original_WSHFC_raw <- read_xlsx(paste0(J_drive_raw_files_filepath, "PSRC WBARS Report_12-31-2020.xlsx"))
## 2) create functions --------------------------------------------------------------------
#create function to select and arrange columns needed for joining
select_and_arrange_columns_function <- function(df){
df <- df %>%
select(any_of(c("DataSource",
"ProjectID",
"ProjectName",
"PropertyID",
"PropertyName",
"Owner",
"Manager",
"InServiceDate",
"ExpirationDate",
"Address",
"City",
"ZIP",
"County",
"TotalUnits",
"TotalRestrictedUnits",
"AMI20",
"AMI25",
"AMI30",
"AMI35",
"AMI40",
"AMI45",
"AMI50",
"AMI60",
"AMI65",
"AMI70",
"AMI75",
"AMI80",
"AMI85",
"AMI90",
"AMI100",
"MarketRate",
"ManagerUnit",
"Bedroom_0",
"Bedroom_1",
"Bedroom_2",
"Bedroom_3",
"Bedroom_4",
"Bedroom_5",
"Bedroom_Unknown",
"BedCount",
"Site_Type",
"HOME",
"Confidentiality",
"ContactName",
"ProjectSponsor",
"Policy",
"Senior",
"Disabled",
"Farmworker",
"Homeless",
"Large Household (+4 pp)",
"Transitional",
"Veterans",
"FundingSources",
"Tenure")))
}
## 3) clean WSHFC data --------------------------------------------------------------------
# ------- DATA FILTER #1 ------- filter by county, create/modify fields
WSHFC_cleaned <- original_WSHFC_raw %>%
filter(County == "Snohomish" | County == "Pierce" | County == "Kitsap")
#create grouped funder column
WSHFC_cleaned <- WSHFC_cleaned %>%
group_by(`Site Name`, Address) %>%
mutate(Funder = paste(sort(unique(Funder)), collapse = ","))
# ------- DATA FILTER #2 ------- select entry with the largest total restricted unit count
WSHFC_cleaned <- WSHFC_cleaned %>%
group_by(`Site Name`, Address) %>%
slice_max(`Income & Rent Restricted Units`,
n = 1,
with_ties = TRUE) %>%
distinct()
#check for duplicates
WSHFC_cleaned %>%
unique() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address) %>%
view()
# ------- DATA FILTER #3 ------- select only entry with latest expiration date
WSHFC_cleaned <- WSHFC_cleaned %>%
group_by(`Site Name`, Address) %>%
slice_max(`Project Expiration Date`,
n = 1,
with_ties = TRUE) %>%
distinct()
#check for duplicates
WSHFC_cleaned %>%
unique() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address) %>%
view()
# ------- DATA FILTER #4 ------- select only entry with earliest in service date
WSHFC_cleaned <- WSHFC_cleaned %>%
group_by(`Site Name`, Address) %>%
slice_min(`First Credit Year or C of O's`,
n = 1,
with_ties = TRUE) %>%
distinct()
#check for duplicates
WSHFC_cleaned %>%
distinct() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address) %>%
view()
# ------- DATA FILTER #4 ------- for entries where there are multiple properties with the same total restricted unit count but different other data, select record that seems correct
WSHFC_cleaned <- WSHFC_cleaned %>%
distinct() %>%
filter(!(`Project Name` == "Annobee Apartments, The" & `Site Name` == "Annobee Apartments, The" & `80%` == 43)) %>% #remove this record, keep record with pop served & deeper affordability
filter(!(`Project Name` == "Catalina Apartments" & `Site Name` == "Catalina Apartments" & `40%` == 32)) %>% #remove this record, keep record with pop served & deeper affordability
filter(!(`Project Name` == "Maternity Shelter (Youth Emergency Shelter (YES) North)" & `Site Name` == "Youth Emergency Shelter (YES) North" & `50%` == 8)) #remove this record, keep record with deeper affordability
#check to see if any duplicates remaining - should be 0
WSHFC_cleaned %>%
distinct() %>%
group_by(`Site Name`, Address) %>%
mutate(n = n()) %>%
filter(n > 1) %>%
arrange(`Project Name`, `Site Name`, Address) %>%
view()
# ------- DATA FILTER #5 ------- Remove any records with an InServiceDate of 2021, this update is through 2020
WSHFC_cleaned <- WSHFC_cleaned %>%
filter(`First Credit Year or C of O's` < "2021")
WSHFC_cleaned <- rename(WSHFC_cleaned, `20%` = `0.2`)
#rename columns and add empty columns for data we dont have
WSHFC_cleaned <- WSHFC_cleaned %>%
mutate(DataSource = as.character(NA),
AMI25 = as.numeric(NA),
AMI75 = as.numeric(NA),
AMI85 = as.numeric(NA),
AMI90 = as.numeric(NA),
AMI100 = as.numeric(NA),
MarketRate = as.numeric(NA),
ManagerUnit = as.numeric(NA),
Confidentiality = as.character(NA),
Policy = as.character(NA),
Tenure = as.character(NA)) %>%
rename(ProjectID = `ProjectKey`,
ProjectName = `Project Name`,
PropertyID = `SiteKey`,
PropertyName = `Site Name`,
Owner = `Contractor/Owner Org`,
Manager = `Property Management Org`,
City = `City`,
TotalUnits = `Total Project Units`,
TotalRestrictedUnits = `Income & Rent Restricted Units`,
InServiceDate = `First Credit Year or C of O's`,
AMI20 = `20%`,
AMI30 = `30%`,
AMI35 = `35%`,
AMI40 = `40%`,
AMI45 = `45%`,
AMI50 = `50%`,
AMI60 = `60%`,
AMI65 = `65%`,
AMI70 = `70%`,
AMI80 = `80%`,
Bedroom_0 = `STUDIO`,
Bedroom_1 = `1 BR`,
Bedroom_2 = `2 BR`,
Bedroom_3 = `3 BR`,
Bedroom_4 = `4 BR`,
Bedroom_5 = `5 BR`,
Bedroom_Unknown = `Unknown`,
BedCount = `GROUP HOME/BED`,
HOME = `Number of HOME Units`,
FundingSources = `Funder`,
ExpirationDate = `Project Expiration Date`,
LargeHousehold4plus = `Large Household (4+ pp)`,
Site_Type = `Site Type`,
Senior = `Elderly`,
Disabled = `Persons with Disabilities`,
ZIP = `Zip`)
#select only necessary columns and arrange columns
WSHFC_cleaned <- select_and_arrange_columns_function(WSHFC_cleaned)
#set DataSource field
WSHFC_cleaned$DataSource = "WSHFC"
## 4) save files --------------------------------------------------------------------
#save J: drive cleaned files location filepath
J_drive_cleaned_files_filepath <- "J:/Projects/IncomeRestrictedHsgDB/2020 vintage/WSHFC/Cleaned Data/"
#save cleaned files
#write_csv(WSHFC_cleaned, paste0(J_drive_cleaned_files_filepath, "WSHFC_2020_cleaned.csv"))