-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04_harmonize_data_sources.R
275 lines (232 loc) · 9.37 KB
/
04_harmonize_data_sources.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
library(tidyverse)
library(stringr)
library(assertthat)
# Import LEMIS metadata
lemis_codes <- lemis::lemis_codes()
lemis_metadata <- lemis::lemis_metadata()
#==============================================================================
# Import and combine cleaned LEMIS data sources for downstream use
a <- read_csv("data/cleaned/amphibian_LEMIS_1999_and_2015.csv")
b <- read_csv("data/cleaned/amphibian_LEMIS_2000_to_2014.csv")
c <- read_csv("data/cleaned/amphibian_LEMIS_2016_to_2021.csv")
d <- bind_rows(a, b, c) %>%
# remove any extra whitespace from genus and species names
mutate(
genus = str_trim(genus, side = "both"),
species = str_trim(species, side = "both"),
subspecies = str_trim(subspecies, side = "both")
) %>%
arrange(shipment_date, control_number)
assert_that(nrow(d) == nrow(a) + nrow(b) + nrow(c))
#==============================================================================
# Add a column to the data frame giving the full country of origin name
d <- lemis_codes %>%
filter(field == "country") %>%
select(code, value) %>%
rename(country_origin_full = value) %>%
left_join(d, ., by = c("country_origin" = "code"))
# Give full names for new country codes that don't appear in "lemis_codes"
d <- d %>%
mutate(
country_origin_full = case_when(
country_origin == "BQ" ~ "Bonaire, Saint Eustatius, & Saba",
country_origin == "CW" ~ "Curacao",
country_origin == "MF" ~ "Saint Martin",
country_origin == "PR" ~ "Puerto Rico",
country_origin == "SS" ~ "South Sudan",
country_origin == "SX" ~ "Sint Maarten",
TRUE ~ country_origin_full
)
)
#==============================================================================
# Taxonomic harmonization
# Import AmphibiaWeb taxonomy
a.taxonomy <-
jsonlite::read_json("data/taxonomy/amphib_names.json", simplifyVector = T)
# Generate a table with taxonomic synonyms from the AmphibiaWeb data
synonymy.table <- a.taxonomy %>%
select(genus, species, synonymies) %>%
# separate synonyms out
separate(
synonymies,
into = c("synonym_1", "synonym_2", "synonym_3",
"synonym_4", "synonym_5", "synonym_6"),
sep = ", "
) %>%
# pivot to get a single synonym column
tidyr::pivot_longer(cols = starts_with("synonym"), values_to = "synonym") %>%
select(-name) %>%
# remove cases with no synonyms at all
filter(!is.na(synonym), synonym != "") %>%
# trim leading whitespace from synonym names
mutate(synonym = str_trim(synonym, side = "left")) %>%
# remove any examples of synonyms with subspecies names as these can lead to
# ambiguity
filter(str_count(synonym, " ") == 1)
# Which synonyms are used for more than one taxa and thus are ambiguous?
ambiguous.synonyms <- synonymy.table %>%
group_by(synonym) %>%
count() %>%
filter(n > 1) %>%
pull(synonym)
# Modify synonymy table to eliminate ambiguous synonyms
synonymy.table <- synonymy.table %>%
filter(!(synonym %in% ambiguous.synonyms)) %>%
separate(
synonym, into = c("genus_synonym", "species_synonym"), sep = " "
) %>%
rename(
genus_aw = genus,
species_aw = species
)
# Generate new generic and species names to synonymize with AmphibiaWeb
d <- d %>%
left_join(
.,
synonymy.table,
by = c("genus" = "genus_synonym", "species" = "species_synonym")
) %>%
mutate(
genus_aw = ifelse(is.na(genus_aw), genus, genus_aw),
species_aw = ifelse(is.na(species_aw), species, species_aw),
# remove "cf. " from the species column if it occurs
species_aw = str_replace_all(species_aw, "cf. ", ""),
# remove any instances of "logi logi"
species_aw = str_replace_all(species_aw, " logi logi", ""),
# match anything that reads "sp. nov*" and replace it with "sp."
species_aw = str_replace_all(species_aw, regex("sp\\. nov.*"), "sp."),
# match anything that reads "sp." and a number and replace it with "sp."
species_aw = str_replace_all(species_aw, regex("sp\\. [1-9].*"), "sp."),
# replace specific species codes with "sp."
species_aw = ifelse(
species_code == "NONA" & is.na(species), "sp.", species_aw),
species_aw = ifelse(
species_code == "BUF#" & is.na(species), "sp.", species_aw),
species_aw = ifelse(
species_code == "CRY#" & is.na(species), "sp.", species_aw)
)
# Which genera values in the data do not match AmphibiaWeb genera?
sort(unique(d$genus_aw)[which(!(unique(d$genus_aw) %in% unique(a.taxonomy$genus)))])
# Change genus names in cases where all species within a genera should be
# different
genera.to.change <- read_csv("data/taxonomy/genera_to_change.csv")
d <- d %>%
left_join(., genera.to.change, by = "genus") %>%
mutate(
genus_aw.x = ifelse(is.na(genus_aw.y), genus_aw.x, genus_aw.y)
) %>%
select(-genus_aw.y) %>%
rename(genus_aw = genus_aw.x)
# Which genera values in the data do not match AmphibiaWeb genera?
sort(unique(d$genus_aw)[which(!(unique(d$genus_aw) %in% unique(a.taxonomy$genus)))])
# Change species names in cases where specific species need changing
species.to.change <- read_csv("data/taxonomy/species_to_change.csv")
d <- d %>%
left_join(., species.to.change, by = c("genus", "species")) %>%
mutate(
genus_aw = ifelse(is.na(genus_aw.y), genus_aw.x, genus_aw.y),
species_aw = ifelse(is.na(species_aw.y), species_aw.x, species_aw.y)
) %>%
select(-genus_aw.x, -species_aw.x, -genus_aw.y, -species_aw.y)
# Now which genera values do not appear in AmphibiaWeb genera?
sort(unique(d$genus_aw)[which(!(unique(d$genus_aw) %in% unique(a.taxonomy$genus)))])
#==============================================================================
# Adding higher level taxonomy
# Add family-level information using AmphibiaWeb taxonomy
a.taxonomy.family <- a.taxonomy %>%
distinct(family, genus, species)
# Make sure "sp." species are captured in the family-level taxonomy table
a.taxonomy.family.full <- a.taxonomy.family %>%
bind_rows(
.,
distinct(a.taxonomy.family, family, genus) %>%
mutate(species = rep("sp.", nrow(.)))
) %>%
arrange(family, genus, species)
d <- left_join(d,
a.taxonomy.family.full,
by = c("genus_aw" = "genus", "species_aw" = "species")) %>%
# In cases where the LEMIS data assigned a family name for the genus
# field or a genus name could not be unambiguously assigned to the
# AmphibiaWeb taxonomy, manually assign the correct family
mutate(
family = case_when(
genus_aw == "Bufonidae" ~ "Bufonidae",
genus_aw == "Caeciliidae" ~ "Caeciliidae",
genus_aw == "Discoglossidae" ~ "Alytidae",
genus_aw == "Paa" ~ "Dicroglossidae",
TRUE ~ family
)
)
# Which genus/species combinations were we unable to assign a family?
# (should be only non-CITES entries and other ambiguous records)
filter(d, is.na(family)) %>%
distinct(genus_aw, species_aw)
# Add order-level information using AmphibiaWeb taxonomy
a.taxonomy.order <- a.taxonomy %>%
distinct(order, family)
d <- left_join(d,
a.taxonomy.order,
by = "family") %>%
# In cases where the LEMIS data assigned an ambiguous name for the genus,
# manually assign the correct order
mutate(order = case_when(
genus_aw == "Anura" ~ "Anura",
genus_aw == "Schoutedenella" ~ "Anura",
TRUE ~ order
)
)
#==============================================================================
# Add on information about species' Lacey Act designation
l <- read_csv("data/reference/Lacey_Act_species.csv")
colnames(l) <- tolower(colnames(l))
l <- l %>%
mutate(
genus = str_trim(genus, side = "both"),
species = str_trim(species, side = "both"),
scientific_name = paste(genus, species, sep = " ")
) %>%
# exclude particular species as needed
# AmphibiaWeb considers Hynobius yatsui to be synonymous with Hynobius
# stejnegeri, which is already in the dataset
filter(scientific_name != "Hynobius yatsui") %>%
# Triturus hongkongensis is a synonym for Paramesotriton hongkongensis,
# which is already in the dataset
filter(scientific_name != "Triturus hongkongensis") %>%
# Tylototriton daweishanensis is a synonym for Tylototriton yangi, which is
# already in the dataset
filter(scientific_name != "Tylototriton daweishanensis") %>%
mutate(
genus = ifelse(
genus == "Triturus" & species == "vittatus", "Ommatotriton", genus
),
scientific_name = paste(genus, species, sep = " "),
lacey_act = rep(1, nrow(.))
)
# Confirm all species are in the AmphibiaWeb taxonomy
assert_that(
sum(l$scientific_name %in%
paste(a.taxonomy$genus, a.taxonomy$species, sep = " ")) == nrow(l)
)
# Join the Lacey Act information in to the larger data frame
d <- left_join(
d, select(l, genus, species, lacey_act),
by = c("genus_aw" = "genus", "species_aw" = "species")
) %>%
mutate(lacey_act = ifelse(is.na(lacey_act), 0, lacey_act))
#==============================================================================
# Rearrange columns
d <- d %>%
select(
control_number, species_code, taxa, class, order, family,
genus, genus_aw, species, species_aw, subspecies,
specific_name, generic_name, lacey_act,
description, quantity, unit, value,
country_origin, country_origin_full, country_imp_exp,
purpose, source, action, disposition,
disposition_date, disposition_year, shipment_date, shipment_year,
import_export, port, us_co, foreign_co,
cleaning_notes
)
# Save the final dataset
write_csv(d, "data/cleaned/harmonized_amphibian_LEMIS_1999_to_2021.csv")