-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5_stats-prep.R
228 lines (180 loc) · 9.66 KB
/
5_stats-prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
## ------------------------------------------ ##
# SPARC Soil P -- Stats/Viz Prep
## ------------------------------------------ ##
# Script author(s): Nick J Lyon
# Purpose:
## Take complete data (i.e., with P sums, ancillary data, and spatial info)
## And create subset(s) relevant to hypotheses
## ------------------------------------------ ##
# Housekeeping -----
## ------------------------------------------ ##
# Load necessary libraries
# install.packages("librarian")
librarian::shelf(tidyverse, googledrive, supportR)
# Create necessary sub-folder(s)
dir.create(path = file.path("data", "tidy_data"), showWarnings = F)
dir.create(path = file.path("data", "stats_ready"), showWarnings = F)
# Clear environment
rm(list = ls())
# Identify needed tidy file(s)
tidy_drive <- googledrive::as_id("https://drive.google.com/drive/u/0/folders/1pjgN-wRlec65NDLBvryibifyx6k9Iqy9")
# Identify the archival data in that folder and download it
googledrive::drive_ls(path = tidy_drive) %>%
dplyr::filter(name == "sparc-soil-p_full-plus-ancil-and-spatial.csv") %>%
googledrive::drive_download(file = .$id, overwrite = T,
path = file.path("data", "tidy_data", .$name))
# Read that file in
all_v1 <- read.csv(file = file.path("data", "tidy_data",
"sparc-soil-p_full-plus-ancil-and-spatial.csv"))
# un <- as.tibble(unique(all_v1$dataset))
# Glimpse it!
dplyr::glimpse(all_v1)
## ------------------------------------------ ##
# Slim Down Columns -----
## ------------------------------------------ ##
# Only some columns are really needed from here on out
all_v2 <- all_v1 %>%
dplyr::select(lter:core, treatment, dplyr::starts_with("rock_"), soil_code, generic_soil,
dplyr::starts_with("mean.annual."), pH,
dplyr::starts_with("horizon"), dplyr::starts_with("depth."),
core.length_cm, bulk.density_g.cm3,
dplyr::starts_with("Al_"), dplyr::starts_with("Fe"),
dplyr::ends_with(".P_conc_mg.kg"),
C_conc_percent, N_conc_percent) %>%
# Drop non-unique rows (shouldn't be any but you never know)
dplyr::distinct()
# See what we lost and make sure you're cool with that
supportR::diff_check(old = names(all_v1), new = names(all_v2), sort = T)
# Look at what you still kept
dplyr::glimpse(all_v2)
un <- as.tibble(unique(all_v2$dataset))
## ------------------------------------------ ##
# Handle Luquillo_2 Core Issue ----
## ------------------------------------------ ##
# Need to do a weighted average across 0-2 and 2-10 depth cores
luq2_v1 <- all_v2 %>%
# Subset as needed
dplyr::filter(dataset_simp == "LUQ_2")
# Calculate across 0-2 and 2-10 depths into a single value
luq2_v2 <- luq2_v1 %>%
# Flip to long format
tidyr::pivot_longer(cols = -lter:-bulk.density_g.cm3) %>%
# Do needed weighted calculation
dplyr::mutate(value = case_when(
depth.start_cm == 0 & depth.end_cm == 2 ~ value * 0.2,
depth.start_cm == 2 & depth.end_cm == 10 ~ value * 0.8,
T ~ value)) %>%
# Fix depth values and core lengths
dplyr::mutate(core.length_cm = dplyr::case_when(
depth.start_cm == 0 & depth.end_cm == 2 ~ 10,
depth.start_cm == 2 & depth.end_cm == 10 ~ 10,
T ~ core.length_cm)) %>%
dplyr::mutate(depth.end_cm = ifelse(depth.start_cm == 0 & depth.end_cm == 2,
yes = 10, no = depth.end_cm)) %>%
dplyr::mutate(depth.start_cm = ifelse(depth.start_cm == 2 & depth.end_cm == 10,
yes = 0, no = depth.start_cm)) %>%
# Sum our resolved depths
dplyr::group_by(dplyr::across(.cols = -value)) %>%
dplyr::summarize(value = sum(value, na.rm = T)) %>%
dplyr::ungroup() %>%
# Reshape wider
tidyr::pivot_wider(names_from = name, values_from = value)
# Check structure
dplyr::glimpse(luq2_v2)
## Should go from 20 rows to 10 rows
# Split off on LUQ_2
not_luq2 <- all_v2 %>%
dplyr::filter(dataset_simp != "LUQ_2")
# Recombine
all_v3 <- dplyr::bind_rows(luq2_v2, not_luq2)
un <- as.tibble(unique(all_v3$dataset))
## ------------------------------------------ ##
# Remove Outliers ----
## ------------------------------------------ ##
# Begin by removing obvious outliers
stats_v1 <- all_v3 %>%
# KBS has an outrageously high total P value in one spot (~5600 mg/kg)
dplyr::filter(is.na(total.P_conc_mg.kg) | total.P_conc_mg.kg <= 5250)
# How many rows lost?
nrow(all_v3) - nrow(stats_v1)
## ------------------------------------------ ##
# Mineral Subsetting ----
## ------------------------------------------ ##
# One set of hypotheses requires only 0-10 cm of mineral layer
mineral_v1 <- stats_v1 %>%
# Keep only mineral layer (and mixed mineral/organic) data
dplyr::filter(horizon_binary %in% c("mineral", "mixed") |
# Also keep unspecified horizon information (assumes mineral)
nchar(horizon_binary) == 0) %>%
# For HBR, keep only A horizon
dplyr::filter((dataset == "Hubbard Brook" & horizon == "A") |
dataset != "Hubbard Brook") %>%
# Coerce empty N/C percents into true NAs
dplyr::mutate(C_conc_percent = ifelse(nchar(C_conc_percent) == 0,
yes = NA, no = C_conc_percent),
N_conc_percent = ifelse(nchar(N_conc_percent) == 0,
yes = NA, no = N_conc_percent)) %>%
# Also we need **either** N or C information in addition to P data for statistics
dplyr::filter(!is.na(C_conc_percent) | !is.na(N_conc_percent)) # This step removes HJA & KBS because neither has C or N
un2 <- as.tibble(unique(mineral_v1$dataset))
# How many rows does that lose?
nrow(stats_v1) - nrow(mineral_v1)
# US PLAYING AROUND
# this is top horizon of each core for BNZ_1
BNZ <- mineral_v1 %>%
filter(dataset_simp == "BNZ_1") %>%
group_by(site,plot,block,core) %>%
filter(depth.start_cm == min(depth.start_cm))
mineral_v2 <- mineral_v1 %>%
filter(dataset_simp != "BNZ_1") %>%
# Keep only cores beginning at the top of the horizon
dplyr::filter(depth.start_cm == 0 | dataset == "Toolik_2" | dataset == "Luquillo_3" | dataset == "Niwot_2") # janky but works
# Again, keep missing depths on assumption they start at 0
# nchar(depth.start_cm) == 0 | is.na(depth.start_cm) # removing this for now because we feel like we addressed all situations where start depth was 0 or missing
# recombining BNZ_1 mineral cores with the rest of the cores after filtering to keep the top mineral layer in each core (because the start depth for these wasn't zero)
mineral_v2 <- rbind(mineral_v2,BNZ)
un3 <- as.tibble(unique(mineral_v2$dataset))
# How many rows dropped at that step?
nrow(mineral_v1) - nrow(mineral_v2)
# Adding new treatment type category
## Adding by manually looking at the raw data of each site to remember what each treatment variable means and updating here
mineral_v2 <- mineral_v2 %>%
mutate(treatment = ifelse(treatment == "",NA,treatment)) %>%
mutate(treatment_type = ifelse(is.na(treatment),NA,treatment)) %>% # adding a new column called treatment_type that is NA for all sites/observations that don't have treatment filled, but filling in with treatment value for now for those that do have treatment info. Although I'm going to update this manually by site
dplyr::select(lter:treatment,treatment_type,everything())
mineral_v2 <- mineral_v2 %>%
mutate(treatment_type = ifelse(dataset_simp == "Calhoun","landscape_position",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "NWT_1","location",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "SEV_1","site",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "HBR","siteage_yrs",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "LUQ_1","cover",treatment_type)) %>% # not sure if cover is best description for this, the treatment variable is just called treatment in the dataset. Should verify with Whendee what this means.
mutate(treatment_type = ifelse(dataset_simp == "LUQ_2","cover",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "Brazil_SA","burn_history",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "Brazil_AF","hillslope_position",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "SEV_2","location",treatment_type)) %>% # should distance be a block for this site?
mutate(treatment_type = ifelse(dataset_simp == "ARC_1","core_ID",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "FCE_1","distance",treatment_type)) %>%
mutate(treatment_type = ifelse(dataset_simp == "KBS","tillage",treatment_type)) %>% # educated guess, need to double check this
mutate(treatment_type = ifelse(dataset_simp == "KBS","tillage",treatment_type))
unique(mineral_v2$dataset_simp)
## ------------------------------------------ ##
# Mineral Export (Local) ----
## ------------------------------------------ ##
# Make a final object
final_mineral <- mineral_v2
# Define the name for this file
mineral_name <- "sparc-soil-p_stats-ready_mineral_0-10.csv"
# Export locally as a CSV to "stats_ready" folder
write.csv(x = final_mineral, row.names = F, na = "",
file = file.path("data", "stats_ready", mineral_name))
## ------------------------------------------ ##
# Stats-Ready Export (Drive) ----
## ------------------------------------------ ##
# List all 'stats ready' files that we've made
( ready_files <- dir(path = file.path("data", "stats_ready")) )
# Loop across it uploading to Drive
for(focal_ready in ready_files){
googledrive::drive_upload(media = file.path("data", "stats_ready", focal_ready),
path = tidy_drive, overwrite = T)
}
# End ----