Skip to content

Commit

Permalink
made some final changes to the compile script for the 2024 data update
Browse files Browse the repository at this point in the history
  • Loading branch information
Melissa-Karp committed Jun 6, 2024
1 parent 7df0524 commit ef64a62
Showing 1 changed file with 70 additions and 54 deletions.
124 changes: 70 additions & 54 deletions data_processing_rcode/code/Compile_Dismap_Current.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## ---- DISMAP 05/29/2024 ---
## ---- DISMAP 06/6/2024 ---

## updated thru 2023 survey data for all regions except SEUS and Gmex(which is thru 2022)

Expand Down Expand Up @@ -65,7 +65,7 @@ library(data.table)
# Answer the following questions using all caps TRUE or FALSE to direct the actions of the script =====================================

# 1. Some strata and years have very little data, should they be removed and saved as fltr data? #DEFAULT: TRUE.
HQ_DATA_ONLY <- FALSE
HQ_DATA_ONLY <- TRUE

# 2. View plots of removed strata for HQ_DATA. #OPTIONAL, DEFAULT:FALSE
# It takes a while to generate these plots.
Expand Down Expand Up @@ -421,15 +421,17 @@ if (HQ_DATA_ONLY == TRUE){
summarise(count = n()) %>%
filter(count >= 12)

# how many rows will be lost if only years where all stratum sampled are kept?
# how many rows will be lost if only years where all stratum sampled are kept? and start timeseries in 1987
test2 <- ebs %>%
filter(year %in% test$year)
filter(year %in% test$year) %>%
filter(year != 1985)
nrow(ebs) - nrow(test2)
# percent that will be lost
print((nrow(ebs) - nrow(test2))/nrow(ebs))
# 8% of rows are removed
ebs_fltr <- ebs %>%
filter(year %in% test$year)
filter(year %in% test$year)%>%
filter(year != 1985)

p3 <- ebs_fltr %>%
select(stratum, year) %>%
Expand Down Expand Up @@ -814,26 +816,26 @@ wcann <- wcann %>%
wtcpue = total_catch_wt_kg/area_swept_ha_der
)

wcann$stratum<-ifelse(wcann$latitude_dd <=35.5 & wcann$depth_m<=183, "35.5-183",
ifelse(wcann$latitude_dd <= 35.5 & wcann$depth_m <= 549, "35.5-549",
ifelse(wcann$latitude_dd <= 35.5 & wcann$depth_m <= 1280, "35.5-1280",
ifelse(wcann$latitude_dd <= 35.5 & wcann$depth_m > 1280, "35.5-2000",
ifelse(wcann$latitude_dd <=40.5 & wcann$depth_m<=183, "40.5-183",
ifelse(wcann$latitude_dd <= 40.5 & wcann$depth_m <= 549, "40.5-549",
ifelse(wcann$latitude_dd <= 40.5 & wcann$depth_m <= 1280, "40.5-1280",
ifelse(wcann$latitude_dd <= 40.5 & wcann$depth_m > 1280, "40.5-2000",
ifelse(wcann$latitude_dd <=43.5 & wcann$depth_m<=183, "43.5-183",
ifelse(wcann$latitude_dd <= 43.5 & wcann$depth_m <= 549, "43.5-549",
ifelse(wcann$latitude_dd <= 43.5 & wcann$depth_m <= 1280, "43.5-1280",
ifelse(wcann$latitude_dd <= 43.5 & wcann$depth_m > 1280, "43.5-2000",
wcann$stratum<-ifelse(wcann$latitude_dd <=35.5 & wcann$depth_hi_prec_m<=183, "35.5-183",
ifelse(wcann$latitude_dd <= 35.5 & wcann$depth_hi_prec_m <= 549, "35.5-549",
ifelse(wcann$latitude_dd <= 35.5 & wcann$depth_hi_prec_m <= 1280, "35.5-1280",
ifelse(wcann$latitude_dd <= 35.5 & wcann$depth_hi_prec_m > 1280, "35.5-2000",
ifelse(wcann$latitude_dd <=40.5 & wcann$depth_hi_prec_m<=183, "40.5-183",
ifelse(wcann$latitude_dd <= 40.5 & wcann$depth_hi_prec_m<= 549, "40.5-549",
ifelse(wcann$latitude_dd <= 40.5 & wcann$depth_hi_prec_m <= 1280, "40.5-1280",
ifelse(wcann$latitude_dd <= 40.5 & wcann$depth_hi_prec_m > 1280, "40.5-2000",
ifelse(wcann$latitude_dd <=43.5 & wcann$depth_hi_prec_m<=183, "43.5-183",
ifelse(wcann$latitude_dd <= 43.5 & wcann$depth_hi_prec_m <= 549, "43.5-549",
ifelse(wcann$latitude_dd <= 43.5 & wcann$depth_hi_prec_m <= 1280, "43.5-1280",
ifelse(wcann$latitude_dd <= 43.5 & wcann$depth_hi_prec_m > 1280, "43.5-2000",
# ifelse(wcann$latitude_dd <=47.5 & wcann$depth_m<=183, "47.5-183",
# ifelse(wcann$latitude_dd <= 47.5 & wcann$depth_m <= 549, "47.5-549",
# ifelse(wcann$latitude_dd <= 47.5 & wcann$depth_m <= 1280, "47.5-1280",
# ifelse(wcann$latitude_dd <= 47.5 & wcann$depth_m > 1280, "47.5-2000",
ifelse(wcann$latitude_dd <=50.5 & wcann$depth_m<=183, "50.5-183",
ifelse(wcann$latitude_dd <= 50.5 & wcann$depth_m <= 549, "50.5-549",
ifelse(wcann$latitude_dd <= 50.5 & wcann$depth_m <= 1280, "50.5-1280",
ifelse(wcann$latitude_dd <= 50.5 & wcann$depth_m > 1280, "50.5-2000",NA))))))))))))))))
ifelse(wcann$latitude_dd <=50.5 & wcann$depth_hi_prec_m<=183, "50.5-183",
ifelse(wcann$latitude_dd <= 50.5 & wcann$depth_hi_prec_m <= 549, "50.5-549",
ifelse(wcann$latitude_dd <= 50.5 & wcann$depth_hi_prec_m <= 1280, "50.5-1280",
ifelse(wcann$latitude_dd <= 50.5 & wcann$depth_hi_prec_m > 1280, "50.5-2000",NA))))))))))))))))
wcann_strats <- wcann %>%
filter(!is.na(wtcpue)) %>%
group_by(stratum) %>%
Expand All @@ -845,7 +847,7 @@ wcann <- left_join(wcann, wcann_strats, by = "stratum")
wcann <- wcann %>%
rename(lat = latitude_dd,
lon = longitude_dd,
depth = depth_m,
depth = depth_hi_prec_m,
spp = scientific_name) %>%
# remove non-fish
filter(spp != "" &
Expand Down Expand Up @@ -899,15 +901,15 @@ if (HQ_DATA_ONLY == TRUE){
distinct() %>%
group_by(stratum) %>%
summarise(count = n()) %>%
filter(count>=19)
filter(count>=20)

# how many rows will be lost if only stratum trawled ever year are kept?
test2 <- wcann %>%
filter(stratum %in% test$stratum)
nrow(wcann) - nrow(test2)
# percent that will be lost
print((nrow(wcann) - nrow(test2))/nrow(wcann))
# 23% of rows are removed

wcann_fltr <- wcann %>%
#filter(year != 2019)%>%
filter(stratum %in% test$stratum)
Expand Down Expand Up @@ -1393,14 +1395,14 @@ if (HQ_DATA_ONLY == TRUE){
summarise(count = n())%>%
filter(count >= 46)

# how many rows will be lost if only stratum trawled ever year are kept (47 years)?
# how many rows will be lost if only stratum trawled fairly consistently (>46 years - so all but 2 of the years) are kept?
test2 <- neus_fall %>%
filter(year != 2017, year > 1973) %>%
filter(stratum %in% test$stratum)
nrow(neus_fall) - nrow(test2)
# percent that will be lost
print((nrow(neus_fall) - nrow(test2))/nrow(neus_fall))
# When bad strata are removed after bad years we only lose 38%
# When bad strata are removed after bad years we only lose 34%

neus_fall_fltr <- neus_fall %>%
filter(year != 2017, year > 1973) %>%
Expand Down Expand Up @@ -1497,15 +1499,15 @@ if (HQ_DATA_ONLY == TRUE){

# how many rows will be lost if only stratum trawled ALMOST ever year are kept?
test2 <- neus_spring %>%
filter(year != 2020,year != 2014, year != 1975, year > 1973) %>%
filter(year!= 2023, year != 2020,year != 2014, year != 1975, year > 1973) %>%
filter(stratum %in% test$stratum)
nrow(neus_spring) - nrow(test2)
# percent that will be lost
(nrow(neus_spring) - nrow(test2))/nrow(neus_spring)
# When bad strata are removed after bad years we only lose 35%

neus_spring_fltr <- neus_spring %>%
filter(year != 2020,year != 2014, year != 1975, year > 1973) %>%
filter(year!= 2023, year != 2020,year != 2014, year != 1975, year > 1973) %>%
filter(stratum %in% test$stratum)

p3 <- neus_spring_fltr %>%
Expand Down Expand Up @@ -1720,10 +1722,11 @@ seus <- left_join(seus, biomass, by = c("haulid", "stratum", "stratumarea", "yea
# double check that column numbers haven't changed by more than 2.

seus <- seus %>%
# remove non-fish
# remove non-fish and records with no species or common name
filter(
!spp %in% c('MISCELLANEOUS INVERTEBRATES','XANTHIDAE','MICROPANOPE NUTTINGI','ALGAE','DYSPANOPEUS SAYI', 'PSEUDOMEDAEUS AGASSIZII')
) %>%
filter(!is.na(spp)) %>%
# adjust spp names
mutate(
spp = ifelse(grepl("ANCHOA", spp), "ANCHOA", spp),
Expand Down Expand Up @@ -1766,7 +1769,7 @@ if (HQ_DATA_ONLY == TRUE){
distinct() %>%
group_by(stratum) %>%
summarise(count = n()) %>%
filter(count >= 29) # strata sampled in 90% of years!
filter(count >= 29) # strata sampled all but a few year!!

# how many rows will be lost if only stratum trawled ever year are kept?
test2 <- seusSPRING %>%
Expand Down Expand Up @@ -1806,64 +1809,77 @@ seusSUMMER <- seus %>%

if (HQ_DATA_ONLY == TRUE){
# look at the graph and make sure decisions to keep or eliminate data make sense
#no need to filter, but rename dataset for consistency
seusSUMMER_fltr <- seusSUMMER
p1 <- seusSUMMER_fltr %>%

p1 <- seusSUMMER %>%
select(stratum, year) %>%
ggplot(aes(x = as.factor(stratum), y = as.factor(year))) +
geom_jitter()

p2 <- seusSUMMER_fltr %>%
p2 <- seusSUMMER %>%
select(lat, lon) %>%
ggplot(aes(x = lon, y = lat)) +
geom_jitter()

#2021 was poorly sampled, so should be removed from data
seusSUMMER_fltr <- seusSUMMER %>%
filter(year!=2021)

p3 <- seusSUMMER_fltr %>%
select(stratum, year) %>%
ggplot(aes(x = as.factor(stratum), y = as.factor(year))) +
geom_jitter()

p4 <- seusSUMMER_fltr %>%
select(lat, lon) %>%
ggplot(aes(x = lon, y = lat)) +
geom_jitter()

if (HQ_PLOTS == TRUE){
temp <- grid.arrange(p1, p2, nrow = 2)
temp <- grid.arrange(p1, p2, p3, p4, nrow = 2)
ggsave(plot = temp, filename = here::here("output/plots", "seusSUM_hq_dat_removed.png"))
rm(temp)
}
rm(p1, p2)
rm(p1, p2, p3, p4)
}
# no missing data


# SEUS fall ====
seusFALL <- seus %>%
filter(SEASON == "fall") %>%
select(-SEASON) %>%
mutate(region = "Southeast US Fall")


# how many rows will be lost if only stratum trawled ever year are kept?
if (HQ_DATA_ONLY == TRUE){

p1 <- seusFALL %>%
select(stratum, year) %>%
ggplot(aes(x = as.factor(stratum), y = as.factor(year))) +
geom_jitter()

p2 <- seusFALL %>%
select(lat, lon) %>%
ggplot(aes(x = lon, y = lat)) +
geom_jitter()

test <- seusFALL %>%
filter(year != 2018, year != 2019) %>%
#filter(year != 2018, year != 2019) %>%
select(stratum, year) %>%
distinct() %>%
group_by(stratum) %>%
summarise(count = n()) %>%
filter(count >= 27)
filter(count >= 31)

test2 <- seusFALL %>%
filter(year != 2018, year != 2019) %>%
#filter(year != 2018, year != 2019) %>%
filter(stratum %in% test$stratum)
nrow(seusFALL) - nrow(test2)
# percent that will be lost
print((nrow(seusFALL) - nrow(test2))/nrow(seusFALL))
# 5.1% are removed

p1 <- seusFALL %>%
select(stratum, year) %>%
ggplot(aes(x = as.factor(stratum), y = as.factor(year))) +
geom_jitter()

p2 <- seusFALL %>%
select(lat, lon) %>%
ggplot(aes(x = lon, y = lat)) +
geom_jitter()

seusFALL_fltr <- seusFALL %>%
filter(year != 2018, year != 2019) %>%
#filter(year != 2018, year != 2019) %>%
filter(stratum %in% test$stratum)

# plot the results after editing
Expand Down Expand Up @@ -3274,7 +3290,7 @@ dat$spp<-firstup(dat$spp)
#Species Taxon checkpoint before proceeding!!
# Check if any new species are in survey data sets before proceeding....take the 'dat' file that combines the individual regions but before joined with 'spp_taxonomy' file
dat_spp <- dat %>%
select(spp) %>%
select(spp,region) %>%
distinct() %>%
mutate(spp_id = 1:nrow(.))

Expand All @@ -3287,7 +3303,7 @@ not_in_tax<- not_in_tax %>% group_by(spp) %>%

# add a case sensitive spp and common name
dat <- left_join(dat, tax, by = c("spp" = "survey_name")) %>%
select(region, haulid, year, lat, lon, stratum, stratumarea, depth, spp, accepted_name, common, rank, wtcpue) %>%
select(region, haulid, year, lat, lon, stratum, stratumarea, depth, spp, valid_name, common, rank, wtcpue) %>%
distinct()

#check for errors in name matching
Expand Down

0 comments on commit ef64a62

Please sign in to comment.