Skip to content

Commit

Permalink
implement assert_no_duplicate(by=)
Browse files Browse the repository at this point in the history
  • Loading branch information
DanChaltiel committed Mar 31, 2024
1 parent da92ec4 commit cf37aaa
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 12 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ EDCimport is a package designed to easily import data from EDC software TrialMas

- `extend_lookup()` will not fail anymore when the database has a faulty table

- `assert_no_duplicate()` has now a `by` argument to check for duplicate in groups, for example by visit.

- `find_keyword()` is more robust and inform on the proportion of missing if possible

- `read_trialmaster()` will output a readable error when no password is entered although one is needed
Expand Down
31 changes: 19 additions & 12 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,21 +136,28 @@ check_subjid = function(x, ref=getOption("edc_subjid_ref")){
#' tibble(subjid=c(1:10)) %>% assert_no_duplicate() %>% nrow()
#'
#' #with duplicate => throws an error
#' #tibble(subjid=c(1:10, 1:2)) %>% assert_no_duplicate() %>% nrow()
assert_no_duplicate = function(df, id_col=get_key_cols()){
if(is.list(id_col)) id_col = id_col$patient_id
#' tibble(subjid=c(1:10, 1:2)) %>% assert_no_duplicate() %>% nrow()
assert_no_duplicate = function(df, by=NULL, id_col=get_subjid_cols()){
env = current_env()
x = df %>% select(any_of2(id_col))
if(ncol(x)==0){
cli_abort("Cannot assert the absence of duplicates: no ID column ({.val {id_col}}).",
call=env, class="edcimport_assert_no_duplicate_no_col")
id_col_selected = id_col[tolower(id_col) %in% tolower(names(df))]

if(length(id_col_selected) == 0){
cli_abort("Cannot assert the absence of duplicates: no ID column ({.val {id_col}}) in `names(df)`.",
class="edcimport_assert_no_duplicate_no_col")
}
if(length(id_col_selected) > 1){
cli_abort("Cannot assert the absence of duplicates: too many ID column ({.val {id_col_selected}}) in `names(df)`.",
class="edcimport_assert_no_duplicate_many_col")
}

y = x %>% map(duplicated) %>% head(1) #y is scalar
if(any(unlist(y))){
dups = x[[1]][unlist(y)] %>% unique() %>% sort()
dups = head(dups, 10) #because of https://github.com/r-lib/cli/issues/617
cli_abort("Duplicate on column {.val { names(y)}} for {qty(length(dups))} value{?s} {.val {dups}}.",
x = df %>%
select(any_of2(id_col_selected), {{by}}) %>%
count(across(everything())) %>%
filter(n>1)

if(nrow(x)>0){
dups = head(x[[1]], 10) #because of https://github.com/r-lib/cli/issues/617
cli_abort("Duplicate on column {.val { names(x[1])}} for {qty(length(x[[1]]))} value{?s} {.val {dups}}.",
call=env, class="edcimport_assert_no_duplicate")
}
df
Expand Down

0 comments on commit cf37aaa

Please sign in to comment.