Merge pull request #95 from darwin-eu/develop

v1.7.0
darwin-eu · Aug 16, 2023 · d25f8f6 · d25f8f6
2 parents 797bb0f + 9930f18
commit d25f8f6
Show file tree

Hide file tree

Showing 17 changed files with 849 additions and 28 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: CodelistGenerator
 Title: Generate Code Lists for the OMOP Common Data Model
-Version: 1.6.0
+Version: 1.7.0
 Authors@R: c(
     person("Edward", "Burn", email = "[email protected]", 
     role = c("aut", "cre"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# CodelistGenerator 1.7.0
+* Added function codesFromCohort.
+
 # CodelistGenerator 1.6.0
 * Improved getICD10StandardCodes function.
 * Added function codesFromConceptSet.

diff --git a/R/codesFromConceptSet.R b/R/codesFromConceptSet.R
@@ -82,15 +82,18 @@ codesFromConceptSet <- function(path, cdm, withConceptDetails = FALSE) {
 
 #' Get concept ids from a provided path to cohort json files
 #'
-#' @param path Path to a file or folder containing JSONs of concept sets
+#' @param path Path to a file or folder containing JSONs of cohort definitions
 #' @param cdm A cdm reference created with CDMConnector
+#' @param withConceptDetails If FALSE a vector of concept IDs will be returned
+#' for each concept set. If TRUE a tibble will be returned with additional
+#' information on the identified concepts.
 #'
 #' @return Named list with concept_ids for each concept set
 #' @export
 #'
-codesFromCohort <- function(path, cdm) {
+codesFromCohort <- function(path, cdm, withConceptDetails = FALSE) {
   # initial checks
-  #checkInput(path = path, cdm = cdm)
+  checkInputs(path = path, cdm = cdm)
 
   # list jsons
   files <- listJsonFromPath(path)
@@ -112,6 +115,11 @@ codesFromCohort <- function(path, cdm) {
   # split into list
   codelist <- tibbleToList(codelistTibble)
 
+  if(isTRUE(withConceptDetails)){
+    codelist <- addDetails(conceptList = codelist,
+                                   cdm = cdm)
+  }
+
   # return
   return(codelist)
 }
@@ -146,7 +154,12 @@ extractCodes <- function(file, unknown) {
     conceptId <- NULL
     includeDescendants <- NULL
     isExcluded <- NULL
+
     for (j in seq_along(concepts)) {
+       if(!is.null(concepts[[j]][["includeMapped"]])){
+        cli::cli_abort(
+          glue::glue("Mapped as TRUE not supported (found in {name})"))
+      }
       conceptId <- c(conceptId, concepts[[j]][["concept"]][["CONCEPT_ID"]])
       exc <- concepts[[j]][["isExcluded"]]
       isExcluded <- c(
@@ -161,7 +174,8 @@ extractCodes <- function(file, unknown) {
       dplyr::union_all(dplyr::tibble(
         codelist_name = name, concept_id = conceptId,
         include_descendants = includeDescendants, is_excluded = isExcluded
-      ))
+      ) %>%
+        dplyr::mutate(filename = file))
   }
   return(codelistTibble)
 }
@@ -198,15 +212,40 @@ excludeCodes <- function(codelistTibble) {
 }
 
 tibbleToList <- function(codelistTibble) {
-  nam <- unique(codelistTibble$codelist_name)
+
+  codelistTibble <- codelistTibble %>%
+    dplyr::mutate(nam = paste0(.data$codelist_name, "; ",
+                               .data$filename))
+
+  nam <- unique(codelistTibble$nam)
   codelist <- lapply(nam, function(x) {
     codelistTibble %>%
-      dplyr::filter(.data$codelist_name == .env$x) %>%
+      dplyr::filter(.data$nam == .env$x) %>%
       dplyr::pull("concept_id") %>%
       unique()
   })
   names(codelist) <- nam
-  return(codelist)
+
+
+  # check if we have any concept sets with the same name but different definitions
+  # keep first for each name
+  cs_names <- stringr::str_extract(names(codelist), "^[^;]*")
+  cs_names_unique <- unique(cs_names)
+
+  codelist_dedup <- list()
+
+  for(i in seq_along(cs_names_unique)){
+   same_name_cs <- codelist[which(cs_names_unique[i] == cs_names)]
+   check_consistent <- all(sapply(same_name_cs, identical, same_name_cs[[1]]))
+
+   if(isFALSE(check_consistent)){
+     cli::cli_abort(message = "Different definitions for concept set {cs_names_unique[i]} found")
+   }
+   # keep first
+   codelist_dedup[[cs_names_unique[i]]] <- same_name_cs[[1]]
+  }
+
+  return(codelist_dedup)
 }
 
 addDetails <- function(conceptList, cdm){

diff --git a/R/drugCodes.R b/R/drugCodes.R
@@ -85,13 +85,14 @@ getATCCodes <- function(cdm,
   checkmate::assertTRUE(atcCheck, add = errorMessage)
   checkmate::reportAssertions(collection = errorMessage)
 
+  # to avoid potential memory problems will batch
+  if (nrow(atc_groups) > 0) {
+    atc_descendants <- fetchBatchedDescendants(cdm = cdm,
+                                  codes = atc_groups$concept_id,
+                                  batchSize = 500,
+                                  doseForm = doseForm)
+}
 
-  atc_descendants <- getDescendants(
-    cdm = cdm,
-    conceptId = atc_groups$concept_id,
-    withAncestor = TRUE,
-    doseForm = doseForm
-  )
   if (nrow(atc_descendants) > 0) {
     atc_descendants <- atc_descendants %>%
       dplyr::select("concept_id", "concept_name",
@@ -207,7 +208,7 @@ getDrugIngredientCodes <- function(cdm,
 
   # to avoid potential memory problems will batch
   if (nrow(ingredientConcepts) > 0) {
-    ingredientCodes <- fetchBatchedDrugIngredientCodes(cdm,
+    ingredientCodes <- fetchBatchedDescendants(cdm,
       codes = ingredientConcepts$concept_id,
       batchSize = 500,
       doseForm = doseForm
@@ -267,7 +268,7 @@ getDrugIngredientCodes <- function(cdm,
     return(ingredientCodes)
 }
 
-fetchBatchedDrugIngredientCodes <- function(cdm, codes, batchSize, doseForm) {
+fetchBatchedDescendants <- function(cdm, codes, batchSize, doseForm) {
   codeBatches <- split(
     codes,
     ceiling(seq_along(codes) / batchSize)
@@ -277,7 +278,7 @@ fetchBatchedDrugIngredientCodes <- function(cdm, codes, batchSize, doseForm) {
 
   cli::cli_progress_bar(
     total = length(descendants),
-    format = " -- getting descendants {cli::pb_bar} {cli::pb_current} of {cli::pb_total} ingredient groups"
+    format = " -- getting descendants {cli::pb_bar} {cli::pb_current} of {cli::pb_total} batched groups"
   )
   for (i in seq_along(descendants)) {
     cli::cli_progress_update()

diff --git a/R/mockVocabRef.R b/R/mockVocabRef.R
@@ -322,7 +322,7 @@ mockVocabRef <- function(backend = "database") {
     CDMConnector::stow(cdm, dOut)
 
     if (backend == "arrow") {
-      if(utils::packageVersion("CDMConnector")<"1.10"){
+      if(utils::packageVersion("CDMConnector")<"1.1.0"){
       cdmArrow <- CDMConnector::cdm_from_files(
         path = dOut,
         as_data_frame = FALSE
@@ -338,7 +338,7 @@ mockVocabRef <- function(backend = "database") {
     }
 
     if (backend == "data_frame") {
-      if(utils::packageVersion("CDMConnector")<"1.10"){
+      if(utils::packageVersion("CDMConnector")<"1.1.0"){
         cdmDF <- CDMConnector::cdm_from_files(
           path = dOut,
           as_data_frame = TRUE

diff --git a/R/summariseCodeUse.R b/R/summariseCodeUse.R
@@ -87,12 +87,6 @@ summariseCodeUse <- function(x,
       .data$estimate_suppressed == "TRUE",
       NA, .data$estimate))
 
-  if(!"record" %in% countBy){
-    codeCounts <- codeCounts %>%
-      dplyr::mutate(concept_name= NA,
-                    concept_id =NA)
-  }
-
   codeCounts <- codeCounts %>%
     dplyr::mutate(group_level = dplyr::if_else(.data$group_name == "By concept",
                                         paste0(.data$concept_name, " (",

diff --git a/inst/cohorts/cohorts.json b/inst/cohorts/cohorts.json
@@ -0,0 +1,88 @@
+{
+  "ConceptSets": [
+    {
+      "id": 0,
+      "name": "acetaminophen",
+      "expression": {
+        "items": [
+          {
+            "concept": {
+              "CONCEPT_CLASS_ID": "Ingredient",
+              "CONCEPT_CODE": "161",
+              "CONCEPT_ID": 1125315,
+              "CONCEPT_NAME": "acetaminophen",
+              "DOMAIN_ID": "Drug",
+              "INVALID_REASON": "V",
+              "INVALID_REASON_CAPTION": "Valid",
+              "STANDARD_CONCEPT": "S",
+              "STANDARD_CONCEPT_CAPTION": "Standard",
+              "VOCABULARY_ID": "RxNorm",
+              "VALID_START_DATE": "1970-01-01",
+              "VALID_END_DATE": "2099-12-31"
+            },
+            "includeDescendants": true
+          }
+        ]
+      }
+    },
+    {
+      "id": 1,
+      "name": "influenza",
+      "expression": {
+        "items": [
+          {
+            "concept": {
+              "CONCEPT_CLASS_ID": "Clinical Finding",
+              "CONCEPT_CODE": "6142004",
+              "CONCEPT_ID": 4266367,
+              "CONCEPT_NAME": "Influenza",
+              "DOMAIN_ID": "Condition",
+              "INVALID_REASON": "V",
+              "INVALID_REASON_CAPTION": "Valid",
+              "STANDARD_CONCEPT": "S",
+              "STANDARD_CONCEPT_CAPTION": "Standard",
+              "VOCABULARY_ID": "SNOMED",
+              "VALID_START_DATE": "2002-01-31",
+              "VALID_END_DATE": "2099-12-31"
+            }
+          }
+        ]
+      }
+    }
+  ],
+  "PrimaryCriteria": {
+    "CriteriaList": [
+      {
+        "ConditionOccurrence": {
+          "CodesetId": 1
+        }
+      },
+      {
+        "DrugExposure": {
+          "CodesetId": 0
+        }
+      }
+    ],
+    "ObservationWindow": {
+      "PriorDays": 0,
+      "PostDays": 0
+    },
+    "PrimaryCriteriaLimit": {
+      "Type": "First"
+    }
+  },
+  "QualifiedLimit": {
+    "Type": "First"
+  },
+  "ExpressionLimit": {
+    "Type": "First"
+  },
+  "InclusionRules": [],
+  "CensoringCriteria": [],
+  "CollapseSettings": {
+    "CollapseType": "ERA",
+    "EraPad": 0
+  },
+  "CensorWindow": {},
+  "cdmVersionRange": ">=5.0.0"
+}
diff --git a/inst/cohorts/extra_file.txt b/inst/cohorts/extra_file.txt
diff --git a/inst/cohorts_for_mock/oa_desc.json b/inst/cohorts_for_mock/oa_desc.json
@@ -0,0 +1,89 @@
+{
+  "ConceptSets": [
+    {
+      "id": 0,
+      "name": "OA",
+      "expression": {
+        "items": [
+          {
+            "concept": {
+              "CONCEPT_CLASS_ID": "Clinical Finding",
+              "CONCEPT_CODE": "396275006",
+              "CONCEPT_ID": 3,
+              "CONCEPT_NAME": "Osteoarthritis",
+              "DOMAIN_ID": "Condition",
+              "INVALID_REASON": "V",
+              "INVALID_REASON_CAPTION": "Valid",
+              "STANDARD_CONCEPT": "S",
+              "STANDARD_CONCEPT_CAPTION": "Standard",
+              "VOCABULARY_ID": "SNOMED"
+            },
+            "includeDescendants": true
+          }
+        ]
+      }
+    },
+    {
+      "id": 1,
+      "name": "Other",
+      "expression": {
+        "items": [
+          {
+            "concept": {
+              "CONCEPT_CLASS_ID": "Clinical Finding",
+              "CONCEPT_CODE": "422504002",
+              "CONCEPT_ID": 5,
+              "CONCEPT_NAME": "Ischemic stroke",
+              "DOMAIN_ID": "Condition",
+              "INVALID_REASON": "V",
+              "INVALID_REASON_CAPTION": "Valid",
+              "STANDARD_CONCEPT": "S",
+              "STANDARD_CONCEPT_CAPTION": "Standard",
+              "VOCABULARY_ID": "SNOMED"
+            }
+          }
+        ]
+      }
+    }
+  ],
+  "PrimaryCriteria": {
+    "CriteriaList": [
+      {
+        "ConditionOccurrence": {
+          "CodesetId": 0
+        }
+      },
+      {
+        "Observation": {
+          "CodesetId": 0
+        }
+      },
+      {
+        "ConditionOccurrence": {
+          "CodesetId": 1
+        }
+      }
+    ],
+    "ObservationWindow": {
+      "PriorDays": 0,
+      "PostDays": 0
+    },
+    "PrimaryCriteriaLimit": {
+      "Type": "First"
+    }
+  },
+  "QualifiedLimit": {
+    "Type": "First"
+  },
+  "ExpressionLimit": {
+    "Type": "First"
+  },
+  "InclusionRules": [],
+  "CensoringCriteria": [],
+  "CollapseSettings": {
+    "CollapseType": "ERA",
+    "EraPad": 0
+  },
+  "CensorWindow": {},
+  "cdmVersionRange": ">=5.0.0"
+}