RMI-PACTA · jdhoffa · Feb 12, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
@@ -8,9 +8,43 @@ isic_classification <- read_bridge(
 )
 use_data(isic_classification, overwrite = TRUE)
 
-nace_classification <- read_bridge(
+nace_classification_raw <- read_bridge(
   file.path("data-raw", "nace_classification.csv")
 )
+
+nace_classification <- nace_classification_raw %>%
+  mutate(
+    prepend_value = case_when(
+      original_code %in% LETTERS ~ "",
+      trunc(as.numeric(original_code)) %in% seq(1, 3) ~ "A",
+      trunc(as.numeric(original_code)) %in% seq(5, 9) ~ "B",
+      trunc(as.numeric(original_code)) %in% seq(10, 33) ~ "C",
+      trunc(as.numeric(original_code)) == 35 ~ "D",
+      trunc(as.numeric(original_code)) %in% seq(36, 39) ~ "E",
+      trunc(as.numeric(original_code)) %in% seq(41, 43) ~ "F",
+      trunc(as.numeric(original_code)) %in% seq(45, 47) ~ "G",
+      trunc(as.numeric(original_code)) %in% seq(49, 53) ~ "H",
+      trunc(as.numeric(original_code)) %in% seq(55, 56) ~ "I",
+      trunc(as.numeric(original_code)) %in% seq(58, 63) ~ "J",
+      trunc(as.numeric(original_code)) %in% seq(64, 66) ~ "K",
+      trunc(as.numeric(original_code)) == 68 ~ "L",
+      trunc(as.numeric(original_code)) %in% seq(69, 75) ~ "M",
+      trunc(as.numeric(original_code)) %in% seq(77, 82) ~ "N",
+      trunc(as.numeric(original_code)) == 84 ~ "O",
+      trunc(as.numeric(original_code)) == 85 ~ "P",
+      trunc(as.numeric(original_code)) %in% seq(86, 88) ~ "Q",
+      trunc(as.numeric(original_code)) %in% seq(90, 93) ~ "R",
+      trunc(as.numeric(original_code)) %in% seq(94, 96) ~ "S",
+      trunc(as.numeric(original_code)) %in% seq(97, 98) ~ "T",
+      trunc(as.numeric(original_code)) == 99 ~ "U",
+      TRUE ~ "Z" #debug value, see unit tests)
+    )
+  ) %>%
+  mutate(
+    code = paste0(prepend_value, original_code),
+    prepend_value = NULL
+  )
+
 use_data(nace_classification, overwrite = TRUE)
 
 naics_classification <- read_bridge(

@@ -57,7 +57,7 @@ INFORMATION AND COMMUNICATION,1,1100,not in scope,FALSE
 INFORMATION AND COMMUNICATION,1,1100,not in scope,FALSE
 INFORMATION AND COMMUNICATION,1,1100,not in scope,FALSE
 INFORMATION AND COMMUNICATION,1,1100,not in scope,FALSE
-MANUFACTURING,1,1200,not in scope,FALSE
+MANUFACTURING,1,1200,steel,TRUE
 MANUFACTURING,1,1200,steel,TRUE
 MANUFACTURING,1,1200,steel,TRUE
 MANUFACTURING,1,1200,steel,TRUE

@@ -199,7 +199,6 @@ code,code_level,sector,borderline,description
 151020,3,cement,TRUE,construction materials
 151030,3,not in scope,FALSE,containers & packaging
 151040,3,steel,TRUE,metals & mining
-151040,3,coal,TRUE,metals & mining
 151050,3,not in scope,FALSE,paper & forest products
 201010,3,not in scope,FALSE,aerospace & defense
 201020,3,cement,TRUE,building products
@@ -244,7 +243,7 @@ code,code_level,sector,borderline,description
 402010,3,not in scope,FALSE,diversified financial services
 402020,3,not in scope,FALSE,consumer finance
 402030,3,not in scope,FALSE,capital markets
-402040,3,not in scope,FALSE,"mortgage real estate investment 
+402040,3,not in scope,FALSE,"mortgage real estate investment
 trusts (reits)"
 403010,3,not in scope,FALSE,insurance
 404010,3,not in scope,FALSE,real estate -- discontinued effective 04/28/2006

@@ -27,3 +27,24 @@ test_that("In classification datasets, `code` is of type 'character' (#185)", {
   types <- unlist(lapply(datasets, function(x) typeof(x[["code"]])))
   expect_equal(unique(types), "character")
 })
+
+test_that("In classification datasets, values of `code` are unique per sector and dataset (#229)", {
+  datasets <- enlist_datasets("r2dii.data", "classification")
+
+  #FIXME: This removes known offending datasets from the test.
+  # These datasets will be deprecated. Deprecation process tracked in #329
+  datasets$cnb_classification <- NULL
+  datasets$isic_classification <- NULL
+  datasets$sector_classifications <- NULL
+
+  no_duplicate_codes <- lapply(datasets, function(x) {
+    distinct_code_sector <- unique(x[c("code", "sector")])
+    result <- subset(as.data.frame(table(distinct_code_sector$code)),Freq > 1)
+    nrow(result) == 0
+  })
+
+  for (i in seq_along(no_duplicate_codes)) {
+    expect_true(no_duplicate_codes[[i]], info = names(no_duplicate_codes)[i])
+  }
+
+})
@@ -24,3 +24,8 @@ test_that("has `sector` values that are lowercase not uppercase", {
   sectors <- sort(unique(sector_classifications$sector))
   expect_equal(sectors, tolower(sectors))
 })
+
+test_that("debug value does not appear in `nace_classification`", {
+  nace_has_debug_value <- any(grepl("Z", nace_classification$original_code))
+  expect_false(nace_has_debug_value)
+})