Add topics as a column (#257)

* add topics as a column (replacing concepts in most entities except Works) * make process_topics internal * document duh! * fix per June's feedback
ropensci · Jun 25, 2024 · 35ef5cb · 35ef5cb
1 parent cc8d0fe
commit 35ef5cb
Show file tree

Hide file tree

Showing 15 changed files with 216 additions and 127 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # openalexR 1.4.0
-* "topics" are now available in oa_fetch
+* "topics" are now a valid entity in oa_fetch
+* The column "topics" replaces concepts in most entities' returned dataframes
+* For Works, "topics" and "concepts" are now returned as separate columns
 
 # openalexR 1.3.1
 * solved bug in au_affiliation_raw in PR#241

diff --git a/R/oa2df.R b/R/oa2df.R
@@ -254,32 +254,10 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE,
       names(open_access)[[1]] <- "is_oa_anywhere"
     }
 
-    # Topics
-    process_paper_topics <- function(paper) {
-      topics <- paper$topics
-      if (is.null(topics)) {
-        return(NULL)
-      }
-      topics_ls <- lapply(seq_along(topics), function(i) {
-        topic <- topics[[i]]
-        relev <- c(
-          # Hoist fields for the topic entity
-          list(topic = topic[c("id", "display_name")]),
-          # Keep info about other entities as-is
-          topic[vapply(topic, is.list, logical(1))]
-        )
-        relev_df <- subs_na(relev, "rbind_df")[[1]]
-        relev_df <- tibble::rownames_to_column(relev_df, "name")
-        cbind(i = i, score = topic$score, relev_df)
-      })
-      topics_df <- do.call(rbind.data.frame, topics_ls)
-      list(tibble::as_tibble(topics_df))
-    }
-    topics <- process_paper_topics(paper)
-
+    topics <- process_topics(paper, "score")
     out_ls <- c(
       sim_fields, venue, open_access, paper_biblio,
-      list(author = author, ab = ab, topics = topics)
+      list(author = author, ab = ab), topics
     )
     out_ls[sapply(out_ls, is.null)] <- NULL
     list_df[[i]] <- out_ls
@@ -363,7 +341,6 @@ authors2df <- function(data, verbose = TRUE,
     "identical", "relevance_score",
     "flat", "display_name_alternatives",
     "rbind_df", "counts_by_year",
-    "rbind_df", "x_concepts",
     "flat", "ids"
   )
 
@@ -397,8 +374,8 @@ authors2df <- function(data, verbose = TRUE,
       }
       sub_affiliation$affiliations_other <- list(affiliations_other)
     }
-
-    list_df[[i]] <- c(sim_fields, sub_affiliation)
+    topics <- process_topics(item, "count")
+    list_df[[i]] <- c(sim_fields, sub_affiliation, topics)
   }
 
   col_order <- c(
@@ -407,7 +384,7 @@ authors2df <- function(data, verbose = TRUE,
     "affiliation_display_name", "affiliation_id", "affiliation_ror",
     "affiliation_country_code", "affiliation_type", "affiliation_lineage",
     "affiliations_other",
-    "x_concepts", "works_api_url"
+    "topics", "works_api_url"
   )
 
   out_df <- rbind_oa_ls(list_df)
@@ -473,7 +450,6 @@ institutions2df <- function(data, verbose = TRUE,
     "flat", "display_name_acronyms",
     "row_df", "geo",
     "rbind_df", "counts_by_year",
-    "rbind_df", "x_concepts",
     "rbind_df", "associated_institutions",
     "flat", "ids"
   )
@@ -498,7 +474,8 @@ institutions2df <- function(data, verbose = TRUE,
         )
       )
     }
-    list_df[[i]] <- c(sim_fields, interna)
+    topics <- process_topics(item, "count")
+    list_df[[i]] <- c(sim_fields, interna, topics)
   }
 
 
@@ -508,7 +485,7 @@ institutions2df <- function(data, verbose = TRUE,
     "homepage_url", "image_url", "image_thumbnail_url",
     "associated_institutions", "relevance_score", "works_count",
     "cited_by_count", "counts_by_year",
-    "works_api_url", "x_concepts", "updated_date", "created_date"
+    "works_api_url", "topics", "updated_date", "created_date"
   )
 
   out_df <- rbind_oa_ls(list_df)
@@ -740,7 +717,6 @@ sources2df <- function(data, verbose = TRUE,
     "flat", "alternate_titles",
     "identical", "abbreviated_title",
     "identical", "type",
-    "rbind_df", "x_concepts",
     "rbind_df", "counts_by_year",
     "identical", "works_api_url",
     "identical", "updated_date",
@@ -761,7 +737,8 @@ sources2df <- function(data, verbose = TRUE,
       fields$type,
       SIMPLIFY = FALSE
     )
-    list_df[[i]] <- sim_fields
+    topics <- process_topics(item, "count")
+    list_df[[i]] <- c(sim_fields, topics)
   }
 
   out_df <- rbind_oa_ls(list_df)

diff --git a/R/simplify.R b/R/simplify.R
@@ -1,4 +1,3 @@
-
 #' Simplify the OpenAlex authors result
 #'
 #' This function is mostly for the package's internal use,
@@ -32,13 +31,13 @@ show_authors <- function(x, simp_func = utils::head) {
   }
 
   x$top_concepts <- vapply(
-    x$x_concepts,
+    x$topics,
     function(y) {
       if (is.logical(y)) {
         return(NA_character_)
       }
-      op_level <- min(1, max(y$level))
-      paste(utils::head(y[y$level == op_level, "display_name"], 3),
+      top_subfields <- y[y$name == "subfield", ]
+      paste(utils::head(top_subfields, 3)$display_name,
         collapse = ", "
       )
     },
@@ -113,6 +112,9 @@ show_works <- function(x, simp_func = utils::head) {
 }
 
 get_auth_position <- function(y, position = "first") {
+  if (length(y) == 1 && is.na(y)) {
+    return(NA_character_)
+  }
   last <- y[y$author_position == position, "au_display_name"]
   if (length(last) == 0) {
     return(NA_character_)

diff --git a/R/utils.R b/R/utils.R
@@ -138,3 +138,33 @@ oa_apikey <- function() {
   }
   apikey
 }
+
+
+#' Process topics
+#'
+#' @param entity List. One single work or author to process.
+#' @param extra Character. Either "score" (work) or "count" (author).
+#'
+#' @return List. A list of one tibble with the processed topics.
+#' @keywords internal
+#'
+process_topics <- function(entity, extra) {
+  topics <- entity$topics
+  if (is.null(topics)) {
+    return(NULL)
+  }
+  topics_ls <- lapply(seq_along(topics), function(i) {
+    topic <- topics[[i]]
+    relev <- c(
+      # Hoist fields for the topic entity
+      list(topic = topic[c("id", "display_name")]),
+      # Keep info about other entities as-is
+      topic[vapply(topic, is.list, logical(1))]
+    )
+    relev_df <- subs_na(relev, "rbind_df")[[1]]
+    relev_df <- tibble::rownames_to_column(relev_df, "name")
+    cbind(i = i, topic[extra], relev_df)
+  })
+  topics_df <- do.call(rbind.data.frame, topics_ls)
+  list(topics = list(tibble::as_tibble(topics_df)))
+}
diff --git a/README.Rmd b/README.Rmd
@@ -278,12 +278,12 @@ And what do they publish on?
 # library(wordcloud)
 
 concept_cloud <- italy_insts |>
-  select(inst_id = id, x_concepts) |>
-  tidyr::unnest(x_concepts) |>
-  filter(level == 1) |>
-  select(display_name, score) |>
+  select(inst_id = id, topics) |>
+  tidyr::unnest(topics) |>
+  filter(name == "field") |>
+  select(display_name, count) |>
   group_by(display_name) |>
-  summarise(score = sum(score))
+  summarise(score = sqrt(sum(count)))
 
 pal <- c("black", scales::brewer_pal(palette = "Set1")(5))
 set.seed(1)
@@ -309,17 +309,26 @@ jours_all <- oa_fetch(
   verbose = TRUE
 )
 
+clean_journal_name <- function(x) {
+  x |>
+    gsub("\\(.*?\\)", "", x = _) |>
+    gsub("Journal of the|Journal of", "J.", x = _) |>
+    gsub("/.*", "", x = _)
+}
+
 jours <- jours_all |>
-  filter(!is.na(x_concepts), type != "ebook platform") |>
+  filter(type == "journal") |>
   slice_max(cited_by_count, n = 9) |>
   distinct(display_name, .keep_all = TRUE) |>
-  select(jour = display_name, x_concepts) |>
-  tidyr::unnest(x_concepts) |>
-  filter(level == 0) |>
+  select(jour = display_name, topics) |>
+  tidyr::unnest(topics) |>
+  filter(name == "field") |>
+  group_by(id, jour, display_name) |> 
+  summarise(score = (sum(count))^(1/3), .groups = "drop") |> 
   left_join(concept_abbrev, by = join_by(id, display_name)) |>
   mutate(
     abbreviation = gsub(" ", "<br>", abbreviation),
-    jour = gsub("Journal of|Journal of the", "J.", gsub("\\(.*?\\)", "", jour))
+    jour = clean_journal_name(jour),
   ) |>
   tidyr::complete(jour, abbreviation, fill = list(score = 0)) |>
   group_by(jour) |>
@@ -333,9 +342,9 @@ jours |>
   ggplot() +
   aes(fill = jour, y = score, x = abbreviation, group = jour) +
   facet_wrap(~jour) +
-  geom_hline(yintercept = c(45, 90), colour = "grey90", linewidth = 0.2) +
+  geom_hline(yintercept = c(25, 50), colour = "grey90", linewidth = 0.2) +
   geom_segment(
-    aes(x = abbreviation, xend = abbreviation, y = 0, yend = 100),
+    aes(x = abbreviation, xend = abbreviation, y = 0, yend = 55),
     color = "grey95"
   ) +
   geom_col(color = "grey20") +
@@ -350,7 +359,7 @@ jours |>
     axis.ticks.y = element_blank()
   ) +
   ggtext::geom_richtext(
-    aes(y = 120, label = label),
+    aes(y = 75, label = label),
     fill = NA, label.color = NA, size = 3
   ) +
   scale_fill_brewer(palette = "Set1", guide = "none") +