Skip to content

Commit

Permalink
Add topics as a column (#257)
Browse files Browse the repository at this point in the history
* add topics as a column (replacing concepts in most entities except Works)

* make process_topics internal

* document duh!

* fix per June's feedback
  • Loading branch information
trangdata authored Jun 25, 2024
1 parent cc8d0fe commit 35ef5cb
Show file tree
Hide file tree
Showing 15 changed files with 216 additions and 127 deletions.
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# openalexR 1.4.0
* "topics" are now available in oa_fetch
* "topics" are now a valid entity in oa_fetch
* The column "topics" replaces concepts in most entities' returned dataframes
* For Works, "topics" and "concepts" are now returned as separate columns

# openalexR 1.3.1
* solved bug in au_affiliation_raw in PR#241
Expand Down
43 changes: 10 additions & 33 deletions R/oa2df.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,32 +254,10 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE,
names(open_access)[[1]] <- "is_oa_anywhere"
}

# Topics
process_paper_topics <- function(paper) {
topics <- paper$topics
if (is.null(topics)) {
return(NULL)
}
topics_ls <- lapply(seq_along(topics), function(i) {
topic <- topics[[i]]
relev <- c(
# Hoist fields for the topic entity
list(topic = topic[c("id", "display_name")]),
# Keep info about other entities as-is
topic[vapply(topic, is.list, logical(1))]
)
relev_df <- subs_na(relev, "rbind_df")[[1]]
relev_df <- tibble::rownames_to_column(relev_df, "name")
cbind(i = i, score = topic$score, relev_df)
})
topics_df <- do.call(rbind.data.frame, topics_ls)
list(tibble::as_tibble(topics_df))
}
topics <- process_paper_topics(paper)

topics <- process_topics(paper, "score")
out_ls <- c(
sim_fields, venue, open_access, paper_biblio,
list(author = author, ab = ab, topics = topics)
list(author = author, ab = ab), topics
)
out_ls[sapply(out_ls, is.null)] <- NULL
list_df[[i]] <- out_ls
Expand Down Expand Up @@ -363,7 +341,6 @@ authors2df <- function(data, verbose = TRUE,
"identical", "relevance_score",
"flat", "display_name_alternatives",
"rbind_df", "counts_by_year",
"rbind_df", "x_concepts",
"flat", "ids"
)

Expand Down Expand Up @@ -397,8 +374,8 @@ authors2df <- function(data, verbose = TRUE,
}
sub_affiliation$affiliations_other <- list(affiliations_other)
}

list_df[[i]] <- c(sim_fields, sub_affiliation)
topics <- process_topics(item, "count")
list_df[[i]] <- c(sim_fields, sub_affiliation, topics)
}

col_order <- c(
Expand All @@ -407,7 +384,7 @@ authors2df <- function(data, verbose = TRUE,
"affiliation_display_name", "affiliation_id", "affiliation_ror",
"affiliation_country_code", "affiliation_type", "affiliation_lineage",
"affiliations_other",
"x_concepts", "works_api_url"
"topics", "works_api_url"
)

out_df <- rbind_oa_ls(list_df)
Expand Down Expand Up @@ -473,7 +450,6 @@ institutions2df <- function(data, verbose = TRUE,
"flat", "display_name_acronyms",
"row_df", "geo",
"rbind_df", "counts_by_year",
"rbind_df", "x_concepts",
"rbind_df", "associated_institutions",
"flat", "ids"
)
Expand All @@ -498,7 +474,8 @@ institutions2df <- function(data, verbose = TRUE,
)
)
}
list_df[[i]] <- c(sim_fields, interna)
topics <- process_topics(item, "count")
list_df[[i]] <- c(sim_fields, interna, topics)
}


Expand All @@ -508,7 +485,7 @@ institutions2df <- function(data, verbose = TRUE,
"homepage_url", "image_url", "image_thumbnail_url",
"associated_institutions", "relevance_score", "works_count",
"cited_by_count", "counts_by_year",
"works_api_url", "x_concepts", "updated_date", "created_date"
"works_api_url", "topics", "updated_date", "created_date"
)

out_df <- rbind_oa_ls(list_df)
Expand Down Expand Up @@ -740,7 +717,6 @@ sources2df <- function(data, verbose = TRUE,
"flat", "alternate_titles",
"identical", "abbreviated_title",
"identical", "type",
"rbind_df", "x_concepts",
"rbind_df", "counts_by_year",
"identical", "works_api_url",
"identical", "updated_date",
Expand All @@ -761,7 +737,8 @@ sources2df <- function(data, verbose = TRUE,
fields$type,
SIMPLIFY = FALSE
)
list_df[[i]] <- sim_fields
topics <- process_topics(item, "count")
list_df[[i]] <- c(sim_fields, topics)
}

out_df <- rbind_oa_ls(list_df)
Expand Down
10 changes: 6 additions & 4 deletions R/simplify.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

#' Simplify the OpenAlex authors result
#'
#' This function is mostly for the package's internal use,
Expand Down Expand Up @@ -32,13 +31,13 @@ show_authors <- function(x, simp_func = utils::head) {
}

x$top_concepts <- vapply(
x$x_concepts,
x$topics,
function(y) {
if (is.logical(y)) {
return(NA_character_)
}
op_level <- min(1, max(y$level))
paste(utils::head(y[y$level == op_level, "display_name"], 3),
top_subfields <- y[y$name == "subfield", ]
paste(utils::head(top_subfields, 3)$display_name,
collapse = ", "
)
},
Expand Down Expand Up @@ -113,6 +112,9 @@ show_works <- function(x, simp_func = utils::head) {
}

get_auth_position <- function(y, position = "first") {
if (length(y) == 1 && is.na(y)) {
return(NA_character_)
}
last <- y[y$author_position == position, "au_display_name"]
if (length(last) == 0) {
return(NA_character_)
Expand Down
30 changes: 30 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,33 @@ oa_apikey <- function() {
}
apikey
}


#' Process topics
#'
#' @param entity List. One single work or author to process.
#' @param extra Character. Either "score" (work) or "count" (author).
#'
#' @return List. A list of one tibble with the processed topics.
#' @keywords internal
#'
process_topics <- function(entity, extra) {
topics <- entity$topics
if (is.null(topics)) {
return(NULL)
}
topics_ls <- lapply(seq_along(topics), function(i) {
topic <- topics[[i]]
relev <- c(
# Hoist fields for the topic entity
list(topic = topic[c("id", "display_name")]),
# Keep info about other entities as-is
topic[vapply(topic, is.list, logical(1))]
)
relev_df <- subs_na(relev, "rbind_df")[[1]]
relev_df <- tibble::rownames_to_column(relev_df, "name")
cbind(i = i, topic[extra], relev_df)
})
topics_df <- do.call(rbind.data.frame, topics_ls)
list(topics = list(tibble::as_tibble(topics_df)))
}
35 changes: 22 additions & 13 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -278,12 +278,12 @@ And what do they publish on?
# library(wordcloud)
concept_cloud <- italy_insts |>
select(inst_id = id, x_concepts) |>
tidyr::unnest(x_concepts) |>
filter(level == 1) |>
select(display_name, score) |>
select(inst_id = id, topics) |>
tidyr::unnest(topics) |>
filter(name == "field") |>
select(display_name, count) |>
group_by(display_name) |>
summarise(score = sum(score))
summarise(score = sqrt(sum(count)))
pal <- c("black", scales::brewer_pal(palette = "Set1")(5))
set.seed(1)
Expand All @@ -309,17 +309,26 @@ jours_all <- oa_fetch(
verbose = TRUE
)
clean_journal_name <- function(x) {
x |>
gsub("\\(.*?\\)", "", x = _) |>
gsub("Journal of the|Journal of", "J.", x = _) |>
gsub("/.*", "", x = _)
}
jours <- jours_all |>
filter(!is.na(x_concepts), type != "ebook platform") |>
filter(type == "journal") |>
slice_max(cited_by_count, n = 9) |>
distinct(display_name, .keep_all = TRUE) |>
select(jour = display_name, x_concepts) |>
tidyr::unnest(x_concepts) |>
filter(level == 0) |>
select(jour = display_name, topics) |>
tidyr::unnest(topics) |>
filter(name == "field") |>
group_by(id, jour, display_name) |>
summarise(score = (sum(count))^(1/3), .groups = "drop") |>
left_join(concept_abbrev, by = join_by(id, display_name)) |>
mutate(
abbreviation = gsub(" ", "<br>", abbreviation),
jour = gsub("Journal of|Journal of the", "J.", gsub("\\(.*?\\)", "", jour))
jour = clean_journal_name(jour),
) |>
tidyr::complete(jour, abbreviation, fill = list(score = 0)) |>
group_by(jour) |>
Expand All @@ -333,9 +342,9 @@ jours |>
ggplot() +
aes(fill = jour, y = score, x = abbreviation, group = jour) +
facet_wrap(~jour) +
geom_hline(yintercept = c(45, 90), colour = "grey90", linewidth = 0.2) +
geom_hline(yintercept = c(25, 50), colour = "grey90", linewidth = 0.2) +
geom_segment(
aes(x = abbreviation, xend = abbreviation, y = 0, yend = 100),
aes(x = abbreviation, xend = abbreviation, y = 0, yend = 55),
color = "grey95"
) +
geom_col(color = "grey20") +
Expand All @@ -350,7 +359,7 @@ jours |>
axis.ticks.y = element_blank()
) +
ggtext::geom_richtext(
aes(y = 120, label = label),
aes(y = 75, label = label),
fill = NA, label.color = NA, size = 3
) +
scale_fill_brewer(palette = "Set1", guide = "none") +
Expand Down
Loading

0 comments on commit 35ef5cb

Please sign in to comment.