Skip to content

Commit

Permalink
Updates to journal network viewer
Browse files Browse the repository at this point in the history
  • Loading branch information
cboulanger committed Nov 19, 2023
1 parent df1a8b6 commit 4d6716a
Show file tree
Hide file tree
Showing 12 changed files with 4,838 additions and 3,893 deletions.
Binary file modified docs/jls-journal-network-communities-graph-openalex.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/jls-journal-network-communities-graph-wos.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,062 changes: 586 additions & 476 deletions docs/jls-journal-network-communities-table-openalex.html

Large diffs are not rendered by default.

885 changes: 447 additions & 438 deletions docs/jls-journal-network-communities-table-wos.html

Large diffs are not rendered by default.

2,848 changes: 1,628 additions & 1,220 deletions docs/jls-journal-network-journals-openalex.html

Large diffs are not rendered by default.

2,022 changes: 1,156 additions & 866 deletions docs/jls-journal-network-journals-wos.html

Large diffs are not rendered by default.

387 changes: 176 additions & 211 deletions docs/jls-journal-network-openalex.html

Large diffs are not rendered by default.

386 changes: 176 additions & 210 deletions docs/jls-journal-network-wos.html

Large diffs are not rendered by default.

299 changes: 22 additions & 277 deletions jls-network.R
Original file line number Diff line number Diff line change
@@ -1,283 +1,28 @@
# see also https://www.r-bloggers.com/2019/06/interactive-network-visualization-with-r/

library(igraph)
library(tidyr)
library(ggplot2)
library(ggExtra)
library(data.table)
library(dotenv)
library(visNetwork)
library(igraph)
library(stringr)
library(gridExtra)
library(htmlTable)
library(htmltools)
library(dplyr)
library(htmltools)

source('lib/query-open-ai-api.R')

# configuraion
min_year <- 0
min_all_years <- 5
louvain_cluster_resolution <- 1.1
ignore_journals <- c('sustainability',
'choice reviews online',
'repec: research papers in economics',
"doaj (doaj: directory of open access journals)")
# data source
journal_id <- "jls"
data_vendor <- "wos"
data_vendor <- "openalex"
data_file <- paste0("data/", journal_id, "-journal-network-", data_vendor, ".csv")
result_file <- paste0("docs/", journal_id, "-journal-network-", data_vendor, ".html")

title_network <- paste(journal_id, "journal network with cluster resolution of", louvain_cluster_resolution, "(Source:", paste0(data_vendor, ")"))

# dataframe with columns source_title1, source_title2, count_citations
df <- read.csv(data_file, encoding = "UTF-8") |>
# remove journals that have a high citation rate but are not relevant for our question
filter(!(str_to_lower(source_title_citing) %in% ignore_journals) &
!(str_to_lower(source_title_cited) %in% ignore_journals)) |>
# remove self-citations
filter(source_title_citing != source_title_cited) |>
# remove journals which do not meet a minimum of citations per year
filter(count_citations >= min_year)

# Create list of nodes with id and label
all_nodes <- data.frame(label = unique(c(df$source_title_citing, df$source_title_cited))) |>
mutate(id=seq_along(label))

# add a column with abbreviated labels
stop_words <- c("&","and", "the", "of", "in", "a", "der", "die", "das")
abbreviate_label <- function(string) {
words <- strsplit(string, "\\s+")[[1]]
words <- words[!tolower(words) %in% stop_words]
initials <- substr(words, 1, 1)
tolower(paste0(initials, collapse = ""))
source('lib/generate-journal-network-viewer.R')

run <- function( journal_id, data_vendor) {
generate_journal_network_viewer(
journal_id = journal_id,
data_vendor = data_vendor,
data_file = paste0("data/", journal_id, "-journal-network-", data_vendor, ".csv"),
result_file = paste0("docs/", journal_id, "-journal-network-", data_vendor, ".html"),
min_year = 0,
min_all_years = 10,
louvain_cluster_resolution = 1.1,
label_groups_with_chatgpt = FALSE,
ignore_journals = c('sustainability',
'choice reviews online',
'repec: research papers in economics',
"doaj (doaj: directory of open access journals)")

)
print(paste("Finished generating the", journal_id, "network using data from", data_vendor ))
}
all_nodes <- all_nodes |>
mutate(abbr = sapply(label, abbreviate_label))

# add the node ids to the citation data
df <- df |>
inner_join(all_nodes, by = c("source_title_citing" = "label")) |>
rename(from = id) |>
inner_join(all_nodes, by = c("source_title_cited" = "label"))|>
rename(to = id)

# Calculate total citations made by each journal per year and compute normalized weight
citing_totals_per_year <- df |>
group_by(from, citation_year) |>
summarise(total_citing_year = sum(count_citations)) |>
ungroup()
df <- df |>
left_join(citing_totals_per_year, by=c("from", "citation_year")) |>
mutate(normalized_weight = count_citations / total_citing_year)

# Calculate total_citing
total_citing_df <- all_nodes |>
left_join(df, by = c("id" = "from")) |>
group_by(id) |>
summarize(total_citing = sum(count_citations, na.rm = TRUE)) |>
ungroup()

# Calculate total_cited
total_cited_df <- all_nodes |>
left_join(df, by = c("id" = "to")) |>
group_by(id) |>
summarize(total_cited = sum(count_citations, na.rm = TRUE)) |>
ungroup()

# Combine everything
all_nodes <- all_nodes |>
left_join(total_citing_df, by = "id") |>
left_join(total_cited_df, by = "id") |>
select(id, label, abbr, total_citing, total_cited) |>
arrange(id)

# save as a html table
file_journal_table <- paste0(journal_id,"-journal-network-journals-", data_vendor,".html")
all_nodes |>
select(label, total_citing, total_cited) |>
htmlTable::addHtmlTableStyle(align="lrr") |>
htmlTable::htmlTable(caption = paste0("<h1>Journals in ",journal_id," network (Source: ", data_vendor,")</h1>")) |>
htmltools::save_html(paste0("docs/",file_journal_table))

# Create list of edges
directed_edges <- df |>
select(from, to, count_citations, normalized_weight) |>
group_by(from, to) |>
summarise(count = sum(count_citations), median_weight = median(normalized_weight)) |>
ungroup() |>
filter(count > min_all_years) |>
mutate(from = as.integer(from),
to = as.integer(to),
width = pmin(median_weight * 3, 10),
label = as.character(count),
weight = median_weight)

# Convert into undirected graph combining citations in both directions
edges <- directed_edges |>
# Calculate count_from_to and count_to_from
mutate(
count_from_to = as.integer(ifelse(from < to, count, 0)),
count_to_from = as.integer(ifelse(from > to, count, 0))
) |>
# create a unique edge_id for both directions
rowwise() |>
mutate(edge_id = paste(pmin(from, to), pmax(from, to), sep = "-")) |>
ungroup() |>
# summarize the values of the directed edged
group_by(edge_id) |>
summarise(
from = first(from),
to = first(to),
total_count = sum(count),
combined_weight = median(weight, na.rm = TRUE),
count_from_to = sum(count_from_to),
count_to_from = sum(count_to_from)
) |>
ungroup() |>
left_join(directed_edges, by = c("from", "to")) |>
group_by(edge_id) |>
# Create the count_from_to and count_to_from
summarise(
from = first(from),
to = first(to),
total_count = first(total_count),
combined_weight = first(combined_weight),
count_from_to = first(count_from_to),
count_to_from = first(count_to_from)
) |>
ungroup() |>
# add journal names back as acronyms
inner_join(all_nodes |> select(id, abbr), by = c("from" = "id")) |>
rename(abbr_from = abbr) |>
inner_join(all_nodes |> select(id, abbr), by = c("to" = "id")) |>
rename(abbr_to = abbr) |>
# Final mutation to create label and size
mutate(
title = paste(abbr_from, "->", abbr_to, ":", count_from_to, "<br>",
abbr_to, "->", abbr_from, ":", count_to_from),
weight = combined_weight,
size = combined_weight * 10,
value = total_count
) |>
select(from, to, title, weight, size, value)

# Filter out nodes without edges
edges_node_ids <- unique(c(edges$from, edges$to)) |> sort()
filtered_nodes <- all_nodes |>
filter(id %in% edges_node_ids) |>
select(id, label, abbr, total_citing, total_cited)

# Create undirected
graph <- graph_from_data_frame(edges, vertices = filtered_nodes, directed = F)

# Remove subgraphs that are unconnected to the main graph
comps <- components(graph)
largest_comp_id <- which.max(comps$csize)
graph <- induced_subgraph(graph, which(comps$membership == largest_comp_id))

# Louvain Comunity Detection
cluster <- cluster_louvain(graph, resolution = louvain_cluster_resolution)
cluster_groups <- as.integer(membership(cluster))
cluster_names <- as.integer(cluster$names)
cluster_df <- tibble(id = cluster_names, group=cluster_groups)

# create a purely illustrative plot of the communities:
file_communities <- paste0("jls-journal-network-communities-graph-", data_vendor, ".png")
png(paste0("docs/",file_communities), width=800, height=600)
plot(cluster, graph, vertex.label=NA, vertex.size=5, edge.color=NA, edge.label=NA)
dev.off()

# create a dataframe containing the journal names
journal_communities <- communities(cluster) |>
lapply(\(ids) filtered_nodes$label[as.integer(ids)]) |>
lapply(\(x) iconv(x, to = "UTF-8"))

# Determine the size of the largest community
max_length <- max(lengths(journal_communities))

# create a dataframe table for viewing
df_communities <- journal_communities |>
lapply(\(community) c(community, rep(NA, max_length - length(community)))) |>
as.data.frame(stringsAsFactors = FALSE)

# if we have an OpenAI key, rename the groups using GPT
if (Sys.getenv("OPENAI_API_KEY")!="") {
df_communities <- rename_headers(df_communities)
}

# groups info
groups_headings <- colnames(df_communities)
groups_df <- data.frame(id = seq_along(groups_headings), group_name = groups_headings)

# save as a html table
file_group_table <- paste0(journal_id,"-journal-network-communities-table-", data_vendor,".html")
htmlTable::htmlTable(df_communities, caption = paste0("<h1>", title_network, "</h1>")) |>
htmltools::save_html(paste0("docs/",file_group_table))

# re-convert igraph to vis network data
graph_data <- toVisNetworkData(graph)

# extract nodes from igraph and add community
vis_nodes <- graph_data$nodes |>
mutate(id = as.integer(id)) |>
select(id) |>
inner_join(filtered_nodes, by="id") |>
inner_join(cluster_df, by="id") |>
inner_join(groups_df, by = c("group"="id")) |>
mutate(label = paste0(label,"\n(",abbr,")"),
title = paste0(paste0("<div class='node-title-label'>",label, "</div>"),
"Group: ", group_name, "<br>",
total_citing," total citing references", "<br>",
total_cited," total cited references")) |>
select(id, label, group, title)

# Extract edges
vis_edges <- graph_data$edges
# main
run("jls","openalex")
run("jls", "wos")

# create visualization
vn <- visNetwork(vis_nodes, vis_edges, width = "100%", height = "1000px") |>
visIgraphLayout() |>
visNodes(
shape = "dot",
size = 30,
color = list(
background = "#0085AF",
border = "#013848",
highlight = "#FF8000"
),
shadow = list(enabled = TRUE, size = 10)
) |>
visEdges(
shadow = FALSE,
smooth = list(type="continuous", roundness=0.7),
font = list(align="middle"),
color = list(color = "#0085AF", highlight = "#C62F4B")
) |>
visLayout(randomSeed = 11) |>
visPhysics(solver = "forceAtlas2Based",
forceAtlas2Based = list(gravitationalConstant = -5000)) |>
visInteraction(hideEdgesOnZoom=T, hideNodesOnDrag=T) |>
visOptions(highlightNearest = list(enabled = T, degree = 1, hover = T)) |>
visSave('cache/last-vis-graph.html', selfcontained = T)

# add searchbox code to HTML output
html <- readLines('cache/last-vis-graph.html') |> paste(collapse = "\n")
searchbox_html <- readLines("lib/vis-network-searchbox.html") |> paste(collapse = "\n")
html <- gsub("(<body.*?>)", paste0("\\1\n", searchbox_html), html)

# update variables placeholders
groups_json <- toJSON(setNames(journal_communities, groups_headings), pretty = TRUE)
html <- html |>
str_replace("\\$title", title_network) |>
str_replace("\\$groups", groups_json) |>
str_replace("<title>visNetwork</title>", paste0("<title>", title_network, "</title>")) |>
str_replace("\\$file_journal_table", file_journal_table) |>
str_replace("\\$file_group_table", file_group_table) |>
str_replace("\\$file_communities", file_communities)

writeLines(html, result_file)
Loading

0 comments on commit 4d6716a

Please sign in to comment.