Skip to content

Commit

Permalink
Merge pull request #183 from ropensci/generalize-paging
Browse files Browse the repository at this point in the history
improve paging control
  • Loading branch information
trangdata authored Oct 22, 2023
2 parents 5134044 + 6cc1f6c commit b0a2770
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 13 deletions.
38 changes: 28 additions & 10 deletions R/oa_fetch.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten
abstract = TRUE,
endpoint = "https://api.openalex.org",
per_page = 200,
paging = NULL,
pages = NULL,
count_only = FALSE,
mailto = oa_email(),
api_key = oa_apikey(),
Expand Down Expand Up @@ -96,7 +98,9 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten

if (!is.null(options$sample) && (options$sample > per_page)) {
paging <- "page"
} else {
} else if (!is.null(options$page)){
paging <- "page"
} else if (is.null(paging)){
paging <- "cursor"
}

Expand All @@ -122,6 +126,7 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten
),
per_page = per_page,
paging = paging,
pages = pages,
count_only = count_only,
mailto = mailto,
api_key = api_key,
Expand All @@ -130,7 +135,6 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten
}

if (length(final_res[[1]]) == 0) { # || is.null(final_res[[1]][[1]]$id)
warning("No collection found!")
return(NULL)
}

Expand Down Expand Up @@ -161,8 +165,12 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten
#' Defaults to 200.
#' @param paging Character.
#' Either "cursor" for cursor paging or "page" for basic paging.
#' When used with options$sample, please set `paging = "page"`
#' to avoid duplicates.
#' When used with `options$sample` and or `pages`,
#' paging is also automatically set to basic paging: `paging = "page"`
#' to avoid duplicates and get the right page.
#' See https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging.
#' @param pages Integer vector.
#' The range of pages to return. If NULL, return all pages.
#' @param count_only Logical.
#' If TRUE, the function returns only the number of item matching the query.
#' Defaults to FALSE.
Expand Down Expand Up @@ -303,6 +311,7 @@ oa_fetch <- function(entity = if (is.null(identifier)) NULL else id_type(shorten
oa_request <- function(query_url,
per_page = 200,
paging = "cursor",
pages = NULL,
count_only = FALSE,
mailto = oa_email(),
api_key = oa_apikey(),
Expand Down Expand Up @@ -337,13 +346,22 @@ oa_request <- function(query_url,
} else {
return(res)
}
n_items <- res$meta$count
n_pages <- ceiling(n_items / per_page)

## number of pages
n_items <- res$meta$count
n_pages <- ceiling(res$meta$count / per_page)
pages <- seq.int(n_pages)
if (is.null(pages)){
pages <- seq.int(n_pages)
} else {
pages <- pages[pages <= n_pages]
n_pages <- length(pages)
n_items <- min(n_items - per_page * (utils::tail(pages, 1) - n_pages), per_page * n_pages)
message("Using basic paging...")
paging <- "page"
}

if (n_items <= 0) {
if (n_items <= 0 || n_pages <= 0) {
warning("No records found!")
return(list())
}

Expand All @@ -362,14 +380,14 @@ oa_request <- function(query_url,
query_ls[["per-page"]] <- per_page

# Activation of cursor pagination
next_page <- get_next_page(paging, 1)
data <- vector("list", length = n_pages)
res <- NULL
for (i in pages) {
if (verbose) pb$tick()
Sys.sleep(1 / 100)
next_page <- get_next_page(paging, i, res)
query_ls[[paging]] <- next_page
res <- api_request(query_url, ua, query = query_ls)
next_page <- get_next_page(paging, i + 1, res)
if (!is.null(res$results)) data[[i]] <- res$results
}

Expand Down
2 changes: 1 addition & 1 deletion R/oa_snowball.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ oa_snowball <- function(identifier = NULL,
citing$oa_input <- FALSE
cited$oa_input <- FALSE
paper$oa_input <- TRUE
nodes <- rbind(paper, citing, cited)
nodes <- rbind_oa_ls(list(paper, citing, cited))
nodes <- nodes[!duplicated(nodes$id), ]

# relationships/edges
Expand Down
12 changes: 12 additions & 0 deletions man/oa_fetch.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions man/oa_request.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

51 changes: 51 additions & 0 deletions tests/testthat/test-oa_fetch.R
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,54 @@ test_that("oa_fetch for identifiers works with options", {
expect_equal(dim(i), c(1, 2))
expect_equal(dim(a), c(1, 3))
})

test_that("different paging methods yield the same result", {
w0 <- oa_fetch(
entity = "works",
title.search = c("bibliometric analysis", "science mapping"),
cited_by_count = ">50",
options = list(select = "id"),
from_publication_date = "2021-01-01",
to_publication_date = "2021-12-31",
verbose = TRUE
)

w24 <- oa_fetch(
entity = "works",
title.search = c("bibliometric analysis", "science mapping"),
cited_by_count = ">50",
from_publication_date = "2021-01-01",
to_publication_date = "2021-12-31",
options = list(select = "id"),
pages = c(2, 4:5),
per_page = 10,
verbose = TRUE
)
expect_equal(
w0[c(11:20, 31:min(50, nrow(w0))), ],
w24
)



})

test_that("pages works", {
# The last 10 pages when per_page = 20
# should be the same as the 10 pages when fetching page 2
w1 <- oa_fetch(
search = "transformative change",
options = list(select = c("id", "display_name", "publication_date")),
pages = 1,
per_page = 20,
verbose = TRUE
)
w2 <- oa_fetch(
search = "transformative change",
options = list(select = c("id", "display_name", "publication_date")),
pages = 2,
per_page = 10,
verbose = TRUE
)
expect_equal(w1[11:20,], w2)
})

0 comments on commit b0a2770

Please sign in to comment.