-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
885 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ Encoding: UTF-8 | |
Imports: | ||
dplyr, | ||
httr, | ||
xml2, | ||
jsonlite, | ||
ggplot2, | ||
readr, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
#' Fetch All Records from Finna API | ||
#' | ||
#' This function fetches records from the Finna API in chunks of 100,000, | ||
#' automatically paginating through the results until the maximum number of | ||
#' records is reached. | ||
#' | ||
#' @param base_query A string specifying the base query. Defaults to "*". | ||
#' @param base_filters A character vector of filters to apply to the query. | ||
#' Defaults to `c('collection:"FEN"')`. | ||
#' @param sort A string defining the sort order of the results. Default is "main_date_str asc". | ||
#' @param limit_per_query An integer specifying the number of records to fetch per query. Defaults to 100000. | ||
#' @param total_limit An integer specifying the maximum number of records to fetch. Defaults to `Inf`. | ||
#' @return A tibble containing all fetched records. | ||
#' @examples | ||
#' \dontrun{ | ||
#' results <- fetch_all_records( | ||
#' base_query = "*", | ||
#' base_filters = c('collection:"FEN"'), | ||
#' sort = "main_date_str asc", | ||
#' limit_per_query = 100000, | ||
#' total_limit = Inf | ||
#' ) | ||
#' print(results) | ||
#' } | ||
#' @export | ||
fetch_all_records <- function(base_query = "*", | ||
base_filters = c('collection:"FEN"'), | ||
sort = "main_date_str asc", | ||
limit_per_query = 100000, | ||
total_limit = Inf) { | ||
|
||
# Initialize storage for results | ||
all_results <- list() | ||
total_fetched <- 0 | ||
continue_fetching <- TRUE | ||
last_fetched_id <- NULL | ||
|
||
while (continue_fetching) { | ||
# Construct filters for the current subquery | ||
filters <- base_filters | ||
if (!is.null(last_fetched_id)) { | ||
filters <- c(filters, paste0('id:[', last_fetched_id, ' TO *]')) | ||
} | ||
|
||
# Inform user about the current query | ||
message(sprintf("Fetching records starting from id: %s", | ||
ifelse(is.null(last_fetched_id), "beginning", last_fetched_id))) | ||
|
||
# Retry logic for handling errors | ||
attempt <- 1 | ||
max_attempts <- 3 | ||
results <- NULL | ||
while (is.null(results) && attempt <= max_attempts) { | ||
tryCatch({ | ||
# Fetch records using the subquery | ||
results <- search_finna( | ||
query = base_query, | ||
filters = filters, | ||
sort = sort, | ||
limit = limit_per_query | ||
) | ||
}, error = function(e) { | ||
warning(sprintf("Attempt %d failed: %s", attempt, e$message)) | ||
attempt <- attempt + 1 | ||
Sys.sleep(5) # Delay before retrying | ||
}) | ||
} | ||
|
||
# If results are still NULL after retries, stop fetching | ||
if (is.null(results)) { | ||
warning("Failed to fetch records after multiple attempts. Stopping.") | ||
break | ||
} | ||
|
||
# Check if results are empty | ||
if (nrow(results) == 0) { | ||
message("No more records found.") | ||
break | ||
} | ||
|
||
# Add the fetched records to the main storage | ||
all_results <- c(all_results, list(results)) | ||
|
||
# Update the total fetched count | ||
num_fetched <- nrow(results) | ||
total_fetched <- total_fetched + num_fetched | ||
|
||
# Stop if we've hit the total limit | ||
if (total_fetched >= total_limit) { | ||
message("Total limit reached.") | ||
break | ||
} | ||
|
||
# Update the last fetched id for the next query | ||
last_fetched_id <- max(results$id, na.rm = TRUE) | ||
|
||
# Stop if fewer than limit_per_query records are fetched | ||
if (num_fetched < limit_per_query) { | ||
message("No more records to fetch.") | ||
break | ||
} | ||
} | ||
|
||
# Combine all results into a single tibble | ||
combined_results <- dplyr::bind_rows(all_results) | ||
return(combined_results) | ||
} | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# fennica_all_records <- function(base_query = "*", | ||
# base_filters = c('collection:"FEN"'), | ||
# sort = "main_date_str asc", | ||
# limit_per_query = 100000, | ||
# total_limit = Inf) { | ||
# | ||
# all_results <- list() # List to store results | ||
# total_fetched <- 0 # Total records fetched so far | ||
# continue_fetching <- TRUE | ||
# last_fetched_date <- NULL | ||
# | ||
# while (continue_fetching) { | ||
# # Construct filters for subquery | ||
# filters <- base_filters | ||
# | ||
# # Add a date range filter if this isn't the first query | ||
# if (!is.null(last_fetched_date)) { | ||
# filters <- c(filters, paste0('search_daterange_mv:"[', last_fetched_date, ' TO 9999]"')) | ||
# } | ||
# | ||
# # Fetch records using the subquery | ||
# message(sprintf("Fetching records starting from date: %s", | ||
# ifelse(is.null(last_fetched_date), "beginning", last_fetched_date))) | ||
# results <- search_finna(query = base_query, filters = filters, sort = sort, limit = limit_per_query) | ||
# | ||
# # Add fetched results to the main list | ||
# all_results <- c(all_results, list(results)) | ||
# | ||
# # Update total fetched count | ||
# num_fetched <- nrow(results) | ||
# total_fetched <- total_fetched + num_fetched | ||
# | ||
# # Check if we've fetched all results or hit the total limit | ||
# if (num_fetched < limit_per_query || total_fetched >= total_limit) { | ||
# continue_fetching <- FALSE | ||
# } else { | ||
# # Update the last fetched date for the next query | ||
# last_fetched_date <- max(results$Year, na.rm = TRUE) # Update to the latest year fetched | ||
# } | ||
# } | ||
# | ||
# # Combine all results into a single tibble | ||
# combined_results <- dplyr::bind_rows(all_results) | ||
# return(combined_results) | ||
# } | ||
# | ||
# # Example usage | ||
# final_results <- fennica_all_records( | ||
# base_query = "*", | ||
# base_filters = c('collection:"FEN"'), | ||
# sort = "main_date_str asc", | ||
# limit_per_query = 100000, | ||
# total_limit = Inf | ||
# ) | ||
# | ||
# # View results | ||
# print(final_results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#' Fetch Finna Collection Data with Flexible Query | ||
#' | ||
#' This function retrieves data from the Finna API and formats it as a tidy tibble. | ||
#' | ||
#' @param query The query string for filtering results. Defaults to NULL, which fetches data without a specific search term. | ||
#' @param limit Maximum number of results to fetch. Defaults to 0. | ||
#' @param facets Facet to retrieve, defaults to "building". | ||
#' @param lng Language for results, defaults to "fi". | ||
#' @param prettyPrint Logical, whether to pretty-print JSON responses. | ||
#' @return A tibble containing the fetched data with relevant fields. | ||
#' @examples | ||
#' fetch_finna_collection(query = "record_format:ead", limit = 0) | ||
#' fetch_finna_collection() # Fetches data with no specific query | ||
#' @export | ||
fetch_finna_collection <- function(query = NULL, limit = 0, facets = "building", lng = "fi", prettyPrint = TRUE) { | ||
# Load required libraries | ||
if (!requireNamespace("httr", quietly = TRUE)) install.packages("httr") | ||
if (!requireNamespace("jsonlite", quietly = TRUE)) install.packages("jsonlite") | ||
if (!requireNamespace("tibble", quietly = TRUE)) install.packages("tibble") | ||
|
||
# Start the timer | ||
start_time <- Sys.time() | ||
|
||
base_url <- "https://api.finna.fi/v1/search" | ||
|
||
# Build query parameters | ||
query_params <- list( | ||
type = "AllFields", | ||
limit = limit, | ||
`facet[]` = facets, | ||
lng = lng, | ||
prettyPrint = prettyPrint | ||
) | ||
|
||
# Add query parameter only if it is not NULL | ||
if (!is.null(query)) { | ||
query_params$lookfor <- query | ||
} | ||
|
||
# Perform the GET request | ||
response <- httr::GET(base_url, query = query_params) | ||
|
||
# Check response status | ||
if (httr::status_code(response) != 200) { | ||
stop("API request failed. Status code: ", httr::status_code(response)) | ||
} | ||
|
||
# Parse the response JSON | ||
json_data <- httr::content(response, as = "parsed") | ||
|
||
# Extract the result count | ||
result_count <- json_data$resultCount | ||
|
||
# Extract and process the building facet data | ||
building_data <- json_data$facets$building | ||
buildings_df <- if (!is.null(building_data)) { | ||
tibble::tibble( | ||
value = sapply(building_data, `[[`, "value"), | ||
translated = sapply(building_data, `[[`, "translated"), | ||
count = sapply(building_data, `[[`, "count"), | ||
href = sapply(building_data, `[[`, "href") | ||
) | ||
} else { | ||
tibble::tibble() | ||
} | ||
|
||
# End the timer | ||
end_time <- Sys.time() | ||
time_taken <- as.numeric(difftime(end_time, start_time, units = "secs")) | ||
|
||
# Print summary messages | ||
message(sprintf("Total results found: %d", result_count)) | ||
message(sprintf("Data fetching completed in %.2f seconds.", time_taken)) | ||
|
||
# Return the tibble | ||
buildings_df | ||
} |
Oops, something went wrong.