Skip to content

Commit

Permalink
adding new functionalities
Browse files Browse the repository at this point in the history
  • Loading branch information
ake123 committed Nov 27, 2024
1 parent 2c377e7 commit b7754e3
Show file tree
Hide file tree
Showing 11 changed files with 885 additions and 1 deletion.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Encoding: UTF-8
Imports:
dplyr,
httr,
xml2,
jsonlite,
ggplot2,
readr,
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ export(analyze_metadata)
export(analyze_trends_over_time_binned)
export(check_api_access)
export(enrich_author_name)
export(fetch_all_records)
export(fetch_finna_collection)
export(finna_cite)
export(finna_interactive)
export(get_finna_records)
export(harvest_oai_pmh)
export(load_offline_data)
export(refine_metadata)
export(save_for_offline)
Expand All @@ -26,8 +29,11 @@ export(visualize_year_distribution)
export(visualize_year_distribution_line)
import(dplyr)
import(ggplot2)
import(httr)
import(progress)
import(rlang)
import(stringr)
import(tibble)
import(tidyr)
import(tm)
import(wordcloud2)
Expand Down
190 changes: 190 additions & 0 deletions R/fennica_all_records.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
#' Fetch All Records from Finna API
#'
#' This function fetches records from the Finna API in chunks of 100,000,
#' automatically paginating through the results until the maximum number of
#' records is reached.
#'
#' @param base_query A string specifying the base query. Defaults to "*".
#' @param base_filters A character vector of filters to apply to the query.
#' Defaults to `c('collection:"FEN"')`.
#' @param sort A string defining the sort order of the results. Default is "main_date_str asc".
#' @param limit_per_query An integer specifying the number of records to fetch per query. Defaults to 100000.
#' @param total_limit An integer specifying the maximum number of records to fetch. Defaults to `Inf`.
#' @return A tibble containing all fetched records.
#' @examples
#' \dontrun{
#' results <- fetch_all_records(
#' base_query = "*",
#' base_filters = c('collection:"FEN"'),
#' sort = "main_date_str asc",
#' limit_per_query = 100000,
#' total_limit = Inf
#' )
#' print(results)
#' }
#' @export
fetch_all_records <- function(base_query = "*",
base_filters = c('collection:"FEN"'),
sort = "main_date_str asc",
limit_per_query = 100000,
total_limit = Inf) {

# Initialize storage for results
all_results <- list()
total_fetched <- 0
continue_fetching <- TRUE
last_fetched_id <- NULL

while (continue_fetching) {
# Construct filters for the current subquery
filters <- base_filters
if (!is.null(last_fetched_id)) {
filters <- c(filters, paste0('id:[', last_fetched_id, ' TO *]'))
}

# Inform user about the current query
message(sprintf("Fetching records starting from id: %s",
ifelse(is.null(last_fetched_id), "beginning", last_fetched_id)))

# Retry logic for handling errors
attempt <- 1
max_attempts <- 3
results <- NULL
while (is.null(results) && attempt <= max_attempts) {
tryCatch({
# Fetch records using the subquery
results <- search_finna(
query = base_query,
filters = filters,
sort = sort,
limit = limit_per_query
)
}, error = function(e) {
warning(sprintf("Attempt %d failed: %s", attempt, e$message))
attempt <- attempt + 1
Sys.sleep(5) # Delay before retrying
})
}

# If results are still NULL after retries, stop fetching
if (is.null(results)) {
warning("Failed to fetch records after multiple attempts. Stopping.")
break
}

# Check if results are empty
if (nrow(results) == 0) {
message("No more records found.")
break
}

# Add the fetched records to the main storage
all_results <- c(all_results, list(results))

# Update the total fetched count
num_fetched <- nrow(results)
total_fetched <- total_fetched + num_fetched

# Stop if we've hit the total limit
if (total_fetched >= total_limit) {
message("Total limit reached.")
break
}

# Update the last fetched id for the next query
last_fetched_id <- max(results$id, na.rm = TRUE)

# Stop if fewer than limit_per_query records are fetched
if (num_fetched < limit_per_query) {
message("No more records to fetch.")
break
}
}

# Combine all results into a single tibble
combined_results <- dplyr::bind_rows(all_results)
return(combined_results)
}


























# fennica_all_records <- function(base_query = "*",
# base_filters = c('collection:"FEN"'),
# sort = "main_date_str asc",
# limit_per_query = 100000,
# total_limit = Inf) {
#
# all_results <- list() # List to store results
# total_fetched <- 0 # Total records fetched so far
# continue_fetching <- TRUE
# last_fetched_date <- NULL
#
# while (continue_fetching) {
# # Construct filters for subquery
# filters <- base_filters
#
# # Add a date range filter if this isn't the first query
# if (!is.null(last_fetched_date)) {
# filters <- c(filters, paste0('search_daterange_mv:"[', last_fetched_date, ' TO 9999]"'))
# }
#
# # Fetch records using the subquery
# message(sprintf("Fetching records starting from date: %s",
# ifelse(is.null(last_fetched_date), "beginning", last_fetched_date)))
# results <- search_finna(query = base_query, filters = filters, sort = sort, limit = limit_per_query)
#
# # Add fetched results to the main list
# all_results <- c(all_results, list(results))
#
# # Update total fetched count
# num_fetched <- nrow(results)
# total_fetched <- total_fetched + num_fetched
#
# # Check if we've fetched all results or hit the total limit
# if (num_fetched < limit_per_query || total_fetched >= total_limit) {
# continue_fetching <- FALSE
# } else {
# # Update the last fetched date for the next query
# last_fetched_date <- max(results$Year, na.rm = TRUE) # Update to the latest year fetched
# }
# }
#
# # Combine all results into a single tibble
# combined_results <- dplyr::bind_rows(all_results)
# return(combined_results)
# }
#
# # Example usage
# final_results <- fennica_all_records(
# base_query = "*",
# base_filters = c('collection:"FEN"'),
# sort = "main_date_str asc",
# limit_per_query = 100000,
# total_limit = Inf
# )
#
# # View results
# print(final_results)
77 changes: 77 additions & 0 deletions R/fetch_finna_collection.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#' Fetch Finna Collection Data with Flexible Query
#'
#' This function retrieves data from the Finna API and formats it as a tidy tibble.
#'
#' @param query The query string for filtering results. Defaults to NULL, which fetches data without a specific search term.
#' @param limit Maximum number of results to fetch. Defaults to 0.
#' @param facets Facet to retrieve, defaults to "building".
#' @param lng Language for results, defaults to "fi".
#' @param prettyPrint Logical, whether to pretty-print JSON responses.
#' @return A tibble containing the fetched data with relevant fields.
#' @examples
#' fetch_finna_collection(query = "record_format:ead", limit = 0)
#' fetch_finna_collection() # Fetches data with no specific query
#' @export
fetch_finna_collection <- function(query = NULL, limit = 0, facets = "building", lng = "fi", prettyPrint = TRUE) {
# Load required libraries
if (!requireNamespace("httr", quietly = TRUE)) install.packages("httr")
if (!requireNamespace("jsonlite", quietly = TRUE)) install.packages("jsonlite")
if (!requireNamespace("tibble", quietly = TRUE)) install.packages("tibble")

# Start the timer
start_time <- Sys.time()

base_url <- "https://api.finna.fi/v1/search"

# Build query parameters
query_params <- list(
type = "AllFields",
limit = limit,
`facet[]` = facets,
lng = lng,
prettyPrint = prettyPrint
)

# Add query parameter only if it is not NULL
if (!is.null(query)) {
query_params$lookfor <- query
}

# Perform the GET request
response <- httr::GET(base_url, query = query_params)

# Check response status
if (httr::status_code(response) != 200) {
stop("API request failed. Status code: ", httr::status_code(response))
}

# Parse the response JSON
json_data <- httr::content(response, as = "parsed")

# Extract the result count
result_count <- json_data$resultCount

# Extract and process the building facet data
building_data <- json_data$facets$building
buildings_df <- if (!is.null(building_data)) {
tibble::tibble(
value = sapply(building_data, `[[`, "value"),
translated = sapply(building_data, `[[`, "translated"),
count = sapply(building_data, `[[`, "count"),
href = sapply(building_data, `[[`, "href")
)
} else {
tibble::tibble()
}

# End the timer
end_time <- Sys.time()
time_taken <- as.numeric(difftime(end_time, start_time, units = "secs"))

# Print summary messages
message(sprintf("Total results found: %d", result_count))
message(sprintf("Data fetching completed in %.2f seconds.", time_taken))

# Return the tibble
buildings_df
}
Loading

0 comments on commit b7754e3

Please sign in to comment.