Skip to content

Commit

Permalink
rewrite of scrapenames for updated API
Browse files Browse the repository at this point in the history
  • Loading branch information
Zachary Foster committed Oct 29, 2024
1 parent 5d2b66c commit a935063
Showing 1 changed file with 95 additions and 70 deletions.
165 changes: 95 additions & 70 deletions R/scrapenames.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,61 @@
#' that this function is extremely buggy.
#'
#' @export
#' @param url Defunct. Use the `text` input for URLs as well as text strings.
#' @param file When using multipart/form-data as the content-type, a file may be
#' sent. This should be a path to your file on your machine.
#' @param text A text (or URL pointing to a text) for name detection.
#' @param engine (optional) (integer) Defunct. The API used no longer supports
#' this option.
#' @param unique Defunct. See the `unique_names` option.
#' @param unique_names (optional) (logical) If `TRUE` (the default), the output
#' returns unique names, instead of all name occurrences, without position
#' information of a name in the text.
#' @param verbatim (optional) Defunct. The API used no longer supports this
#' option.
#' @param detect_language (optional) Defunct. See the `language` option.
#' @param language The language of the text. Language value is used for
#' @param url (character) If text parameter is empty, and url is given, GNfinder will
#' process the URL and will find names in the content of its body.
#' @param text (character) Contains the text which will be checked for scientific names. If
#' this parameter is not empty, the url parameter is ignored.
#' @param format (character) Sets the output format. It can be set to: "csv" (the default),
#' "tsv", or "json".
#' @param bytes_offset (logical) This changes how the position of a detected name in text
#' is calculated. Normally a name's start and end positions are given as the
#' number of UTF-8 characters from the beginning of the text. If bytesOffset
#' flag is true, the start and end offsets are recalculated in the number of
#' bytes.
#' @param return_content (logical) If this is `TRUE`, the text used for the name
#' detection is returned back. This is especially useful if the input was not
#' a plain UTF-8 text and had to be prepared for name-finding. Then the
#' returned content can be used together with start and end fields of detected
#' name-strings to locate the strings in the text.
#' @param unique_names (logical) If this is `TRUE`, the output returns a list of
#' unique names, instead of a list of all name occurrences. Unique list of
#' names does not provide position information of a name in the text.
#' @param ambiguousNames (logical) If this is `TRUE`, strings which are simultaneously
#' scientific names and "normal" words are not filtered out from the results.
#' For example generic names like America, Cancer, Cafeteria will be returned
#' in the results.
#' @param no_bayes (logical) If this is `TRUE`, only heuristic algorithms are used for
#' name detection.
#' @param odds_details (logical) If true, the result will contain odds of all features
#' used for calculation of NaiveBayes odds. Odds describe probability of a
#' name to be 'real'. The higher the odds, the higher the probability that a
#' detected name is not a false positive. Odds are calculated by
#' multiplication of the odds of separate features. Odds details explain how
#' the final odds value is calculated.
#' @param language (character) The language of the text. Language value is used for
#' calculation of Bayesian odds. If this parameter is not given, eng is used
#' by default. Currently only English and German languages are supported.
#' Valid values are: `eng`, `deu`, `detect`.
#' @param all_data_sources (optional) Defunct. The API used no longer supports
#' this option.
#' @param data_source_ids (optional) Defunct. See the `sources` option.
#' @param words_around (integer) Allows to see the context surrounding a name-string. This
#' sets the number of words located immediately before or after a detected
#' name. These words are then returned in the output. Default is 0, maximum
#' value is 5.
#' @param verification (character) When this `TRUE`, there is an additional
#' verification step for detected names. This step requires internet
#' connection and uses https://verifier.globalnames.org/api/v1 for
#' verification queries.
#' @param sources Pipe separated list of data source ids to resolve found names
#' against. See list of Data Sources
#' http://resolver.globalnames.org/data_sources
#' @param return_content (logical) return OCR'ed text. returns text string in
#' `x$meta$content` slot. Default: `FALSE`
#' @param ... Further args passed to [crul::verb-GET]
#' @param detect_language Defunct. See the `language` option.
#' @param all_data_sources Defunct. The API used no longer supports this option.
#' @param data_source_ids Defunct. See the `sources` option.
#' @param file Defunct. If you feel this is important functionality submit an
#' issue at "https://github.com/ropensci/taxize"
#' @param unique Defunct. See the `unique_names` option.
#' @param engine Defunct. The API used no longer supports this option.
#' @param verbatim Defunct. The API used no longer supports this option.
#' @author Scott Chamberlain
#' @return A list of length two, first is metadata, second is the data as a
#' data.frame.
Expand Down Expand Up @@ -74,25 +103,29 @@
#' }
scrapenames <- function(
url = NULL,
file = NULL,
text = NULL,
engine = NULL,
format = 'csv',
bytes_offset = FALSE,
return_content = FALSE,
unique_names = TRUE,
ambiguous_names = FALSE,
no_bayes = FALSE,
odds_details = FALSE,
language = 'detect',
words_around = 0,
verification = TRUE,
sources = NULL,
all_matches = FALSE,
...,
file = NULL,
unique = NULL,
unique_names = NULL,
verbatim = NULL,
engine = NULL,
detect_language = NULL,
language = NULL,
all_data_sources = NULL,
data_source_ids = NULL,
sources = NULL,
return_content = FALSE,
...
method = NULL
) {

# Error if defunct parameters are used.
if (!is.null(url)) {
stop(call. = FALSE, 'The `url` option is defunct. Use the `text` option for URLs as well as text strings.')
}
if (!is.null(unique)) {
stop(call. = FALSE, 'The `unique` option is defunct. See the `unique_names` option. ')
}
Expand All @@ -105,50 +138,42 @@ scrapenames <- function(
if (!is.null(data_source_ids)) {
stop(call. = FALSE, 'The `data_source_ids` option is defunct. See the `source` option. ')
}
if (!is.null(method)) {
stop(call. = FALSE, 'This function can no longer submit files. If you feel this is important functionality submit an issue at "https://github.com/ropensci/taxize".')
}

method <- tc(list(url = url, file = file, text = text))
if (length(method) > 1) {
stop("Only one of url, file, or text can be used", call. = FALSE)
# Validate parameters
if (! format %in% c('csv', 'tsv', 'json')) {
stop(call. = FALSE, 'The `format` option must be "csv", "tsv", or "json". "', format, '" was the value given')
}


# Make query
base <- "http://gnrd.globalnames.org/api/v1/find"
if (!is.null(data_source_ids))
data_source_ids <- paste0(data_source_ids, collapse = "|")
args <- tc(list(
args <- list(
text = text,
unique_names = unique_names,
verbatim = verbatim,
url = url,
format = format,
bytesOffset = bytes_offset,
returnContent = return_content,
uniqueNames = unique_names,
ambiguousNames = ambiguous_names,
noBayes = no_bayes,
oddsDetails = odds_details,
language = language,
source = source,
return_content = as_l(return_content)
))
wordsAround = words_around,
verification = verification,
sources = sources,
allMatches = all_matches
)
cli <- crul::HttpClient$new(base, headers = tx_ual, opts = list(...))
if (names(method) == 'url') {
tt <- cli$get(query = args)
tt$raise_for_status()
out <- jsonlite::fromJSON(tt$parse("UTF-8"))
token_url <- out$token_url
} else {
if (names(method) == "text") {
tt <- cli$post(body = list(text = text), encode = "form",
followlocation = 0)
} else {
tt <- cli$post(query = argsnull(args), encode = "multipart",
body = list(file = crul::upload(file)),
followlocation = 0)
}
if (tt$status_code != 303) tt$raise_for_status()
token_url <- tt$response_headers$location
}

st <- 303
while (st == 303) {
dat <- crul::HttpClient$new(token_url, headers = tx_ual)$get()
dat$raise_for_status()
datout <- jsonlite::fromJSON(dat$parse("UTF-8"))
st <- datout$status
}
meta <- datout[!names(datout) %in% c("names")]
list(meta = meta, data = nmslwr(datout$names))
response <- cli$post(body = args, encode = "multipart")

# Parse and return results
switch (format,
csv = read.csv(text = response$parse("UTF-8")),
tsv = read.csv(text = response$parse("UTF-8"), sep = '\t'),
json = jsonlite::fromJSON(response$parse("UTF-8")),
other = stop("Invalid 'format' option.")
)
}

0 comments on commit a935063

Please sign in to comment.