From 602f6be3b5a2ca15bc2ba16515e2c15d3e059ce2 Mon Sep 17 00:00:00 2001 From: William Dewey Date: Thu, 27 Oct 2022 11:32:57 -0500 Subject: [PATCH 01/13] upgrade to Ruby 3.0.4 --- .ruby-version | 2 +- Gemfile.lock | 9 +++++++-- datura.gemspec | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.ruby-version b/.ruby-version index 860487ca1..b0f2dcb32 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -2.7.1 +3.0.4 diff --git a/Gemfile.lock b/Gemfile.lock index d5512b234..fd39ff559 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -19,9 +19,13 @@ GEM mime-types (3.4.1) mime-types-data (~> 3.2015) mime-types-data (3.2022.0105) + mini_portile2 (2.8.0) minitest (5.16.3) netrc (0.11.0) - nokogiri (1.13.8-x86_64-darwin) + nokogiri (1.13.9) + mini_portile2 (~> 2.8.0) + racc (~> 1.4) + nokogiri (1.13.9-x86_64-darwin) racc (~> 1.4) racc (1.6.0) rake (13.0.6) @@ -35,6 +39,7 @@ GEM unf_ext (0.0.8.2) PLATFORMS + ruby x86_64-darwin-20 DEPENDENCIES @@ -45,4 +50,4 @@ DEPENDENCIES rake (~> 13.0) BUNDLED WITH - 2.2.26 + 2.2.33 diff --git a/datura.gemspec b/datura.gemspec index 316a5e2a0..b673b8a1f 100644 --- a/datura.gemspec +++ b/datura.gemspec @@ -53,7 +53,7 @@ Gem::Specification.new do |spec| ] spec.require_paths = ["lib"] - spec.required_ruby_version = "~> 2.5" + spec.required_ruby_version = "~> 3.0" spec.add_runtime_dependency "colorize", "~> 0.8.1" spec.add_runtime_dependency "nokogiri", "~> 1.10" spec.add_runtime_dependency "rest-client", "~> 2.1" From c6e687de9263de01b9a1b7f45975260935cb8790 Mon Sep 17 00:00:00 2001 From: William Dewey Date: Thu, 27 Oct 2022 11:40:51 -0500 Subject: [PATCH 02/13] make keyword arguments compatible with Ruby 3 --- lib/datura/file_types/file_csv.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/datura/file_types/file_csv.rb b/lib/datura/file_types/file_csv.rb index cd8a4e381..92a1cbff6 100644 --- a/lib/datura/file_types/file_csv.rb +++ b/lib/datura/file_types/file_csv.rb @@ -33,7 +33,7 @@ def present?(item) # override to change encoding def read_csv(file_location, encoding="utf-8") - CSV.read(file_location, { + CSV.read(file_location, **{ encoding: encoding, headers: true, return_headers: true From dbe6aaa87cde4c99457b0c132ab922bb19c3101a Mon Sep 17 00:00:00 2001 From: William Dewey Date: Thu, 27 Oct 2022 12:55:37 -0500 Subject: [PATCH 03/13] go up to ruby 3.1.2 --- .ruby-version | 2 +- datura.gemspec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ruby-version b/.ruby-version index b0f2dcb32..ef538c281 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -3.0.4 +3.1.2 diff --git a/datura.gemspec b/datura.gemspec index b673b8a1f..3d6f91b56 100644 --- a/datura.gemspec +++ b/datura.gemspec @@ -53,7 +53,7 @@ Gem::Specification.new do |spec| ] spec.require_paths = ["lib"] - spec.required_ruby_version = "~> 3.0" + spec.required_ruby_version = "~> 3.1" spec.add_runtime_dependency "colorize", "~> 0.8.1" spec.add_runtime_dependency "nokogiri", "~> 1.10" spec.add_runtime_dependency "rest-client", "~> 2.1" From 6b1e46841e941a70ef2401dd4312914dfae25e69 Mon Sep 17 00:00:00 2001 From: William Dewey Date: Tue, 8 Nov 2022 15:42:16 -0600 Subject: [PATCH 04/13] add output if nested field is invalid --- lib/datura/elasticsearch/index.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/datura/elasticsearch/index.rb b/lib/datura/elasticsearch/index.rb index cb8da9b84..8124bad79 100644 --- a/lib/datura/elasticsearch/index.rb +++ b/lib/datura/elasticsearch/index.rb @@ -137,6 +137,7 @@ def valid_document?(doc) next else # if one of the nested hashes fails, it is invalid + puts "Nested field '#{field}' is invalid" return false end end From de02891ca9ab9a6fe28102faaa7dc7ab3b668846 Mon Sep 17 00:00:00 2001 From: William Dewey Date: Tue, 8 Nov 2022 15:47:55 -0600 Subject: [PATCH 05/13] don't use array method on person to avoid errors --- lib/datura/to_es/es_request.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/datura/to_es/es_request.rb b/lib/datura/to_es/es_request.rb index 693a37e07..6b7f42382 100644 --- a/lib/datura/to_es/es_request.rb +++ b/lib/datura/to_es/es_request.rb @@ -206,7 +206,7 @@ def assemble_relations_2 def assemble_additional_2 @json["spatial"] = spatial @json["places"] = places - @json["person"] = Array(person) + @json["person"] = person @json["event"] = event @json["rdf"] = rdf end From ba60f4c6ef93205b05733f22968ebc152ea8f5eb Mon Sep 17 00:00:00 2001 From: William Dewey Date: Wed, 9 Nov 2022 16:15:03 -0600 Subject: [PATCH 06/13] update changelog for new version --- CHANGELOG.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b90e11bf8..ceaf30818 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,21 +25,46 @@ Versioning](https://semver.org/spec/v2.0.0.html). ### Security --> -## [Unreleased](https://github.com/CDRH/datura/compare/v0.2.0-beta...dev) +## [1.0.0](https://github.com/CDRH/datura/compare/v0.2.0-beta...dev) ### Added - minor test for Datura::Helpers.date_standardize - documentation for web scraping - documentation for CsvToEs (transforming CSV files and posting to elasticsearch) +- documentation for adding new ingest formats to Datura +- byebug gem for debugging - instructions for installing Javascript Runtime files for Saxon +- API schema can either be 1.0 or 2.0 (which includes nested fields); 1.0 will be run by default unless 2.0 is specified. Add the following to `public.yml` or `private.yml` in the data repo: +``` +api_version: '2.0' +``` +- schema validation with API version 2.0, invalidly constructed documents will not post +- authentication with Elasticesarch 8.5; add the following to `public.yml` or `private.yml` in the data repo: +``` + es_user: username + es_password: ******** +``` +- field overrides for new fields in the new API schema +- Functionality to transform EAD files and post them to elasticsearch ### Changed +- update ruby to 3.1.2 - date_standardize now relies on strftime instead of manual zero padding for month, day - minor corrections to documentation - XPath: "text" is now ingested as an array and will be displayed delimitted by spaces +- refactored command line methods into elasticsearch library +- refactored and moved date_standardize and date_display helper methods +- Nokogiri methods `get_text` and `get_list` on TEI now return nil rather than empty strings or arrays if there are no matches ### Migration - check to make sure "text" xpath is doing desired behavior +- use Elasticsearch 8.5 or higher and add authentication as described above if security is enabled +- upgrade data repos to Ruby 3.1.2 +- add api version to config as described above +- make sure fields are consistent with the api schema, many have been renamed or changed in format +- add nil checks with get_text and get_list methods +- add EadToES overrides if ingesting EAD files +- if overriding the `read_csv` method in `lib/datura/file_type.rb`, the hash must be prefixed with ** (`**{}`). ## [v0.2.0-beta](https://github.com/CDRH/datura/compare/v0.1.6...v0.2.0-beta) - 2020-08-17 - Altering field and xpath behavior, adds get_elements From 0ddcc33686c5c28b144d3e1c56fce646902fbf86 Mon Sep 17 00:00:00 2001 From: William Dewey Date: Wed, 9 Nov 2022 16:39:58 -0600 Subject: [PATCH 07/13] update reference to ruby version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5997622ee..55baa376f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Looking for information about how to post documents? Check out the ## Install / Set Up Data Repo -Check that Ruby is installed, preferably 2.7.x or up. If you are using RVM, see the RVM section below. +Check that Ruby is installed, preferably 3.1.2 or up. If you are using RVM, see the RVM section below. If your project already has a Gemfile, add the `gem "datura"` line. If not, create a new directory and add a file named `Gemfile` (no extension). From a614c6dd2b7409b98dd4e23a224a3f1e91c011fe Mon Sep 17 00:00:00 2001 From: William Dewey Date: Thu, 10 Nov 2022 12:00:34 -0600 Subject: [PATCH 08/13] make changes related to ES and API upgrade --- docs/1_setup/config.md | 5 +++++ docs/1_setup/prepare_index.md | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/1_setup/config.md b/docs/1_setup/config.md index fe4e7b29f..e58ccd5b5 100644 --- a/docs/1_setup/config.md +++ b/docs/1_setup/config.md @@ -9,7 +9,10 @@ default: collection: es_index es_path + es_user + es_password ``` +(The options es_user and es_password are needed if you are using a secured Elasticsearch index.) If there are any settings which must be different based on the local environment (your computer vs the server), place these in `config/private.yml`. @@ -118,6 +121,8 @@ Some stuff commonly in `private.yml`: - `threads: 5` (5 recommended for PC, 50 for powerful servers) - `es_path: http://localhost:9200` - `es_index: some_index` +- `es_user: elastic` (if you want to use security on your local elasticsearch instance) +- `es_password: ******` - `solr_path: http://localhost:8983/solr` - `solr_core: collection_name` diff --git a/docs/1_setup/prepare_index.md b/docs/1_setup/prepare_index.md index 944f9a719..fa79e7013 100644 --- a/docs/1_setup/prepare_index.md +++ b/docs/1_setup/prepare_index.md @@ -13,7 +13,7 @@ You will need to make sure that somewhere, the following are being set in your p ### Step 2: Prepare Elasticsearch Index -Make sure elasticsearch is installed and running in the location you wish to push to. If there is already an index you will be using, take note of its name and skip this step. If you want to add an index, run this command with a specified environment: +Make sure elasticsearch is installed and running in the location you wish to push to. If there is already an index you will be using, take note of its name and skip this step. (Note that each index must be dedicated to data on one version of the API schema) If you want to add an index, run this command with a specified environment: ``` admin_es_create_index -e development From fbc0897df0ae58e14e78ba88f1ac6a2bc390f957 Mon Sep 17 00:00:00 2001 From: William Dewey Date: Thu, 10 Nov 2022 13:36:32 -0600 Subject: [PATCH 09/13] add links to more detailed documentation --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ceaf30818..8861fe0f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ Versioning](https://semver.org/spec/v2.0.0.html). ``` api_version: '2.0' ``` +See new schema (2.0) documentation [here](https://github.com/CDRH/datura/docs/schema_v2.md) - schema validation with API version 2.0, invalidly constructed documents will not post - authentication with Elasticesarch 8.5; add the following to `public.yml` or `private.yml` in the data repo: ``` @@ -45,7 +46,7 @@ api_version: '2.0' es_password: ******** ``` - field overrides for new fields in the new API schema -- Functionality to transform EAD files and post them to elasticsearch +- functionality to transform EAD files and post them to elasticsearch ### Changed - update ruby to 3.1.2 @@ -58,7 +59,7 @@ api_version: '2.0' ### Migration - check to make sure "text" xpath is doing desired behavior -- use Elasticsearch 8.5 or higher and add authentication as described above if security is enabled +- use Elasticsearch 8.5 or higher and add authentication as described above if security is enabled. See [dev docs instructions](https://github.com/CDRH/cdrh_dev_docs/blob/update_elasticsearch_documentation/publishing/2_basic_requirements.md#downloading-elasticsearch). - upgrade data repos to Ruby 3.1.2 - add api version to config as described above - make sure fields are consistent with the api schema, many have been renamed or changed in format From 3cf22376135695eda8e064c48c319dc39e46cd0c Mon Sep 17 00:00:00 2001 From: William Dewey Date: Thu, 10 Nov 2022 15:02:19 -0600 Subject: [PATCH 10/13] add link to elasticsearch documentation --- docs/4_developers/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/4_developers/installation.md b/docs/4_developers/installation.md index 37eb521c2..0e0171bda 100644 --- a/docs/4_developers/installation.md +++ b/docs/4_developers/installation.md @@ -6,7 +6,7 @@ TODO ### Elasticsearch -TODO +See installation instructions [here](https://github.com/CDRH/cdrh_dev_docs/blob/update_elasticsearch_documentation/publishing/2_basic_requirements.md#downloading-elasticsearch). ### Apache Permissions From 3cadd26ee90874064cbdb73c93cf18fd73e2195e Mon Sep 17 00:00:00 2001 From: William Dewey Date: Fri, 18 Nov 2022 10:03:31 -0600 Subject: [PATCH 11/13] add conditional to creator for nil checking --- lib/datura/to_es/vra_to_es/fields.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/datura/to_es/vra_to_es/fields.rb b/lib/datura/to_es/vra_to_es/fields.rb index c65dda8db..e8ecb1fb2 100644 --- a/lib/datura/to_es/vra_to_es/fields.rb +++ b/lib/datura/to_es/vra_to_es/fields.rb @@ -20,8 +20,10 @@ def category # nested field def creator - creators = get_list(@xpaths["creators"]) - creators.map { |c| { "name" => Datura::Helpers.normalize_space(c) } } + creators = get_list(@xpaths["creator"]) + if creators + creators.map { |c| { "name" => Datura::Helpers.normalize_space(c) } } + end end def collection From 3498ebee701dd26652424f5abcdc24a48d97084b Mon Sep 17 00:00:00 2001 From: Karin Dalziel Date: Thu, 10 Nov 2022 09:29:26 -0600 Subject: [PATCH 12/13] Create schema_v2.md --- docs/schema_v2.md | 152 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 docs/schema_v2.md diff --git a/docs/schema_v2.md b/docs/schema_v2.md new file mode 100644 index 000000000..e5985a32a --- /dev/null +++ b/docs/schema_v2.md @@ -0,0 +1,152 @@ +## CDRH Schema, version 2 + +| NEW FIELD NAME | likely facet field? | Metadata Equivalent | ORIGINAL FIELD NAME | DESCRIPTION | FIELD TYPE | | EXAMPLE | +| ----------------------------------------------------------------------------------------- | ------------------- | --------------------------- | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Resourse identification, website display | +| identifier | | | identifier | Unique identifier of the resource. | keyword | | oscys.case.0001.001 | +| collection | y | | collection | User friendly and URL valid name of project. Typically consists of directory under specified web domain. | keyword | | oscys, quillsandfeathers | +| collection\_desc | | | collection\_desc | Full CDRH name of the project. (e.g. “The William F. Cody Archive”) | keyword | | O Say Can You See: Early Washington, D.C., Law & Family | +| uri | | | uri | Full URI of resource. (Actual site not API site) | keyword | | [http://earlywashingtondc.org/doc/oscys.case.0001.001](http://earlywashingtondc.org/doc/oscys.case.0001.001) | +| uri\_data | | | uri\_data | Full URL to XML of data, when available | keyword | | [http://earlywashingtondc.org/files/oscys/tei/oscys.case.0001.001.xml](http://earlywashingtondc.org/files/oscys/tei/oscys.case.0001.001.xml) | +| uri\_html | | | uri\_html | Full URL to HTML snippit of data | keyword | | [http://earlywashingtondc.org/files/oscys/html-generated/oscys.case.0001.001.txt](http://earlywashingtondc.org/files/oscys/html-generated/oscys.case.0001.001.txt) | +| data\_type | | | data\_type | Format the data was originally stored in at CDRH. | keyword | | tei
| +| fig\_location | | | fig\_location | URI to location of figure. | keyword | | [http://earlywashingtondc.org/figures/](http://earlywashingtondc.org/figures/) | +| cover\_image | | | image\_id | Unambiguous reference to the image when the image id does not match the file id. | keyword | | oscys.case.0001.001.001.jpg | +| title | | dcterms:title | title | Name given to the resource. | keyword, copied into text | text? | The Once and Future King | +| title\_sort | | | title\_sort | Name given to the resource lowercased with articles removed | keyword | | once and future king | +| alternative | | dcterms:alternative | alternative | Alternative name for the resource. | keyword, copied into text | text? | Petition for Habeas Corpus | +| date\_updated | m | | NEW | | date | | | +| category | y | | category | Category on web page where resource occurs. Category fields are meant to be hierarchical and exclusive, for other types of organization look to subjects, keywords, etc

Each site will have a controlled vocabulary of its own | keyword | | works | +| category2 | y | | subcategory | | keyword | | works | novels | +| category3 | y | | NEW | 3rd level category | keyword | | works | novels | historical fiction | +| category4 | y | | NEW | 4th level category | keyword | | works | novels | historical fiction | civil war | +| category5 | y | | NEW | 5th level category | keyword | | etc | +| notes | | | NEW | | keyword | | | +| Metadata: Digital Item | +| contributor | | dcterms:contributor | contributor | CONTAINER FIELD
"Entity responsible for making contributions
to the resource." | | | | +| contributor.name | | | [contributor.name](http://contributor.name) | Entity responsible for making contributions
to the resource. | keyword | | \[Allison, Dee Ann\]
\[Walter, Katherine\] | +| [contributor.id](http://contributor.id/) | | | [contributor.id](http://contributor.id) | ID of the contributor | keyword | | \[https://orcid.org/0000-0002-4671-061X\]
(leave blank for no id) | +| contributor.role | | | contributor.role | | keyword | | \[researcher\]
\[Principal Investigator\]
\[encoder\] | +| Metadata: Original Item | +| creator | | dcterms:creator | creator | CONTAINER FIELD
An entity primarily responsible for making the resource.
Examples of a Creator include a person, an organization, or a service. | | | Use person field with role instead | +| creator.name | y | | [creator.name](http://creator.name) | Creator field name | keyword | copied into text | Use person field with role instead | +| creator.id | y | | [creator.id](http://creator.id) | Creator field ID (if available) | keyword | | Use person field with role instead | +| citation | | | | | | | | +| citation.role | | | NEW | | keyword | | | +| [citation.id](http://citation.id/) | | bibo:identifier | NEW | an identifier of the original item | keyword | | | +| citation.title | | dcterms:title | NEW | Used to describe the title of a bibliographic resource | keyword | text? | | +| citation.publisher | y | bibo:producer | publisher | Entity responsible for making the resource available. | keyword | | University of Nebraska Press, Lincoln & London, 1992 | +| citation.date | | dcterms:date | NEW | Date the resource was orginally created. | date | | 1900-01-01 | +| citation.issue | | bibo:issue | NEW | An issue number | keyword | | | +| citation.page\_start | | bibo:pageStart | NEW | Starting page number within a continuous page range. | keyword (some pages are roman numerals) | | 4 | +| citation.page\_end | | bibo:pageEnd | NEW | Ending page number within a continuous page range. | keyword | | 5 (if applicable) | +| citation.section | | bibo:section | NEW | A section number | keyword | | | +| citation.volume | | bibo:volume | NEW | A volume number | keyword | | | +| citation.place | | juso:name | NEW | This property indicates the name of the spatial thing. | keyword | | | +| citation.title\_a | | tei title level a | NEW | typically an article | keyword | text? | | +| citation.title\_m | | tei title level m | NEW | typically a monograph | keyword | text? | | +| citation.title\_j | y | tei title level j | NEW | typically a journal name | keyword | text? | | +| date | y | dcterms:date | | the date that will be used to sort and run date queries on item | date | | | +| date\_display | | | date\_display | Date in whatever display format is used on the site | keyword | text? | January, 1900 | +| date\_not\_before | | | date\_not\_before | Inclusive beginning date of resource. | date | | 1900-01-01 | +| date\_not\_after | | | date\_not\_after | Inclusive ending date of resource. | date | | 1900-01-31 | +| format | y | dcterms:format | format | File format, physical medium, or dimensions of the resource. | keyword | copied into text? | Film: 16mm Safety Film | +| medium | y | dcterms:medium | medium | Material or physical carrier of the resource. | keyword | copied into text? | Film | +| extent | | dcterms:extent | extent | Size or duration of the resource. | keyword | | 4:03 | +| language | y | dcterms:language | language | Primary / original language of the resource | keyword | | en | +| rights\_holder | y | dcterms:rightsHolder | rights\_holder | A person or organization owning or managing rights over the resource. | keyword | copied into text? | Huntington Library | +| rights | | dcterms:rights | rights | Information about the rights held in and over the resource. | keyword | copied into text? | All Rights Reserved. Contact Rights Holder for Permissions Information.
or
Covered by a CC-By License https://creativecommons.org/licenses/by/2.0/ | +| rights\_uri | | | rights\_uri | URI to rights holder information. | keyword | | [http://www.huntington.org/](http://www.huntington.org/) | +| container\_box | | ead container type = box | NEW | box an item is kept in, as in an archive | keyword | | | +| container\_folder | | ead container type = folder | NEW | folder an item is kept in, as in an archive | keyword | | | +| Metadata: Interpretive | +| subjects | y | dcterms:subject | subjects | Topic of the content of the resource. | keyword | copied into text? | \[Horror in art\]
\[Poisonous spiders--Venom\] | +| abstract | | dcterms:abstract | abstract | Abstract of the resource. | keyword? (for display or searching?)
| text? | The poem is not one of DGR's great sonnets, and it pales before the majestic painting it was written to accompany. Nevertheless, it is quite an interesting and important text. | +| description | | dcterms:description | description | Short description of the resource. | text | text? | A Poem by Dante Gabriel Rossetti | +| type | y | dcterms:type | type | Nature or genre of the resource. | keyword | copied into text? | Video | +| topics | y | | topics | Topics of content of resource. | keyword | copied into text? | | +| keywords | y | | keywords | Keywords used for resource. | keyword | copied into text? | | +| keywords2 | y | | NEW | Another set of keywords, used in sites to create another way to browse | keyword | copied into text? | decade | +| keywords3 | y | | NEW | Another set of keywords, used in sites to create another way to browse | keyword | copied into text? | | +| keywords4 | y | | NEW | Another set of keywords, used in sites to create another way to browse | keyword | copied into text? | | +| Relation to other items | +| relation | | dcterms:relation | relation | A related resource that is substantially the same as the described resource, but in another format. | keyword | | oscys.case.0001.001-B | +| source | | dcterms:source | source | A related resource from which the described resource is derived | keyword | | oscys.case.0001.001-A | +| has\_part | | dcterms:hasPart | NEW | parts of the resource, for example items pasted into a scrapbook | | | | +| has\_part.role | | | | | | | | +| has\_part.id | | | | | keyword | | cdrh.0001 | +| has\_part.title | | | | | keyword | | Resource title | +| has\_part.order | | | | | whole number | | 1 | +| is\_part\_of | | dcterms:isPartOf | NEW | the containing resource, for example the scrapbook the individual items are in | | | | +| is\_part\_of.role | | | | | | | | +| is\_part\_of.id | | | | | keyword | | cdrh.0001 | +| is\_part\_of.title | | | | | keyword | | Resource title | +| is\_part\_of.order | | | | | whole number | | 1 | +| previous\_item | | | NEW | previous item in a series. role can be used to create multiple nexts - for instance, previous letter in a mailing sequence, pervious letter by date | | | | +| previous\_item.role | | | | | | | | +| [previous\_item.id](http://previous_item.id/) | | | | | keyword | | cdrh.0001 | +| previous\_item.title | | | | | keyword | | Resource title | +| previous\_item.order | | | | | whole number | | 1 | +| next\_item | | | NEW | next item in a series. role can be used to create multiple nexts - for instance, next letter in a mailing sequence, next letter by date | | | | +| next\_item.role | | | | | | | | +| [next\_item.id](http://next_item.id/) | | | | | keyword | | cdrh.0001 | +| next\_item.title | | | | | keyword | | Resource title | +| next\_item.order | | | | | whole number | | 1 | +| Additional Data types | +| spatial | | | spatial | CONTAINER FIELD | | | | +| spatial.role | | | | | keyword | | | +| spatial.name | y | juso:name | spatial.title | Title / display name of location | keyword | copied into text? | Display name for this location, typically built from other fields, but potentially not. | +| spatial.description | | | spatial.description | Description | text | text? | | +| spatial.type | y | | spatial.type | | keyword | copied into text? | "origin" or "destination" used to distinguish multiple spatial records for one item (for example, for an item of correspondence) | +| spatial.short\_name | y | juso:short\_name | spatial.place\_name | Specific name of location in question, such as the army camp name, business, event title, etc | keyword, copied into text | | Camp Hollowell, Kimball Recital Hall, The Coffeehouse, Lancaster County Fairgrounds | +| spatial.coordinates | y | juso:geometry | spatial.coordinates | | geopoint | | \[-96.6669600, 40.8000000\] | +| spatial.id | | | [spatial.id](http://coverage.spatial.id/) | | keyword | | ????
| +| spatial.city | y | juso:city | spatial.city | | keyword | copied into text? | | +| spatial.township | | juso:Township | NEW | | | copied into text? | | +| spatial.county | | juso:county | spatial.county | | keyword | copied into text? | | +| spatial.country | y | juso:country | spatial.country | | keyword | copied into text? | | +| spatial.region | y | juso:within | NEW? | | keyword | copied into text? | | +| spatial.state | | juso:state | spatial.state | | keyword | copied into text? | | +| spatial.street | | juso:street | spatial.street | | keyword | | | +| spatial.postal\_code | | juso:postal\_code | spatial.postal\_code | | keyword | | | +| spatial.note | | | | | | | | +| deprecate and replace with place with role of "placename" and only place\_name filled out | | | places | Place names mentioned in the resource. | keyword | | | +| person | | foaf:Person | person | any people other than contributors associated with resource | | | | +| person.name | y | foaf:name | [person.name](http://person.name) | Name as we wish it to appear | keyword | copied into text? | \[Cody, William F.\] | +| [person.id](http://person.id/) | y | | [person.id](http://person.id) | Optional, if exists, may be from VIAF or similar. | keyword | | \[http://viaf.org/viaf/100252467\] | +| person.role | y | | person.role | Role of person. Common examples are recipient and sender, less common examples are attorney and defendant | keyword | copied into text? | \[sender\]
\[recipient\]
\[creator\]
\[editor\] | +| person.note | | | NEW | | keyword | | | +| person.order | | | NEW | | keyword | | | +| person.birth\_date | | foaf:birthday | NEW | | date | | \[1899-03-04\] | +| person.death\_date | | | NEW | | date | | | +| person.age\_category | y | | NEW | used when resources are categorizing the age of the participant at the time of the event. For instance, a minor in a court case | keyword | | \[minor\]
\[adult\] | +| person.name\_last | | foaf:lastName | NEW | | keyword | | | +| person.name\_given | | foaf:givenName | NEW | | keyword | | | +| person.name\_alternate | | | NEW | | keyword | | | +| person.name\_previous | | | | | | | | +| person.race | y | | NEW | | keyword | | | +| person.sex | y | | NEW | | keyword | | | +| person.gender | y | foaf:gender | NEW | | keyword | | | +| person.nationality | y | | NEW | | keyword | | | +| person.trait1 | y | | NEW | | keyword | | | +| person.trait2 | y | | NEW | | keyword | | | +| event | | | | | | | | +| event.type | y | | NEW | | keyword | | | +| event.agent | y | event:agent | NEW | Relates an event to an active agent (a person, a computer, ... :-) ) | keyword | | | +| event.factor | | event:factor | NEW | Relates an event to a passive factor (a tool, an instrument, an abstract cause...) | keyword | | points of law cited in a case | +| event.product | y | event:product | NEW | | keyword | | case outcome | +| event.date\_begin | | event\_date\_begin | NEW | | date | | | +| event.date\_end | | event\_date\_end | NEW | | date | | | +| event.trait1 | | | NEW | | keyword | | can be used for case keywords, i.e. civil, criminal | +| event.trait2 | | | NEW | | keyword | | | +| event.notes | | | NEW | | keyword | | | +| RDF | | | | The RDF field can be used to record any other data that needs to be associated with the record, for instance relationships | | | | +| rdf.type | | | NEW | | keyword | | \[relationship\] | +| rdf.subject | y | ref:subject | NEW | | keyword | | \[Smith, John\] | +| rdf.predicate | y | rdf:predicate | NEW | | keyword | | \[is married to\] | +| rdf.object | y | rdf:object | NEW | | keyword | | \[Smith, Mary\] | +| rdf.source | | | NEW | | keyword | | item.0001 | +| rdf.note | | | NEW | | keyword | | | +| Text search | +| annotations\_text | | | annotations\_text | Place for annotations text, so we can search annotations separately from the main text | text | | | +| text | | | text | Combined text of all the above fields for key word searching. | text
| | | From 5b0f54f1d81af819513740441ec8e7d1a9ee2986 Mon Sep 17 00:00:00 2001 From: William Dewey Date: Wed, 25 Jan 2023 12:44:34 -0600 Subject: [PATCH 13/13] make sure webs_to_es fields can handle nil values --- lib/datura/to_es/webs_to_es/fields.rb | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/datura/to_es/webs_to_es/fields.rb b/lib/datura/to_es/webs_to_es/fields.rb index 3163706be..1ef633126 100644 --- a/lib/datura/to_es/webs_to_es/fields.rb +++ b/lib/datura/to_es/webs_to_es/fields.rb @@ -39,7 +39,11 @@ def data_type end def date(before=true) - datestr = get_list(@xpaths["date"]).first + if get_list(@xpaths["date"]) + datestr = get_list(@xpaths["date"]).first + else + datestr = nil + end if datestr Datura::Helpers.date_standardize(datestr, true) end @@ -80,7 +84,9 @@ def format end def image_id - get_list(@xpaths["image_id"]).first + if get_list(@xpaths["image_id"]) + get_list(@xpaths["image_id"]).first + end end def keywords @@ -218,7 +224,9 @@ def works # new/moved fields for API 2.0 def cover_image - get_list(@xpaths["image_id"]).first + if get_list(@xpaths["image_id"]) + get_list(@xpaths["image_id"]).first + end end def date_updated