From a2a6a3eda1670817d8fa636975e0b675e8eb41fc Mon Sep 17 00:00:00 2001 From: David de Boer Date: Mon, 23 Sep 2024 19:31:56 +0200 Subject: [PATCH] feat: Process complete GeoNames * Download all countries and simplify download script. * Switch from RMLMapper to CARML, which is streaming. We need a custom CARML build that supports the `grel:string_split` function. --- Dockerfile | 2 +- config/{geonames.rml => geonames.ttl} | 76 +++++++-------------------- geonames-download.sh | 22 +++----- map.sh | 6 +-- 4 files changed, 27 insertions(+), 79 deletions(-) rename config/{geonames.rml => geonames.ttl} (72%) diff --git a/Dockerfile b/Dockerfile index c0d6f1b..8645098 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,6 @@ FROM eclipse-temurin:17 LABEL org.opencontainers.image.source = "https://github.com/netwerk-digitaal-erfgoed/geonames-harvester" WORKDIR /app RUN mkdir bin -RUN curl -L https://github.com/RMLio/rmlmapper-java/releases/download/v5.0.0/rmlmapper-5.0.0-r362-all.jar -o bin/rmlmapper.jar +RUN curl -L https://github.com/carml/carml-jar/raw/nde/carml-jar-jena-1.4.0-SNAPSHOT-0.4.10.jar -o bin/carml.jar RUN apt-get update && apt-get install zip -y && rm -rf /var/lib/apt/lists/* COPY . . diff --git a/config/geonames.rml b/config/geonames.ttl similarity index 72% rename from config/geonames.rml rename to config/geonames.ttl index c8b76c7..2f8d5ed 100644 --- a/config/geonames.rml +++ b/config/geonames.ttl @@ -1,7 +1,6 @@ @prefix rml: . @prefix rr: . @prefix ql: . -@prefix csvw: . @prefix rdf: . @prefix rdfs: . @prefix : . @@ -12,26 +11,16 @@ @prefix grel: . @prefix idlab-fn: . -:GeonamesSource +:GeonamesSource a rml:LogicalSource ; - rml:source [ - rdf:type csvw:Table; - csvw:url "geonamesplus.txt"; - csvw:dialect [ - rdf:type csvw:Dialect; - csvw:delimiter "\t"; - csvw:encoding "UTF-8"; - csvw:trim true; - csvw:null ""; # this doesn't seem to have any effect, sadly! - ] - ]; + rml:source "geonamesplus.txt"; rml:referenceFormulation ql:CSV . -:GeonamesMap +:GeonamesMap a rr:TriplesMap; rml:logicalSource :GeonamesSource ; rr:subjectMap [ rr:template "https://sws.geonames.org/{geonameid}/" ]; - rr:predicateObjectMap + rr:predicateObjectMap [ rr:predicate gn:name; rr:objectMap [ rml:reference "name" ] @@ -46,13 +35,13 @@ ], [ rr:predicate gn:featureClass; - rr:objectMap [ + rr:objectMap [ rr:template "https://www.geonames.org/ontology#{feature class}" ] ], [ rr:predicate gn:featureCode; - rr:objectMap [ + rr:objectMap [ rr:template "https://www.geonames.org/ontology#{feature class}.{feature code}" ] ], @@ -92,10 +81,11 @@ :AlternateNamesSplit + rr:termType rr:Literal; fnml:functionValue [ rml:logicalSource :LogicalSource; rr:predicateObjectMap [ - rr:predicate fno:executes; + rr:predicate fno:executes; rr:objectMap [ rr:constant grel:string_split ]; ]; rr:predicateObjectMap [ @@ -108,64 +98,36 @@ ]; ]. -:Admin1codeSource +:Admin1codeSource a rml:LogicalSource ; - rml:source [ - rdf:type csvw:Table; - csvw:url "admin1-codes.txt"; - csvw:dialect [ - rdf:type csvw:Dialect; - csvw:delimiter "\t"; - csvw:encoding "UTF-8"; - csvw:trim true; - ]; - ]; + rml:source "admin1-codes.txt"; rml:referenceFormulation ql:CSV . - -:Adm1Map +:Adm1Map a rr:TriplesMap; rml:logicalSource :Admin1codeSource ; rr:subjectMap [ rr:template "https://sws.geonames.org/{geonameId}/" ]. -:Admin2codeSource +:Admin2codeSource a rml:LogicalSource ; - rml:source [ - rdf:type csvw:Table; - csvw:url "admin2-codes.txt"; - csvw:dialect [ - rdf:type csvw:Dialect; - csvw:delimiter "\t"; - csvw:encoding "UTF-8"; - csvw:trim true; - ]; - ]; + rml:source "admin2-codes.txt"; rml:referenceFormulation ql:CSV . -:Adm2Map +:Adm2Map a rr:TriplesMap; rml:logicalSource :Admin2codeSource ; rr:subjectMap [ rr:template "https://sws.geonames.org/{geonameId}/" ]. - :FeatureCodeSource +:FeatureCodeSource a rml:LogicalSource ; - rml:source [ - rdf:type csvw:Table; - csvw:url "feature-codes.txt"; - csvw:dialect [ - rdf:type csvw:Dialect; - csvw:delimiter "\t"; - csvw:encoding "UTF-8"; - csvw:trim true; - ] - ]; + rml:source "feature-codes.txt"; rml:referenceFormulation ql:CSV . -:FeatureCodeTriplesMap +:FeatureCodeTriplesMap a rr:TriplesMap; rml:logicalSource :FeatureCodeSource ; rr:subjectMap [ rr:template "https://www.geonames.org/ontology#{featureCode}" ]; - rr:predicateObjectMap + rr:predicateObjectMap [ rr:predicate gn:name; rr:objectMap [ rml:reference "name" ] @@ -173,4 +135,4 @@ [ rr:predicate rdfs:comment; rr:objectMap [ rml:reference "description" ] - ]. \ No newline at end of file + ]. diff --git a/geonames-download.sh b/geonames-download.sh index b93f6e9..06c76f5 100755 --- a/geonames-download.sh +++ b/geonames-download.sh @@ -1,12 +1,11 @@ #!/bin/bash CONFIG_DIR="$PWD/config" DATA_DIR="./data" -if [ ! -d "$DATA_DIR" ]; then - mkdir $DATA_DIR -fi +mkdir -p $DATA_DIR # specify countries to download -country_files="NL BE DE" +#country_files="NL BE DE " +country_files="allCountries" cp $CONFIG_DIR/headers-gn.txt $DATA_DIR/geonames.txt for cfile in $country_files; do mkdir temp @@ -25,19 +24,10 @@ rm $DATA_DIR/geonames.txt # download latest version of generic files cp $CONFIG_DIR/headers-feature-codes.txt $DATA_DIR/feature-codes.txt -curl -O "https://download.geonames.org/export/dump/featureCodes_en.txt" -cat featureCodes_en.txt >> $DATA_DIR/feature-codes.txt -rm featureCodes_en.txt +curl "https://download.geonames.org/export/dump/featureCodes_en.txt" >> $DATA_DIR/feature-codes.txt cp $CONFIG_DIR/headers-admin1-codes.txt $DATA_DIR/admin1-codes.txt -curl -O "https://download.geonames.org/export/dump/admin1CodesASCII.txt" -cat admin1CodesASCII.txt >> $DATA_DIR/admin1-codes.txt -rm admin1CodesASCII.txt +curl "https://download.geonames.org/export/dump/admin1CodesASCII.txt" >> $DATA_DIR/admin1-codes.txt cp $CONFIG_DIR/headers-admin2-codes.txt $DATA_DIR/admin2-codes.txt -curl -O "https://download.geonames.org/export/dump/admin2Codes.txt" -# remove double quotes that seem to appear within some strings -# to prevent RML from crashing -sed -i 's/"//g' admin2Codes.txt -cat admin2Codes.txt >> $DATA_DIR/admin2-codes.txt -rm admin2Codes.txt +curl "https://download.geonames.org/export/dump/admin2Codes.txt" >> $DATA_DIR/admin2-codes.txt diff --git a/map.sh b/map.sh index 216bc8d..400f300 100755 --- a/map.sh +++ b/map.sh @@ -5,8 +5,4 @@ CONFIG_DIR="$PWD/config" OUTPUT_DIR="$DATA_DIR" cd $DATA_DIR echo "Running the RMLmapper to convert geonames data to RDF, be patient..." -java -jar $BIN_DIR/rmlmapper.jar -m $CONFIG_DIR/geonames.rml -o $OUTPUT_DIR/geonames.nt -cd .. -# This RML mapping creates triples with empty object values. -# The following sed command remove every line with a "". pattern. -sed -i '/\"\"./d' $OUTPUT_DIR/geonames.nt +java -jar $BIN_DIR/carml.jar map -m $CONFIG_DIR/geonames.ttl -r . -o $OUTPUT_DIR/geonames.nt