feat: Process complete GeoNames

* Use SPARQL Anything for SPARQL-based mappings. * Process admin codes first, preloading the output in the main mapping process. * Download all countries and simplify download script.
netwerk-digitaal-erfgoed · Nov 18, 2024 · 3f1d93a · 3f1d93a
1 parent a2a6a3e
commit 3f1d93a
Show file tree

Hide file tree

Showing 13 changed files with 160 additions and 183 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
 FROM eclipse-temurin:17
-LABEL org.opencontainers.image.source = "https://github.com/netwerk-digitaal-erfgoed/geonames-harvester"
+LABEL org.opencontainers.image.source="https://github.com/netwerk-digitaal-erfgoed/geonames-harvester"
 WORKDIR /app
 RUN mkdir bin
-RUN curl -L https://github.com/carml/carml-jar/raw/nde/carml-jar-jena-1.4.0-SNAPSHOT-0.4.10.jar -o bin/carml.jar
+RUN curl -L https://github.com/SPARQL-Anything/sparql.anything/releases/download/v1.0-DEV.6/sparql-anything-v1.0-DEV.6.jar -o bin/sparql-anything-v1.0-DEV.6.jar
 RUN apt-get update && apt-get install zip -y && rm -rf /var/lib/apt/lists/*
 COPY . .
diff --git a/README.md b/README.md
@@ -6,7 +6,9 @@ Simple download and transform scripts to download and convert geonames dumps int
 
 ### RML
 
-The text to RDF transformation is done using the RML mapper. This requires a java runtime environment (openjdk or other). The RMLmapper can be downloaded from the the [RML repo](https://github.com/RMLio/rmlmapper-java). The scripts assume that the `rmlmapper.jar` is available in the `./bin` directory.
+The text to RDF transformation is done using [SPARQL Anything](https://github.com/SPARQL-Anything/sparql.anything).
+This requires a Java runtime environment.
+You can download the SPARQL Anything JAR [here](https://github.com/SPARQL-Anything/sparql.anything/releases).
 
 ### Fuseki
 
@@ -27,10 +29,10 @@ See the [Geonames download website](https://download.geonames.org/export/dump/)
 Run the scripts in the following order:
 
 1. **Download**
-Run the `geonames-download.sh` to download the data. Currently only the NL and BE country data is downloaded. After downloading some basic cleaning is done to prevent problems in the mapping proces. The download files are place in the `./data` directory.
+Run the `download.sh` to download the data. Currently only the NL and BE country data is downloaded. After downloading some basic cleaning is done to prevent problems in the mapping proces. The download files are place in the `./data` directory.
 
 2. **Mapping**
-   Run the `map.sh` to convert the textfiles to RDF. The resulting ntriples files is placed in `./fuseki/databases/`. The mapping can take some time to finish, be patient!
+   Run the `map.sh` to convert the text files to RDF. This produces a `data/geonames.ttl` file. 
 
 3. **Expose the data**
    Run the `server.sh` to start the server and expose the SPARQL-endpoint on <http://localhost:3030/geonames/sparql>.
diff --git a/config/admin-codes.rq b/config/admin-codes.rq
@@ -0,0 +1,40 @@
+PREFIX apf: <http://jena.apache.org/ARQ/property#>
+PREFIX fx: <http://sparql.xyz/facade-x/ns/>
+PREFIX gn: <https://www.geonames.org/ontology#>
+PREFIX xyz: <http://sparql.xyz/facade-x/data/>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#>
+PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+CONSTRUCT {
+    ?adm1Uri xyz:admin1Code ?adm1 .
+    ?adm2Uri xyz:admin2Code ?adm2 .
+}
+WHERE {
+    {
+        SERVICE <x-sparql-anything:> {
+            fx:properties fx:location "data/admin1-codes.csv" ;
+                fx:csv.delimiter "\t" ;
+                fx:csv.headers true ;
+            .
+            [
+                xyz:admin1code ?adm1 ;
+                xyz:geonameId ?adm1Id ;
+            ]
+            BIND(URI(CONCAT("https://sws.geonames.org/", ?adm1Id, "/")) as ?adm1Uri)
+        }
+    }
+    UNION {
+        SERVICE <x-sparql-anything:> {
+            fx:properties fx:location "data/admin2-codes.csv" ;
+                fx:csv.delimiter "\t" ;
+                fx:csv.headers true ;
+            .
+            [
+                xyz:admin2code ?adm2 ;
+                xyz:geonameId ?adm2Id ;
+            ]
+            BIND(URI(CONCAT("https://sws.geonames.org/", ?adm2Id, "/")) as ?adm2Uri)
+        }
+    }
+}
diff --git a/config/geonames.ttl b/config/geonames.ttl
diff --git a/config/headers-admin1-codes.txt → config/headers-admin1-codes.csv b/config/headers-admin1-codes.txt → config/headers-admin1-codes.csv
diff --git a/config/headers-admin2-codes.txt → config/headers-admin2-codes.csv b/config/headers-admin2-codes.txt → config/headers-admin2-codes.csv
diff --git a/config/headers-feature-codes.txt b/config/headers-feature-codes.txt
diff --git a/config/headers-gn.txt → config/headers-gn.csv b/config/headers-gn.txt → config/headers-gn.csv
diff --git a/config/places.rq b/config/places.rq
@@ -0,0 +1,67 @@
+PREFIX apf: <http://jena.apache.org/ARQ/property#>
+PREFIX fx: <http://sparql.xyz/facade-x/ns/>
+PREFIX gn: <https://www.geonames.org/ontology#>
+PREFIX xyz: <http://sparql.xyz/facade-x/data/>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#>
+PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+# Measurements
+# ~8 GB RAM or is that the default map heap? without ondisk for geonames-1M.csv
+#
+# 10K 195 s   3 s (without adm2 join)
+# 1M  72 s                 ~8 GB RAM without ondisk
+# 13M OOM
+# 1M ondisk 3m36.655s
+# 13M ondisk  ~400 MB RAM
+#
+# It was important to nest SERVICE clauses.
+CONSTRUCT {
+    ?uri
+        a gn:Feature ;
+        gn:name ?name ;
+        gn:alternateName ?alternateName ;
+        gn:countryCode ?countryCode ;
+        gn:featureClass ?featureClass ;
+        gn:featureCode ?featureCode ;
+        gn:parentADM1 ?parentAdm1 ;
+        gn:parentADM2 ?parentAdm2 ;
+        wgs84_pos:lat ?latitudeFloat ;
+        wgs84_pos:long ?longitudeFloat ;
+}
+WHERE {
+    SERVICE <x-sparql-anything:> {
+        fx:properties
+            fx:location "data/geonames1M.csv" ;
+#            fx:location "data/geonamesplus.csv" ;
+            fx:ondisk "/tmp/sparql-anything-geonames3" ;
+            fx:null-string "" ; # Skip empty alternate names.
+            fx:csv.delimiter "\t" ;
+            fx:csv.headers true ;
+        .
+        ?s xyz:geonameid ?id ;
+            xyz:name ?name ;
+            xyz:country%20code ?countryCode ;
+            xyz:feature%20class ?featureClassString ;
+            xyz:feature%20code ?featureCodeString ;
+            xyz:adm1 ?adm1 ;
+            xyz:adm2 ?adm2 ; # Not all places have a parentAdm2, but we use a special ‘NONE’ value to prevent OPTIONAL joins.
+            xyz:latitude ?latitude ;
+            xyz:longitude ?longitude ;
+        .
+
+        OPTIONAL {
+            ?s xyz:alternatenames ?alternateNameString .
+            ?alternateName apf:strSplit (?alternateNameString ",") .
+        }
+
+        BIND(URI(CONCAT("https://sws.geonames.org/", ?id, "/")) as ?uri)
+        BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString)) as ?featureClass)
+        BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString, ".", ?featureCodeString)) as ?featureCode)
+        BIND(xsd:float(?latitude) AS ?latitudeFloat)
+        BIND(xsd:float(?longitude) AS ?longitudeFloat)
+    }
+
+    OPTIONAL { ?parentAdm1 xyz:admin1Code ?adm1 }
+    OPTIONAL { ?parentAdm2 xyz:admin2Code ?adm2 }
+}
diff --git a/download.sh b/download.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+CONFIG_DIR="$PWD/config"
+DATA_DIR="./data"
+mkdir -p $DATA_DIR
+
+# specify countries to download
+#country_files="NL BE DE "
+country_files="allCountries"
+cp $CONFIG_DIR/headers-gn.csv $DATA_DIR/geonames.csv
+for cfile in $country_files; do
+    mkdir temp
+    cd temp
+    curl -O "https://download.geonames.org/export/dump/$cfile.zip"
+    unzip "$cfile.zip"
+    cd ..
+    cat "temp/$cfile.txt" >> $DATA_DIR/geonames.csv
+    rm -rf temp
+done
+
+# create foreign keys 'adm1' and 'adm2' for the admin1code and admin2code tables
+# $9=country code, $11=admin1 code, $12=admin2 code
+# Explicit NONE so we don't need OPTIONAL joins, which speeds up the mapping process.
+awk 'BEGIN{FS=OFS="\t"} {print $0, (NR > 1 ? $9"."$11 : "adm1"), (NR > 1 ? ($12 != "" ? $9"."$11"."$12 : "NONE") : "adm2")}' $DATA_DIR/geonames.csv > $DATA_DIR/geonamesplus.csv
+rm $DATA_DIR/geonames.csv
+
+# download latest version of generic files
+cp $CONFIG_DIR/headers-admin1-codes.csv $DATA_DIR/admin1-codes.csv
+curl "https://download.geonames.org/export/dump/admin1CodesASCII.txt" >> $DATA_DIR/admin1-codes.csv
+
+cp $CONFIG_DIR/headers-admin2-codes.csv $DATA_DIR/admin2-codes.csv
+curl "https://download.geonames.org/export/dump/admin2Codes.txt" >> $DATA_DIR/admin2-codes.csv
diff --git a/geonames-download.sh b/geonames-download.sh
diff --git a/map.sh b/map.sh
@@ -2,7 +2,16 @@
 DATA_DIR="$PWD/data"
 BIN_DIR="$PWD/bin"
 CONFIG_DIR="$PWD/config"
-OUTPUT_DIR="$DATA_DIR"
-cd $DATA_DIR
-echo "Running the RMLmapper to convert geonames data to RDF, be patient..."
-java -jar $BIN_DIR/carml.jar map -m $CONFIG_DIR/geonames.ttl -r . -o $OUTPUT_DIR/geonames.nt
+SPARQL_ANYTHING_VERSION="v1.0-DEV.6"
+JAR="sparql-anything-${SPARQL_ANYTHING_VERSION}.jar"
+
+# Download SPARQL Anything CLI.
+echo "https://github.com/SPARQL-Anything/sparql.anything/releases/download/$SPARQL_ANYTHING_VERSION/$JAR"
+
+curl --skip-existing -L "https://github.com/SPARQL-Anything/sparql.anything/releases/download/$SPARQL_ANYTHING_VERSION/$JAR" -o $BIN_DIR/$JAR
+
+# Map admin codes.
+java -jar $BIN_DIR/$JAR -q $CONFIG_DIR/admin-codes.rq > $DATA_DIR/admin-codes.ttl
+
+# Map places, side-loading admin codes.
+java -jar $BIN_DIR/$JAR -q $CONFIG_DIR/places.rq -l $DATA_DIR/admin-codes.ttl > $DATA_DIR/geonames.ttl
diff --git a/server.sh b/server.sh
@@ -1,4 +1,4 @@
 #! /bin/bash
 cd fuseki
-docker-compose run --rm --name gnserver --service-ports fuseki --file=/fuseki/databases/geonames.nt /geonames
-cd ..
+docker-compose run --rm --name gnserver --service-ports fuseki --file=/fuseki/databases/geonames.ttl /geonames
+cd ..