feat: Process complete GeoNames

* Use SPARQL Anything for SPARQL-based mappings. * Process admin codes first, preloading the output in the main mapping process. * Download all countries and simplify download script.
netwerk-digitaal-erfgoed · Nov 18, 2024 · f9bb4e0 · f9bb4e0
1 parent b92643f
commit f9bb4e0
Show file tree

Hide file tree

Showing 13 changed files with 155 additions and 235 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
 FROM eclipse-temurin:17
-LABEL org.opencontainers.image.source = "https://github.com/netwerk-digitaal-erfgoed/geonames-harvester"
+LABEL org.opencontainers.image.source="https://github.com/netwerk-digitaal-erfgoed/geonames-harvester"
 WORKDIR /app
 RUN mkdir bin
-RUN curl -L https://github.com/RMLio/rmlmapper-java/releases/download/v5.0.0/rmlmapper-5.0.0-r362-all.jar -o bin/rmlmapper.jar
+RUN curl -L https://github.com/SPARQL-Anything/sparql.anything/releases/download/v1.0-DEV.6/sparql-anything-v1.0-DEV.6.jar -o bin/sparql-anything-v1.0-DEV.6.jar
 RUN apt-get update && apt-get install zip -y && rm -rf /var/lib/apt/lists/*
 COPY . .
diff --git a/README.md b/README.md
@@ -6,7 +6,9 @@ Simple download and transform scripts to download and convert geonames dumps int
 
 ### RML
 
-The text to RDF transformation is done using the RML mapper. This requires a java runtime environment (openjdk or other). The RMLmapper can be downloaded from the the [RML repo](https://github.com/RMLio/rmlmapper-java). The scripts assume that the `rmlmapper.jar` is available in the `./bin` directory.
+The text to RDF transformation is done using [SPARQL Anything](https://github.com/SPARQL-Anything/sparql.anything).
+This requires a Java runtime environment.
+You can download the SPARQL Anything JAR [here](https://github.com/SPARQL-Anything/sparql.anything/releases).
 
 ### Fuseki
 
@@ -27,10 +29,10 @@ See the [Geonames download website](https://download.geonames.org/export/dump/)
 Run the scripts in the following order:
 
 1. **Download**
-Run the `geonames-download.sh` to download the data. Currently only the NL and BE country data is downloaded. After downloading some basic cleaning is done to prevent problems in the mapping proces. The download files are place in the `./data` directory.
+Run the `download.sh` to download the data. Currently only the NL and BE country data is downloaded. After downloading some basic cleaning is done to prevent problems in the mapping proces. The download files are place in the `./data` directory.
 
 2. **Mapping**
-   Run the `map.sh` to convert the textfiles to RDF. The resulting ntriples files is placed in `./fuseki/databases/`. The mapping can take some time to finish, be patient!
+   Run the `map.sh` to convert the text files to RDF. This produces a `data/geonames.ttl` file. 
 
 3. **Expose the data**
    Run the `server.sh` to start the server and expose the SPARQL-endpoint on <http://localhost:3030/geonames/sparql>.
diff --git a/config/admin-codes.rq b/config/admin-codes.rq
@@ -0,0 +1,40 @@
+PREFIX apf: <http://jena.apache.org/ARQ/property#>
+PREFIX fx: <http://sparql.xyz/facade-x/ns/>
+PREFIX gn: <https://www.geonames.org/ontology#>
+PREFIX xyz: <http://sparql.xyz/facade-x/data/>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#>
+PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+CONSTRUCT {
+    ?adm1Uri xyz:admin1Code ?adm1 .
+    ?adm2Uri xyz:admin2Code ?adm2 .
+}
+WHERE {
+    {
+        SERVICE <x-sparql-anything:> {
+            fx:properties fx:location "data/admin1-codes.csv" ;
+                fx:csv.delimiter "\t" ;
+                fx:csv.headers true ;
+            .
+            [
+                xyz:admin1code ?adm1 ;
+                xyz:geonameId ?adm1Id ;
+            ]
+            BIND(URI(CONCAT("https://sws.geonames.org/", ?adm1Id, "/")) as ?adm1Uri)
+        }
+    }
+    UNION {
+        SERVICE <x-sparql-anything:> {
+            fx:properties fx:location "data/admin2-codes.csv" ;
+                fx:csv.delimiter "\t" ;
+                fx:csv.headers true ;
+            .
+            [
+                xyz:admin2code ?adm2 ;
+                xyz:geonameId ?adm2Id ;
+            ]
+            BIND(URI(CONCAT("https://sws.geonames.org/", ?adm2Id, "/")) as ?adm2Uri)
+        }
+    }
+}
diff --git a/config/geonames.rml b/config/geonames.rml
diff --git a/config/headers-admin1-codes.txt → config/headers-admin1-codes.csv b/config/headers-admin1-codes.txt → config/headers-admin1-codes.csv
diff --git a/config/headers-admin2-codes.txt → config/headers-admin2-codes.csv b/config/headers-admin2-codes.txt → config/headers-admin2-codes.csv
diff --git a/config/headers-feature-codes.txt b/config/headers-feature-codes.txt
diff --git a/config/headers-gn.txt → config/headers-gn.csv b/config/headers-gn.txt → config/headers-gn.csv
diff --git a/config/places.rq b/config/places.rq
@@ -0,0 +1,62 @@
+PREFIX apf: <http://jena.apache.org/ARQ/property#>
+PREFIX fx: <http://sparql.xyz/facade-x/ns/>
+PREFIX gn: <https://www.geonames.org/ontology#>
+PREFIX xyz: <http://sparql.xyz/facade-x/data/>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#>
+PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+# Some measurements. On my local machine, so not super representative.
+#
+# 1M records    3m36.655s   ~400 MB RAM
+# 13M records   ?           ??
+#
+CONSTRUCT {
+    ?uri
+        a gn:Feature ;
+        gn:name ?name ;
+        gn:alternateName ?alternateName ;
+        gn:countryCode ?countryCode ;
+        gn:featureClass ?featureClass ;
+        gn:featureCode ?featureCode ;
+        gn:parentADM1 ?parentAdm1 ;
+        gn:parentADM2 ?parentAdm2 ;
+        wgs84_pos:lat ?latitudeFloat ;
+        wgs84_pos:long ?longitudeFloat ;
+}
+WHERE {
+    SERVICE <x-sparql-anything:> {
+        fx:properties
+            fx:location "data/geonamesplus.csv" ;
+            fx:ondisk "/tmp/sparql-anything-geonames" ; # Ondisk storage is slower but required to prevent OOM.
+            fx:null-string "" ; # Skip empty alternate names.
+            fx:csv.delimiter "\t" ;
+            fx:csv.headers true ;
+        .
+
+        ?s xyz:geonameid ?id ;
+            xyz:name ?name ;
+            xyz:country%20code ?countryCode ;
+            xyz:feature%20class ?featureClassString ;
+            xyz:feature%20code ?featureCodeString ;
+            xyz:adm1 ?adm1 ;
+            xyz:adm2 ?adm2 ; # Not all places have a parentAdm2, but we use a special ‘NONE’ value so we can join non-OPTIONALly.
+            xyz:latitude ?latitude ;
+            xyz:longitude ?longitude ;
+        .
+
+        OPTIONAL {
+            ?s xyz:alternatenames ?alternateNameString .
+            ?alternateName apf:strSplit (?alternateNameString ",") .
+        }
+
+        BIND(URI(CONCAT("https://sws.geonames.org/", ?id, "/")) as ?uri)
+        BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString)) as ?featureClass)
+        BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString, ".", ?featureCodeString)) as ?featureCode)
+        BIND(xsd:float(?latitude) AS ?latitudeFloat)
+        BIND(xsd:float(?longitude) AS ?longitudeFloat)
+    }
+
+    OPTIONAL { ?parentAdm1 xyz:admin1Code ?adm1 }
+    OPTIONAL { ?parentAdm2 xyz:admin2Code ?adm2 }
+}
diff --git a/download.sh b/download.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+CONFIG_DIR="$PWD/config"
+DATA_DIR="./data"
+mkdir -p $DATA_DIR
+
+# specify countries to download
+#country_files="NL BE DE "
+country_files="allCountries"
+cp $CONFIG_DIR/headers-gn.csv $DATA_DIR/geonames.csv
+for cfile in $country_files; do
+    mkdir temp
+    cd temp
+    curl -O "https://download.geonames.org/export/dump/$cfile.zip"
+    unzip "$cfile.zip"
+    cd ..
+    cat "temp/$cfile.txt" >> $DATA_DIR/geonames.csv
+    rm -rf temp
+done
+
+# create foreign keys 'adm1' and 'adm2' for the admin1code and admin2code tables
+# $9=country code, $11=admin1 code, $12=admin2 code
+# Explicit NONE so we don't need OPTIONAL joins, which speeds up the mapping process.
+awk 'BEGIN{FS=OFS="\t"} {print $0, (NR > 1 ? $9"."$11 : "adm1"), (NR > 1 ? ($12 != "" ? $9"."$11"."$12 : "NONE") : "adm2")}' $DATA_DIR/geonames.csv > $DATA_DIR/geonamesplus.csv
+rm $DATA_DIR/geonames.csv
+
+# download latest version of generic files
+cp $CONFIG_DIR/headers-admin1-codes.csv $DATA_DIR/admin1-codes.csv
+curl "https://download.geonames.org/export/dump/admin1CodesASCII.txt" >> $DATA_DIR/admin1-codes.csv
+
+cp $CONFIG_DIR/headers-admin2-codes.csv $DATA_DIR/admin2-codes.csv
+curl "https://download.geonames.org/export/dump/admin2Codes.txt" >> $DATA_DIR/admin2-codes.csv
diff --git a/geonames-download.sh b/geonames-download.sh