-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Use SPARQL Anything for SPARQL-based mappings. * Process admin codes first, preloading the output in the main mapping process. * Download all countries and simplify download script.
- Loading branch information
Showing
13 changed files
with
160 additions
and
183 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
FROM eclipse-temurin:17 | ||
LABEL org.opencontainers.image.source = "https://github.com/netwerk-digitaal-erfgoed/geonames-harvester" | ||
LABEL org.opencontainers.image.source="https://github.com/netwerk-digitaal-erfgoed/geonames-harvester" | ||
WORKDIR /app | ||
RUN mkdir bin | ||
RUN curl -L https://github.com/carml/carml-jar/raw/nde/carml-jar-jena-1.4.0-SNAPSHOT-0.4.10.jar -o bin/carml.jar | ||
RUN curl -L https://github.com/SPARQL-Anything/sparql.anything/releases/download/v1.0-DEV.6/sparql-anything-v1.0-DEV.6.jar -o bin/sparql-anything-v1.0-DEV.6.jar | ||
RUN apt-get update && apt-get install zip -y && rm -rf /var/lib/apt/lists/* | ||
COPY . . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
PREFIX apf: <http://jena.apache.org/ARQ/property#> | ||
PREFIX fx: <http://sparql.xyz/facade-x/ns/> | ||
PREFIX gn: <https://www.geonames.org/ontology#> | ||
PREFIX xyz: <http://sparql.xyz/facade-x/data/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#> | ||
PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
CONSTRUCT { | ||
?adm1Uri xyz:admin1Code ?adm1 . | ||
?adm2Uri xyz:admin2Code ?adm2 . | ||
} | ||
WHERE { | ||
{ | ||
SERVICE <x-sparql-anything:> { | ||
fx:properties fx:location "data/admin1-codes.csv" ; | ||
fx:csv.delimiter "\t" ; | ||
fx:csv.headers true ; | ||
. | ||
[ | ||
xyz:admin1code ?adm1 ; | ||
xyz:geonameId ?adm1Id ; | ||
] | ||
BIND(URI(CONCAT("https://sws.geonames.org/", ?adm1Id, "/")) as ?adm1Uri) | ||
} | ||
} | ||
UNION { | ||
SERVICE <x-sparql-anything:> { | ||
fx:properties fx:location "data/admin2-codes.csv" ; | ||
fx:csv.delimiter "\t" ; | ||
fx:csv.headers true ; | ||
. | ||
[ | ||
xyz:admin2code ?adm2 ; | ||
xyz:geonameId ?adm2Id ; | ||
] | ||
BIND(URI(CONCAT("https://sws.geonames.org/", ?adm2Id, "/")) as ?adm2Uri) | ||
} | ||
} | ||
} |
This file was deleted.
Oops, something went wrong.
File renamed without changes.
File renamed without changes.
This file was deleted.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
PREFIX apf: <http://jena.apache.org/ARQ/property#> | ||
PREFIX fx: <http://sparql.xyz/facade-x/ns/> | ||
PREFIX gn: <https://www.geonames.org/ontology#> | ||
PREFIX xyz: <http://sparql.xyz/facade-x/data/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#> | ||
PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
# Measurements | ||
# ~8 GB RAM or is that the default map heap? without ondisk for geonames-1M.csv | ||
# | ||
# 10K 195 s 3 s (without adm2 join) | ||
# 1M 72 s ~8 GB RAM without ondisk | ||
# 13M OOM | ||
# 1M ondisk 3m36.655s | ||
# 13M ondisk ~400 MB RAM | ||
# | ||
# It was important to nest SERVICE clauses. | ||
CONSTRUCT { | ||
?uri | ||
a gn:Feature ; | ||
gn:name ?name ; | ||
gn:alternateName ?alternateName ; | ||
gn:countryCode ?countryCode ; | ||
gn:featureClass ?featureClass ; | ||
gn:featureCode ?featureCode ; | ||
gn:parentADM1 ?parentAdm1 ; | ||
gn:parentADM2 ?parentAdm2 ; | ||
wgs84_pos:lat ?latitudeFloat ; | ||
wgs84_pos:long ?longitudeFloat ; | ||
} | ||
WHERE { | ||
SERVICE <x-sparql-anything:> { | ||
fx:properties | ||
fx:location "data/geonames1M.csv" ; | ||
# fx:location "data/geonamesplus.csv" ; | ||
fx:ondisk "/tmp/sparql-anything-geonames3" ; | ||
fx:null-string "" ; # Skip empty alternate names. | ||
fx:csv.delimiter "\t" ; | ||
fx:csv.headers true ; | ||
. | ||
?s xyz:geonameid ?id ; | ||
xyz:name ?name ; | ||
xyz:country%20code ?countryCode ; | ||
xyz:feature%20class ?featureClassString ; | ||
xyz:feature%20code ?featureCodeString ; | ||
xyz:adm1 ?adm1 ; | ||
xyz:adm2 ?adm2 ; # Not all places have a parentAdm2, but we use a special ‘NONE’ value to prevent OPTIONAL joins. | ||
xyz:latitude ?latitude ; | ||
xyz:longitude ?longitude ; | ||
. | ||
|
||
OPTIONAL { | ||
?s xyz:alternatenames ?alternateNameString . | ||
?alternateName apf:strSplit (?alternateNameString ",") . | ||
} | ||
|
||
BIND(URI(CONCAT("https://sws.geonames.org/", ?id, "/")) as ?uri) | ||
BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString)) as ?featureClass) | ||
BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString, ".", ?featureCodeString)) as ?featureCode) | ||
BIND(xsd:float(?latitude) AS ?latitudeFloat) | ||
BIND(xsd:float(?longitude) AS ?longitudeFloat) | ||
} | ||
|
||
OPTIONAL { ?parentAdm1 xyz:admin1Code ?adm1 } | ||
OPTIONAL { ?parentAdm2 xyz:admin2Code ?adm2 } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/bin/bash | ||
CONFIG_DIR="$PWD/config" | ||
DATA_DIR="./data" | ||
mkdir -p $DATA_DIR | ||
|
||
# specify countries to download | ||
#country_files="NL BE DE " | ||
country_files="allCountries" | ||
cp $CONFIG_DIR/headers-gn.csv $DATA_DIR/geonames.csv | ||
for cfile in $country_files; do | ||
mkdir temp | ||
cd temp | ||
curl -O "https://download.geonames.org/export/dump/$cfile.zip" | ||
unzip "$cfile.zip" | ||
cd .. | ||
cat "temp/$cfile.txt" >> $DATA_DIR/geonames.csv | ||
rm -rf temp | ||
done | ||
|
||
# create foreign keys 'adm1' and 'adm2' for the admin1code and admin2code tables | ||
# $9=country code, $11=admin1 code, $12=admin2 code | ||
# Explicit NONE so we don't need OPTIONAL joins, which speeds up the mapping process. | ||
awk 'BEGIN{FS=OFS="\t"} {print $0, (NR > 1 ? $9"."$11 : "adm1"), (NR > 1 ? ($12 != "" ? $9"."$11"."$12 : "NONE") : "adm2")}' $DATA_DIR/geonames.csv > $DATA_DIR/geonamesplus.csv | ||
rm $DATA_DIR/geonames.csv | ||
|
||
# download latest version of generic files | ||
cp $CONFIG_DIR/headers-admin1-codes.csv $DATA_DIR/admin1-codes.csv | ||
curl "https://download.geonames.org/export/dump/admin1CodesASCII.txt" >> $DATA_DIR/admin1-codes.csv | ||
|
||
cp $CONFIG_DIR/headers-admin2-codes.csv $DATA_DIR/admin2-codes.csv | ||
curl "https://download.geonames.org/export/dump/admin2Codes.txt" >> $DATA_DIR/admin2-codes.csv |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
#! /bin/bash | ||
cd fuseki | ||
docker-compose run --rm --name gnserver --service-ports fuseki --file=/fuseki/databases/geonames.nt /geonames | ||
cd .. | ||
docker-compose run --rm --name gnserver --service-ports fuseki --file=/fuseki/databases/geonames.ttl /geonames | ||
cd .. |