-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Use SPARQL Anything for SPARQL-based mappings. * Process admin codes first, preloading the output in the main mapping process. * Download all countries and simplify download script.
- Loading branch information
Showing
13 changed files
with
155 additions
and
235 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
FROM eclipse-temurin:17 | ||
LABEL org.opencontainers.image.source = "https://github.com/netwerk-digitaal-erfgoed/geonames-harvester" | ||
LABEL org.opencontainers.image.source="https://github.com/netwerk-digitaal-erfgoed/geonames-harvester" | ||
WORKDIR /app | ||
RUN mkdir bin | ||
RUN curl -L https://github.com/RMLio/rmlmapper-java/releases/download/v5.0.0/rmlmapper-5.0.0-r362-all.jar -o bin/rmlmapper.jar | ||
RUN curl -L https://github.com/SPARQL-Anything/sparql.anything/releases/download/v1.0-DEV.6/sparql-anything-v1.0-DEV.6.jar -o bin/sparql-anything-v1.0-DEV.6.jar | ||
RUN apt-get update && apt-get install zip -y && rm -rf /var/lib/apt/lists/* | ||
COPY . . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
PREFIX apf: <http://jena.apache.org/ARQ/property#> | ||
PREFIX fx: <http://sparql.xyz/facade-x/ns/> | ||
PREFIX gn: <https://www.geonames.org/ontology#> | ||
PREFIX xyz: <http://sparql.xyz/facade-x/data/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#> | ||
PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
CONSTRUCT { | ||
?adm1Uri xyz:admin1Code ?adm1 . | ||
?adm2Uri xyz:admin2Code ?adm2 . | ||
} | ||
WHERE { | ||
{ | ||
SERVICE <x-sparql-anything:> { | ||
fx:properties fx:location "data/admin1-codes.csv" ; | ||
fx:csv.delimiter "\t" ; | ||
fx:csv.headers true ; | ||
. | ||
[ | ||
xyz:admin1code ?adm1 ; | ||
xyz:geonameId ?adm1Id ; | ||
] | ||
BIND(URI(CONCAT("https://sws.geonames.org/", ?adm1Id, "/")) as ?adm1Uri) | ||
} | ||
} | ||
UNION { | ||
SERVICE <x-sparql-anything:> { | ||
fx:properties fx:location "data/admin2-codes.csv" ; | ||
fx:csv.delimiter "\t" ; | ||
fx:csv.headers true ; | ||
. | ||
[ | ||
xyz:admin2code ?adm2 ; | ||
xyz:geonameId ?adm2Id ; | ||
] | ||
BIND(URI(CONCAT("https://sws.geonames.org/", ?adm2Id, "/")) as ?adm2Uri) | ||
} | ||
} | ||
} |
This file was deleted.
Oops, something went wrong.
File renamed without changes.
File renamed without changes.
This file was deleted.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
PREFIX apf: <http://jena.apache.org/ARQ/property#> | ||
PREFIX fx: <http://sparql.xyz/facade-x/ns/> | ||
PREFIX gn: <https://www.geonames.org/ontology#> | ||
PREFIX xyz: <http://sparql.xyz/facade-x/data/> | ||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
PREFIX wgs84_pos: <http://www.w3.org/2003/01/geo/wgs84_pos#> | ||
PREFIX rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
|
||
# Some measurements. On my local machine, so not super representative. | ||
# | ||
# 1M records 3m36.655s ~400 MB RAM | ||
# 13M records ? ?? | ||
# | ||
CONSTRUCT { | ||
?uri | ||
a gn:Feature ; | ||
gn:name ?name ; | ||
gn:alternateName ?alternateName ; | ||
gn:countryCode ?countryCode ; | ||
gn:featureClass ?featureClass ; | ||
gn:featureCode ?featureCode ; | ||
gn:parentADM1 ?parentAdm1 ; | ||
gn:parentADM2 ?parentAdm2 ; | ||
wgs84_pos:lat ?latitudeFloat ; | ||
wgs84_pos:long ?longitudeFloat ; | ||
} | ||
WHERE { | ||
SERVICE <x-sparql-anything:> { | ||
fx:properties | ||
fx:location "data/geonamesplus.csv" ; | ||
fx:ondisk "/tmp/sparql-anything-geonames" ; # Ondisk storage is slower but required to prevent OOM. | ||
fx:null-string "" ; # Skip empty alternate names. | ||
fx:csv.delimiter "\t" ; | ||
fx:csv.headers true ; | ||
. | ||
|
||
?s xyz:geonameid ?id ; | ||
xyz:name ?name ; | ||
xyz:country%20code ?countryCode ; | ||
xyz:feature%20class ?featureClassString ; | ||
xyz:feature%20code ?featureCodeString ; | ||
xyz:adm1 ?adm1 ; | ||
xyz:adm2 ?adm2 ; # Not all places have a parentAdm2, but we use a special ‘NONE’ value so we can join non-OPTIONALly. | ||
xyz:latitude ?latitude ; | ||
xyz:longitude ?longitude ; | ||
. | ||
|
||
OPTIONAL { | ||
?s xyz:alternatenames ?alternateNameString . | ||
?alternateName apf:strSplit (?alternateNameString ",") . | ||
} | ||
|
||
BIND(URI(CONCAT("https://sws.geonames.org/", ?id, "/")) as ?uri) | ||
BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString)) as ?featureClass) | ||
BIND(URI(CONCAT("https://www.geonames.org/ontology#", ?featureClassString, ".", ?featureCodeString)) as ?featureCode) | ||
BIND(xsd:float(?latitude) AS ?latitudeFloat) | ||
BIND(xsd:float(?longitude) AS ?longitudeFloat) | ||
} | ||
|
||
OPTIONAL { ?parentAdm1 xyz:admin1Code ?adm1 } | ||
OPTIONAL { ?parentAdm2 xyz:admin2Code ?adm2 } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/bin/bash | ||
CONFIG_DIR="$PWD/config" | ||
DATA_DIR="./data" | ||
mkdir -p $DATA_DIR | ||
|
||
# specify countries to download | ||
#country_files="NL BE DE " | ||
country_files="allCountries" | ||
cp $CONFIG_DIR/headers-gn.csv $DATA_DIR/geonames.csv | ||
for cfile in $country_files; do | ||
mkdir temp | ||
cd temp | ||
curl -O "https://download.geonames.org/export/dump/$cfile.zip" | ||
unzip "$cfile.zip" | ||
cd .. | ||
cat "temp/$cfile.txt" >> $DATA_DIR/geonames.csv | ||
rm -rf temp | ||
done | ||
|
||
# create foreign keys 'adm1' and 'adm2' for the admin1code and admin2code tables | ||
# $9=country code, $11=admin1 code, $12=admin2 code | ||
# Explicit NONE so we don't need OPTIONAL joins, which speeds up the mapping process. | ||
awk 'BEGIN{FS=OFS="\t"} {print $0, (NR > 1 ? $9"."$11 : "adm1"), (NR > 1 ? ($12 != "" ? $9"."$11"."$12 : "NONE") : "adm2")}' $DATA_DIR/geonames.csv > $DATA_DIR/geonamesplus.csv | ||
rm $DATA_DIR/geonames.csv | ||
|
||
# download latest version of generic files | ||
cp $CONFIG_DIR/headers-admin1-codes.csv $DATA_DIR/admin1-codes.csv | ||
curl "https://download.geonames.org/export/dump/admin1CodesASCII.txt" >> $DATA_DIR/admin1-codes.csv | ||
|
||
cp $CONFIG_DIR/headers-admin2-codes.csv $DATA_DIR/admin2-codes.csv | ||
curl "https://download.geonames.org/export/dump/admin2Codes.txt" >> $DATA_DIR/admin2-codes.csv |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.