Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Process complete GeoNames #18

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ FROM eclipse-temurin:17
LABEL org.opencontainers.image.source = "https://github.com/netwerk-digitaal-erfgoed/geonames-harvester"
WORKDIR /app
RUN mkdir bin
RUN curl -L https://github.com/RMLio/rmlmapper-java/releases/download/v5.0.0/rmlmapper-5.0.0-r362-all.jar -o bin/rmlmapper.jar
RUN curl -L https://github.com/carml/carml-jar/raw/nde/carml-jar-jena-1.4.0-SNAPSHOT-0.4.10.jar -o bin/carml.jar
RUN apt-get update && apt-get install zip -y && rm -rf /var/lib/apt/lists/*
COPY . .
76 changes: 19 additions & 57 deletions config/geonames.rml → config/geonames.ttl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
@prefix rml: <http://semweb.mmlab.be/ns/rml#>.
@prefix rr: <http://www.w3.org/ns/r2rml#>.
@prefix ql: <http://semweb.mmlab.be/ns/ql#>.
@prefix csvw: <http://www.w3.org/ns/csvw#>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix : <http://example.org/rules/>.
Expand All @@ -12,26 +11,16 @@
@prefix grel: <http://users.ugent.be/~bjdmeest/function/grel.ttl#> .
@prefix idlab-fn: <http://example.com/idlab/function/> .

:GeonamesSource
:GeonamesSource
a rml:LogicalSource ;
rml:source [
rdf:type csvw:Table;
csvw:url "geonamesplus.txt";
csvw:dialect [
rdf:type csvw:Dialect;
csvw:delimiter "\t";
csvw:encoding "UTF-8";
csvw:trim true;
csvw:null ""; # this doesn't seem to have any effect, sadly!
]
];
rml:source "geonamesplus.txt";
rml:referenceFormulation ql:CSV .

:GeonamesMap
:GeonamesMap
a rr:TriplesMap;
rml:logicalSource :GeonamesSource ;
rr:subjectMap [ rr:template "https://sws.geonames.org/{geonameid}/" ];
rr:predicateObjectMap
rr:predicateObjectMap
[
rr:predicate gn:name;
rr:objectMap [ rml:reference "name" ]
Expand All @@ -46,13 +35,13 @@
],
[
rr:predicate gn:featureClass;
rr:objectMap [
rr:objectMap [
rr:template "https://www.geonames.org/ontology#{feature class}"
]
],
[
rr:predicate gn:featureCode;
rr:objectMap [
rr:objectMap [
rr:template "https://www.geonames.org/ontology#{feature class}.{feature code}"
]
],
Expand Down Expand Up @@ -92,10 +81,11 @@


:AlternateNamesSplit
rr:termType rr:Literal;
fnml:functionValue [
rml:logicalSource :LogicalSource;
rr:predicateObjectMap [
rr:predicate fno:executes;
rr:predicate fno:executes;
rr:objectMap [ rr:constant grel:string_split ];
];
rr:predicateObjectMap [
Expand All @@ -108,69 +98,41 @@
];
].

:Admin1codeSource
:Admin1codeSource
a rml:LogicalSource ;
rml:source [
rdf:type csvw:Table;
csvw:url "admin1-codes.txt";
csvw:dialect [
rdf:type csvw:Dialect;
csvw:delimiter "\t";
csvw:encoding "UTF-8";
csvw:trim true;
];
];
rml:source "admin1-codes.txt";
rml:referenceFormulation ql:CSV .


:Adm1Map
:Adm1Map
a rr:TriplesMap;
rml:logicalSource :Admin1codeSource ;
rr:subjectMap [ rr:template "https://sws.geonames.org/{geonameId}/" ].

:Admin2codeSource
:Admin2codeSource
a rml:LogicalSource ;
rml:source [
rdf:type csvw:Table;
csvw:url "admin2-codes.txt";
csvw:dialect [
rdf:type csvw:Dialect;
csvw:delimiter "\t";
csvw:encoding "UTF-8";
csvw:trim true;
];
];
rml:source "admin2-codes.txt";
rml:referenceFormulation ql:CSV .

:Adm2Map
:Adm2Map
a rr:TriplesMap;
rml:logicalSource :Admin2codeSource ;
rr:subjectMap [ rr:template "https://sws.geonames.org/{geonameId}/" ].

:FeatureCodeSource
:FeatureCodeSource
a rml:LogicalSource ;
rml:source [
rdf:type csvw:Table;
csvw:url "feature-codes.txt";
csvw:dialect [
rdf:type csvw:Dialect;
csvw:delimiter "\t";
csvw:encoding "UTF-8";
csvw:trim true;
]
];
rml:source "feature-codes.txt";
rml:referenceFormulation ql:CSV .

:FeatureCodeTriplesMap
:FeatureCodeTriplesMap
a rr:TriplesMap;
rml:logicalSource :FeatureCodeSource ;
rr:subjectMap [ rr:template "https://www.geonames.org/ontology#{featureCode}" ];
rr:predicateObjectMap
rr:predicateObjectMap
[
rr:predicate gn:name;
rr:objectMap [ rml:reference "name" ]
],
[
rr:predicate rdfs:comment;
rr:objectMap [ rml:reference "description" ]
].
].
22 changes: 6 additions & 16 deletions geonames-download.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
#!/bin/bash
CONFIG_DIR="$PWD/config"
DATA_DIR="./data"
if [ ! -d "$DATA_DIR" ]; then
mkdir $DATA_DIR
fi
mkdir -p $DATA_DIR

# specify countries to download
country_files="NL BE DE"
#country_files="NL BE DE "
country_files="allCountries"
cp $CONFIG_DIR/headers-gn.txt $DATA_DIR/geonames.txt
for cfile in $country_files; do
mkdir temp
Expand All @@ -25,19 +24,10 @@ rm $DATA_DIR/geonames.txt

# download latest version of generic files
cp $CONFIG_DIR/headers-feature-codes.txt $DATA_DIR/feature-codes.txt
curl -O "https://download.geonames.org/export/dump/featureCodes_en.txt"
cat featureCodes_en.txt >> $DATA_DIR/feature-codes.txt
rm featureCodes_en.txt
curl "https://download.geonames.org/export/dump/featureCodes_en.txt" >> $DATA_DIR/feature-codes.txt

cp $CONFIG_DIR/headers-admin1-codes.txt $DATA_DIR/admin1-codes.txt
curl -O "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
cat admin1CodesASCII.txt >> $DATA_DIR/admin1-codes.txt
rm admin1CodesASCII.txt
curl "https://download.geonames.org/export/dump/admin1CodesASCII.txt" >> $DATA_DIR/admin1-codes.txt

cp $CONFIG_DIR/headers-admin2-codes.txt $DATA_DIR/admin2-codes.txt
curl -O "https://download.geonames.org/export/dump/admin2Codes.txt"
# remove double quotes that seem to appear within some strings
# to prevent RML from crashing
sed -i 's/"//g' admin2Codes.txt
cat admin2Codes.txt >> $DATA_DIR/admin2-codes.txt
rm admin2Codes.txt
curl "https://download.geonames.org/export/dump/admin2Codes.txt" >> $DATA_DIR/admin2-codes.txt
6 changes: 1 addition & 5 deletions map.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,4 @@ CONFIG_DIR="$PWD/config"
OUTPUT_DIR="$DATA_DIR"
cd $DATA_DIR
echo "Running the RMLmapper to convert geonames data to RDF, be patient..."
java -jar $BIN_DIR/rmlmapper.jar -m $CONFIG_DIR/geonames.rml -o $OUTPUT_DIR/geonames.nt
cd ..
# This RML mapping creates triples with empty object values.
# The following sed command remove every line with a "". pattern.
sed -i '/\"\"./d' $OUTPUT_DIR/geonames.nt
java -jar $BIN_DIR/carml.jar map -m $CONFIG_DIR/geonames.ttl -r . -o $OUTPUT_DIR/geonames.nt