Skip to content

Commit

Permalink
Merge branch 'main' of github.com:knaw-huc/sd-skosmos
Browse files Browse the repository at this point in the history
  • Loading branch information
jarno-knaw committed Sep 9, 2024
2 parents 4e075b9 + 47de985 commit 771e683
Show file tree
Hide file tree
Showing 12 changed files with 171 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10"]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [v2.15-1.2.0]

### Added
- Added support for YAML configurations.
- Allow specifying external config/rdf files.
- Added support for trig files.
- Import vocabularies every hour using cron.
- Added a configuration option for the refresh interval of vocabularies.

## [v2.15-1.1.0]

Expand Down
9 changes: 8 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ARG SKOSMOS_TARGZ_RELEASE_URL=https://github.com/knaw-huc/Skosmos/archive/refs/t

# general server setup and locale
RUN apt-get update && \
apt-get -y install gettext locales curl unzip vim git libicu-dev libxslt-dev python3 pip && \
apt-get -y install gettext locales curl unzip vim git libicu-dev libxslt-dev python3 pip cron && \
for locale in en_GB en_US fi_FI fr_FR sv_SE; do \
echo "${locale}.UTF-8 UTF-8" >> /etc/locale.gen ; \
done && \
Expand Down Expand Up @@ -40,7 +40,14 @@ RUN /usr/bin/env pip install -r /var/www/requirements.txt
# Configure Skosmos
COPY skosmos-repository.ttl /var/www/
COPY entrypoint.sh /var/www/
COPY entrypoint_cron.sh /var/www/
COPY ./src /var/www/src
COPY entrypoint.py /var/www/
COPY config-docker-compose.ttl /var/www/html/

# Prepare CRON
COPY crontab /var/www/crontab
COPY entrypoint_cron.sh /var/www/
COPY crontask.sh /var/www/

ENTRYPOINT ["/var/www/entrypoint.sh"]
4 changes: 4 additions & 0 deletions crontab
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SPARQL_ENDPOINT=$SPARQL_ENDPOINT
DATA=$DATA

0 * * * * /var/www/crontask.sh >> /var/log/cron.log 2>&1
6 changes: 6 additions & 0 deletions crontask.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash
set -x

/var/www/entrypoint.py

/usr/bin/envsubst < /config/config-docker-compose.ttl > /config/config.ttl
21 changes: 21 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,29 @@ services:
- '8080:80'
volumes:
- ./data:/data
- skosmos-configuration:/config
depends_on:
- graphdb
skosmos-cron:
container_name: 'skosmos-cron'
build:
context: .
dockerfile: Dockerfile
environment:
- SPARQL_ENDPOINT=http://host.docker.internal:7200/repositories/skosmos
- DATA=${DATA:-/data}
networks:
- skosmos
volumes:
- ./data:/data
- skosmos-configuration:/config
entrypoint:
- /var/www/entrypoint_cron.sh

networks:
skosmos:
external: false

volumes:
skosmos-configuration:
external: false
45 changes: 32 additions & 13 deletions entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
import shutil
import time
from pathlib import Path
from typing import IO

from src.exceptions import InvalidConfigurationException
from src.graphdb import get_loaded_vocabs, setup_graphdb
from src.graphdb import get_loaded_vocabs, set_timestamp, setup_graphdb, update_timestamp
from src.vocabularies import get_file_from_config, get_graph, load_vocab_yaml, load_vocabulary


def append_file(source, dest):
def append_file(source: IO, dest: str):
"""
Append source to dest file.
:param source: A file pointer to a source file.
Expand All @@ -31,19 +32,21 @@ def append_file(source, dest):
df.write(line)


if __name__ == "__main__":
time.sleep(10)

def main() -> None:
"""
Main function.
:return:
"""
data = os.environ["DATA"]

if os.path.isfile(f'{data}/config.ttl'):
shutil.copy(f'{data}/config.ttl', '/tmp/config-docker-compose.ttl')
shutil.copy(f'{data}/config.ttl', '/config/config-docker-compose.ttl')
else:
shutil.copy('/var/www/html/config-docker-compose.ttl', '/tmp/config-docker-compose.ttl')
shutil.copy('/var/www/html/config-docker-compose.ttl', '/config/config-docker-compose.ttl')

if os.path.isfile(f'{data}/config-ext.ttl'):
with open(f'{data}/config-ext.ttl', 'r', encoding='utf-8') as f:
append_file(f, '/tmp/config-docker-compose.ttl')
append_file(f, '/config/config-docker-compose.ttl')

setup_graphdb()

Expand All @@ -60,17 +63,33 @@ def append_file(source, dest):
with get_file_from_config(vocab_config['config'], data) as config:
graph = get_graph(config)
print(f"Graph: {graph}")
with get_file_from_config(vocab_config['config'], data) as config:
# Reset file pointer
append_file(config, "/tmp/config-docker-compose.ttl")

always_load = vocab_config['config'].get('alwaysRefresh', False)
reload = False
if graph not in loaded_vocabs:
reload = True
elif vocab_config['config'].get('refresh', False):
interval = vocab_config['config'].get('refreshInterval', 0)
diff = (time.time() - loaded_vocabs[graph]) / 3600
reload = diff > interval

if always_load or graph not in loaded_vocabs:
if reload:
print(f"Loading vocabulary {vocab}")
load_vocabulary(vocab_config['source'], data, graph)
if graph in loaded_vocabs:
update_timestamp(graph, int(time.time()))
else:
set_timestamp(graph, int(time.time()))
print("... DONE")

# Doing this last makes sure the vocab isn't added to the config when there's a problem
with get_file_from_config(vocab_config['config'], data) as config:
append_file(config, "/config/config-docker-compose.ttl")
except InvalidConfigurationException as e:
print(f"Invalid configuration: {e}")
print(f"Skipping vocab '{vocab}'")
continue


if __name__ == "__main__":
time.sleep(10)
main()
4 changes: 3 additions & 1 deletion entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ set -x

/var/www/entrypoint.py

/usr/bin/envsubst < /tmp/config-docker-compose.ttl > /var/www/html/config.ttl
/usr/bin/envsubst < /config/config-docker-compose.ttl > /config/config.ttl

ln -s /config/config.ttl /var/www/html/config.ttl
cat /var/www/html/config.ttl

/usr/sbin/apache2ctl -D FOREGROUND
8 changes: 8 additions & 0 deletions entrypoint_cron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

/usr/bin/envsubst < /var/www/crontab > /etc/cron.d/crontab

crontab /etc/cron.d/crontab
touch /var/log/cron.log

cron && tail -f /var/log/cron.log
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
PyYAML~=6.0.1
requests~=2.31.0

SPARQLWrapper~=2.0.0
89 changes: 72 additions & 17 deletions src/graphdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
This file contains functions for interacting with GraphDB
"""
import os
from typing import TextIO

import requests

from SPARQLWrapper import SPARQLWrapper, JSON, POST, DIGEST

admin_password = os.environ.get("ADMIN_PASSWORD", '')
endpoint = os.environ.get("SPARQL_ENDPOINT", '')


def setup_graphdb():
def setup_graphdb() -> None:
"""
Setup graphdb, if it isn't set up yet.
:return:
Expand All @@ -33,28 +37,79 @@ def setup_graphdb():
print(f"EXISTS GRAPHDB [{endpoint}]]")


def get_loaded_vocabs():
def get_loaded_vocabs() -> dict[str, int]:
"""
Get all loaded vocabularies from GraphDB
:return:
"""
graphs_response = requests.get(
f"{endpoint}/rdf-graphs",
headers={"Accept": "application/json"},
timeout=60
)
tmp = []
if graphs_response.status_code == 200:
body = graphs_response.json()
tmp = []
for binding in body["results"]["bindings"]:
tmp.append(binding["contextID"]["value"])
print("Loaded vocabs:")
print(tmp)
sparql = SPARQLWrapper(endpoint)
sparql.setReturnFormat(JSON)
q = """
SELECT ?graph ?timestamp
WHERE {
?graph <http://purl.org/dc/terms/modified> ?timestamp .
FILTER NOT EXISTS {
GRAPH ?g {?graph <http://purl.org/dc/terms/modified> ?timestamp .}
}
}
ORDER BY ?timestamp
"""
sparql.setQuery(q)
result = sparql.queryAndConvert()
result = result['results']['bindings']
tmp = {}
for line in result:
tmp[line['graph']['value']] = int(line['timestamp']['value'])
return tmp


def get_type(extension):
def set_timestamp(graph_name: str, timestamp: int) -> None:
"""
Set a timestamp for a new graph.
:param graph_name:
:param timestamp:
:return:
"""
sparql = SPARQLWrapper(f"{endpoint}/statements")
sparql.setHTTPAuth(DIGEST)
sparql.setCredentials("admin", admin_password)
sparql.setMethod(POST)
q = """INSERT DATA {{
<{graph}> <http://purl.org/dc/terms/modified> {timestamp} .
}}"""
q_formatted = q.format(graph=graph_name, timestamp=timestamp)
print(q_formatted)
sparql.setQuery(q_formatted)
sparql.query()


def update_timestamp(graph_name: str, timestamp: int) -> None:
"""
Set a timestamp for an existing graph.
:param graph_name:
:param timestamp:
:return:
"""
sparql = SPARQLWrapper(f"{endpoint}/statements")
sparql.setHTTPAuth(DIGEST)
sparql.setCredentials("admin", admin_password)
sparql.setMethod(POST)
q = """
DELETE {{
<{graph}> <http://purl.org/dc/terms/modified> ?timestamp .
}}
INSERT {{
<{graph}> <http://purl.org/dc/terms/modified> {timestamp} .
}}
WHERE {{
<{graph}> <http://purl.org/dc/terms/modified> ?timestamp .
}}
"""
sparql.setQuery(q.format(graph=graph_name, timestamp=timestamp))
sparql.query()


def get_type(extension: str) -> str:
"""
Get the http mimetype based on the extension of a file.
:param extension:
Expand All @@ -68,7 +123,7 @@ def get_type(extension):
return "text/turtle"


def add_vocabulary(graph, graph_name, extension):
def add_vocabulary(graph: TextIO, graph_name: str, extension: str) -> None:
"""
Add a vocabulary to GraphDB
:param graph: File
Expand Down
20 changes: 11 additions & 9 deletions src/vocabularies.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
import re
import urllib.request
import urllib.parse
from pathlib import Path
from typing import IO, TextIO

import yaml

from src.exceptions import InvalidConfigurationException, UnknownAuthenticationTypeException
from src.graphdb import add_vocabulary


def get_file_from_config(config_data, data_dir):
def get_file_from_config(config_data: dict, data_dir: str) -> TextIO:
"""
Get the config file from yaml data.
:param config_data: The configuration, a dict with information about the file.
Expand Down Expand Up @@ -52,7 +55,7 @@ def get_file_from_config(config_data, data_dir):
raise InvalidConfigurationException("Type must be file")


def load_vocabulary(source_data, data_dir, graph_name):
def load_vocabulary(source_data: dict, data_dir: str, graph_name: str) -> None:
"""
Load a vocabulary using the source data from the yaml.
:param source_data:
Expand All @@ -64,25 +67,24 @@ def load_vocabulary(source_data, data_dir, graph_name):
add_vocabulary(vocab_file, graph_name, get_vocab_format(source_data))


def get_graph(fp):
def get_graph(fp: IO) -> str:
"""
Get the sparql graph from the given vocab
:param fp: The vocabulary config, a file pointer
:return:
"""
for line in fp:
# If line is a bytes-like object, we need to decode it
try:
line = line.decode()
except (UnicodeDecodeError, AttributeError):
line = line.decode('utf-8')
except UnicodeDecodeError:
# Already decoded
pass
if re.search("sparqlGraph", line):
return line.strip().split(" ")[1].strip("<>")
return ""


def load_vocab_yaml(file_location):
def load_vocab_yaml(file_location: Path) -> dict:
"""
Open a yaml config file and return a dict with its contents
:param file_location:
Expand All @@ -92,7 +94,7 @@ def load_vocab_yaml(file_location):
return yaml.safe_load(fp)


def get_vocab_format(source_data):
def get_vocab_format(source_data: dict) -> str:
"""
Return the vocab format of the given data source. It is either based on the file extension,
or on an override in the yaml file.
Expand All @@ -101,4 +103,4 @@ def get_vocab_format(source_data):
"""
if 'format' in source_data:
return source_data['format']
return source_data['location'].split('.')[-1]
return source_data['location'].split('?')[0].split('.')[-1]

0 comments on commit 771e683

Please sign in to comment.