From 9969e72637db7842111d6836bb4109c66cc9f87c Mon Sep 17 00:00:00 2001 From: Jarno Bakker Date: Tue, 30 Jul 2024 17:15:21 +0200 Subject: [PATCH 1/4] added cron for running import --- CHANGELOG.md | 1 + Dockerfile | 9 ++++++++- crontab | 4 ++++ crontask.sh | 6 ++++++ docker-compose.yml | 21 +++++++++++++++++++++ entrypoint.py | 13 +++++++------ entrypoint.sh | 4 +++- entrypoint_cron.sh | 8 ++++++++ 8 files changed, 58 insertions(+), 8 deletions(-) create mode 100644 crontab create mode 100755 crontask.sh create mode 100755 entrypoint_cron.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 69b3be4..56ddd8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support for YAML configurations. - Allow specifying external config/rdf files. - Added support for trig files. +- Import vocabularies every hour using cron. ## [v2.15-1.1.0] diff --git a/Dockerfile b/Dockerfile index aea67f8..bf46a0b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ ARG SKOSMOS_TARGZ_RELEASE_URL=https://github.com/knaw-huc/Skosmos/archive/refs/t # general server setup and locale RUN apt-get update && \ - apt-get -y install gettext locales curl unzip vim git libicu-dev libxslt-dev python3 pip && \ + apt-get -y install gettext locales curl unzip vim git libicu-dev libxslt-dev python3 pip cron && \ for locale in en_GB en_US fi_FI fr_FR sv_SE; do \ echo "${locale}.UTF-8 UTF-8" >> /etc/locale.gen ; \ done && \ @@ -40,7 +40,14 @@ RUN /usr/bin/env pip install -r /var/www/requirements.txt # Configure Skosmos COPY skosmos-repository.ttl /var/www/ COPY entrypoint.sh /var/www/ +COPY entrypoint_cron.sh /var/www/ COPY ./src /var/www/src COPY entrypoint.py /var/www/ COPY config-docker-compose.ttl /var/www/html/ + +# Prepare CRON +COPY crontab /var/www/crontab +COPY entrypoint_cron.sh /var/www/ +COPY crontask.sh /var/www/ + ENTRYPOINT ["/var/www/entrypoint.sh"] diff --git a/crontab b/crontab new file mode 100644 index 0000000..47be358 --- /dev/null +++ b/crontab @@ -0,0 +1,4 @@ +SPARQL_ENDPOINT=$SPARQL_ENDPOINT +DATA=$DATA + +0 * * * * /var/www/crontask.sh >> /var/log/cron.log 2>&1 diff --git a/crontask.sh b/crontask.sh new file mode 100755 index 0000000..f0146ff --- /dev/null +++ b/crontask.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -x + +/var/www/entrypoint.py + +/usr/bin/envsubst < /config/config-docker-compose.ttl > /config/config.ttl diff --git a/docker-compose.yml b/docker-compose.yml index 11c2414..712a766 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,8 +34,29 @@ services: - '8080:80' volumes: - ./data:/data + - skosmos-configuration:/config depends_on: - graphdb + skosmos-cron: + container_name: 'skosmos-cron' + build: + context: . + dockerfile: Dockerfile + environment: + - SPARQL_ENDPOINT=http://host.docker.internal:7200/repositories/skosmos + - DATA=${DATA:-/data} + networks: + - skosmos + volumes: + - ./data:/data + - skosmos-configuration:/config + entrypoint: + - /var/www/entrypoint_cron.sh + networks: skosmos: external: false + +volumes: + skosmos-configuration: + external: false diff --git a/entrypoint.py b/entrypoint.py index 522e6a9..1311731 100755 --- a/entrypoint.py +++ b/entrypoint.py @@ -37,13 +37,13 @@ def append_file(source, dest): data = os.environ["DATA"] if os.path.isfile(f'{data}/config.ttl'): - shutil.copy(f'{data}/config.ttl', '/tmp/config-docker-compose.ttl') + shutil.copy(f'{data}/config.ttl', '/config/config-docker-compose.ttl') else: - shutil.copy('/var/www/html/config-docker-compose.ttl', '/tmp/config-docker-compose.ttl') + shutil.copy('/var/www/html/config-docker-compose.ttl', '/config/config-docker-compose.ttl') if os.path.isfile(f'{data}/config-ext.ttl'): with open(f'{data}/config-ext.ttl', 'r', encoding='utf-8') as f: - append_file(f, '/tmp/config-docker-compose.ttl') + append_file(f, '/config/config-docker-compose.ttl') setup_graphdb() @@ -60,9 +60,6 @@ def append_file(source, dest): with get_file_from_config(vocab_config['config'], data) as config: graph = get_graph(config) print(f"Graph: {graph}") - with get_file_from_config(vocab_config['config'], data) as config: - # Reset file pointer - append_file(config, "/tmp/config-docker-compose.ttl") always_load = vocab_config['config'].get('alwaysRefresh', False) @@ -70,6 +67,10 @@ def append_file(source, dest): print(f"Loading vocabulary {vocab}") load_vocabulary(vocab_config['source'], data, graph) print("... DONE") + + # Doing this last makes sure the vocab isn't added to the config when there's a problem + with get_file_from_config(vocab_config['config'], data) as config: + append_file(config, "/config/config-docker-compose.ttl") except InvalidConfigurationException as e: print(f"Invalid configuration: {e}") print(f"Skipping vocab '{vocab}'") diff --git a/entrypoint.sh b/entrypoint.sh index 8c5d168..e80278c 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -3,7 +3,9 @@ set -x /var/www/entrypoint.py -/usr/bin/envsubst < /tmp/config-docker-compose.ttl > /var/www/html/config.ttl +/usr/bin/envsubst < /config/config-docker-compose.ttl > /config/config.ttl + +ln -s /config/config.ttl /var/www/html/config.ttl cat /var/www/html/config.ttl /usr/sbin/apache2ctl -D FOREGROUND diff --git a/entrypoint_cron.sh b/entrypoint_cron.sh new file mode 100755 index 0000000..1d2c618 --- /dev/null +++ b/entrypoint_cron.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +/usr/bin/envsubst < /var/www/crontab > /etc/cron.d/crontab + +crontab /etc/cron.d/crontab +touch /var/log/cron.log + +cron && tail -f /var/log/cron.log From a2f8098d21935b478fc08b13c3422912cf22d6df Mon Sep 17 00:00:00 2001 From: Jarno Bakker Date: Thu, 1 Aug 2024 13:34:36 +0200 Subject: [PATCH 2/4] keep track of refresh timestamps --- entrypoint.py | 19 ++++++++-- requirements.txt | 2 + src/graphdb.py | 89 ++++++++++++++++++++++++++++++++++++--------- src/vocabularies.py | 20 +++++----- 4 files changed, 100 insertions(+), 30 deletions(-) diff --git a/entrypoint.py b/entrypoint.py index 1311731..288616c 100755 --- a/entrypoint.py +++ b/entrypoint.py @@ -9,13 +9,14 @@ import shutil import time from pathlib import Path +from typing import IO from src.exceptions import InvalidConfigurationException -from src.graphdb import get_loaded_vocabs, setup_graphdb +from src.graphdb import get_loaded_vocabs, set_timestamp, setup_graphdb, update_timestamp from src.vocabularies import get_file_from_config, get_graph, load_vocab_yaml, load_vocabulary -def append_file(source, dest): +def append_file(source: IO, dest: str): """ Append source to dest file. :param source: A file pointer to a source file. @@ -61,11 +62,21 @@ def append_file(source, dest): graph = get_graph(config) print(f"Graph: {graph}") - always_load = vocab_config['config'].get('alwaysRefresh', False) + should_reload = False + if graph not in loaded_vocabs: + should_reload = True + elif vocab_config['config'].get('refresh', False): + interval = vocab_config['config'].get('refreshInterval', 0) + diff = (time.time() - loaded_vocabs[graph]) / 3600 + should_reload = diff > interval - if always_load or graph not in loaded_vocabs: + if should_reload: print(f"Loading vocabulary {vocab}") load_vocabulary(vocab_config['source'], data, graph) + if graph in loaded_vocabs: + update_timestamp(graph, int(time.time())) + else: + set_timestamp(graph, int(time.time())) print("... DONE") # Doing this last makes sure the vocab isn't added to the config when there's a problem diff --git a/requirements.txt b/requirements.txt index 3c4c149..c41d163 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ PyYAML~=6.0.1 requests~=2.31.0 + +SPARQLWrapper~=2.0.0 diff --git a/src/graphdb.py b/src/graphdb.py index 6a0207d..270b849 100644 --- a/src/graphdb.py +++ b/src/graphdb.py @@ -2,13 +2,17 @@ This file contains functions for interacting with GraphDB """ import os +from typing import TextIO + import requests +from SPARQLWrapper import SPARQLWrapper, JSON, POST, DIGEST + admin_password = os.environ.get("ADMIN_PASSWORD", '') endpoint = os.environ.get("SPARQL_ENDPOINT", '') -def setup_graphdb(): +def setup_graphdb() -> None: """ Setup graphdb, if it isn't set up yet. :return: @@ -33,28 +37,79 @@ def setup_graphdb(): print(f"EXISTS GRAPHDB [{endpoint}]]") -def get_loaded_vocabs(): +def get_loaded_vocabs() -> dict[str, int]: """ Get all loaded vocabularies from GraphDB :return: """ - graphs_response = requests.get( - f"{endpoint}/rdf-graphs", - headers={"Accept": "application/json"}, - timeout=60 - ) - tmp = [] - if graphs_response.status_code == 200: - body = graphs_response.json() - tmp = [] - for binding in body["results"]["bindings"]: - tmp.append(binding["contextID"]["value"]) - print("Loaded vocabs:") - print(tmp) + sparql = SPARQLWrapper(endpoint) + sparql.setReturnFormat(JSON) + q = """ + SELECT ?graph ?timestamp + WHERE { + ?graph ?timestamp . + FILTER NOT EXISTS { + GRAPH ?g {?graph ?timestamp .} + } + } + ORDER BY ?timestamp + """ + sparql.setQuery(q) + result = sparql.queryAndConvert() + result = result['results']['bindings'] + tmp = {} + for line in result: + tmp[line['graph']['value']] = int(line['timestamp']['value']) return tmp -def get_type(extension): +def set_timestamp(graph_name: str, timestamp: int) -> None: + """ + Set a timestamp for a new graph. + :param graph_name: + :param timestamp: + :return: + """ + sparql = SPARQLWrapper(f"{endpoint}/statements") + sparql.setHTTPAuth(DIGEST) + sparql.setCredentials("admin", admin_password) + sparql.setMethod(POST) + q = """INSERT DATA {{ + <{graph}> {timestamp} . + }}""" + q_formatted = q.format(graph=graph_name, timestamp=timestamp) + print(q_formatted) + sparql.setQuery(q_formatted) + sparql.query() + + +def update_timestamp(graph_name: str, timestamp: int) -> None: + """ + Set a timestamp for an existing graph. + :param graph_name: + :param timestamp: + :return: + """ + sparql = SPARQLWrapper(f"{endpoint}/statements") + sparql.setHTTPAuth(DIGEST) + sparql.setCredentials("admin", admin_password) + sparql.setMethod(POST) + q = """ + DELETE {{ + <{graph}> ?timestamp . + }} + INSERT {{ + <{graph}> {timestamp} . + }} + WHERE {{ + <{graph}> ?timestamp . + }} + """ + sparql.setQuery(q.format(graph=graph_name, timestamp=timestamp)) + sparql.query() + + +def get_type(extension: str) -> str: """ Get the http mimetype based on the extension of a file. :param extension: @@ -68,7 +123,7 @@ def get_type(extension): return "text/turtle" -def add_vocabulary(graph, graph_name, extension): +def add_vocabulary(graph: TextIO, graph_name: str, extension: str) -> None: """ Add a vocabulary to GraphDB :param graph: File diff --git a/src/vocabularies.py b/src/vocabularies.py index bb7f5ba..68de659 100644 --- a/src/vocabularies.py +++ b/src/vocabularies.py @@ -5,13 +5,16 @@ import re import urllib.request import urllib.parse +from pathlib import Path +from typing import IO, TextIO + import yaml from src.exceptions import InvalidConfigurationException, UnknownAuthenticationTypeException from src.graphdb import add_vocabulary -def get_file_from_config(config_data, data_dir): +def get_file_from_config(config_data: dict, data_dir: str) -> TextIO: """ Get the config file from yaml data. :param config_data: The configuration, a dict with information about the file. @@ -52,7 +55,7 @@ def get_file_from_config(config_data, data_dir): raise InvalidConfigurationException("Type must be file") -def load_vocabulary(source_data, data_dir, graph_name): +def load_vocabulary(source_data: dict, data_dir: str, graph_name: str) -> None: """ Load a vocabulary using the source data from the yaml. :param source_data: @@ -64,17 +67,16 @@ def load_vocabulary(source_data, data_dir, graph_name): add_vocabulary(vocab_file, graph_name, get_vocab_format(source_data)) -def get_graph(fp): +def get_graph(fp: IO) -> str: """ Get the sparql graph from the given vocab :param fp: The vocabulary config, a file pointer :return: """ for line in fp: - # If line is a bytes-like object, we need to decode it try: - line = line.decode() - except (UnicodeDecodeError, AttributeError): + line = line.decode('utf-8') + except UnicodeDecodeError: # Already decoded pass if re.search("sparqlGraph", line): @@ -82,7 +84,7 @@ def get_graph(fp): return "" -def load_vocab_yaml(file_location): +def load_vocab_yaml(file_location: Path) -> dict: """ Open a yaml config file and return a dict with its contents :param file_location: @@ -92,7 +94,7 @@ def load_vocab_yaml(file_location): return yaml.safe_load(fp) -def get_vocab_format(source_data): +def get_vocab_format(source_data: dict) -> str: """ Return the vocab format of the given data source. It is either based on the file extension, or on an override in the yaml file. @@ -101,4 +103,4 @@ def get_vocab_format(source_data): """ if 'format' in source_data: return source_data['format'] - return source_data['location'].split('.')[-1] + return source_data['location'].split('?')[0].split('.')[-1] From e9587217634b190288909788f46b1c73260010c1 Mon Sep 17 00:00:00 2001 From: Jarno Bakker Date: Thu, 1 Aug 2024 13:43:37 +0200 Subject: [PATCH 3/4] update changelog --- CHANGELOG.md | 2 ++ entrypoint.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56ddd8e..ac3f6d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added - Added support for YAML configurations. - Allow specifying external config/rdf files. - Added support for trig files. - Import vocabularies every hour using cron. +- Added a configuration option for the refresh interval of vocabularies. ## [v2.15-1.1.0] diff --git a/entrypoint.py b/entrypoint.py index 288616c..417ac9c 100755 --- a/entrypoint.py +++ b/entrypoint.py @@ -32,9 +32,11 @@ def append_file(source: IO, dest: str): df.write(line) -if __name__ == "__main__": - time.sleep(10) - +def main() -> None: + """ + Main function. + :return: + """ data = os.environ["DATA"] if os.path.isfile(f'{data}/config.ttl'): @@ -62,15 +64,15 @@ def append_file(source: IO, dest: str): graph = get_graph(config) print(f"Graph: {graph}") - should_reload = False + reload = False if graph not in loaded_vocabs: - should_reload = True + reload = True elif vocab_config['config'].get('refresh', False): interval = vocab_config['config'].get('refreshInterval', 0) diff = (time.time() - loaded_vocabs[graph]) / 3600 - should_reload = diff > interval + reload = diff > interval - if should_reload: + if reload: print(f"Loading vocabulary {vocab}") load_vocabulary(vocab_config['source'], data, graph) if graph in loaded_vocabs: @@ -86,3 +88,8 @@ def append_file(source: IO, dest: str): print(f"Invalid configuration: {e}") print(f"Skipping vocab '{vocab}'") continue + + +if __name__ == "__main__": + time.sleep(10) + main() From 6ed8a5dd86fa6147e39eda42d6ab9c9d5ba481ef Mon Sep 17 00:00:00 2001 From: Jarno Bakker Date: Thu, 1 Aug 2024 13:48:26 +0200 Subject: [PATCH 4/4] drop python 3.8 support want to use strict typing for dicts --- .github/workflows/pylint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 1987cd0..caba1c8 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.9", "3.10"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }}