Merge branch 'main' of github.com:knaw-huc/sd-skosmos

knaw-huc · Sep 9, 2024 · 771e683 · 771e683
2 parents 4e075b9 + 47de985
commit 771e683
Show file tree

Hide file tree

Showing 12 changed files with 171 additions and 42 deletions.
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10"]
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,9 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [v2.15-1.2.0]
 
+### Added
 - Added support for YAML configurations.
 - Allow specifying external config/rdf files.
 - Added support for trig files.
+- Import vocabularies every hour using cron.
+- Added a configuration option for the refresh interval of vocabularies.
 
 ## [v2.15-1.1.0]
 

diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ ARG SKOSMOS_TARGZ_RELEASE_URL=https://github.com/knaw-huc/Skosmos/archive/refs/t
 
 # general server setup and locale
 RUN apt-get update && \
-  apt-get -y install gettext locales curl unzip vim git libicu-dev libxslt-dev python3 pip && \
+  apt-get -y install gettext locales curl unzip vim git libicu-dev libxslt-dev python3 pip cron && \
   for locale in en_GB en_US fi_FI fr_FR sv_SE; do \
     echo "${locale}.UTF-8 UTF-8" >> /etc/locale.gen ; \
   done && \
@@ -40,7 +40,14 @@ RUN /usr/bin/env pip install -r /var/www/requirements.txt
 # Configure Skosmos
 COPY skosmos-repository.ttl /var/www/
 COPY entrypoint.sh /var/www/
+COPY entrypoint_cron.sh /var/www/
 COPY ./src /var/www/src
 COPY entrypoint.py /var/www/
 COPY config-docker-compose.ttl /var/www/html/
+
+# Prepare CRON
+COPY crontab /var/www/crontab
+COPY entrypoint_cron.sh /var/www/
+COPY crontask.sh /var/www/
+
 ENTRYPOINT ["/var/www/entrypoint.sh"]
diff --git a/crontab b/crontab
@@ -0,0 +1,4 @@
+SPARQL_ENDPOINT=$SPARQL_ENDPOINT
+DATA=$DATA
+
+0 * * * * /var/www/crontask.sh >> /var/log/cron.log 2>&1
diff --git a/crontask.sh b/crontask.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+set -x
+
+/var/www/entrypoint.py
+
+/usr/bin/envsubst < /config/config-docker-compose.ttl > /config/config.ttl
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -34,8 +34,29 @@ services:
       - '8080:80'
     volumes:
       - ./data:/data
+      - skosmos-configuration:/config
     depends_on:
       - graphdb
+  skosmos-cron:
+    container_name: 'skosmos-cron'
+    build:
+      context: .
+      dockerfile: Dockerfile
+    environment:
+      - SPARQL_ENDPOINT=http://host.docker.internal:7200/repositories/skosmos
+      - DATA=${DATA:-/data}
+    networks:
+      - skosmos
+    volumes:
+      - ./data:/data
+      - skosmos-configuration:/config
+    entrypoint:
+      - /var/www/entrypoint_cron.sh
+
 networks:
   skosmos:
     external: false
+
+volumes:
+  skosmos-configuration:
+    external: false
diff --git a/entrypoint.py b/entrypoint.py
@@ -9,13 +9,14 @@
 import shutil
 import time
 from pathlib import Path
+from typing import IO
 
 from src.exceptions import InvalidConfigurationException
-from src.graphdb import get_loaded_vocabs, setup_graphdb
+from src.graphdb import get_loaded_vocabs, set_timestamp, setup_graphdb, update_timestamp
 from src.vocabularies import get_file_from_config, get_graph, load_vocab_yaml, load_vocabulary
 
 
-def append_file(source, dest):
+def append_file(source: IO, dest: str):
     """
     Append source to dest file.
     :param source:  A file pointer to a source file.
@@ -31,19 +32,21 @@ def append_file(source, dest):
             df.write(line)
 
 
-if __name__ == "__main__":
-    time.sleep(10)
-
+def main() -> None:
+    """
+    Main function.
+    :return:
+    """
     data = os.environ["DATA"]
 
     if os.path.isfile(f'{data}/config.ttl'):
-        shutil.copy(f'{data}/config.ttl', '/tmp/config-docker-compose.ttl')
+        shutil.copy(f'{data}/config.ttl', '/config/config-docker-compose.ttl')
     else:
-        shutil.copy('/var/www/html/config-docker-compose.ttl', '/tmp/config-docker-compose.ttl')
+        shutil.copy('/var/www/html/config-docker-compose.ttl', '/config/config-docker-compose.ttl')
 
     if os.path.isfile(f'{data}/config-ext.ttl'):
         with open(f'{data}/config-ext.ttl', 'r', encoding='utf-8') as f:
-            append_file(f, '/tmp/config-docker-compose.ttl')
+            append_file(f, '/config/config-docker-compose.ttl')
 
     setup_graphdb()
 
@@ -60,17 +63,33 @@ def append_file(source, dest):
             with get_file_from_config(vocab_config['config'], data) as config:
                 graph = get_graph(config)
                 print(f"Graph: {graph}")
-            with get_file_from_config(vocab_config['config'], data) as config:
-                # Reset file pointer
-                append_file(config, "/tmp/config-docker-compose.ttl")
 
-            always_load = vocab_config['config'].get('alwaysRefresh', False)
+            reload = False
+            if graph not in loaded_vocabs:
+                reload = True
+            elif vocab_config['config'].get('refresh', False):
+                interval = vocab_config['config'].get('refreshInterval', 0)
+                diff = (time.time() - loaded_vocabs[graph]) / 3600
+                reload = diff > interval
 
-            if always_load or graph not in loaded_vocabs:
+            if reload:
                 print(f"Loading vocabulary {vocab}")
                 load_vocabulary(vocab_config['source'], data, graph)
+                if graph in loaded_vocabs:
+                    update_timestamp(graph, int(time.time()))
+                else:
+                    set_timestamp(graph, int(time.time()))
                 print("... DONE")
+
+            # Doing this last makes sure the vocab isn't added to the config when there's a problem
+            with get_file_from_config(vocab_config['config'], data) as config:
+                append_file(config, "/config/config-docker-compose.ttl")
         except InvalidConfigurationException as e:
             print(f"Invalid configuration: {e}")
             print(f"Skipping vocab '{vocab}'")
             continue
+
+
+if __name__ == "__main__":
+    time.sleep(10)
+    main()
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -3,7 +3,9 @@ set -x
 
 /var/www/entrypoint.py
 
-/usr/bin/envsubst < /tmp/config-docker-compose.ttl > /var/www/html/config.ttl
+/usr/bin/envsubst < /config/config-docker-compose.ttl > /config/config.ttl
+
+ln -s /config/config.ttl /var/www/html/config.ttl
 cat /var/www/html/config.ttl
 
 /usr/sbin/apache2ctl -D FOREGROUND
diff --git a/entrypoint_cron.sh b/entrypoint_cron.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+/usr/bin/envsubst < /var/www/crontab > /etc/cron.d/crontab
+
+crontab /etc/cron.d/crontab
+touch /var/log/cron.log
+
+cron && tail -f /var/log/cron.log
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 PyYAML~=6.0.1
 requests~=2.31.0
+
+SPARQLWrapper~=2.0.0
diff --git a/src/graphdb.py b/src/graphdb.py
@@ -2,13 +2,17 @@
 This file contains functions for interacting with GraphDB
 """
 import os
+from typing import TextIO
+
 import requests
 
+from SPARQLWrapper import SPARQLWrapper, JSON, POST, DIGEST
+
 admin_password = os.environ.get("ADMIN_PASSWORD", '')
 endpoint = os.environ.get("SPARQL_ENDPOINT", '')
 
 
-def setup_graphdb():
+def setup_graphdb() -> None:
     """
     Setup graphdb, if it isn't set up yet.
     :return:
@@ -33,28 +37,79 @@ def setup_graphdb():
         print(f"EXISTS GRAPHDB [{endpoint}]]")
 
 
-def get_loaded_vocabs():
+def get_loaded_vocabs() -> dict[str, int]:
     """
     Get all loaded vocabularies from GraphDB
     :return:
     """
-    graphs_response = requests.get(
-        f"{endpoint}/rdf-graphs",
-        headers={"Accept": "application/json"},
-        timeout=60
-    )
-    tmp = []
-    if graphs_response.status_code == 200:
-        body = graphs_response.json()
-        tmp = []
-        for binding in body["results"]["bindings"]:
-            tmp.append(binding["contextID"]["value"])
-        print("Loaded vocabs:")
-        print(tmp)
+    sparql = SPARQLWrapper(endpoint)
+    sparql.setReturnFormat(JSON)
+    q = """
+        SELECT ?graph ?timestamp
+        WHERE {
+            ?graph <http://purl.org/dc/terms/modified> ?timestamp .
+            FILTER NOT EXISTS {
+                GRAPH ?g {?graph <http://purl.org/dc/terms/modified> ?timestamp .}
+            }
+        }
+        ORDER BY ?timestamp
+    """
+    sparql.setQuery(q)
+    result = sparql.queryAndConvert()
+    result = result['results']['bindings']
+    tmp = {}
+    for line in result:
+        tmp[line['graph']['value']] = int(line['timestamp']['value'])
     return tmp
 
 
-def get_type(extension):
+def set_timestamp(graph_name: str, timestamp: int) -> None:
+    """
+    Set a timestamp for a new graph.
+    :param graph_name:
+    :param timestamp:
+    :return:
+    """
+    sparql = SPARQLWrapper(f"{endpoint}/statements")
+    sparql.setHTTPAuth(DIGEST)
+    sparql.setCredentials("admin", admin_password)
+    sparql.setMethod(POST)
+    q = """INSERT DATA {{
+        <{graph}> <http://purl.org/dc/terms/modified> {timestamp} .
+    }}"""
+    q_formatted = q.format(graph=graph_name, timestamp=timestamp)
+    print(q_formatted)
+    sparql.setQuery(q_formatted)
+    sparql.query()
+
+
+def update_timestamp(graph_name: str, timestamp: int) -> None:
+    """
+    Set a timestamp for an existing graph.
+    :param graph_name:
+    :param timestamp:
+    :return:
+    """
+    sparql = SPARQLWrapper(f"{endpoint}/statements")
+    sparql.setHTTPAuth(DIGEST)
+    sparql.setCredentials("admin", admin_password)
+    sparql.setMethod(POST)
+    q = """
+    DELETE {{
+        <{graph}> <http://purl.org/dc/terms/modified> ?timestamp .
+    }}
+    INSERT {{
+        <{graph}> <http://purl.org/dc/terms/modified> {timestamp} .
+    }}
+    WHERE {{
+        <{graph}> <http://purl.org/dc/terms/modified> ?timestamp .
+    }}
+    """
+    sparql.setQuery(q.format(graph=graph_name, timestamp=timestamp))
+    sparql.query()
+
+
+def get_type(extension: str) -> str:
     """
     Get the http mimetype based on the extension of a file.
     :param extension:
@@ -68,7 +123,7 @@ def get_type(extension):
     return "text/turtle"
 
 
-def add_vocabulary(graph, graph_name, extension):
+def add_vocabulary(graph: TextIO, graph_name: str, extension: str) -> None:
     """
     Add a vocabulary to GraphDB
     :param graph:       File

diff --git a/src/vocabularies.py b/src/vocabularies.py
@@ -5,13 +5,16 @@
 import re
 import urllib.request
 import urllib.parse
+from pathlib import Path
+from typing import IO, TextIO
+
 import yaml
 
 from src.exceptions import InvalidConfigurationException, UnknownAuthenticationTypeException
 from src.graphdb import add_vocabulary
 
 
-def get_file_from_config(config_data, data_dir):
+def get_file_from_config(config_data: dict, data_dir: str) -> TextIO:
     """
     Get the config file from yaml data.
     :param config_data: The configuration, a dict with information about the file.
@@ -52,7 +55,7 @@ def get_file_from_config(config_data, data_dir):
     raise InvalidConfigurationException("Type must be file")
 
 
-def load_vocabulary(source_data, data_dir, graph_name):
+def load_vocabulary(source_data: dict, data_dir: str, graph_name: str) -> None:
     """
     Load a vocabulary using the source data from the yaml.
     :param source_data:
@@ -64,25 +67,24 @@ def load_vocabulary(source_data, data_dir, graph_name):
         add_vocabulary(vocab_file, graph_name, get_vocab_format(source_data))
 
 
-def get_graph(fp):
+def get_graph(fp: IO) -> str:
     """
     Get the sparql graph from the given vocab
     :param fp:  The vocabulary config, a file pointer
     :return:
     """
     for line in fp:
-        # If line is a bytes-like object, we need to decode it
         try:
-            line = line.decode()
-        except (UnicodeDecodeError, AttributeError):
+            line = line.decode('utf-8')
+        except UnicodeDecodeError:
             # Already decoded
             pass
         if re.search("sparqlGraph", line):
             return line.strip().split(" ")[1].strip("<>")
     return ""
 
 
-def load_vocab_yaml(file_location):
+def load_vocab_yaml(file_location: Path) -> dict:
     """
     Open a yaml config file and return a dict with its contents
     :param file_location:
@@ -92,7 +94,7 @@ def load_vocab_yaml(file_location):
         return yaml.safe_load(fp)
 
 
-def get_vocab_format(source_data):
+def get_vocab_format(source_data: dict) -> str:
     """
     Return the vocab format of the given data source. It is either based on the file extension,
     or on an override in the yaml file.
@@ -101,4 +103,4 @@ def get_vocab_format(source_data):
     """
     if 'format' in source_data:
         return source_data['format']
-    return source_data['location'].split('.')[-1]
+    return source_data['location'].split('?')[0].split('.')[-1]