Skip to content

Commit

Permalink
Add an API endpoint to check codelists from study repos
Browse files Browse the repository at this point in the history
This endpoint may be called from opensafely-cli or from job-server.
It takes as POST parameters the content of a study repo's
codelists.txt file (specifying the codelists that are to be used in
the study), and a manifest (codelists.json) file, which is generated
when `opensafely codelists update` is run. It checks that the hashed
codelist data in the manifest file matches the content of the current
codelist versions, if they were to be downloaded again.

Note opensafely-cli already has a codelists check command, which checks
that the manifest file is consistent with the codelists in the study
repo. However, dm+d codelists can change due to new VMP mappings, which
get mapped into CSV downloads. Therefore a dm+d codelist that was
downloaded into a study repo may need to be updated again, even if no changes
have been made to the codelists.csv.
  • Loading branch information
rebkwok committed Oct 24, 2023
1 parent 333446e commit c19499a
Show file tree
Hide file tree
Showing 4 changed files with 341 additions and 1 deletion.
128 changes: 127 additions & 1 deletion codelists/api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import json
import re

from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods

from mappings.dmdvmpprevmap.mappers import vmp_ids_to_previous
from opencodelists.models import Organisation, User

from .actions import (
create_old_style_codelist,
Expand All @@ -14,10 +16,23 @@
create_version_with_codes,
)
from .api_decorators import require_authentication, require_permission
from .models import Codelist
from .models import Codelist, CodelistVersion, Handle
from .views.decorators import load_codelist, load_owner


CODELIST_VERSION_REGEX = re.compile(
r"""
# regex for matching codelists from study repos
(?P<user>user)?/? # may start with "user/" if it's a user codelist
(?P<owner>[\w|\d|-]+)/ # organisation slug or user name
(?P<codelist_slug>[\w|\d|-]+)/ # codelist slug
(?P<tag_or_hash>[\w|\d|-]+) # version tag or hash
/?$ # possible trailing slash
""",
flags=re.X,
)


@require_http_methods(["GET"])
def all_codelists(request):
return codelists_get(request)
Expand Down Expand Up @@ -259,3 +274,114 @@ def dmd_previous_codes_mapping(request):
"""
_, vmp_to_previous_tuples = vmp_ids_to_previous()
return JsonResponse(vmp_to_previous_tuples, safe=False)


@require_http_methods("POST")
@csrf_exempt
def codelists_check(requests):
"""
Checks that a study repo's codelist CSV files as downloaded now are consistent
with the study's current downloaded codelists.
study_codelists: the contents of a study repo's codelists.txt
manifest: the contents of a study repo's codelists.json; this file is generated
when codelists are downloaded into a study using `opensafely codelists update`
and contain a hash of the codelist CSV file data at the time of update.
Note that some of these checks are duplicated in opensafely-cli and will fail the
Github action test run (`opensafely codelists check`). However, we can't guarantee
that a user has corrected those errors, so we need to check them again.
We DO NOT check whether the actual downloaded codelists have been modified; this can
only be done in the study repo itself (either CLI or GA).
"""
study_codelists = requests.POST.get("codelists")
try:
manifest = json.loads(requests.POST.get("manifest"))
except json.decoder.JSONDecodeError:
return JsonResponse(
{"status": "error", "data": {"error": "Codelists manifest file is invalid"}}
)

codelist_download_data = {}
# Fetch codelist CSV data
for line in study_codelists.split("\n"):
line = line.strip().rstrip("/")
if not line or line.startswith("#"):
continue
matches = CODELIST_VERSION_REGEX.match(line)
if not matches:
return JsonResponse(
{
"status": "error",
"data": {
"error": f"{line} does not match expected codelist pattern"
},
}
)

line_parts = matches.groupdict()
try:
codelist_version = CodelistVersion.objects.get_by_hash(
line_parts["tag_or_hash"]
)
except (CodelistVersion.DoesNotExist, ValueError):
# it's an old version that predates hashes, find by owner/org
owner = line_parts["owner"]
codelist_slug = line_parts["codelist_slug"]
try:
if line_parts["user"]:
user = User.objects.get(username=owner)
handle = Handle.objects.get(slug=codelist_slug, user=user)
else:
organisation = Organisation.objects.get(slug=owner)
handle = Handle.objects.get(
slug=codelist_slug, organisation=organisation
)
codelist_version = CodelistVersion.objects.get(
codelist=handle.codelist, tag=line_parts["tag_or_hash"]
)
except (
User.DoesNotExist,
Handle.DoesNotExist,
Organisation.DoesNotExist,
CodelistVersion.DoesNotExist,
):
return JsonResponse(
{"status": "error", "data": {"error": f"{line} could not be found"}}
)
codelist_download_data[line] = codelist_version.csv_data_sha()

# Compare with manifest file
# The manifest file is generated when `opensafely codelists update` is run in a study repo
# It contains an entry per codelist file with a hash of the file content and represents the
# state of the last update in the study repo
# codelist_download_data is the download data corresponding to the files listed in the study
# repo's codelists.csv
# We can check whether the codelists in the study repo are up to date by comparing the
# similarly-hashed content of the data that would be prepared for a CSV download
manifest_codelist_ids = set(file["id"] for file in manifest["files"].values())
current_codelist_ids = set(codelist_download_data.keys())
# new files are files that are in the study's codelists.csv but not in the manifest file generatated
# at the last update
new_files = current_codelist_ids - manifest_codelist_ids
# removed files are codelist entries that have been removed from the study's codelists.csv since
# the last update
removed_files = manifest_codelist_ids - current_codelist_ids
changed = [
file_data["id"]
for file_data in manifest["files"].values()
if file_data["id"] in codelist_download_data
and file_data["sha"] != codelist_download_data[file_data["id"]]
]
if new_files or removed_files or changed:
return JsonResponse(
{
"status": "error",
"data": {
"added": list(new_files),
"removed": list(removed_files),
"changed": changed,
},
}
)
return JsonResponse({"status": "ok"})
1 change: 1 addition & 0 deletions codelists/api_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
api.dmd_previous_codes_mapping,
name="dmd_previous_codes_mapping",
),
path("check/", api.codelists_check, name="check_codelists"),
]

for subpath, view in [
Expand Down
9 changes: 9 additions & 0 deletions codelists/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
from collections import Counter

from django.contrib.auth.models import AnonymousUser
Expand Down Expand Up @@ -553,6 +554,14 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True):
table = self.table
return rows_to_csv_data(table)

def csv_data_sha(self):
"""
sha of CSV data for download with default parameters. This matches the method
used to hash the CSVs downloaded in a study repo.
"""
data_for_download = self.csv_data_for_download().encode()
return hashlib.sha1(data_for_download).hexdigest()

def table_with_fixed_headers(self, include_mapped_vmps=True):
"""
Find the code and term columns from csv data (which may be labelled with different
Expand Down
204 changes: 204 additions & 0 deletions codelists/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
from datetime import datetime

import pytest

from codelists.actions import update_codelist
from mappings.dmdvmpprevmap.models import Mapping as VmpPrevMapping
from opencodelists.tests.assertions import assert_difference, assert_no_difference
Expand Down Expand Up @@ -468,3 +470,205 @@ def test_dmd_mapping_with_data(client, organisation):
["22", "01"],
["22", "11"],
]


@pytest.mark.parametrize(
"codelists,manifest,error",
[
("user/foo/codelist-foo/1234", "[foo]", "Codelists manifest file is invalid"),
(
"user/foo/codelist-foo/1234",
'{"files": {"id": "user/foo/codelist-foo/1234", "sha": "bar"}}',
"user/foo/codelist-foo/1234 could not be found",
),
(
"foo/codelist-foo/1234",
'{"files": {"id": "foo/codelist-foo/1234", "sha": "bar"}}',
"foo/codelist-foo/1234 could not be found",
),
(
"foo/bar/codelist-foo/1234/",
'{"files": {"id": "foo/bar/codelist-foo/1234", "sha": "bar"}}',
"foo/bar/codelist-foo/1234 does not match expected codelist pattern",
),
],
)
def test_codelists_check_bad_input_files(client, codelists, manifest, error):
data = {"codelists": codelists, "manifest": manifest}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "error", "data": {"error": error}}


def test_codelists_check_new_style_codelist_version(
client, user_version, new_style_version
):
user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}"
user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv"
org_codelist_id = f"{new_style_version.organisation.slug}/{new_style_version.codelist.slug}/{new_style_version.hash}"
org_codelist_csv_id = org_codelist_id.replace("/", "-") + ".csv"
manifest = {
"files": {
user_codelist_csv_id: {
"id": user_codelist_id,
"url": f"https://opencodelist.org/codelists/{user_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": user_version.csv_data_sha(),
},
org_codelist_csv_id: {
"id": org_codelist_id,
"url": f"https://opencodelist.org/codelists/{org_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": new_style_version.csv_data_sha(),
},
}
}
data = {
"codelists": f"# this is a comment line\n{user_codelist_id}\n{org_codelist_id}/\n\n",
"manifest": json.dumps(manifest),
}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}


def test_codelists_check_old_style_codelist_version(client, old_style_version):
codelist_id = f"{old_style_version.organisation.slug}/{old_style_version.codelist.slug}/{old_style_version.hash}"
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": old_style_version.csv_data_sha(),
}
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}


def test_codelists_check_codelist_with_tag(client, old_style_version):
old_style_version.tag = "v1"
old_style_version.save()

codelist_id = (
f"{old_style_version.organisation.slug}/{old_style_version.codelist.slug}/v1"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": old_style_version.csv_data_sha(),
}
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}


def test_codelists_check_has_added_codelists(
client, user_version, version_with_no_searches
):
user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}"
user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv"
org_codelist_id = (
f"{version_with_no_searches.organisation.slug}/"
f"{version_with_no_searches.codelist.slug}/"
f"{version_with_no_searches.hash}"
)
# Setup: Study repo has added a codelist (version_with_no_searches) in the codelists.txt file,
# but has not run update, so manifest only contains one codelist
manifest = {
"files": {
user_codelist_csv_id: {
"id": user_codelist_id,
"url": f"https://opencodelist.org/codelists/{user_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": user_version.csv_data_sha(),
},
}
}
data = {
"codelists": f"# this is a comment line\n{user_codelist_id}\n{org_codelist_id}/\n\n",
"manifest": json.dumps(manifest),
}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {
"status": "error",
"data": {"added": [org_codelist_id], "removed": [], "changed": []},
}


def test_codelists_check_has_removed_codelists(
client, user_version, version_with_no_searches
):
user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}"
user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv"
org_codelist_id = (
f"{version_with_no_searches.organisation.slug}/"
f"{version_with_no_searches.codelist.slug}/"
f"{version_with_no_searches.hash}"
)
org_codelist_csv_id = org_codelist_id.replace("/", "-") + ".csv"

# Setup: Study repo has remove a codelist (version_with_no_searches) in the codelists.txt file,
# but has not run update, so manifest contains 2 codelists, codelists.txt contains only one
manifest = {
"files": {
user_codelist_csv_id: {
"id": user_codelist_id,
"url": f"https://opencodelist.org/codelists/{user_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": user_version.csv_data_sha(),
},
org_codelist_csv_id: {
"id": org_codelist_id,
"url": f"https://opencodelist.org/codelists/{org_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": version_with_no_searches.csv_data_sha(),
},
}
}
data = {"codelists": user_codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {
"status": "error",
"data": {"added": [], "removed": [org_codelist_id], "changed": []},
}


def test_codelists_check_changes(client, dmd_version_asthma_medication):
codelist_id = (
f"{dmd_version_asthma_medication.organisation.slug}/"
f"{dmd_version_asthma_medication.codelist.slug}/"
f"{dmd_version_asthma_medication.hash}"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"

# Test the happy path for this dmd version
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": dmd_version_asthma_medication.csv_data_sha(),
},
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}
# Add a VMP mapping which will be added into the CSV download
VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999")
resp = client.post("/api/v1/check/", data)

assert resp.json() == {
"status": "error",
"data": {"added": [], "removed": [], "changed": [codelist_id]},
}

0 comments on commit c19499a

Please sign in to comment.