Skip to content

Commit

Permalink
Merge pull request #1703 from opensafely-core/api-check-endpoint
Browse files Browse the repository at this point in the history
Add an API endpoint to check codelists from study repos
  • Loading branch information
rebkwok authored Oct 25, 2023
2 parents 333446e + c19499a commit 25d12a4
Show file tree
Hide file tree
Showing 4 changed files with 341 additions and 1 deletion.
128 changes: 127 additions & 1 deletion codelists/api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import json
import re

from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods

from mappings.dmdvmpprevmap.mappers import vmp_ids_to_previous
from opencodelists.models import Organisation, User

from .actions import (
create_old_style_codelist,
Expand All @@ -14,10 +16,23 @@
create_version_with_codes,
)
from .api_decorators import require_authentication, require_permission
from .models import Codelist
from .models import Codelist, CodelistVersion, Handle
from .views.decorators import load_codelist, load_owner


CODELIST_VERSION_REGEX = re.compile(
r"""
# regex for matching codelists from study repos
(?P<user>user)?/? # may start with "user/" if it's a user codelist
(?P<owner>[\w|\d|-]+)/ # organisation slug or user name
(?P<codelist_slug>[\w|\d|-]+)/ # codelist slug
(?P<tag_or_hash>[\w|\d|-]+) # version tag or hash
/?$ # possible trailing slash
""",
flags=re.X,
)


@require_http_methods(["GET"])
def all_codelists(request):
return codelists_get(request)
Expand Down Expand Up @@ -259,3 +274,114 @@ def dmd_previous_codes_mapping(request):
"""
_, vmp_to_previous_tuples = vmp_ids_to_previous()
return JsonResponse(vmp_to_previous_tuples, safe=False)


@require_http_methods("POST")
@csrf_exempt
def codelists_check(requests):
"""
Checks that a study repo's codelist CSV files as downloaded now are consistent
with the study's current downloaded codelists.
study_codelists: the contents of a study repo's codelists.txt
manifest: the contents of a study repo's codelists.json; this file is generated
when codelists are downloaded into a study using `opensafely codelists update`
and contain a hash of the codelist CSV file data at the time of update.
Note that some of these checks are duplicated in opensafely-cli and will fail the
Github action test run (`opensafely codelists check`). However, we can't guarantee
that a user has corrected those errors, so we need to check them again.
We DO NOT check whether the actual downloaded codelists have been modified; this can
only be done in the study repo itself (either CLI or GA).
"""
study_codelists = requests.POST.get("codelists")
try:
manifest = json.loads(requests.POST.get("manifest"))
except json.decoder.JSONDecodeError:
return JsonResponse(
{"status": "error", "data": {"error": "Codelists manifest file is invalid"}}
)

codelist_download_data = {}
# Fetch codelist CSV data
for line in study_codelists.split("\n"):
line = line.strip().rstrip("/")
if not line or line.startswith("#"):
continue
matches = CODELIST_VERSION_REGEX.match(line)
if not matches:
return JsonResponse(
{
"status": "error",
"data": {
"error": f"{line} does not match expected codelist pattern"
},
}
)

line_parts = matches.groupdict()
try:
codelist_version = CodelistVersion.objects.get_by_hash(
line_parts["tag_or_hash"]
)
except (CodelistVersion.DoesNotExist, ValueError):
# it's an old version that predates hashes, find by owner/org
owner = line_parts["owner"]
codelist_slug = line_parts["codelist_slug"]
try:
if line_parts["user"]:
user = User.objects.get(username=owner)
handle = Handle.objects.get(slug=codelist_slug, user=user)
else:
organisation = Organisation.objects.get(slug=owner)
handle = Handle.objects.get(
slug=codelist_slug, organisation=organisation
)
codelist_version = CodelistVersion.objects.get(
codelist=handle.codelist, tag=line_parts["tag_or_hash"]
)
except (
User.DoesNotExist,
Handle.DoesNotExist,
Organisation.DoesNotExist,
CodelistVersion.DoesNotExist,
):
return JsonResponse(
{"status": "error", "data": {"error": f"{line} could not be found"}}
)
codelist_download_data[line] = codelist_version.csv_data_sha()

# Compare with manifest file
# The manifest file is generated when `opensafely codelists update` is run in a study repo
# It contains an entry per codelist file with a hash of the file content and represents the
# state of the last update in the study repo
# codelist_download_data is the download data corresponding to the files listed in the study
# repo's codelists.csv
# We can check whether the codelists in the study repo are up to date by comparing the
# similarly-hashed content of the data that would be prepared for a CSV download
manifest_codelist_ids = set(file["id"] for file in manifest["files"].values())
current_codelist_ids = set(codelist_download_data.keys())
# new files are files that are in the study's codelists.csv but not in the manifest file generatated
# at the last update
new_files = current_codelist_ids - manifest_codelist_ids
# removed files are codelist entries that have been removed from the study's codelists.csv since
# the last update
removed_files = manifest_codelist_ids - current_codelist_ids
changed = [
file_data["id"]
for file_data in manifest["files"].values()
if file_data["id"] in codelist_download_data
and file_data["sha"] != codelist_download_data[file_data["id"]]
]
if new_files or removed_files or changed:
return JsonResponse(
{
"status": "error",
"data": {
"added": list(new_files),
"removed": list(removed_files),
"changed": changed,
},
}
)
return JsonResponse({"status": "ok"})
1 change: 1 addition & 0 deletions codelists/api_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
api.dmd_previous_codes_mapping,
name="dmd_previous_codes_mapping",
),
path("check/", api.codelists_check, name="check_codelists"),
]

for subpath, view in [
Expand Down
9 changes: 9 additions & 0 deletions codelists/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
from collections import Counter

from django.contrib.auth.models import AnonymousUser
Expand Down Expand Up @@ -553,6 +554,14 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True):
table = self.table
return rows_to_csv_data(table)

def csv_data_sha(self):
"""
sha of CSV data for download with default parameters. This matches the method
used to hash the CSVs downloaded in a study repo.
"""
data_for_download = self.csv_data_for_download().encode()
return hashlib.sha1(data_for_download).hexdigest()

def table_with_fixed_headers(self, include_mapped_vmps=True):
"""
Find the code and term columns from csv data (which may be labelled with different
Expand Down
204 changes: 204 additions & 0 deletions codelists/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
from datetime import datetime

import pytest

from codelists.actions import update_codelist
from mappings.dmdvmpprevmap.models import Mapping as VmpPrevMapping
from opencodelists.tests.assertions import assert_difference, assert_no_difference
Expand Down Expand Up @@ -468,3 +470,205 @@ def test_dmd_mapping_with_data(client, organisation):
["22", "01"],
["22", "11"],
]


@pytest.mark.parametrize(
"codelists,manifest,error",
[
("user/foo/codelist-foo/1234", "[foo]", "Codelists manifest file is invalid"),
(
"user/foo/codelist-foo/1234",
'{"files": {"id": "user/foo/codelist-foo/1234", "sha": "bar"}}',
"user/foo/codelist-foo/1234 could not be found",
),
(
"foo/codelist-foo/1234",
'{"files": {"id": "foo/codelist-foo/1234", "sha": "bar"}}',
"foo/codelist-foo/1234 could not be found",
),
(
"foo/bar/codelist-foo/1234/",
'{"files": {"id": "foo/bar/codelist-foo/1234", "sha": "bar"}}',
"foo/bar/codelist-foo/1234 does not match expected codelist pattern",
),
],
)
def test_codelists_check_bad_input_files(client, codelists, manifest, error):
data = {"codelists": codelists, "manifest": manifest}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "error", "data": {"error": error}}


def test_codelists_check_new_style_codelist_version(
client, user_version, new_style_version
):
user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}"
user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv"
org_codelist_id = f"{new_style_version.organisation.slug}/{new_style_version.codelist.slug}/{new_style_version.hash}"
org_codelist_csv_id = org_codelist_id.replace("/", "-") + ".csv"
manifest = {
"files": {
user_codelist_csv_id: {
"id": user_codelist_id,
"url": f"https://opencodelist.org/codelists/{user_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": user_version.csv_data_sha(),
},
org_codelist_csv_id: {
"id": org_codelist_id,
"url": f"https://opencodelist.org/codelists/{org_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": new_style_version.csv_data_sha(),
},
}
}
data = {
"codelists": f"# this is a comment line\n{user_codelist_id}\n{org_codelist_id}/\n\n",
"manifest": json.dumps(manifest),
}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}


def test_codelists_check_old_style_codelist_version(client, old_style_version):
codelist_id = f"{old_style_version.organisation.slug}/{old_style_version.codelist.slug}/{old_style_version.hash}"
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": old_style_version.csv_data_sha(),
}
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}


def test_codelists_check_codelist_with_tag(client, old_style_version):
old_style_version.tag = "v1"
old_style_version.save()

codelist_id = (
f"{old_style_version.organisation.slug}/{old_style_version.codelist.slug}/v1"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": old_style_version.csv_data_sha(),
}
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}


def test_codelists_check_has_added_codelists(
client, user_version, version_with_no_searches
):
user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}"
user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv"
org_codelist_id = (
f"{version_with_no_searches.organisation.slug}/"
f"{version_with_no_searches.codelist.slug}/"
f"{version_with_no_searches.hash}"
)
# Setup: Study repo has added a codelist (version_with_no_searches) in the codelists.txt file,
# but has not run update, so manifest only contains one codelist
manifest = {
"files": {
user_codelist_csv_id: {
"id": user_codelist_id,
"url": f"https://opencodelist.org/codelists/{user_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": user_version.csv_data_sha(),
},
}
}
data = {
"codelists": f"# this is a comment line\n{user_codelist_id}\n{org_codelist_id}/\n\n",
"manifest": json.dumps(manifest),
}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {
"status": "error",
"data": {"added": [org_codelist_id], "removed": [], "changed": []},
}


def test_codelists_check_has_removed_codelists(
client, user_version, version_with_no_searches
):
user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}"
user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv"
org_codelist_id = (
f"{version_with_no_searches.organisation.slug}/"
f"{version_with_no_searches.codelist.slug}/"
f"{version_with_no_searches.hash}"
)
org_codelist_csv_id = org_codelist_id.replace("/", "-") + ".csv"

# Setup: Study repo has remove a codelist (version_with_no_searches) in the codelists.txt file,
# but has not run update, so manifest contains 2 codelists, codelists.txt contains only one
manifest = {
"files": {
user_codelist_csv_id: {
"id": user_codelist_id,
"url": f"https://opencodelist.org/codelists/{user_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": user_version.csv_data_sha(),
},
org_codelist_csv_id: {
"id": org_codelist_id,
"url": f"https://opencodelist.org/codelists/{org_codelist_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": version_with_no_searches.csv_data_sha(),
},
}
}
data = {"codelists": user_codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {
"status": "error",
"data": {"added": [], "removed": [org_codelist_id], "changed": []},
}


def test_codelists_check_changes(client, dmd_version_asthma_medication):
codelist_id = (
f"{dmd_version_asthma_medication.organisation.slug}/"
f"{dmd_version_asthma_medication.codelist.slug}/"
f"{dmd_version_asthma_medication.hash}"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"

# Test the happy path for this dmd version
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": dmd_version_asthma_medication.csv_data_sha(),
},
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}
# Add a VMP mapping which will be added into the CSV download
VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999")
resp = client.post("/api/v1/check/", data)

assert resp.json() == {
"status": "error",
"data": {"added": [], "removed": [], "changed": [codelist_id]},
}

0 comments on commit 25d12a4

Please sign in to comment.