diff --git a/codelists/api.py b/codelists/api.py index 1892b5f7..e31c3b33 100644 --- a/codelists/api.py +++ b/codelists/api.py @@ -1,10 +1,12 @@ import json +import re from django.http import JsonResponse from django.views.decorators.csrf import csrf_exempt from django.views.decorators.http import require_http_methods from mappings.dmdvmpprevmap.mappers import vmp_ids_to_previous +from opencodelists.models import Organisation, User from .actions import ( create_old_style_codelist, @@ -14,10 +16,23 @@ create_version_with_codes, ) from .api_decorators import require_authentication, require_permission -from .models import Codelist +from .models import Codelist, CodelistVersion, Handle from .views.decorators import load_codelist, load_owner +CODELIST_VERSION_REGEX = re.compile( + r""" + # regex for matching codelists from study repos + (?Puser)?/? # may start with "user/" if it's a user codelist + (?P[\w|\d|-]+)/ # organisation slug or user name + (?P[\w|\d|-]+)/ # codelist slug + (?P[\w|\d|-]+) # version tag or hash + /?$ # possible trailing slash + """, + flags=re.X, +) + + @require_http_methods(["GET"]) def all_codelists(request): return codelists_get(request) @@ -259,3 +274,114 @@ def dmd_previous_codes_mapping(request): """ _, vmp_to_previous_tuples = vmp_ids_to_previous() return JsonResponse(vmp_to_previous_tuples, safe=False) + + +@require_http_methods("POST") +@csrf_exempt +def codelists_check(requests): + """ + Checks that a study repo's codelist CSV files as downloaded now are consistent + with the study's current downloaded codelists. + study_codelists: the contents of a study repo's codelists.txt + manifest: the contents of a study repo's codelists.json; this file is generated + when codelists are downloaded into a study using `opensafely codelists update` + and contain a hash of the codelist CSV file data at the time of update. + + Note that some of these checks are duplicated in opensafely-cli and will fail the + Github action test run (`opensafely codelists check`). However, we can't guarantee + that a user has corrected those errors, so we need to check them again. + + We DO NOT check whether the actual downloaded codelists have been modified; this can + only be done in the study repo itself (either CLI or GA). + """ + study_codelists = requests.POST.get("codelists") + try: + manifest = json.loads(requests.POST.get("manifest")) + except json.decoder.JSONDecodeError: + return JsonResponse( + {"status": "error", "data": {"error": "Codelists manifest file is invalid"}} + ) + + codelist_download_data = {} + # Fetch codelist CSV data + for line in study_codelists.split("\n"): + line = line.strip().rstrip("/") + if not line or line.startswith("#"): + continue + matches = CODELIST_VERSION_REGEX.match(line) + if not matches: + return JsonResponse( + { + "status": "error", + "data": { + "error": f"{line} does not match expected codelist pattern" + }, + } + ) + + line_parts = matches.groupdict() + try: + codelist_version = CodelistVersion.objects.get_by_hash( + line_parts["tag_or_hash"] + ) + except (CodelistVersion.DoesNotExist, ValueError): + # it's an old version that predates hashes, find by owner/org + owner = line_parts["owner"] + codelist_slug = line_parts["codelist_slug"] + try: + if line_parts["user"]: + user = User.objects.get(username=owner) + handle = Handle.objects.get(slug=codelist_slug, user=user) + else: + organisation = Organisation.objects.get(slug=owner) + handle = Handle.objects.get( + slug=codelist_slug, organisation=organisation + ) + codelist_version = CodelistVersion.objects.get( + codelist=handle.codelist, tag=line_parts["tag_or_hash"] + ) + except ( + User.DoesNotExist, + Handle.DoesNotExist, + Organisation.DoesNotExist, + CodelistVersion.DoesNotExist, + ): + return JsonResponse( + {"status": "error", "data": {"error": f"{line} could not be found"}} + ) + codelist_download_data[line] = codelist_version.csv_data_sha() + + # Compare with manifest file + # The manifest file is generated when `opensafely codelists update` is run in a study repo + # It contains an entry per codelist file with a hash of the file content and represents the + # state of the last update in the study repo + # codelist_download_data is the download data corresponding to the files listed in the study + # repo's codelists.csv + # We can check whether the codelists in the study repo are up to date by comparing the + # similarly-hashed content of the data that would be prepared for a CSV download + manifest_codelist_ids = set(file["id"] for file in manifest["files"].values()) + current_codelist_ids = set(codelist_download_data.keys()) + # new files are files that are in the study's codelists.csv but not in the manifest file generatated + # at the last update + new_files = current_codelist_ids - manifest_codelist_ids + # removed files are codelist entries that have been removed from the study's codelists.csv since + # the last update + removed_files = manifest_codelist_ids - current_codelist_ids + changed = [ + file_data["id"] + for file_data in manifest["files"].values() + if file_data["id"] in codelist_download_data + and file_data["sha"] != codelist_download_data[file_data["id"]] + ] + if new_files or removed_files or changed: + return JsonResponse( + { + "status": "error", + "data": { + "added": list(new_files), + "removed": list(removed_files), + "changed": changed, + }, + } + ) + return JsonResponse({"status": "ok"}) diff --git a/codelists/api_urls.py b/codelists/api_urls.py index 94c79062..49f6e675 100644 --- a/codelists/api_urls.py +++ b/codelists/api_urls.py @@ -13,6 +13,7 @@ api.dmd_previous_codes_mapping, name="dmd_previous_codes_mapping", ), + path("check/", api.codelists_check, name="check_codelists"), ] for subpath, view in [ diff --git a/codelists/models.py b/codelists/models.py index 6ae54674..633b1cce 100644 --- a/codelists/models.py +++ b/codelists/models.py @@ -1,3 +1,4 @@ +import hashlib from collections import Counter from django.contrib.auth.models import AnonymousUser @@ -553,6 +554,14 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True): table = self.table return rows_to_csv_data(table) + def csv_data_sha(self): + """ + sha of CSV data for download with default parameters. This matches the method + used to hash the CSVs downloaded in a study repo. + """ + data_for_download = self.csv_data_for_download().encode() + return hashlib.sha1(data_for_download).hexdigest() + def table_with_fixed_headers(self, include_mapped_vmps=True): """ Find the code and term columns from csv data (which may be labelled with different diff --git a/codelists/tests/test_api.py b/codelists/tests/test_api.py index 3e98922c..47e25542 100644 --- a/codelists/tests/test_api.py +++ b/codelists/tests/test_api.py @@ -1,6 +1,8 @@ import json from datetime import datetime +import pytest + from codelists.actions import update_codelist from mappings.dmdvmpprevmap.models import Mapping as VmpPrevMapping from opencodelists.tests.assertions import assert_difference, assert_no_difference @@ -468,3 +470,205 @@ def test_dmd_mapping_with_data(client, organisation): ["22", "01"], ["22", "11"], ] + + +@pytest.mark.parametrize( + "codelists,manifest,error", + [ + ("user/foo/codelist-foo/1234", "[foo]", "Codelists manifest file is invalid"), + ( + "user/foo/codelist-foo/1234", + '{"files": {"id": "user/foo/codelist-foo/1234", "sha": "bar"}}', + "user/foo/codelist-foo/1234 could not be found", + ), + ( + "foo/codelist-foo/1234", + '{"files": {"id": "foo/codelist-foo/1234", "sha": "bar"}}', + "foo/codelist-foo/1234 could not be found", + ), + ( + "foo/bar/codelist-foo/1234/", + '{"files": {"id": "foo/bar/codelist-foo/1234", "sha": "bar"}}', + "foo/bar/codelist-foo/1234 does not match expected codelist pattern", + ), + ], +) +def test_codelists_check_bad_input_files(client, codelists, manifest, error): + data = {"codelists": codelists, "manifest": manifest} + resp = client.post("/api/v1/check/", data) + assert resp.json() == {"status": "error", "data": {"error": error}} + + +def test_codelists_check_new_style_codelist_version( + client, user_version, new_style_version +): + user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}" + user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv" + org_codelist_id = f"{new_style_version.organisation.slug}/{new_style_version.codelist.slug}/{new_style_version.hash}" + org_codelist_csv_id = org_codelist_id.replace("/", "-") + ".csv" + manifest = { + "files": { + user_codelist_csv_id: { + "id": user_codelist_id, + "url": f"https://opencodelist.org/codelists/{user_codelist_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": user_version.csv_data_sha(), + }, + org_codelist_csv_id: { + "id": org_codelist_id, + "url": f"https://opencodelist.org/codelists/{org_codelist_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": new_style_version.csv_data_sha(), + }, + } + } + data = { + "codelists": f"# this is a comment line\n{user_codelist_id}\n{org_codelist_id}/\n\n", + "manifest": json.dumps(manifest), + } + resp = client.post("/api/v1/check/", data) + assert resp.json() == {"status": "ok"} + + +def test_codelists_check_old_style_codelist_version(client, old_style_version): + codelist_id = f"{old_style_version.organisation.slug}/{old_style_version.codelist.slug}/{old_style_version.hash}" + codelist_csv_id = codelist_id.replace("/", "-") + ".csv" + manifest = { + "files": { + codelist_csv_id: { + "id": codelist_id, + "url": f"https://opencodelist.org/codelists/{codelist_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": old_style_version.csv_data_sha(), + } + } + } + data = {"codelists": codelist_id, "manifest": json.dumps(manifest)} + resp = client.post("/api/v1/check/", data) + assert resp.json() == {"status": "ok"} + + +def test_codelists_check_codelist_with_tag(client, old_style_version): + old_style_version.tag = "v1" + old_style_version.save() + + codelist_id = ( + f"{old_style_version.organisation.slug}/{old_style_version.codelist.slug}/v1" + ) + codelist_csv_id = codelist_id.replace("/", "-") + ".csv" + manifest = { + "files": { + codelist_csv_id: { + "id": codelist_id, + "url": f"https://opencodelist.org/codelists/{codelist_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": old_style_version.csv_data_sha(), + } + } + } + data = {"codelists": codelist_id, "manifest": json.dumps(manifest)} + resp = client.post("/api/v1/check/", data) + assert resp.json() == {"status": "ok"} + + +def test_codelists_check_has_added_codelists( + client, user_version, version_with_no_searches +): + user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}" + user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv" + org_codelist_id = ( + f"{version_with_no_searches.organisation.slug}/" + f"{version_with_no_searches.codelist.slug}/" + f"{version_with_no_searches.hash}" + ) + # Setup: Study repo has added a codelist (version_with_no_searches) in the codelists.txt file, + # but has not run update, so manifest only contains one codelist + manifest = { + "files": { + user_codelist_csv_id: { + "id": user_codelist_id, + "url": f"https://opencodelist.org/codelists/{user_codelist_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": user_version.csv_data_sha(), + }, + } + } + data = { + "codelists": f"# this is a comment line\n{user_codelist_id}\n{org_codelist_id}/\n\n", + "manifest": json.dumps(manifest), + } + resp = client.post("/api/v1/check/", data) + assert resp.json() == { + "status": "error", + "data": {"added": [org_codelist_id], "removed": [], "changed": []}, + } + + +def test_codelists_check_has_removed_codelists( + client, user_version, version_with_no_searches +): + user_codelist_id = f"user/{user_version.user.username}/{user_version.codelist.slug}/{user_version.hash}" + user_codelist_csv_id = user_codelist_id.replace("/", "-") + ".csv" + org_codelist_id = ( + f"{version_with_no_searches.organisation.slug}/" + f"{version_with_no_searches.codelist.slug}/" + f"{version_with_no_searches.hash}" + ) + org_codelist_csv_id = org_codelist_id.replace("/", "-") + ".csv" + + # Setup: Study repo has remove a codelist (version_with_no_searches) in the codelists.txt file, + # but has not run update, so manifest contains 2 codelists, codelists.txt contains only one + manifest = { + "files": { + user_codelist_csv_id: { + "id": user_codelist_id, + "url": f"https://opencodelist.org/codelists/{user_codelist_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": user_version.csv_data_sha(), + }, + org_codelist_csv_id: { + "id": org_codelist_id, + "url": f"https://opencodelist.org/codelists/{org_codelist_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": version_with_no_searches.csv_data_sha(), + }, + } + } + data = {"codelists": user_codelist_id, "manifest": json.dumps(manifest)} + resp = client.post("/api/v1/check/", data) + assert resp.json() == { + "status": "error", + "data": {"added": [], "removed": [org_codelist_id], "changed": []}, + } + + +def test_codelists_check_changes(client, dmd_version_asthma_medication): + codelist_id = ( + f"{dmd_version_asthma_medication.organisation.slug}/" + f"{dmd_version_asthma_medication.codelist.slug}/" + f"{dmd_version_asthma_medication.hash}" + ) + codelist_csv_id = codelist_id.replace("/", "-") + ".csv" + + # Test the happy path for this dmd version + manifest = { + "files": { + codelist_csv_id: { + "id": codelist_id, + "url": f"https://opencodelist.org/codelists/{codelist_csv_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": dmd_version_asthma_medication.csv_data_sha(), + }, + } + } + data = {"codelists": codelist_id, "manifest": json.dumps(manifest)} + resp = client.post("/api/v1/check/", data) + assert resp.json() == {"status": "ok"} + # Add a VMP mapping which will be added into the CSV download + VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999") + resp = client.post("/api/v1/check/", data) + + assert resp.json() == { + "status": "error", + "data": {"added": [], "removed": [], "changed": [codelist_id]}, + }