Skip to content

Commit

Permalink
Merge pull request #1704 from opensafely-core/csv-data-sha
Browse files Browse the repository at this point in the history
Replace line endings before hashing CSV data
  • Loading branch information
rebkwok authored Oct 25, 2023
2 parents 25d12a4 + f80bb76 commit 7ab8785
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
6 changes: 4 additions & 2 deletions codelists/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,9 +558,11 @@ def csv_data_sha(self):
"""
sha of CSV data for download with default parameters. This matches the method
used to hash the CSVs downloaded in a study repo.
# In order to avoid different OS messing with line endings, opensafely-cli
# splits the lines and rejoins them before hashing.
"""
data_for_download = self.csv_data_for_download().encode()
return hashlib.sha1(data_for_download).hexdigest()
data_for_download = "\n".join(self.csv_data_for_download().splitlines())
return hashlib.sha1(data_for_download.encode()).hexdigest()

def table_with_fixed_headers(self, include_mapped_vmps=True):
"""
Expand Down
30 changes: 30 additions & 0 deletions codelists/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
import json
from datetime import datetime

Expand Down Expand Up @@ -672,3 +673,32 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication):
"status": "error",
"data": {"added": [], "removed": [], "changed": [codelist_id]},
}


def test_codelists_check_sha(version_with_no_searches):
# The CSV data download contains \r\n line endings
assert version_with_no_searches.csv_data_for_download() == (
"code,term\r\n"
"128133004,Disorder of elbow\r\n"
"156659008,(Epicondylitis &/or tennis elbow) or (golfers' elbow)\r\n"
"239964003,Soft tissue lesion of elbow region\r\n"
"35185008,Enthesopathy of elbow region\r\n"
"429554009,Arthropathy of elbow\r\n"
"73583000,Epicondylitis\r\n"
)
# In order to avoid different OS messing with line endings, opensafely-cli
# splits the lines and rejoins them before hashing. Test that our
# csv_data_sha does the same
csv_data_clean = (
"code,term\n"
"128133004,Disorder of elbow\n"
"156659008,(Epicondylitis &/or tennis elbow) or (golfers' elbow)\n"
"239964003,Soft tissue lesion of elbow region\n"
"35185008,Enthesopathy of elbow region\n"
"429554009,Arthropathy of elbow\n"
"73583000,Epicondylitis"
)
assert (
version_with_no_searches.csv_data_sha()
== hashlib.sha1(csv_data_clean.encode()).hexdigest()
)

0 comments on commit 7ab8785

Please sign in to comment.