From 93f64122fadcbb11f88e5c1a79a00b31bbed741b Mon Sep 17 00:00:00 2001 From: jtmst Date: Mon, 4 Nov 2024 15:00:18 -0500 Subject: [PATCH] added last update field --- tasks/unredact.py | 53 ++++++++++++++- tests/test_unredact.py | 150 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 3 deletions(-) diff --git a/tasks/unredact.py b/tasks/unredact.py index ac2b9ad..f1af8d3 100644 --- a/tasks/unredact.py +++ b/tasks/unredact.py @@ -1,5 +1,5 @@ import json - +from datetime import datetime, timezone from invoke import task from .helpers import get_volumes_metadata, get_reporter_volumes_metadata, R2_STATIC_BUCKET, R2_UNREDACTED_BUCKET, \ @@ -80,11 +80,14 @@ def update_redacted_field_of_volume(ctx, reporter=None, publication_year=None, d if volumes_to_unredact and not dry_run: volumes_metadata = json.loads(get_volumes_metadata(R2_STATIC_BUCKET)) + # Format example: 2024-01-01T00:00:00+00:00 + current_time = datetime.now(timezone.utc).isoformat() for item in volumes_to_unredact: for volume in volumes_metadata: if item["id"] == volume["id"]: volume["redacted"] = False + volume["last_updated"] = current_time r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(volumes_metadata), Key="VolumesMetadata.json", ContentType="application/json") @@ -95,6 +98,7 @@ def update_redacted_field_of_volume(ctx, reporter=None, publication_year=None, d for volume in reporter_volumes_metadata: if item["id"] == volume["id"]: volume["redacted"] = False + volume["last_updated"] = current_time r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(reporter_volumes_metadata), Key=f"{reporter}/VolumesMetadata.json", ContentType="application/json") @@ -106,6 +110,7 @@ def update_redacted_field_of_volume(ctx, reporter=None, publication_year=None, d for volume in reporter_volumes_metadata: if item["id"] == volume["id"]: volume["redacted"] = False + volume["last_updated"] = current_time r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(reporter_volumes_metadata), Key=f"{item['reporter']}/VolumesMetadata.json", ContentType="application/json") @@ -251,3 +256,49 @@ def filter_for_newest_tars(): unique_items.append(newest_item) return {f"{file['volume_id']}/{file['extension']}/": file for file in unique_items} + + +@task +def add_last_updated_field(ctx, dry_run=False): + """ Adds last_updated field to all volumes in VolumesMetadata.json files. + If dry_run is True, only prints what would be updated. """ + current_time = datetime.now(timezone.utc).isoformat() + + # Update main VolumesMetadata.json + volumes_metadata = json.loads(get_volumes_metadata(R2_STATIC_BUCKET)) + updated_count = 0 + + for volume in volumes_metadata: + if "last_updated" not in volume: + volume["last_updated"] = current_time + updated_count += 1 + + print(f"Would update {updated_count} volumes in main VolumesMetadata.json") + + if not dry_run: + r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(volumes_metadata), Key="VolumesMetadata.json", + ContentType="application/json") + print("Updated main VolumesMetadata.json") + + # Update reporter-specific metadata files + reporters = set(vol["reporter_slug"] for vol in volumes_metadata) + + for reporter in reporters: + try: + reporter_metadata = json.loads(get_reporter_volumes_metadata(R2_STATIC_BUCKET, reporter)) + reporter_updated_count = 0 + + for volume in reporter_metadata: + if "last_updated" not in volume: + volume["last_updated"] = current_time + reporter_updated_count += 1 + + print(f"Would update {reporter_updated_count} volumes in {reporter}/VolumesMetadata.json") + + if not dry_run: + r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(reporter_metadata), + Key=f"{reporter}/VolumesMetadata.json", ContentType="application/json") + print(f"Updated {reporter}/VolumesMetadata.json") + + except Exception as e: + print(f"Error processing {reporter}: {e}") diff --git a/tests/test_unredact.py b/tests/test_unredact.py index 0a9cc99..6ed9dbe 100644 --- a/tests/test_unredact.py +++ b/tests/test_unredact.py @@ -1,2 +1,148 @@ -def test_todo(): - assert True +import json +from datetime import datetime +import pytest +from unittest.mock import Mock, patch, ANY +from invoke import Context +from tasks.unredact import add_last_updated_field + + +@pytest.fixture +def mock_context(): + return Mock(spec=Context) + + +@pytest.fixture +def sample_volumes_metadata(): + return [ + { + "volume_number": "1", + "id": "32044078646858", + "redacted": False, + "reporter_slug": "ad", + }, + { + "volume_number": "2", + "id": "32044078646940", + "redacted": False, + "reporter_slug": "ad", + "last_updated": "2024-01-01T00:00:00+00:00", + }, + ] + + +@pytest.fixture +def sample_reporter_metadata(): + return [ + {"volume_number": "1", "id": "32044078646858", "redacted": False}, + { + "volume_number": "2", + "id": "32044078646940", + "redacted": False, + "last_updated": "2024-01-01T00:00:00+00:00", + }, + ] + + +@patch("tasks.unredact.get_volumes_metadata") +@patch("tasks.unredact.get_reporter_volumes_metadata") +@patch("tasks.unredact.r2_s3_client") +def test_add_last_updated_field_dry_run( + mock_r2_client, + mock_get_reporter, + mock_get_volumes, + mock_context, + sample_volumes_metadata, + sample_reporter_metadata, + capsys, +): + mock_get_volumes.return_value = json.dumps(sample_volumes_metadata) + mock_get_reporter.return_value = json.dumps(sample_reporter_metadata) + + add_last_updated_field(mock_context, dry_run=True) + + captured = capsys.readouterr() + + mock_r2_client.put_object.assert_not_called() + + assert "Would update 1 volumes in main VolumesMetadata.json" in captured.out + assert "Would update 1 volumes in ad/VolumesMetadata.json" in captured.out + + +@patch("tasks.unredact.get_volumes_metadata") +@patch("tasks.unredact.get_reporter_volumes_metadata") +@patch("tasks.unredact.r2_s3_client") +def test_add_last_updated_field_actual_update( + mock_r2_client, + mock_get_reporter, + mock_get_volumes, + mock_context, + sample_volumes_metadata, + sample_reporter_metadata, +): + # Setup mocks + mock_get_volumes.return_value = json.dumps(sample_volumes_metadata) + mock_get_reporter.return_value = json.dumps(sample_reporter_metadata) + + add_last_updated_field(mock_context, dry_run=False) + + mock_r2_client.put_object.assert_any_call( + Bucket=ANY, + Body=ANY, + Key="VolumesMetadata.json", + ContentType="application/json", + ) + + mock_r2_client.put_object.assert_any_call( + Bucket=ANY, + Body=ANY, + Key="ad/VolumesMetadata.json", + ContentType="application/json", + ) + + calls = mock_r2_client.put_object.call_args_list + for call in calls: + body = json.loads(call.kwargs["Body"]) + for volume in body: + # Verify all volumes have last_updated + assert "last_updated" in volume + + # Verify existing data is preserved + if volume["id"] == "32044078646858": + assert volume["volume_number"] == "1" + assert volume["redacted"] is False + if call.kwargs["Key"] == "VolumesMetadata.json": + assert volume["reporter_slug"] == "ad" + elif volume["id"] == "32044078646940": + assert volume["volume_number"] == "2" + assert volume["last_updated"] == "2024-01-01T00:00:00+00:00" + assert volume["redacted"] is False + if call.kwargs["Key"] == "VolumesMetadata.json": + assert volume["reporter_slug"] == "ad" + + +@patch("tasks.unredact.get_volumes_metadata") +@patch("tasks.unredact.get_reporter_volumes_metadata") +@patch("tasks.unredact.r2_s3_client") +def test_add_last_updated_field_handles_errors( + mock_r2_client, + mock_get_reporter, + mock_get_volumes, + mock_context, + sample_volumes_metadata, + capsys, +): + mock_get_volumes.return_value = json.dumps(sample_volumes_metadata) + mock_get_reporter.side_effect = Exception("Failed to get reporter metadata") + + add_last_updated_field(mock_context, dry_run=False) + + captured = capsys.readouterr() + + mock_r2_client.put_object.assert_called_once_with( + Bucket=ANY, + Body=ANY, + Key="VolumesMetadata.json", + ContentType="application/json", + ) + + assert "Error processing ad: Failed to get reporter metadata" in captured.out