Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added last update field #14

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion tasks/unredact.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json

from datetime import datetime, timezone
from invoke import task

from .helpers import get_volumes_metadata, get_reporter_volumes_metadata, R2_STATIC_BUCKET, R2_UNREDACTED_BUCKET, \
Expand Down Expand Up @@ -80,11 +80,14 @@ def update_redacted_field_of_volume(ctx, reporter=None, publication_year=None, d

if volumes_to_unredact and not dry_run:
volumes_metadata = json.loads(get_volumes_metadata(R2_STATIC_BUCKET))
# Format example: 2024-01-01T00:00:00+00:00
current_time = datetime.now(timezone.utc).isoformat()

for item in volumes_to_unredact:
for volume in volumes_metadata:
if item["id"] == volume["id"]:
volume["redacted"] = False
volume["last_updated"] = current_time

r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(volumes_metadata), Key="VolumesMetadata.json",
ContentType="application/json")
Expand All @@ -95,6 +98,7 @@ def update_redacted_field_of_volume(ctx, reporter=None, publication_year=None, d
for volume in reporter_volumes_metadata:
if item["id"] == volume["id"]:
volume["redacted"] = False
volume["last_updated"] = current_time

r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(reporter_volumes_metadata),
Key=f"{reporter}/VolumesMetadata.json", ContentType="application/json")
Expand All @@ -106,6 +110,7 @@ def update_redacted_field_of_volume(ctx, reporter=None, publication_year=None, d
for volume in reporter_volumes_metadata:
if item["id"] == volume["id"]:
volume["redacted"] = False
volume["last_updated"] = current_time

r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(reporter_volumes_metadata),
Key=f"{item['reporter']}/VolumesMetadata.json", ContentType="application/json")
Expand Down Expand Up @@ -251,3 +256,49 @@ def filter_for_newest_tars():
unique_items.append(newest_item)

return {f"{file['volume_id']}/{file['extension']}/": file for file in unique_items}


@task
def add_last_updated_field(ctx, dry_run=False):
""" Adds last_updated field to all volumes in VolumesMetadata.json files.
If dry_run is True, only prints what would be updated. """
current_time = datetime.now(timezone.utc).isoformat()

# Update main VolumesMetadata.json
volumes_metadata = json.loads(get_volumes_metadata(R2_STATIC_BUCKET))
updated_count = 0

for volume in volumes_metadata:
if "last_updated" not in volume:
volume["last_updated"] = current_time
updated_count += 1

print(f"Would update {updated_count} volumes in main VolumesMetadata.json")

if not dry_run:
r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(volumes_metadata), Key="VolumesMetadata.json",
ContentType="application/json")
print("Updated main VolumesMetadata.json")

# Update reporter-specific metadata files
reporters = set(vol["reporter_slug"] for vol in volumes_metadata)

for reporter in reporters:
try:
reporter_metadata = json.loads(get_reporter_volumes_metadata(R2_STATIC_BUCKET, reporter))
reporter_updated_count = 0

for volume in reporter_metadata:
if "last_updated" not in volume:
volume["last_updated"] = current_time
reporter_updated_count += 1

print(f"Would update {reporter_updated_count} volumes in {reporter}/VolumesMetadata.json")

if not dry_run:
r2_s3_client.put_object(Bucket=R2_STATIC_BUCKET, Body=json.dumps(reporter_metadata),
Key=f"{reporter}/VolumesMetadata.json", ContentType="application/json")
print(f"Updated {reporter}/VolumesMetadata.json")

except Exception as e:
print(f"Error processing {reporter}: {e}")
150 changes: 148 additions & 2 deletions tests/test_unredact.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,148 @@
def test_todo():
assert True
import json
from datetime import datetime
import pytest
from unittest.mock import Mock, patch, ANY
from invoke import Context
from tasks.unredact import add_last_updated_field


@pytest.fixture
def mock_context():
return Mock(spec=Context)


@pytest.fixture
def sample_volumes_metadata():
return [
{
"volume_number": "1",
"id": "32044078646858",
"redacted": False,
"reporter_slug": "ad",
},
{
"volume_number": "2",
"id": "32044078646940",
"redacted": False,
"reporter_slug": "ad",
"last_updated": "2024-01-01T00:00:00+00:00",
},
]


@pytest.fixture
def sample_reporter_metadata():
return [
{"volume_number": "1", "id": "32044078646858", "redacted": False},
{
"volume_number": "2",
"id": "32044078646940",
"redacted": False,
"last_updated": "2024-01-01T00:00:00+00:00",
},
]


@patch("tasks.unredact.get_volumes_metadata")
@patch("tasks.unredact.get_reporter_volumes_metadata")
@patch("tasks.unredact.r2_s3_client")
def test_add_last_updated_field_dry_run(
mock_r2_client,
mock_get_reporter,
mock_get_volumes,
mock_context,
sample_volumes_metadata,
sample_reporter_metadata,
capsys,
):
mock_get_volumes.return_value = json.dumps(sample_volumes_metadata)
mock_get_reporter.return_value = json.dumps(sample_reporter_metadata)

add_last_updated_field(mock_context, dry_run=True)

captured = capsys.readouterr()

mock_r2_client.put_object.assert_not_called()

assert "Would update 1 volumes in main VolumesMetadata.json" in captured.out
assert "Would update 1 volumes in ad/VolumesMetadata.json" in captured.out


@patch("tasks.unredact.get_volumes_metadata")
@patch("tasks.unredact.get_reporter_volumes_metadata")
@patch("tasks.unredact.r2_s3_client")
def test_add_last_updated_field_actual_update(
mock_r2_client,
mock_get_reporter,
mock_get_volumes,
mock_context,
sample_volumes_metadata,
sample_reporter_metadata,
):
# Setup mocks
mock_get_volumes.return_value = json.dumps(sample_volumes_metadata)
mock_get_reporter.return_value = json.dumps(sample_reporter_metadata)

add_last_updated_field(mock_context, dry_run=False)

mock_r2_client.put_object.assert_any_call(
Bucket=ANY,
Body=ANY,
Key="VolumesMetadata.json",
ContentType="application/json",
)

mock_r2_client.put_object.assert_any_call(
Bucket=ANY,
Body=ANY,
Key="ad/VolumesMetadata.json",
ContentType="application/json",
)

calls = mock_r2_client.put_object.call_args_list
for call in calls:
body = json.loads(call.kwargs["Body"])
for volume in body:
# Verify all volumes have last_updated
assert "last_updated" in volume

# Verify existing data is preserved
if volume["id"] == "32044078646858":
assert volume["volume_number"] == "1"
assert volume["redacted"] is False
if call.kwargs["Key"] == "VolumesMetadata.json":
assert volume["reporter_slug"] == "ad"
elif volume["id"] == "32044078646940":
assert volume["volume_number"] == "2"
assert volume["last_updated"] == "2024-01-01T00:00:00+00:00"
assert volume["redacted"] is False
if call.kwargs["Key"] == "VolumesMetadata.json":
assert volume["reporter_slug"] == "ad"


@patch("tasks.unredact.get_volumes_metadata")
@patch("tasks.unredact.get_reporter_volumes_metadata")
@patch("tasks.unredact.r2_s3_client")
def test_add_last_updated_field_handles_errors(
mock_r2_client,
mock_get_reporter,
mock_get_volumes,
mock_context,
sample_volumes_metadata,
capsys,
):
mock_get_volumes.return_value = json.dumps(sample_volumes_metadata)
mock_get_reporter.side_effect = Exception("Failed to get reporter metadata")

add_last_updated_field(mock_context, dry_run=False)

captured = capsys.readouterr()

mock_r2_client.put_object.assert_called_once_with(
Bucket=ANY,
Body=ANY,
Key="VolumesMetadata.json",
ContentType="application/json",
)

assert "Error processing ad: Failed to get reporter metadata" in captured.out