Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add audit task for Subscriber Lists #1988

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ gem "ratelimit"
gem "redcarpet"
gem "sentry-sidekiq"
gem "sidekiq-scheduler"
gem "sitemap-parser"
gem "with_advisory_lock"

group :test do
Expand Down
8 changes: 8 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ GEM
erubi (1.12.0)
et-orbi (1.2.7)
tzinfo
ethon (0.16.0)
ffi (>= 1.15.0)
expgen (0.1.1)
parslet
factory_bot (6.4.5)
Expand Down Expand Up @@ -660,6 +662,9 @@ GEM
simplecov_json_formatter (~> 0.1)
simplecov-html (0.12.3)
simplecov_json_formatter (0.1.4)
sitemap-parser (0.5.6)
nokogiri (~> 1.6)
typhoeus (>= 0.6, < 2.0)
snaky_hash (2.0.1)
hashie
version_gem (~> 1.1, >= 1.1.1)
Expand All @@ -674,6 +679,8 @@ GEM
timeout (0.4.1)
tins (1.32.1)
sync
typhoeus (1.4.0)
ethon (>= 0.9.0)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unicode-display_width (2.5.0)
Expand Down Expand Up @@ -734,6 +741,7 @@ DEPENDENCIES
sentry-sidekiq
sidekiq-scheduler
simplecov
sitemap-parser
webmock
with_advisory_lock

Expand Down
9 changes: 9 additions & 0 deletions app/models/subscriber_list_audit.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class SubscriberListAudit < ApplicationRecord
belongs_to :subscriber_list

def self.increment_count(subscriber_list)
record = SubscriberListAudit.find_or_create_by!(subscriber_list:)
record.reference_count += 1
record.save!
end
end
23 changes: 23 additions & 0 deletions app/workers/subscriber_list_audit_worker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
require "content_item_list_query"

class SubscriberListAuditWorker < ApplicationWorker
sidekiq_options queue: :subscriber_list_audit

def perform(url_batch)
content_store_client = GdsApi.content_store

url_batch.each do |url|
parsed_url = URI.parse(url)
begin
content_item = content_store_client.content_item(parsed_url.path).to_hash

if EmailAlertCriteria.new(content_item:).would_trigger_alert?
query = ContentItemListQuery.new(parsed_url.path, false)
query.lists_by_content_item(content_item).each { |list| SubscriberListAudit.increment_count(list) }
end
rescue StandardError
# We don't really need to do much here, just not crash the worker
end
end
end
end
1 change: 1 addition & 0 deletions config/sidekiq.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
- [email_generation_digest, 4] # To generate emails to users with daily or weekly subscriptions for content changes and messages.
- [send_email_immediate, 2] # To send emails to users with immediate subscriptions.
- [send_email_digest, 2] # To send digest emails to users with either daily or weekly subscriptions.
- [subscriber_list_audit, 1] # To crawl the GOV.UK site for audit purposes.
- [default, 1] # Miscellaneous operations e.g. initiating digest runs, monitoring, DB cleanup and recovering lost Sidekiq jobs.
:scheduler:
:schedule:
Expand Down
10 changes: 10 additions & 0 deletions db/migrate/20230823081626_create_subscriber_list_audit.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class CreateSubscriberListAudit < ActiveRecord::Migration[7.0]
def change
create_table :subscriber_list_audits do |t|
t.references :subscriber_list, null: false, foreign_key: true
t.integer :reference_count, default: 0

t.timestamps
end
end
end
9 changes: 9 additions & 0 deletions db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,14 @@
t.index ["sender_message_id"], name: "index_messages_on_sender_message_id", unique: true
end

create_table "subscriber_list_audits", force: :cascade do |t|
t.bigint "subscriber_list_id", null: false
t.integer "reference_count", default: 0
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["subscriber_list_id"], name: "index_subscriber_list_audits_on_subscriber_list_id"
end

create_table "subscriber_lists", id: :serial, force: :cascade do |t|
t.text "title", null: false
t.datetime "created_at", precision: nil
Expand Down Expand Up @@ -207,6 +215,7 @@
add_foreign_key "matched_content_changes", "subscriber_lists", on_delete: :cascade
add_foreign_key "matched_messages", "messages", on_delete: :cascade
add_foreign_key "matched_messages", "subscriber_lists", on_delete: :cascade
add_foreign_key "subscriber_list_audits", "subscriber_lists"
add_foreign_key "subscription_contents", "content_changes", on_delete: :restrict
add_foreign_key "subscription_contents", "digest_run_subscribers", on_delete: :cascade
add_foreign_key "subscription_contents", "emails", on_delete: :cascade
Expand Down
52 changes: 52 additions & 0 deletions docs/subscriber_list_audit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Subscriber List Audit

## Background

At the moment it's possible to get into a state where subscriber lists exist
but cannot be triggered:
- if the keys or fields used to trigger the list are out-of-date with real
content,
- if a format is fully retired without the correct manual intervention to
clean up the lists,
- if content is unpublished and the automatic list cleanup code fails.

To determine whether untriggerable lists exist, we can run an audit over the
gov.uk website using the sitemap. We iterate over the sitemap, getting the
content item for each path and then comparing it against the EmailAlertCriteria
(to check if the path would _ever_ trigger a subscriber list), then if it passes
that test against the ContentItemListQuery, which returns a list of SubscriberLists
that would be triggered by major changes to that path. If positive, we store a
record in the SubscriberListAudit table. At the end of the process, we can compare
the list of SubscriberLists against the list in the audit table, and if any are
missing we know they can't be triggered.

## Running the audit

This is a long-running job which needs to be run asynchronously. To kick it off, use:

```bash
kubectl -n apps exec -it deploy/email-alert-api -- rake 'subscriber_list_audit:start'
```

NB: It will probably only reliably run in production, as the time taken to iterate
over the sight may be more than a day, which in integration will be interrupted
by the environment sync.

To check the number of workers remaining on the job:

```bash
kubectl -n apps exec -it deploy/email-alert-api -- rake 'subscriber_list_audit:queue_size'
```

When there are no workers remaining, you can run:

```bash
kubectl -n apps exec -it deploy/email-alert-api -- rake 'subscriber_list_audit:report'
```

to get a list of all subscriber lists without matching SubscriberListAudit records.

## Notes

The audit won't manually clear down the SubscriberListAudit table, but it will abort
if the table isn't empty when it starts the workers.
75 changes: 75 additions & 0 deletions lib/content_item_list_query.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
class ContentItemListQuery
attr_reader :govuk_path, :draft

def initialize(govuk_path, draft)
@govuk_path = govuk_path
@draft = draft
end

def lists
lists_by_content_item(content_store_client.content_item(govuk_path).to_hash)
end

def lists_by_content_item(content_item)
SubscriberListQuery.new(
content_id: content_item["content_id"],
tags: tags_from_content_item(content_item),
links: links_from_content_item(content_item),
document_type: content_item["document_type"],
email_document_supertype: supertypes(content_item)["email_document_supertype"],
government_document_supertype: supertypes(content_item)["government_document_supertype"],
).lists
end

def content_store_client
return GdsApi.content_store unless draft

GdsApi::ContentStore.new(Plek.find("draft-content-store"), bearer_token: "")
end

def supertypes(content_item)
GovukDocumentTypes.supertypes(document_type: content_item["document_type"])
end

def tags_from_content_item(content_item)
return {} unless content_item["details"].key("tags")

content_item["details"]["tags"].merge(additional_items(content_item))
end

def links_from_content_item(content_item)
compressed_links(content_item).merge(additional_items(content_item).merge("taxon_tree" => taxon_tree(content_item)))
end

def compressed_links(content_item)
keys = content_item["links"].keys - %w[available_translations taxons]
compressed_links = {}
keys.each do |k|
compressed_links[k] = content_item["links"][k].collect { |i| i["content_id"] }
end
compressed_links
end

def additional_items(content_item)
{ "content_store_document_type" => content_item["document_type"] }.merge(supertypes(content_item))
end

def taxon_tree(content_item)
return [] unless content_item["links"].key?("taxons")

[content_item["links"]["taxons"].first["content_id"]] + get_parent_links(content_item["links"]["taxons"].first)
end

def get_parent_links(taxon_struct)
return [] unless taxon_struct["links"].key?("parent_taxons")
return [] unless taxon_struct["links"]["parent_taxons"].any?

tree = []
taxon_struct["links"]["parent_taxons"].each do |parent_taxon|
tree += [parent_taxon["content_id"]]
tree += get_parent_links(parent_taxon)
end

tree
end
end
1 change: 1 addition & 0 deletions lib/reports/future_content_change_statistics_report.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require "reports/concerns/notification_stats"
require "content_item_list_query"

class Reports::FutureContentChangeStatisticsReport
include Reports::Concerns::NotificationStats
Expand Down
44 changes: 44 additions & 0 deletions lib/tasks/subscriber_list_audit.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
require "sidekiq/api"
require "sitemap-parser"

namespace :subscriber_list_audit do
desc "Create audit workers"
task :start, %i[batch_size limit] => :environment do |_t, args|
batch_size = (args[:batch_size] || "10").to_i
limit = args[:limit] ? args[:limit].to_i : -1

abort("SubscriberListAudit not empty") if SubscriberListAudit.any?

puts("Creating audit workers with batch size #{batch_size}, limit #{limit}")

sitemaps = limit == -1 ? 29 : 1

(1..sitemaps).each do |i|
sitemap = SitemapParser.new("#{Plek.website_root}/sitemaps/sitemap_#{i}.xml")
urls = sitemap.to_a

urls = urls.take(limit) if limit != -1

puts("Read #{urls.count} URLs from sitemap part #{i}")

urls.each_slice(batch_size) do |batch|
SubscriberListAuditWorker.perform_async(batch)
end
end

puts("Batch workers created. Use rails subscriber_list_audit:queue_size to monitor queue size")
end

desc "Check audit worker queue size"
task queue_size: :environment do
audit_queue = Sidekiq::Queue.new(:subscriber_list_audit)
puts("#{audit_queue.size} jobs remaining to be processed")
end

desc "Show Subscriber Lists that cannot be triggered"
task report: :environment do
SubscriberList.all do |subscriber_list|
puts("#{subscriber_list.id}: #{subscriber_list.title}") unless SubscriberListAudit.where(subscriber_list:).any?
end
end
end