diff --git a/app/workers/alert_check_worker.rb b/app/workers/alert_check_worker.rb new file mode 100644 index 000000000..8990994cc --- /dev/null +++ b/app/workers/alert_check_worker.rb @@ -0,0 +1,24 @@ +class AlertCheckWorker < ApplicationWorker + include SearchAlertList + + def perform(document_type) + content_items = get_alert_content_items(document_type:) + delivered = 0 + content_items.each do |ci| + if any_emails_delivered_for?(ci[:content_id], ci[:valid_from]) + delivered += 1 + else + Rails.logger.warn("Couldn't find any delivered emails for #{document_type.titleize}s with content id #{ci[:content_id]} (at #{ci[:url]})") + end + end + + Rails.logger.info("Checking #{document_type.titleize}s: #{delivered} out of #{content_items.count} alerts have been delivered to at least one recipient") + + Rails.cache.write("current_#{document_type}s", content_items.count, expires_in: 15.minutes) + Rails.cache.write("delivered_#{document_type}s", delivered, expires_in: 15.minutes) + end + + def any_emails_delivered_for?(content_id, valid_from) + Email.where("notify_status = 'delivered' AND content_id = ? AND created_at > ?", content_id, valid_from).exists? + end +end diff --git a/config/initializers/prometheus.rb b/config/initializers/prometheus.rb index 4b842d841..7c2d42743 100644 --- a/config/initializers/prometheus.rb +++ b/config/initializers/prometheus.rb @@ -1,2 +1,4 @@ require "govuk_app_config/govuk_prometheus_exporter" -GovukPrometheusExporter.configure +require "collectors/global_prometheus_collector" + +GovukPrometheusExporter.configure(collectors: [Collectors::GlobalPrometheusCollector]) diff --git a/config/sidekiq.yml b/config/sidekiq.yml index 17bfa1397..4422ea388 100644 --- a/config/sidekiq.yml +++ b/config/sidekiq.yml @@ -39,3 +39,11 @@ recover_lost_jobs: every: '30m' class: RecoverLostJobsWorker + check_medical_safety_alerts: + every: '15m' + class: AlertCheckWorker + args: ["medical_safety_alert"] + check_travel_advice_alerts: + every: '15m' + class: AlertCheckWorker + args: ["travel_advice_alert"] diff --git a/docs/alert_check_scheduled_jobs.md b/docs/alert_check_scheduled_jobs.md new file mode 100644 index 000000000..9374633c1 --- /dev/null +++ b/docs/alert_check_scheduled_jobs.md @@ -0,0 +1,17 @@ +# Alert Check Scheduled Jobs + +Two sets of alerts (Medical Safety Alerts and Travel Advice Alerts) are critical, so it's important to check that they have gone out correctly. To do this we have a pair of scheduled sidekiq jobs. These jobs use the public search API to find a list of relevant Content IDs, then iterate that list checking for the existence of at least one Email record which has that Content ID and a notify_status of "delivered", and which was created after the public update time . Via the prometheus collector, we return metrics of the number of alerts and the number of alerts that have at least one delivery record. Ideally those number should be the same (0 & 0 is okay, 2 & 2 is okay, but 2 & 1 is an alert state). + +The jobs run every 15 minutes, and checks for alerts that meet the following criteria from the Search API results: +- document type is `medical_safety_alert` or `travel_advice` +- is in the 50 most recently updated documents of that type +- created at least one hour ago (very new alerts are excluded) +- created no more than 2 days ago (older alerts are excluded) + +## Multiple changes to a content item + +It is possible that a given travel advice item or medical alert might be updated more than once in two days, but there's no way to know about that through the Search API (we can only find out the last update to a given page). In this case we can only check that emails were delivered after the last public update. The 1 hour lag before an alert becomes checkable plus the 15 minute update frequency means that if a change happens between 60 and 75 minutes after the previous change the system can only confirms that emails were sent out for the most recent update, and if further proof of the previous update is needed a developer will have to check that manually. If the change happens later than 75 minutes, both changes will have been checked by separate runs of the job. + +## Dependencies + +This system depends on the Search API structure, so radical changes to the search API's results format may cause problems. diff --git a/lib/collectors/global_prometheus_collector.rb b/lib/collectors/global_prometheus_collector.rb new file mode 100644 index 000000000..cb5aabceb --- /dev/null +++ b/lib/collectors/global_prometheus_collector.rb @@ -0,0 +1,23 @@ +require "prometheus_exporter" +require "prometheus_exporter/server" + +module Collectors + class GlobalPrometheusCollector < PrometheusExporter::Server::TypeCollector + def type + "email_alert_api_global" + end + + def metrics + current_medical_safety_alerts = PrometheusExporter::Metric::Gauge.new("email_alert_api_current_medical_safety_alerts", "Number of medical safety alerts email-alert-api is checking") + current_medical_safety_alerts.observe(Rails.cache.fetch("current_medical_safety_alerts") { 0 }) + delivered_medical_safety_alerts = PrometheusExporter::Metric::Gauge.new("email_alert_api_delivered_medical_safety_alerts", "Number of current medical safety alerts marked as delivered") + delivered_medical_safety_alerts.observe(Rails.cache.fetch("delivered_medical_safety_alerts") { 0 }) + current_travel_advice_alerts = PrometheusExporter::Metric::Gauge.new("email_alert_api_current_travel_advice_alerts", "Number of travel advice alerts email-alert-api is checking") + current_travel_advice_alerts.observe(Rails.cache.fetch("current_travel_advice_alerts") { 0 }) + delivered_travel_advice_alerts = PrometheusExporter::Metric::Gauge.new("email_alert_api_delivered_travel_advice_alerts", "Number of current travel advice alerts marked as delivered") + delivered_travel_advice_alerts.observe(Rails.cache.fetch("delivered_travel_advice_alerts") { 0 }) + + [current_medical_safety_alerts, delivered_medical_safety_alerts, current_travel_advice_alerts, delivered_travel_advice_alerts] + end + end +end diff --git a/lib/search_alert_list.rb b/lib/search_alert_list.rb new file mode 100644 index 000000000..7229fb09b --- /dev/null +++ b/lib/search_alert_list.rb @@ -0,0 +1,18 @@ +module SearchAlertList + MIN_AGE = 1.hour + MAX_AGE = 2.days + MAX_RESULTS = 50 + + def get_alert_content_items(document_type:) + results = GdsApi.search.search(count: MAX_RESULTS, fields: "content_id,link,public_timestamp", filter_format: document_type, order: "-public_timestamp") + + output = results["results"].map do |result| + publish_time = Time.zone.parse(result["public_timestamp"]) + if publish_time.between?(Time.zone.now - MAX_AGE, Time.zone.now - MIN_AGE) + { content_id: result["content_id"], valid_from: publish_time, url: result["link"] } + end + end + + output.compact + end +end diff --git a/spec/support/search_alert_list_helpers.rb b/spec/support/search_alert_list_helpers.rb new file mode 100644 index 000000000..7f006880e --- /dev/null +++ b/spec/support/search_alert_list_helpers.rb @@ -0,0 +1,33 @@ +module SearchAlertListHelpers + def medical_safety_alert_search_body(content_id:, public_timestamp:) + { + results: [ + { + content_id:, + link: "/drug-device-alerts/test", + public_timestamp:, + index: "govuk", + es_score: nil, + model_score: nil, + original_rank: nil, + combined_score: nil, + _id: "/drug-device-alerts/test", + elasticsearch_type: "medical_safety_alert", + document_type: "medical_safety_alert", + }, + ], + total: 1, + start: 0, + aggregates: {}, + suggested_queries: [], + suggested_autocomplete: [], + es_cluster: "A", + reranked: false, + } + end + + def stub_medical_safety_alert_feed(content_id:, age:) + stub_request(:get, "http://search-api.dev.gov.uk/search.json?count=50&fields=content_id,link,public_timestamp&filter_format=medical_safety_alert&order=-public_timestamp") + .to_return(status: 200, body: medical_safety_alert_search_body(content_id:, public_timestamp: (Time.zone.now - age).strftime("%Y-%m-%dT%H:%M:%SZ")).to_json, headers: {}) + end +end diff --git a/spec/workers/alert_check_worker_spec.rb b/spec/workers/alert_check_worker_spec.rb new file mode 100644 index 000000000..088f40c9e --- /dev/null +++ b/spec/workers/alert_check_worker_spec.rb @@ -0,0 +1,71 @@ +RSpec.describe AlertCheckWorker do + include SearchAlertListHelpers + + describe "#perform", caching: true do + def perform + described_class.new.perform("medical_safety_alert") + end + + context "there are no alerts older than an hour" do + before { stub_medical_safety_alert_feed(content_id: SecureRandom.uuid, age: 30.minutes) } + + it "should put 0/0 in the cache" do + expect(Rails.cache).to receive(:write).with("current_medical_safety_alerts", 0, expires_in: 15.minutes) + expect(Rails.cache).to receive(:write).with("delivered_medical_safety_alerts", 0, expires_in: 15.minutes) + perform + end + end + + context "there are no alerts younger than 2 days" do + before { stub_medical_safety_alert_feed(content_id: SecureRandom.uuid, age: 3.days) } + + it "should put 0/0 in the cache" do + expect(Rails.cache).to receive(:write).with("current_medical_safety_alerts", 0, expires_in: 15.minutes) + expect(Rails.cache).to receive(:write).with("delivered_medical_safety_alerts", 0, expires_in: 15.minutes) + perform + end + end + + context "there is a valid alert with delivered emails" do + before do + content_id = SecureRandom.uuid + stub_medical_safety_alert_feed(content_id:, age: 2.hours) + create(:email, content_id:, notify_status: "delivered") + end + + it "should put 1/1 in the cache" do + expect(Rails.cache).to receive(:write).with("current_medical_safety_alerts", 1, expires_in: 15.minutes) + expect(Rails.cache).to receive(:write).with("delivered_medical_safety_alerts", 1, expires_in: 15.minutes) + perform + end + end + + context "there is a valid alerts with undelivered emails" do + before do + content_id = SecureRandom.uuid + stub_medical_safety_alert_feed(content_id:, age: 2.hours) + create(:email, content_id:, notify_status: nil) + end + + it "should put 1/0 in the cache" do + expect(Rails.cache).to receive(:write).with("current_medical_safety_alerts", 1, expires_in: 15.minutes) + expect(Rails.cache).to receive(:write).with("delivered_medical_safety_alerts", 0, expires_in: 15.minutes) + perform + end + end + + context "there is a valid alert with delivered emails that are too old to be valid" do + before do + content_id = SecureRandom.uuid + stub_medical_safety_alert_feed(content_id:, age: 2.hours) + create(:email, content_id:, notify_status: "delivered", created_at: Time.zone.now - 3.hours) + end + + it "should put 1/0 in the cache" do + expect(Rails.cache).to receive(:write).with("current_medical_safety_alerts", 1, expires_in: 15.minutes) + expect(Rails.cache).to receive(:write).with("delivered_medical_safety_alerts", 0, expires_in: 15.minutes) + perform + end + end + end +end