-
-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(crons): Record historic check-in volume counts #79448
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,12 @@ | |
# This key is used to store the hashmap of Mapping[PartitionKey, Timestamp] | ||
MONITOR_TASKS_PARTITION_CLOCKS = "sentry.monitors.partition_clocks" | ||
|
||
# This key is used to record historical date about the volume of check-ins. | ||
MONITOR_VOLUME_HISTORY = "sentry.monitors.volume_history:{}" | ||
|
||
# We record 30 days worth of historical data for each minute of check-ins. | ||
MONITOR_VOLUME_RETENTION = timedelta(days=30) | ||
|
||
CLOCK_TICK_CODEC: Codec[ClockTick] = get_topic_codec(Topic.MONITORS_CLOCK_TICK) | ||
|
||
|
||
|
@@ -70,6 +76,33 @@ def _dispatch_tick(ts: datetime): | |
_clock_tick_producer.produce(ArroyoTopic(topic), payload) | ||
|
||
|
||
def _make_reference_ts(ts: datetime): | ||
""" | ||
Produce a timestamp number with the seconds and microsecond removed | ||
""" | ||
return int(ts.replace(second=0, microsecond=0).timestamp()) | ||
|
||
|
||
def update_check_in_volume(ts: datetime): | ||
""" | ||
Increment a counter for this particular timestamp trimmed down to the | ||
minute. | ||
|
||
This counter will be used as historical data to help incidate if we may | ||
have had some data-loss (due to an incident) and would want to tick our | ||
clock in a mode where misses and time-outs are created as "unknown". | ||
""" | ||
redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is using |
||
|
||
reference_ts = _make_reference_ts(ts) | ||
key = MONITOR_VOLUME_HISTORY.format(reference_ts) | ||
|
||
pipeline = redis_client.pipeline() | ||
pipeline.incr(key, amount=1) | ||
pipeline.expire(key, MONITOR_VOLUME_RETENTION) | ||
Comment on lines
+101
to
+102
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could just probabilistically set the |
||
pipeline.execute() | ||
|
||
|
||
def try_monitor_clock_tick(ts: datetime, partition: int): | ||
""" | ||
Handles triggering the monitor tasks when we've rolled over the minute. | ||
|
@@ -84,8 +117,7 @@ def try_monitor_clock_tick(ts: datetime, partition: int): | |
|
||
# Trim the timestamp seconds off, these tasks are run once per minute and | ||
# should have their timestamp clamped to the minute. | ||
reference_datetime = ts.replace(second=0, microsecond=0) | ||
reference_ts = int(reference_datetime.timestamp()) | ||
reference_ts = _make_reference_ts(ts) | ||
|
||
# Store the current clock value for this partition. | ||
redis_client.zadd( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,7 @@ | |
from sentry.db.postgres.transactions import in_test_hide_transaction_boundary | ||
from sentry.killswitches import killswitch_matches_context | ||
from sentry.models.project import Project | ||
from sentry.monitors.clock_dispatch import try_monitor_clock_tick | ||
from sentry.monitors.clock_dispatch import try_monitor_clock_tick, update_check_in_volume | ||
from sentry.monitors.constants import PermitCheckInStatus | ||
from sentry.monitors.logic.mark_failed import mark_failed | ||
from sentry.monitors.logic.mark_ok import mark_ok | ||
|
@@ -937,6 +937,7 @@ def process_checkin_group(items: list[CheckinItem]): | |
completely serially. | ||
""" | ||
for item in items: | ||
update_check_in_volume(item.ts) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're doing this here, I think it'd likely be better to just pass all the timestamps to the function in bulk. The single process method can just pass a list of one item. Actually, it'd be even better to just do this in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah yeah good call |
||
process_checkin(item) | ||
|
||
|
||
|
@@ -1009,6 +1010,9 @@ def process_single(message: Message[KafkaPayload | FilteredPayload]): | |
ts = message.value.timestamp | ||
partition = message.value.partition.index | ||
|
||
if wrapper["message_type"] != "clock_pulse": | ||
update_check_in_volume(ts) | ||
|
||
try: | ||
try_monitor_clock_tick(ts, partition) | ||
except Exception: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we keep keys every minute, this will be 43k keys. Could be a good reason to have our own cluster.