Skip to content

Commit

Permalink
[Monitoring] Fix untriaged testcase age oversampling, add untriaged t…
Browse files Browse the repository at this point in the history
…estcase count (#4494)

### Motivation

#4364 implemented a metric to track the percentile distributions of
untriaged testcase age. It was overcounting testcases:
* Testcases for which a bug was already filed
* Testcases for which the crash was unimportant

This PR solves this issue, and adds the UNTRIAGED_TESTCASE_COUNT metric,
drilled down by job and platform, so we can also know how many testcases
are stuck, and not only their age distribution.
  • Loading branch information
vitorguidi authored and jonathanmetzman committed Jan 8, 2025
1 parent 8c575e4 commit 94763ad
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 7 deletions.
36 changes: 29 additions & 7 deletions src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,22 @@ def _file_issue(testcase, issue_tracker, throttler):
return filed


def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
testcase: data_types.Testcase):
untriaged_testcase_count = {}


def _increment_untriaged_testcase_count(testcase: data_types.Testcase):
identifier = (testcase.job_type, testcase.platform)
if identifier not in untriaged_testcase_count:
untriaged_testcase_count[identifier] = 0
untriaged_testcase_count[identifier] += 1


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
"""Emmits a metric to track age of untriaged testcases."""
if critical_tasks_completed:
return
if not testcase.timestamp:
return

_increment_untriaged_testcase_count(testcase)
logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
f'(age = {testcase.get_age_in_seconds()})')
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
Expand All @@ -327,6 +335,16 @@ def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
})


def _emit_untriaged_testcase_count_metric():
for (job, platform) in untriaged_testcase_count:
monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
untriaged_testcase_count[(job, platform)],
labels={
'job': job,
'platform': platform,
})


def main():
"""Files bugs."""
try:
Expand Down Expand Up @@ -373,12 +391,10 @@ def main():
f' exclusion list ({testcase.job_type})')
continue

# Emmit the metric for testcases that should be triaged.
_emit_untriaged_testcase_age_metric(critical_tasks_completed, testcase)

# Skip if we are running progression task at this time.
if testcase.get_metadata('progression_pending'):
logs.info(f'Skipping testcase {testcase_id}, progression pending')
_emit_untriaged_testcase_age_metric(testcase)
continue

# If the testcase has a bug filed already, no triage is needed.
Expand All @@ -397,6 +413,7 @@ def main():
# Require that all tasks like minimizaton, regression testing, etc have
# finished.
if not critical_tasks_completed:
_emit_untriaged_testcase_age_metric(testcase)
logs.info(
f'Skipping testcase {testcase_id}, critical tasks still pending.')
continue
Expand All @@ -413,11 +430,13 @@ def main():
# metadata works well.
if not testcase.group_id and not dates.time_has_expired(
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
_emit_untriaged_testcase_age_metric(testcase)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

if not testcase.get_metadata('ran_grouper'):
# Testcase should be considered by the grouper first before filing.
_emit_untriaged_testcase_age_metric(testcase)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

Expand Down Expand Up @@ -447,6 +466,7 @@ def main():
# File the bug first and then create filed bug metadata.
if not _file_issue(testcase, issue_tracker, throttler):
logs.info(f'Issue filing failed for testcase id {testcase_id}')
_emit_untriaged_testcase_age_metric(testcase)
continue

_create_filed_bug_metadata(testcase)
Expand All @@ -455,6 +475,8 @@ def main():
logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information,
testcase_id))

_emit_untriaged_testcase_count_metric()

logs.info('Triage testcases succeeded.')
return True

Expand Down
10 changes: 10 additions & 0 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,16 @@
monitor.StringField('platform'),
])

UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(
'issues/untriaged_testcase_count',
description='Number of testcases that were not yet triaged '
'(have not yet completed analyze, regression,'
' minimization, impact task), in hours.',
field_spec=[
monitor.StringField('job'),
monitor.StringField('platform'),
])

ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric(
'task/analyze/reproducibility',
description='Outcome count for analyze task.',
Expand Down

0 comments on commit 94763ad

Please sign in to comment.