diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py index 324cf0f54e4..73fe9edfd4d 100644 --- a/src/clusterfuzz/_internal/common/testcase_utils.py +++ b/src/clusterfuzz/_internal/common/testcase_utils.py @@ -31,6 +31,8 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): + '''Finds out if a testcase is fuzzer generated or manually uploaded, + and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.''' testcase_upload_metadata = get_testcase_upload_metadata(testcase_id) if not testcase_upload_metadata: logs.warning(f'No upload metadata found for testcase {testcase_id},' @@ -45,9 +47,6 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): 'analyze_launched', 'analyze_completed', 'minimize_completed', 'regression_completed', 'impact_completed', 'issue_updated' ] - elapsed_time_since_upload = datetime.datetime.utcnow() - elapsed_time_since_upload -= testcase_upload_metadata.timestamp - elapsed_time_since_upload = elapsed_time_since_upload.total_seconds() / 3600 testcase = data_handler.get_testcase_by_id(testcase_id) @@ -61,15 +60,30 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') return + from_fuzzer = not get_testcase_upload_metadata(testcase_id) + + assert step in [ + 'analyze_launched', 'analyze_completed', 'minimize_completed', + 'regression_completed', 'impact_completed', 'issue_updated' + ] + + if not testcase.get_age_in_seconds(): + logs.warning(f'No timestamp associated to testcase {testcase_id},' + ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + return + + testcase_age_in_hours = testcase.get_age_in_seconds() / 3600 + logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase ' - f'{testcase_id} (age = {elapsed_time_since_upload}) ' + f'{testcase_id} (age = {testcase_age_in_hours} hours.) ' 'in step {step}.') monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add( - elapsed_time_since_upload, + testcase_age_in_hours, labels={ 'job': testcase.job_type, 'step': step, + 'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded' }) diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py index d5759be5539..7a82807c564 100644 --- a/src/clusterfuzz/_internal/cron/triage.py +++ b/src/clusterfuzz/_internal/cron/triage.py @@ -309,6 +309,13 @@ def _file_issue(testcase, issue_tracker, throttler): return filed +def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool): + if testcase.stuck_in_triage == state: + return + testcase.stuck_in_triage = state + testcase.put() + + untriaged_testcases = {} @@ -322,7 +329,8 @@ def _increment_untriaged_testcase_count(job, status): def _emit_untriaged_testcase_count_metric(): for (job, status) in untriaged_testcases: monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set( - untriaged_testcases[(job, status)], labels={ + untriaged_testcases[(job, status)], + labels={ 'job': job, 'status': status, }) @@ -383,14 +391,21 @@ def main(): # Skip if testcase's job is removed. if testcase.job_type not in all_jobs: + _set_testcase_stuck_state(testcase, False) + logs.info(f'Skipping testcase {testcase_id}, since its job was removed ' + f' ({testcase.job_type})') continue # Skip if testcase's job is in exclusions list. if testcase.job_type in excluded_jobs: + _set_testcase_stuck_state(testcase, False) + logs.info(f'Skipping testcase {testcase_id}, since its job is in the' + f' exclusion list ({testcase.job_type})') continue # Skip if we are running progression task at this time. if testcase.get_metadata('progression_pending'): + _set_testcase_stuck_state(testcase, True) logs.info(f'Skipping testcase {testcase_id}, progression pending') _emit_untriaged_testcase_age_metric(testcase) _increment_untriaged_testcase_count(testcase.job_type, @@ -399,17 +414,24 @@ def main(): # If the testcase has a bug filed already, no triage is needed. if _is_bug_filed(testcase): + _set_testcase_stuck_state(testcase, False) + logs.info( + f'Skipping testcase {testcase_id}, since a bug was already filed.') continue # Check if the crash is important, i.e. it is either a reproducible crash # or an unreproducible crash happening frequently. if not _is_crash_important(testcase): + _set_testcase_stuck_state(testcase, False) + logs.info( + f'Skipping testcase {testcase_id}, since the crash is not important.') continue # Require that all tasks like minimizaton, regression testing, etc have # finished. if not critical_tasks_completed: _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_CRITICAL_TASKS) logs.info( @@ -429,6 +451,7 @@ def main(): if not testcase.group_id and not dates.time_has_expired( testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT): _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue @@ -436,6 +459,7 @@ def main(): if not testcase.get_metadata('ran_grouper'): # Testcase should be considered by the grouper first before filing. _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') continue @@ -450,6 +474,9 @@ def main(): # If there are similar issues to this test case already filed or recently # closed, skip filing a duplicate bug. if _check_and_update_similar_bug(testcase, issue_tracker): + _set_testcase_stuck_state(testcase, False) + logs.info(f'Skipping testcase {testcase_id}, since a similar bug' + ' was already filed.') continue # Clean up old triage messages that would be not applicable now. @@ -457,6 +484,7 @@ def main(): # A testcase is untriaged, until immediately before a bug is opened _emit_untriaged_testcase_age_metric(testcase) + _set_testcase_stuck_state(testcase, False) _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) # File the bug first and then create filed bug metadata. diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py index cfd16f93fe6..bd4c3e2e44d 100644 --- a/src/clusterfuzz/_internal/datastore/data_types.py +++ b/src/clusterfuzz/_internal/datastore/data_types.py @@ -580,6 +580,9 @@ class Testcase(Model): # corpus. trusted = ndb.BooleanProperty(default=False) + # Tracks if a testcase is stuck during triage. + stuck_in_triage = ndb.BooleanProperty(default=False) + def is_chromium(self): return self.project_name in ('chromium', 'chromium-testing') @@ -686,6 +689,8 @@ def get_created_time(self) -> ndb.DateTimeProperty: def get_age_in_seconds(self): current_time = datetime.datetime.utcnow() + if not self.get_created_time(): + return None testcase_age = current_time - self.get_created_time() return testcase_age.total_seconds() diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index 97449677330..c30787836a1 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -223,12 +223,14 @@ TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( 'uploaded_testcase_analysis/triage_duration_secs', description=('Time elapsed between testcase upload and completion' - ' of relevant tasks in the testcase upload lifecycle, ' - 'in hours.'), + ' of relevant tasks in the testcase upload lifecycle.' + ' Origin can be either from a fuzzer, or a manual' + ' upload. Measured in hours.'), bucketer=monitor.GeometricBucketer(), field_spec=[ monitor.StringField('step'), monitor.StringField('job'), + monitor.StringField('origin'), ], )