Refactor rules and commit with features, collect runtime stats (#233, #…

…242) * collect runtime stats (by Gergő Balogh, aka geryxyz) #233 * refactor: drop CommitWithFeatures #242
SAP · Sep 15, 2021 · 3122cb6 · 3122cb6
1 parent a5916f2
commit 3122cb6
Show file tree

Hide file tree

Showing 39 changed files with 1,366 additions and 546 deletions.
diff --git a/.gitignore b/.gitignore
@@ -50,3 +50,4 @@ prospector/prospector-report.html
 prospector/test_report.html
 prospector/.idea/*
 similarities.csv
+prospector/demo_ul.html
diff --git a/prospector/client/cli/console_report.py b/prospector/client/cli/console_report.py
@@ -1,14 +1,14 @@
 import log.util
 from datamodel.advisory import AdvisoryRecord
-from datamodel.commit_features import CommitWithFeatures
+from datamodel.commit import Commit
 
 _logger = log.util.init_local_logger()
 
 
 def report_on_console(
-    results: "list[CommitWithFeatures]", advisory_record: AdvisoryRecord, verbose=False
+    results: "list[Commit]", advisory_record: AdvisoryRecord, verbose=False
 ):
-    def format_annotations(commit: CommitWithFeatures) -> str:
+    def format_annotations(commit: Commit) -> str:
         out = ""
         if verbose:
             for tag in commit.annotations:
@@ -25,9 +25,9 @@ def format_annotations(commit: CommitWithFeatures) -> str:
     for commit in results:
         count += 1
         print(
-            f"\n----------\n{commit.commit.repository}/commit/{commit.commit.commit_id}\n"
-            + "\n".join(commit.commit.changed_files)
-            + f"{commit.commit.message}\n{format_annotations(commit)}"
+            f"\n----------\n{commit.repository}/commit/{commit.commit_id}\n"
+            + "\n".join(commit.changed_files)
+            + f"{commit.message}\n{format_annotations(commit)}"
         )
 
     print(f"Found {count} candidates\nAdvisory record\n{advisory_record}")
diff --git a/prospector/client/cli/html_report.py b/prospector/client/cli/html_report.py
@@ -5,20 +5,22 @@
 
 import log.util
 from datamodel.advisory import AdvisoryRecord
-from datamodel.commit_features import CommitWithFeatures
+from datamodel.commit import Commit
+from simple_hierarchical_storage.execution import execution_statistics
 
 _logger = log.util.init_local_logger()
 
 
 def report_as_html(
-    results: List[CommitWithFeatures],
+    results: List[Commit],
     advisory_record: AdvisoryRecord,
     filename: str = "prospector-report.html",
+    statistics=None,
 ):
     annotations_count = {}
-    commit_with_feature: CommitWithFeatures
-    for commit_with_feature in results:
-        for annotation in commit_with_feature.annotations.keys():
+    annotated_commit: Commit
+    for annotated_commit in results:
+        for annotation in annotated_commit.annotations.keys():
             annotations_count[annotation] = annotations_count.get(annotation, 0) + 1
 
     _logger.info("Writing results to " + filename)
@@ -32,6 +34,9 @@ def report_as_html(
             candidates=results,
             present_annotations=annotations_count,
             advisory_record=advisory_record,
+            execution_statistics=(
+                execution_statistics if statistics is None else statistics
+            ).as_html_ul(),
         ):
             html_file.write(content)
     return filename
diff --git a/prospector/client/cli/html_report_test.py b/prospector/client/cli/html_report_test.py
@@ -5,8 +5,7 @@
 from client.cli.html_report import report_as_html
 from datamodel.advisory import AdvisoryRecord
 from datamodel.commit import Commit
-from datamodel.commit_features import CommitWithFeatures
-from util.sample_data_generation import (
+from util.sample_data_generation import (  # random_list_of_url,
     random_bool,
     random_commit_hash,
     random_dict_of_strs,
@@ -16,39 +15,32 @@
     random_list_of_jira_refs,
     random_list_of_path,
     random_list_of_strs,
-    random_list_of_url,
     random_list_of_version,
     random_url,
+    sample_statistics,
 )
 
 
 def test_report_generation():
     candidates = []
     for _ in range(100):
-        commit_with_feature = CommitWithFeatures(
-            commit=Commit(
-                commit_id=random_commit_hash(),
-                repository=random_url(4),
-                message=" ".join(random_list_of_strs(100)),
-                timestamp=randint(0, 100000),
-                hunks=random_list_of_hunks(1000, 42),
-                diff=random_list_of_strs(200),
-                changed_files=random_list_of_path(4, 42),
-                message_reference_content=random_list_of_strs(42),
-                jira_refs=random_list_of_jira_refs(42),
-                ghissue_refs=random_list_of_github_issue_ids(100000, 42),
-                cve_refs=random_list_of_cve(42),
-                tags=random_list_of_strs(42),
-            ),
-            references_vuln_id=random_bool(),
-            time_between_commit_and_advisory_record=randint(0, 42),
-            changes_relevant_path=set(random_list_of_path(4, 42)),
-            other_CVE_in_message=set(random_list_of_cve(42)),
-            referred_to_by_pages_linked_from_advisories=set(random_list_of_url(4, 42)),
-            referred_to_by_nvd=set(random_list_of_url(4, 42)),
+        annotated_candidates = Commit(
+            commit_id=random_commit_hash(),
+            repository=random_url(4),
+            message=" ".join(random_list_of_strs(100)),
+            timestamp=randint(0, 100000),
+            hunks=random_list_of_hunks(1000, 42),
+            diff=random_list_of_strs(200),
+            changed_files=random_list_of_path(4, 42),
+            message_reference_content=random_list_of_strs(42),
+            jira_refs=random_list_of_jira_refs(42),
+            ghissue_refs=random_list_of_github_issue_ids(100000, 42),
+            cve_refs=random_list_of_cve(42),
+            tags=random_list_of_strs(42),
             annotations=random_dict_of_strs(16, 10),
         )
-        candidates.append(commit_with_feature)
+
+        candidates.append(annotated_candidates)
 
     advisory = AdvisoryRecord(
         vulnerability_id=random_list_of_cve(max_count=1, min_count=1)[0],
@@ -71,5 +63,7 @@ def test_report_generation():
     filename = "test_report.html"
     if os.path.isfile(filename):
         os.remove(filename)
-    generated_report = report_as_html(candidates, advisory, filename)
+    generated_report = report_as_html(
+        candidates, advisory, filename, statistics=sample_statistics()
+    )
     assert os.path.isfile(generated_report)
diff --git a/prospector/client/cli/json_report.py b/prospector/client/cli/json_report.py
@@ -2,14 +2,12 @@
 
 import log.util
 from datamodel.advisory import AdvisoryRecord
-from datamodel.commit_features import CommitWithFeatures
+from datamodel.commit import Commit
 
 _logger = log.util.init_local_logger()
 
 
-def report_as_json(
-    results: "list[CommitWithFeatures]", advisory_record: AdvisoryRecord
-):
+def report_as_json(results: "list[Commit]", advisory_record: AdvisoryRecord):
 
     data = {
         "advisory_record": advisory_record.dict(),

diff --git a/prospector/client/cli/main.py b/prospector/client/cli/main.py
@@ -22,6 +22,7 @@
     prospector,
 )
 from git.git import GIT_CACHE
+from simple_hierarchical_storage.execution import ExecutionTimer, execution_statistics
 
 _logger = log.util.init_local_logger()
 
@@ -166,108 +167,112 @@ def ping_backend(server_url: str, verbose: bool = False) -> bool:
 
 
 def main(argv):  # noqa: C901
-    args = parseArguments(argv)
-    configuration = getConfiguration(args.conf)
+    with ExecutionTimer(execution_statistics.sub_collection(name="initialization")):
+        args = parseArguments(argv)
+        configuration = getConfiguration(args.conf)
 
-    if args.log_level:
-        log.config.level = getattr(logging, args.log_level)
+        if args.log_level:
+            log.config.level = getattr(logging, args.log_level)
 
-    _logger.info(f"global log level is set to {logging.getLevelName(log.config.level)}")
+        _logger.info(
+            f"global log level is set to {logging.getLevelName(log.config.level)}"
+        )
 
-    if args.vulnerability_id is None:
-        _logger.error("No vulnerability id was specified. Cannot proceed.")
-        return False
-
-    if configuration is None:
-        _logger.error("Invalid configuration, exiting.")
-        return False
-
-    report = configuration["global"].getboolean("report")
-    if args.report:
-        report = args.report
-
-    if configuration["global"].get("nvd_rest_endpoint"):
-        nvd_rest_endpoint = configuration["global"].get("nvd_rest_endpoint")
-
-    backend = configuration["global"].get("backend") or DEFAULT_BACKEND
-    if args.backend:
-        backend = args.backend
-
-    if args.ping:
-        return ping_backend(backend, log.config.level < logging.INFO)
-
-    vulnerability_id = args.vulnerability_id
-    repository_url = args.repository
-
-    vuln_descr = args.descr
-    use_nvd = args.use_nvd
-    tag_interval = args.tag_interval
-    version_interval = args.version_interval
-    time_limit_before = TIME_LIMIT_BEFORE
-    time_limit_after = TIME_LIMIT_AFTER
-    max_candidates = args.max_candidates
-    modified_files = args.modified_files.split(",")
-<<<<<<< HEAD
-=======
-
->>>>>>> 30aa6ce (bug: handling of user-supplied special tokens)
-    code_tokens = (
-        args.diff_contains.split(",") if args.diff_contains is not None else []
-    )
+        if args.vulnerability_id is None:
+            _logger.error("No vulnerability id was specified. Cannot proceed.")
+            return False
 
-    print(code_tokens)
-
-    publication_date = ""
-    if args.pub_date != "":
-        publication_date = args.pub_date + "T00:00Z"
-        # if the date is forced manually, the time interval can
-        # be restricted
-        # time_limit_before = int(time_limit_before / 5)
-        # time_limit_after = int(time_limit_after / 2)
-
-    git_cache = GIT_CACHE
-    if os.environ["GIT_CACHE"]:
-        git_cache = os.environ["GIT_CACHE"]
-    if configuration["global"].get("git_cache"):
-        git_cache = configuration["global"].get("git_cache")
-
-    _logger.debug("Using the following configuration:")
-    _logger.pretty_log(
-        {section: dict(configuration[section]) for section in configuration.sections()}
-    )
+        if configuration is None:
+            _logger.error("Invalid configuration, exiting.")
+            return False
 
-    _logger.debug("Vulnerability ID: " + vulnerability_id)
-    _logger.debug("time-limit before: " + str(time_limit_before))
-    _logger.debug("time-limit after: " + str(time_limit_after))
-
-    results, advisory_record = prospector(
-        vulnerability_id=vulnerability_id,
-        repository_url=repository_url,
-        publication_date=publication_date,
-        vuln_descr=vuln_descr,
-        tag_interval=tag_interval,
-        version_interval=version_interval,
-        modified_files=modified_files,
-        code_tokens=code_tokens,
-        time_limit_before=time_limit_before,
-        time_limit_after=time_limit_after,
-        use_nvd=use_nvd,
-        nvd_rest_endpoint=nvd_rest_endpoint,
-        backend_address=backend,
-        git_cache=git_cache,
-        limit_candidates=max_candidates,
-        active_rules=["ALL"],
-    )
+        report = configuration["global"].getboolean("report")
+        if args.report:
+            report = args.report
+
+        if configuration["global"].get("nvd_rest_endpoint"):
+            nvd_rest_endpoint = configuration["global"].get("nvd_rest_endpoint")
+
+        backend = configuration["global"].get("backend") or DEFAULT_BACKEND
+        if args.backend:
+            backend = args.backend
+
+        if args.ping:
+            return ping_backend(backend, log.config.level < logging.INFO)
+
+        vulnerability_id = args.vulnerability_id
+        repository_url = args.repository
+
+        vuln_descr = args.descr
+        use_nvd = args.use_nvd
+        tag_interval = args.tag_interval
+        version_interval = args.version_interval
+        time_limit_before = TIME_LIMIT_BEFORE
+        time_limit_after = TIME_LIMIT_AFTER
+        max_candidates = args.max_candidates
+        modified_files = args.modified_files.split(",")
+        code_tokens = (
+            args.diff_contains.split(",") if args.diff_contains is not None else []
+        )
+
+        publication_date = ""
+        if args.pub_date != "":
+            publication_date = args.pub_date + "T00:00Z"
+            # if the date is forced manually, the time interval can
+            # be restricted
+            # time_limit_before = int(time_limit_before / 5)
+            # time_limit_after = int(time_limit_after / 2)
+
+        git_cache = GIT_CACHE
+        if os.environ["GIT_CACHE"]:
+            git_cache = os.environ["GIT_CACHE"]
+        if configuration["global"].get("git_cache"):
+            git_cache = configuration["global"].get("git_cache")
+
+        _logger.debug("Using the following configuration:")
+        _logger.pretty_log(
+            {
+                section: dict(configuration[section])
+                for section in configuration.sections()
+            }
+        )
+
+        _logger.debug("Vulnerability ID: " + vulnerability_id)
+        _logger.debug("time-limit before: " + str(time_limit_before))
+        _logger.debug("time-limit after: " + str(time_limit_after))
+
+    with ExecutionTimer(execution_statistics.sub_collection(name="core")):
+        results, advisory_record = prospector(
+            vulnerability_id=vulnerability_id,
+            repository_url=repository_url,
+            publication_date=publication_date,
+            vuln_descr=vuln_descr,
+            tag_interval=tag_interval,
+            version_interval=version_interval,
+            modified_files=modified_files,
+            code_tokens=code_tokens,
+            time_limit_before=time_limit_before,
+            time_limit_after=time_limit_after,
+            use_nvd=use_nvd,
+            nvd_rest_endpoint=nvd_rest_endpoint,
+            backend_address=backend,
+            git_cache=git_cache,
+            limit_candidates=max_candidates,
+            active_rules=["ALL"],
+        )
+
+    with ExecutionTimer(execution_statistics.sub_collection(name="reporting")):
+        if report == "console":
+            report_on_console(results, advisory_record, log.config.level < logging.INFO)
+        elif report == "json":
+            report_as_json(results, advisory_record)
+        elif report == "html":
+            report_as_html(results, advisory_record)
+        else:
+            _logger.warning("Invalid report type specified, using 'console'")
+            report_on_console(results, advisory_record, log.config.level < logging.INFO)
 
-    if report == "console":
-        report_on_console(results, advisory_record, log.config.level < logging.INFO)
-    elif report == "json":
-        report_as_json(results, advisory_record)
-    elif report == "html":
-        report_as_html(results, advisory_record)
-    else:
-        _logger.warning("Invalid report type specified, using 'console'")
-        report_on_console(results, advisory_record, log.config.level < logging.INFO)
+    _logger.info("\n" + execution_statistics.generate_console_tree())
     return True