WIP, DEBUG: Python derived metrics

* this is a Python/pandas-only version of doing some simple derived metrics; I don't think we'll actually do this, but I was exploring a bit because of the difficulties in darshan-hpcgh-839 * this matches pretty well with the `perl` based reports for total bytes, but even simple cases can sometimes disagree on bandwidth per darshan-hpcgh-847, so now I'm curious what is going on * one problem with doing this is that we'd have the same algorithms implemented in two different languages; the advantages include: - not reading all the records in a second time, crossing the CFFI boundary each time - easier to debug/maintain because bounds checking/no segfaults, etc.
tylerjereddy · Nov 9, 2022 · 67dd29e · 67dd29e
1 parent 9731f44
commit 67dd29e
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 0 deletions.
diff --git a/darshan-util/pydarshan/darshan/derived_metrics.py b/darshan-util/pydarshan/darshan/derived_metrics.py
@@ -0,0 +1,22 @@
+def perf_estimate(report, mod_name: str):
+    data = report.data["records"][mod_name].to_df()
+    counters_df = data["counters"]
+    fcounters_df = data["fcounters"]
+    # the old perl reports used MiB so doing
+    # that here for consistency, though I note
+    # that it might be more natural to use a library
+    # like humanize to automatically select i.e., GiB
+    # depending on magnitude
+    mod_name_adjusted = mod_name.replace("-", "")
+    total_mebibytes = (counters_df[f"{mod_name_adjusted}_BYTES_WRITTEN"].sum()
+                       + counters_df[f"{mod_name_adjusted}_BYTES_READ"].sum()) / (2 ** 20)
+    total_rw_time = (fcounters_df[f"{mod_name_adjusted}_F_READ_TIME"].sum() +
+                     fcounters_df[f"{mod_name_adjusted}_F_WRITE_TIME"].sum())
+    mebibytes_per_sec = total_mebibytes / total_rw_time
+    # construct a string similar to the one used in perl reports,
+    # matching in precision of reported values
+    # TODO: resolve discrepancy noted in gh-847 vs. perl
+    # reports on the bandwidth calculation (even for single record logs!)
+    io_perf_string = (f"I/O performance estimate (at the {mod_name} layer): "
+                      f"transferred {total_mebibytes:.1f} MiB at {mebibytes_per_sec:.2f} MiB/s")
+    return io_perf_string
diff --git a/darshan-util/pydarshan/darshan/tests/test_derived_metrics.py b/darshan-util/pydarshan/darshan/tests/test_derived_metrics.py
@@ -0,0 +1,22 @@
+import darshan
+from darshan.log_utils import get_log_path
+from darshan import derived_metrics
+
+import pytest
+
+
+@pytest.mark.parametrize("log_name, module, expected", [
+        # expected strings are copy-pasted from the old
+        # perl reports
+        ("imbalanced-io.darshan",
+         "STDIO",
+         "I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
+        ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
+         "STDIO",
+         "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
+        ])
+def test_perf_estimate(log_name, module, expected):
+    log_path = get_log_path(log_name)
+    report = darshan.DarshanReport(log_path, read_all=True)
+    actual = derived_metrics.perf_estimate(report=report, mod_name=module)
+    assert actual == expected