janosh · janosh · Mar 24, 2023 · Mar 10, 2023 · Mar 10, 2023 · Mar 10, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -8,7 +8,7 @@ on:
     branches: [main]
     paths: ['**/*.py', '.github/workflows/test.yml']
   release:
-    types: [published]
+    types: [published, edited]
 
 jobs:
   tests:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.248
+    rev: v0.0.259
     hooks:
       - id: ruff
         args: [--fix]
@@ -18,7 +18,7 @@ repos:
       - id: black-jupyter
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.0.1
+    rev: v1.1.1
     hooks:
       - id: mypy
 
@@ -35,16 +35,17 @@ repos:
       - id: trailing-whitespace
 
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.2.4
     hooks:
       - id: codespell
         stages: [commit, commit-msg]
         exclude_types: [jupyter, bib]
 
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.6.3
+    rev: 1.6.4
     hooks:
       - id: nbqa-ruff
+        args: [--fix]
 
   - repo: https://github.com/kynan/nbstripout
     rev: 0.6.1

diff --git a/examples/basic_python_api_example.ipynb b/examples/basic_python_api_example.ipynb
@@ -30,7 +30,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 3 runs: ['../tests/runs/strict/run_2', '../tests/runs/strict/run_3', '../tests/runs/strict/run_1']\n"
+      "Found 3 runs:\n",
+      "../tests/runs/strict/run_1\n",
+      "../tests/runs/strict/run_2\n",
+      "../tests/runs/strict/run_3\n"
      ]
     }
    ],
@@ -47,26 +50,26 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading runs: 100%|██████████| 3/3 [00:00<00:00, 305.84it/s]\n",
+      "Reading tags: 100%|██████████| 3/3 [00:00<00:00, 1142.86it/s]"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loaded 3 TensorBoard runs with 1 scalar(s) and 100 steps each:\n",
-      "{'strict/foo':          value     value     value\n",
-      "step                              \n",
-      "0     2.970375  3.521034  1.496257\n",
-      "5     2.836909  3.822311  1.768222\n",
-      "10    2.281987  3.122040  1.088477\n",
-      "15    2.374158  3.156744  1.132030\n",
-      "20    2.023701  3.209669  1.307423\n",
-      "...        ...       ...       ...\n",
-      "475   2.224758  3.191785  1.175652\n",
-      "480   2.208647  3.845772  1.749658\n",
-      "485   2.670872  3.127788  1.604651\n",
-      "490   2.202044  3.704833  1.109958\n",
-      "495   2.489091  3.331873  1.212090\n",
-      "\n",
-      "[100 rows x 3 columns]}\n"
+      "Loaded 3 TensorBoard runs with 1 scalars and 100 steps each\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     }
    ],
@@ -76,15 +79,7 @@
     "overwrite = True\n",
     "reduce_ops = (\"mean\", \"min\", \"max\")\n",
     "\n",
-    "events_dict = tbr.load_tb_events(input_event_dirs)\n",
-    "\n",
-    "n_scalars = len(events_dict)\n",
-    "n_steps, n_events = list(events_dict.values())[0].shape\n",
-    "\n",
-    "print(\n",
-    "    f\"Loaded {n_events} TensorBoard runs with {n_scalars} scalar(s) \"\n",
-    "    f\"and {n_steps} steps each:\\n{events_dict}\"\n",
-    ")"
+    "events_dict = tbr.load_tb_events(input_event_dirs, verbose=True)"
    ]
   },
   {
@@ -96,25 +91,51 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Reduced 1 scalars with 3 operations: (mean, min, max)\n",
       "Writing 'mean' reduction to 'tmp/reduced-mean'\n",
       "Writing 'min' reduction to 'tmp/reduced-min'\n",
-      "Writing 'max' reduction to 'tmp/reduced-max'\n",
+      "Writing 'max' reduction to 'tmp/reduced-max'\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Writing max reduction to disk: 100%|██████████| 3/3 [00:00<00:00, 180.74it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created new TensorBoard event files in\n",
+      "- tmp/reduced-mean\n",
+      "- tmp/reduced-min\n",
+      "- tmp/reduced-max\n",
       "Writing results to 'tmp/tb-reduction.csv'\n",
+      "Created new data file at 'tmp/tb-reduction.csv'\n",
       "✓ Reduction complete\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
     }
    ],
    "source": [
-    "reduced_events = tbr.reduce_events(events_dict, reduce_ops)\n",
+    "reduced_events = tbr.reduce_events(events_dict, reduce_ops, verbose=True)\n",
     "\n",
     "for op in reduce_ops:\n",
     "    print(f\"Writing '{op}' reduction to '{events_out_dir}-{op}'\")\n",
     "\n",
-    "tbr.write_tb_events(reduced_events, events_out_dir, overwrite)\n",
+    "tbr.write_tb_events(reduced_events, events_out_dir, overwrite, verbose=True)\n",
     "\n",
     "print(f\"Writing results to '{csv_out_path}'\")\n",
     "\n",
-    "tbr.write_data_file(reduced_events, csv_out_path, overwrite)\n",
+    "tbr.write_data_file(reduced_events, csv_out_path, overwrite, verbose=True)\n",
     "\n",
     "print(\"✓ Reduction complete\")"
    ]
@@ -146,7 +167,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.5"
+   "version": "3.10.9"
   },
   "vscode": {
    "interpreter": {

diff --git a/examples/functorch_mlp_ensemble.ipynb b/examples/functorch_mlp_ensemble.ipynb
diff --git a/examples/wandb_integration.ipynb b/examples/wandb_integration.ipynb
@@ -1397,7 +1397,9 @@
    "source": [
     "import tensorboard_reducer as tbr\n",
     "\n",
-    "reduced_runs = tbr.reduce_events(run_data, reduce_ops=(\"mean\", \"min\", \"max\"))"
+    "reduced_runs = tbr.reduce_events(\n",
+    "    run_data, reduce_ops=(\"mean\", \"min\", \"max\"), verbose=True\n",
+    ")"
    ]
   },
   {
@@ -1459,8 +1461,8 @@
    ],
    "source": [
     "# example of how you might want to persist this data to disk\n",
-    "tbr.write_data_file(reduced_runs, \"wandb-cnn-ensemble-runs.csv\")\n",
-    "tbr.write_tb_events(reduced_runs, \"wandb-cnn-ensemble-runs/\")"
+    "tbr.write_data_file(reduced_runs, \"wandb-cnn-ensemble-runs.csv\", verbose=True)\n",
+    "tbr.write_tb_events(reduced_runs, \"wandb-cnn-ensemble-runs/\", verbose=True)"
    ]
   },
   {
@@ -1511,7 +1513,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.11.0"
   },
   "vscode": {
    "interpreter": {

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,21 +26,26 @@ classifiers = [
   "License :: OSI Approved :: MIT License",
 ]
 requires-python = ">=3.8"
-dependencies = ["tensorboard >= 2.0", "numpy >= 1.19", "pandas >= 1.0.0"]
+dependencies = [
+  "tensorboard >= 2.0",
+  "numpy >= 1.19",
+  "pandas >= 1.0.0",
+  "tqdm >= 4.0",
+]
 
 [project.urls]
 Homepage = "https://github.com/janosh/tensorboard-reducer"
 Package = "https://pypi.org/project/tensorboard-reducer"
 
 [project.optional-dependencies]
 test = ["pytest", "pytest-cov", "torch >= 1.6"]
-excel = ["openpyxl", "xlwt", "xlrd"]
+excel = ["openpyxl"]
 
 [project.scripts]
 tb-reducer = "tensorboard_reducer:main"
 
 [tool.setuptools.packages]
-find = { include = ["tensorboard_reducer"] }
+find = { include = ["tensorboard_reducer*"], exclude = ["tests*"] }
 
 [tool.distutils.bdist_wheel]
 universal = true
@@ -77,10 +82,11 @@ select = [
   "YTT", # flake8-2020
 ]
 ignore = [
-  "B904", # Within an except clause, raise exceptions with ...
-  "D100", # Missing docstring in public module
-  "D104", # Missing docstring in public package
-  "D205", # 1 blank line required between summary line and description
+  "B904",    # Within an except clause, raise exceptions with ...
+  "D100",    # Missing docstring in public module
+  "D104",    # Missing docstring in public package
+  "D205",    # 1 blank line required between summary line and description
+  "PLW2901", # for loop variable overwritten by assignment target
 ]
 pydocstyle.convention = "google"
 

diff --git a/readme.md b/readme.md
@@ -53,12 +53,12 @@ All positional CLI arguments are interpreted as input directories and expected t
 
 In addition, `tb-reducer` has the following flags:
 
-- **`-o/--outpath`** (required): File path or directory where to write output to disk. If `--outpath` is a directory, output will be saved as TensorBoard runs, one new directory created for each reduction suffixed by the `numpy` operation, e.g. `'out/path-mean'`, `'out/path-max'`, etc. If `--outpath` is a file path, it must have `'.csv'`/`'.json'` or `'.xls(x)'` (supports compression by using e.g. `.csv.gz`, `json.bz2`) in which case a single file will be created. CSVs will have a two-level header containing one column for each combination of tag (`loss`, `accuracy`, ...) and reduce operation (`mean`, `std`, ...). Tag names will be in top-level header, reduce ops in second level. **Hint**: When saving data as CSV or Excel, use `pandas.read_csv("path/to/file.csv", header=[0, 1], index_col=0)` and `pandas.read_excel("path/to/file.xlsx", header=[0, 1], index_col=0)` to load reduction results into a multi-index dataframe.
+- **`-o/--outpath`** (required): File path or directory where to write output to disk. If `--outpath` is a directory, output will be saved as TensorBoard runs, one new directory created for each reduction suffixed by the `numpy` operation, e.g. `'out/path-mean'`, `'out/path-max'`, etc. If `--outpath` is a file path, it must have `'.csv'`/`'.json'` or `'.xlsx'` (supports compression by using e.g. `.csv.gz`, `json.bz2`) in which case a single file will be created. CSVs will have a two-level header containing one column for each combination of tag (`loss`, `accuracy`, ...) and reduce operation (`mean`, `std`, ...). Tag names will be in top-level header, reduce ops in second level. **Hint**: When saving data as CSV or Excel, use `pandas.read_csv("path/to/file.csv", header=[0, 1], index_col=0)` and `pandas.read_excel("path/to/file.xlsx", header=[0, 1], index_col=0)` to load reduction results into a multi-index dataframe.
 - **`-r/--reduce-ops`** (optional, default: `mean`): Comma-separated names of numpy reduction ops (`mean`, `std`, `min`, `max`, ...). Each reduction is written to a separate `outpath` suffixed by its op name. E.g. if `outpath='reduced-run'`, the mean reduction will be written to `'reduced-run-mean'`.
 - **`-f/--overwrite`** (optional, default: `False`): Whether to overwrite existing output directories/data files (CSV, JSON, Excel). For safety, the overwrite operation will abort with an error if the file/directory to overwrite is not a known data file and does not look like a TensorBoard run directory (i.e. does not start with `'events.out'`).
 - **`--lax-tags`** (optional, default: `False`): Allow different runs have to different sets of tags. In this mode, each tag reduction will run over as many runs as are available for a given tag, even if that's just one. Proceed with caution as not all tags will have the same statistics in downstream analysis.
 - **`--lax-steps`** (optional, default: `False`): Allow tags across different runs to have unequal numbers of steps. In this mode, each reduction will only use as many steps as are available in the shortest run (same behavior as `zip(short_list, long_list)` which stops when `short_list` is exhausted).
-- **`--handle-dup-steps`** (optional, default: `None`): How to handle duplicate values recorded for the same tag and step in a single run. One of `'keep-first'`, `'keep-last'`, `'mean'`. `'keep-first/last'` will keep the first/last occurrence of duplicate steps while 'mean' computes their mean. Default behavior is to raise `AssertionError` on duplicate steps.
+- **`--handle-dup-steps`** (optional, default: `None`): How to handle duplicate values recorded for the same tag and step in a single run. One of `'keep-first'`, `'keep-last'`, `'mean'`. `'keep-first/last'` will keep the first/last occurrence of duplicate steps while 'mean' computes their mean. Default behavior is to raise `ValueError` on duplicate steps.
 - **`--min-runs-per-step`** (optional, default: `None`): Minimum number of runs across which a given step must be recorded to be kept. Steps present across less runs are dropped. Only plays a role if `lax_steps` is true. **Warning**: Be aware that with this setting, you'll be reducing variable number of runs, however many recorded a value for a given step as long as there are at least `--min-runs-per-step`. In other words, the statistics of a reduction will change mid-run. Say you're plotting the mean of an error curve, the sample size of that mean will drop from, say, 10 down to 4 mid-plot if 4 of your models trained for longer than the rest. Be sure to remember when using this.
 - **`-v/--version`** (optional): Get the current version.