vocalpy
diff --git a/‎doc/api/index.rst
+14 b/‎doc/api/index.rst
+14
diff --git a/‎doc/toml/gy6or6_eval.toml
+2-2 b/‎doc/toml/gy6or6_eval.toml
+2-2
diff --git a/‎doc/toml/gy6or6_predict.toml
+2-2 b/‎doc/toml/gy6or6_predict.toml
+2-2
diff --git a/‎doc/toml/gy6or6_train.toml
+2-2 b/‎doc/toml/gy6or6_train.toml
+2-2
diff --git a/‎src/vak/__init__.py
+2 b/‎src/vak/__init__.py
+2
diff --git a/‎src/vak/cli/eval.py
+1-1 b/‎src/vak/cli/eval.py
+1-1
diff --git a/‎src/vak/cli/learncurve.py
+1-1 b/‎src/vak/cli/learncurve.py
+1-1
diff --git a/‎src/vak/cli/predict.py
+4-4 b/‎src/vak/cli/predict.py
+4-4
diff --git a/‎src/vak/cli/train.py
+2-2 b/‎src/vak/cli/train.py
+2-2
diff --git a/‎src/vak/common/__init__.py
+1-1 b/‎src/vak/common/__init__.py
+1-1
diff --git a/‎src/vak/common/constants.py
+6-1 b/‎src/vak/common/constants.py
+6-1
diff --git a/‎src/vak/common/labels.py
+28-15 b/‎src/vak/common/labels.py
+28-15
diff --git a/‎src/vak/config/__init__.py
-1 b/‎src/vak/config/__init__.py
-1
diff --git a/‎src/vak/config/dataset.py
+4-2 b/‎src/vak/config/dataset.py
+4-2
diff --git a/‎src/vak/config/eval.py
+3-3 b/‎src/vak/config/eval.py
+3-3
diff --git a/‎src/vak/config/learncurve.py
+12-5 b/‎src/vak/config/learncurve.py
+12-5
diff --git a/‎src/vak/config/predict.py
+3-4 b/‎src/vak/config/predict.py
+3-4
@@ -154,6 +154,20 @@ The :mod:`vak.datasets` module contains datasets built into vak.
    datasets.frame_classification
    datasets.parametric_umap
 
+Datapipes
+---------
+
+The :mod:`vak.datapipes` module contains datapipes for loading dataset
+generated by :func:`vak.prep.prep`.
+
+.. autosummary::
+   :toctree: generated
+   :template: module.rst
+   :recursive:
+
+   datapipes.frame_classification
+   datapipes.parametric_umap
+
 Metrics
 -------
 The :mod:`vak.metrics` module contains metrics used
 
@@ -33,9 +33,9 @@ checkpoint_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/TweetyNet/che
 # labelmap_path: path to file that maps from outputs of model (integers) to text labels in annotations;
 # this is used when generating predictions
 labelmap_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/labelmap.json"
-# spect_scaler_path: path to file containing SpectScaler that was fit to training set
+# frames_standardizer_path: path to file containing SpectScaler that was fit to training set
 # We want to transform the data we predict on in the exact same way
-spect_scaler_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/StandardizeSpect"
+frames_standardizer_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/StandardizeSpect"
 # batch_size
 # for predictions with a frame classification model, this should always be 1
 # and will be ignored if it's not
 
@@ -29,9 +29,9 @@ checkpoint_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/TweetyNet/che
 # labelmap_path: path to file that maps from outputs of model (integers) to text labels in annotations;
 # this is used when generating predictions
 labelmap_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/labelmap.json"
-# spect_scaler_path: path to file containing SpectScaler that was fit to training set
+# frames_standardizer_path: path to file containing SpectScaler that was fit to training set
 # We want to transform the data we predict on in the exact same way
-spect_scaler_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/StandardizeSpect"
+frames_standardizer_path = "/PATH/TO/FOLDER/results/train/RESULTS_TIMESTAMP/StandardizeSpect"
 # batch_size
 # for predictions with a frame classification model, this should always be 1
 # and will be ignored if it's not
 
@@ -37,9 +37,9 @@ root_results_dir = "/PATH/TO/FOLDER/results/train"
 batch_size = 8
 # num_epochs: number of training epochs, where an epoch is one iteration through all samples in training split
 num_epochs = 2
-# normalize_spectrograms: if true, normalize spectrograms per frequency bin, so mean of each is 0.0 and std is 1.0
+# standardize_frames: if true, standardize (normalize) frames (input to neural network) per frequency bin, so mean of each is 0.0 and std is 1.0
 # across the entire training split
-normalize_spectrograms = true
+standardize_frames = true
 # val_step: step number on which to compute metrics with validation set, every time step % val_step == 0
 # (a step is one batch fed through the network)
 # saves a checkpoint if the monitored evaluation metric improves (which is model specific)
 
@@ -3,6 +3,7 @@
     cli,
     common,
     config,
+    datapipes,
     datasets,
     eval,
     learncurve,
@@ -42,6 +43,7 @@
     "cli",
     "common",
     "config",
+    "datapipes",
     "datasets",
     "eval",
     "learncurve",
 
@@ -58,6 +58,6 @@ def eval(toml_path: str | pathlib.Path) -> None:
         output_dir=cfg.eval.output_dir,
         num_workers=cfg.eval.num_workers,
         batch_size=cfg.eval.batch_size,
-        spect_scaler_path=cfg.eval.spect_scaler_path,
+        frames_standardizer_path=cfg.eval.frames_standardizer_path,
         post_tfm_kwargs=cfg.eval.post_tfm_kwargs,
     )
@@ -61,7 +61,7 @@ def learning_curve(toml_path):
         num_workers=cfg.learncurve.num_workers,
         results_path=results_path,
         post_tfm_kwargs=cfg.learncurve.post_tfm_kwargs,
-        normalize_spectrograms=cfg.learncurve.normalize_spectrograms,
+        standardize_frames=cfg.learncurve.standardize_frames,
         shuffle=cfg.learncurve.shuffle,
         val_step=cfg.learncurve.val_step,
         ckpt_step=cfg.learncurve.ckpt_step,
 
@@ -1,7 +1,7 @@
 import logging
 from pathlib import Path
 
-from .. import config
+from .. import common, config
 from .. import predict as predict_module
 from ..common.logging import config_logging_for_cli, log_version
 
@@ -33,7 +33,7 @@ def predict(toml_path):
         force=True,
     )
     log_version(logger)
-    logger.info("Logging results to {}".format(cfg.prep.output_dir))
+    logger.info("Logging results to {}".format(cfg.predict.output_dir))
 
     if cfg.predict.dataset.path is None:
         raise ValueError(
@@ -49,8 +49,8 @@ def predict(toml_path):
         checkpoint_path=cfg.predict.checkpoint_path,
         labelmap_path=cfg.predict.labelmap_path,
         num_workers=cfg.predict.num_workers,
-        timebins_key=cfg.prep.spect_params.timebins_key,
-        spect_scaler_path=cfg.predict.spect_scaler_path,
+        timebins_key=cfg.prep.spect_params.timebins_key if cfg.prep else common.constants.TIMEBINS_KEY,
+        frames_standardizer_path=cfg.predict.frames_standardizer_path,
         annot_csv_filename=cfg.predict.annot_csv_filename,
         output_dir=cfg.predict.output_dir,
         min_segment_dur=cfg.predict.min_segment_dur,
 
@@ -60,9 +60,9 @@ def train(toml_path):
         num_epochs=cfg.train.num_epochs,
         num_workers=cfg.train.num_workers,
         checkpoint_path=cfg.train.checkpoint_path,
-        spect_scaler_path=cfg.train.spect_scaler_path,
+        frames_standardizer_path=cfg.train.frames_standardizer_path,
         results_path=results_path,
-        normalize_spectrograms=cfg.train.normalize_spectrograms,
+        standardize_frames=cfg.train.standardize_frames,
         shuffle=cfg.train.shuffle,
         val_step=cfg.train.val_step,
         ckpt_step=cfg.train.ckpt_step,
 
@@ -5,7 +5,7 @@
 If a helper/utility function is only used in one module,
 it should live either in that module or another at the same level.
 See for example :mod:`vak.prep.prep_helper` or
-:mod:`vak.datsets.window_dataset._helper`.
+:mod:`vak.datsets.train_datapipe._helper`.
 """
 
 from . import (
 
@@ -1,4 +1,4 @@
-"""constants used by multiple modules.
+"""Constants used by multiple modules.
 Defined here to avoid circular imports.
 """
 
@@ -26,6 +26,7 @@
     "npz": np.load,
 }
 VALID_SPECT_FORMATS = list(SPECT_FORMAT_LOAD_FUNCTION_MAP.keys())
+DEFAULT_SPECT_FORMAT = "npz"
 
 # ---- valid types of training data, the $x$ that goes into a network
 VALID_X_SOURCES = {"audio", "spect"}
@@ -57,3 +58,7 @@
     "npz": SPECT_NPZ_EXTENSION,
     "mat": ".mat",
 }
+
+VALID_SPLITS = ("predict", "test", "train", "val")
+
+DEFAULT_BACKGROUND_LABEL = "background"
@@ -5,10 +5,12 @@
 import numpy as np
 import pandas as pd
 
-from . import annotation
+from . import annotation, constants
 
 
-def to_map(labelset: set, map_unlabeled: bool = True) -> dict:
+def to_map(
+        labelset: set, map_background: bool = True, background_label: str = constants.DEFAULT_BACKGROUND_LABEL
+) -> dict:
     """Convert set of labels to `dict`
     mapping those labels to a series of consecutive integers
     from 0 to n inclusive,
@@ -18,21 +20,31 @@ def to_map(labelset: set, map_unlabeled: bool = True) -> dict:
     from annotations of a vocalization into
     a label for every time bin in a spectrogram of that vocalization.
 
-    If ``map_unlabeled`` is True, then the label 'unlabeled'
-    will be added to labelset, and will map to 0,
+    If ``map_background`` is True, then a label
+    will be added to labelset representing a background class
+    (any segment that is not labeled).
+    The default for this label is
+    :const:`vak.common.constants.DEFAULT_BACKGROUND_LABEL`.
+    This string label will map to class index 0,
     so the total number of classes is n + 1.
 
     Parameters
     ----------
     labelset : set
         Set of labels used to annotate a dataset.
-    map_unlabeled : bool
-        If True, include key 'unlabeled' in mapping.
+    map_background : bool
+        If True, include key specified by
+        ``background_label`` in mapping.
         Any time bins in a spectrogram
         that do not have a label associated with them,
         e.g. a silent gap between vocalizations,
         will be assigned the integer
-        that the 'unlabeled' key maps to.
+        that the background key maps to.
+    background_label: str, optional
+        The string label applied to segments belonging to the
+        background class.
+        Default is
+        :const:`vak.common.constants.DEFAULT_BACKGROUND_LABEL`.
 
     Returns
     -------
@@ -45,11 +57,12 @@ def to_map(labelset: set, map_unlabeled: bool = True) -> dict:
         )
 
     labellist = []
-    if map_unlabeled is True:
-        labellist.append("unlabeled")
-
+    if map_background is True:
+        # NOTE we append background label *first*
+        labellist.append(background_label)
+    # **then** extend with the rest of the labels
     labellist.extend(sorted(list(labelset)))
-
+    # so that background_label maps to class index 0 by default in next line
     labelmap = dict(zip(labellist, range(len(labellist))))
     return labelmap
 
@@ -124,7 +137,7 @@ def from_df(
 
 # added to fix https://github.com/NickleDave/vak/issues/373
 def multi_char_labels_to_single_char(
-    labelmap: dict, skip: tuple[str] = ("unlabeled",)
+    labelmap: dict, skip: tuple[str] = (constants.DEFAULT_BACKGROUND_LABEL,)
 ) -> dict:
     """Return a copy of a ``labelmap`` where any
     labels that are strings with multiple characters
@@ -146,9 +159,9 @@ def multi_char_labels_to_single_char(
         to integers. As returned by
         ``vak.labels.to_map``.
     skip : tuple
-        Of strings, labels to leave
-        as multiple characters.
-        Default is ('unlabeled',).
+        A tuple of labels to leave as multiple characters.
+        Default is a tuple containing just
+        :const:`vak.common.constants.DEFAULT_BACKGROUND_LABEL`.
 
     Returns
     -------
 
@@ -25,7 +25,6 @@
 from .train import TrainConfig
 from .trainer import TrainerConfig
 
-
 __all__ = [
     "config",
     "dataset",
 
@@ -7,6 +7,8 @@
 import attr.validators
 from attr import asdict, define, field
 
+from ..common.converters import expanded_user_path
+
 
 @define
 class DatasetConfig:
@@ -31,9 +33,9 @@ class DatasetConfig:
         Default is None.
     """
 
-    path: pathlib.Path = field(converter=pathlib.Path)
+    path: pathlib.Path = field(converter=expanded_user_path)
     splits_path: pathlib.Path | None = field(
-        converter=attr.converters.optional(pathlib.Path), default=None
+        converter=attr.converters.optional(expanded_user_path), default=None
     )
     name: str | None = field(
         converter=attr.converters.optional(str), default=None
 
@@ -110,8 +110,8 @@ class EvalConfig:
         Argument to torch.DataLoader. Default is 2.
     labelmap_path : str
         path to 'labelmap.json' file.
-    spect_scaler_path : str
-        path to a saved SpectScaler object used to normalize spectrograms.
+    frames_standardizer_path : str
+        path to a saved :class:`vak.transforms.FramesStandardizer` object used to standardize (normalize) frames.
         If spectrograms were normalized and this is not provided, will give
         incorrect results.
     post_tfm_kwargs : dict
@@ -152,7 +152,7 @@ class EvalConfig:
         converter=converters.optional(expanded_user_path), default=None
     )
     # optional, transform
-    spect_scaler_path = field(
+    frames_standardizer_path = field(
         converter=converters.optional(expanded_user_path),
         default=None,
     )
 
@@ -10,7 +10,12 @@
 from .train import TrainConfig
 from .trainer import TrainerConfig
 
-REQUIRED_KEYS = ("dataset", "model", "root_results_dir", "trainer",)
+REQUIRED_KEYS = (
+    "dataset",
+    "model",
+    "root_results_dir",
+    "trainer",
+)
 
 
 @define
@@ -45,9 +50,9 @@ class LearncurveConfig(TrainConfig):
         Argument to torch.DataLoader.
     shuffle: bool
         if True, shuffle training data before each epoch. Default is True.
-    normalize_spectrograms : bool
-        if True, use spect.utils.data.SpectScaler to normalize the spectrograms.
-        Normalization is done by subtracting off the mean for each frequency bin
+    standardize_frames : bool
+        if True, use :class:`vak.transforms.FramesStandardizer` to standardize the frames.
+        Normalization is done by subtracting off the mean for each row
         of the training set and then dividing by the std for that frequency bin.
         This same normalization is then applied to validation + test data.
     val_step : int
@@ -75,6 +80,7 @@ class LearncurveConfig(TrainConfig):
         See the docstring of the transform for more details on
         these arguments and how they work.
     """
+
     post_tfm_kwargs = field(
         validator=validators.optional(are_valid_post_tfm_kwargs),
         converter=converters.optional(convert_post_tfm_kwargs),
@@ -91,7 +97,8 @@ def from_config_dict(cls, config_dict: dict) -> LearncurveConfig:
         by loading a valid configuration toml file with
         :func:`vak.config.parse.from_toml_path`,
         and then using key ``learncurve``,
-        i.e., ``LearncurveConfig.from_config_dict(config_dict['learncurve'])``."""
+        i.e., ``LearncurveConfig.from_config_dict(config_dict['learncurve'])``.
+        """
         for required_key in REQUIRED_KEYS:
             if required_key not in config_dict:
                 raise KeyError(
 
@@ -14,7 +14,6 @@
 from .model import ModelConfig
 from .trainer import TrainerConfig
 
-
 REQUIRED_KEYS = (
     "checkpoint_path",
     "dataset",
@@ -50,8 +49,8 @@ class PredictConfig:
     num_workers : int
         Number of processes to use for parallel loading of data.
         Argument to torch.DataLoader. Default is 2.
-    spect_scaler_path : str
-        path to a saved SpectScaler object used to normalize spectrograms.
+    frames_standardizer_path : str
+        path to a saved :class:`vak.transforms.FramesStandardizer` object used to standardize (normalize) frames.
         If spectrograms were normalized and this is not provided, will give
         incorrect results.
     annot_csv_filename : str
@@ -104,7 +103,7 @@ class PredictConfig:
     )
 
     # optional, transform
-    spect_scaler_path = field(
+    frames_standardizer_path = field(
         converter=converters.optional(expanded_user_path),
         default=None,
     )
Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,6 @@ def eval(toml_path: str \| pathlib.Path) -> None:`
`58`	`58`	`output_dir=cfg.eval.output_dir,`
`59`	`59`	`num_workers=cfg.eval.num_workers,`
`60`	`60`	`batch_size=cfg.eval.batch_size,`
`61`		`- spect_scaler_path=cfg.eval.spect_scaler_path,`
	`61`	`+ frames_standardizer_path=cfg.eval.frames_standardizer_path,`
`62`	`62`	`post_tfm_kwargs=cfg.eval.post_tfm_kwargs,`
`63`	`63`	`)`