diff --git a/.gitignore b/.gitignore index ff84877..b06d9ab 100644 --- a/.gitignore +++ b/.gitignore @@ -140,4 +140,4 @@ dmypy.json .DS_Store tests/resources/ -.datasetops_cache \ No newline at end of file +.datasetops_cache diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..fa85d2b --- /dev/null +++ b/conftest.py @@ -0,0 +1,12 @@ +import pytest + +import datasetops as do +from datasetops.loaders import from_iterable, from_recursive_files + +# see http://doc.pytest.org/en/latest/doctest.html (doctest_namespace fixture) +@pytest.fixture(autouse=True) +def setup_doctest_namespace(doctest_namespace): + + doctest_namespace["do"] = do + doctest_namespace["from_iterable"] = from_iterable + doctest_namespace["load_files_recursive"] = from_recursive_files diff --git a/docs/conf.py b/docs/conf.py index a36af37..dd8c315 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,13 +35,8 @@ "recommonmark", "sphinx_rtd_theme", ] -doctest_global_setup = """ -try: - import datasetops as do -except ImportError: - do = None -""" +numfig = True # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -78,4 +73,4 @@ extensions.append("autoapi.extension") autoapi_type = "python" autoapi_dirs = ["../src/"] -autoapi_keep_files = True +autoapi_keep_files = False diff --git a/docs/getting_started.rst b/docs/getting_started.rst deleted file mode 100644 index 0ced15d..0000000 --- a/docs/getting_started.rst +++ /dev/null @@ -1,27 +0,0 @@ -Getting started -=============== - -.. code-block:: python - - import datasetops as do - - path = '../data/nested_class_folder' - - # Prepare your data - train, val, test = \ - do.load_folder_class_data(path) \ - .set_item_names('data','label') \ - .as_img('data').resize((240,240)).as_numpy('data') \ - .one_hot('label') \ - .shuffle(seed=42) \ - .split([0.6,0.2,0.3]) - - # Do your magic using Tensorflow - train_tf = trian.to_tf() - - # Rule the world with PyTorch - train_pt = trian.to_pytorch() #coming up! - - # Do your own thing - for img, label in train: - ... \ No newline at end of file diff --git a/docs/getting_started/getting_started.rst b/docs/getting_started/getting_started.rst new file mode 100644 index 0000000..bb0bb43 --- /dev/null +++ b/docs/getting_started/getting_started.rst @@ -0,0 +1,20 @@ +Getting started +=============== + +Before getting started with loading and processing datasets it is useful to have an overview of what the framework provides and its intended workflow. +As depicted in :numref:`fig_pipeline`, the framework provides a pipeline for processing the data by composing a chains of operations applied to the dataset. + +.. _fig_pipeline: +.. figure:: ../pics/pipeline.svg + :figwidth: 600 + :align: center + :alt: Dataset Ops pipeline + + Dataset Ops Pipeline. + +At the beginning of this chain is a *loader* which implements the process of reading a dataset stored in some specific file format. +Following this the raw data can then processed into a desired from by applying a number of transformations, independently of the underlying storage format. +After applying the transformations to the dataset it can be used as is or it can be converted into a type compatible with either PyTorch or TensorFlow. + + +An overview of the available loaders and transforms can be found in: diff --git a/docs/installing.rst b/docs/getting_started/installing.rst similarity index 100% rename from docs/installing.rst rename to docs/getting_started/installing.rst diff --git a/docs/howto/custom_loader.rst b/docs/howto/custom_loader.rst index 6b11eb6..c8c6c40 100644 --- a/docs/howto/custom_loader.rst +++ b/docs/howto/custom_loader.rst @@ -1,2 +1,9 @@ Implementing A New Loader -========================= \ No newline at end of file +========================= + +In case the format of your dataset does not fit any of the standard loaders, it is possible to define your own custom loader. +By defining a custom loader you dataset can be integrated with the framework allowing transformations to be applied to its data, just like a standard loader. + +To define a new loader a new class must be created that implements the interface declared by :class:`AbstractDataset `. +In the context of the library a dataset is an + diff --git a/docs/index.rst b/docs/index.rst index 45aa5f1..8bf1773 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,54 +3,52 @@ Dataset Ops documentation Friendly dataset operations for your data science needs. Dataset Ops provides declarative loading, sampling, splitting and transformation operations for datasets, alongside export options for easy integration with Tensorflow and PyTorch. -.. figure:: pics/pipeline.svg - :figwidth: 600 - :align: center - :alt: Dataset Ops pipeline +.. .. figure:: pics/pipeline.svg +.. :figwidth: 600 +.. :align: center +.. :alt: Dataset Ops pipeline - Illustration Dataset Ops Pipeline. - Several built-in loaders makes it possible to load datasets stored in various formats. - Several operators are provided that provide common pre-processing steps to be applied to the data quickly. - Finally, the processed data can be used as is or exported in a format to be used with ML frameworks. +.. Illustration Dataset Ops Pipeline. +.. Several built-in loaders makes it possible to load datasets stored in various formats. +.. Several operators are provided that provide common pre-processing steps to be applied to the data quickly. +.. Finally, the processed data can be used as is or exported in a format to be used with ML frameworks. First Steps ----------- Are you looking for ways to install the framework or do you looking for inspiration to get started? -* **Installing**: :doc:`Installing ` +* **Installing**: :doc:`Installing ` -* **Getting Started**: :doc:`Getting started ` +* **Getting Started**: :doc:`Getting started ` .. toctree:: :maxdepth: 2 :hidden: - :caption: Getting Started: + :caption: Getting Started - installing - getting_started + getting_started/installing + getting_started/getting_started Loaders and Transforms ---------------------- Get an overview of the available loaders and transforms that can be used with your dataset. -* **Loaders**: :doc:`Standard loaders ` +* **Loaders**: :doc:`Loaders ` -* **Transforms**: :doc:`General ` | :doc:`Image ` | :doc:`Time-series ` +* **Transforms**: :doc:`Transforms ` It is also possible to implement your own loaders and transforms. .. toctree:: :maxdepth: 2 :hidden: - :caption: Loaders and Transforms + :caption: Overview - loaders/standard - transforms/common - transforms/images - transforms/timeseries + overview/loaders + overview/transforms Custom Loaders and Transforms ----------------------------- @@ -65,7 +63,7 @@ For how-to guides on how to do this see: .. toctree:: :maxdepth: 2 :hidden: - :caption: How-to guides: + :caption: How-to guides howto/custom_loader howto/custom_transform @@ -106,7 +104,7 @@ See the example section: .. toctree:: :maxdepth: 1 - :caption: Examples: + :caption: Examples :glob: examples/* @@ -127,7 +125,7 @@ Information on how to the codebase is tested, how it is published, and how to ad .. toctree:: :maxdepth: 2 :hidden: - :caption: Contributing: + :caption: Contributing development/communication development/codebase diff --git a/docs/loaders/standard.rst b/docs/loaders/standard.rst deleted file mode 100644 index 6094658..0000000 --- a/docs/loaders/standard.rst +++ /dev/null @@ -1,12 +0,0 @@ -Standard Loaders -================ -Dataset Ops provides a set of standard loaders that covers loading of the most frequently exchange formats. - -PyTorch -------- - -Tensorflow ----------- - -Comman-separated values (CSV) ------------------------------ \ No newline at end of file diff --git a/docs/optimization/caching.rst b/docs/optimization/caching.rst index c990fd7..5237333 100644 --- a/docs/optimization/caching.rst +++ b/docs/optimization/caching.rst @@ -1,3 +1,4 @@ +.. _sec_caching: Caching ======= @@ -21,10 +22,9 @@ To cache some combination of dataset and transformations the *cache* function is .. doctest:: - >>> kernel = np.ones((5,5))*(5**2) - >>> train, val = do.load_mnist().whiten().image_filter(kernel).cache().split((0.7,0.3)) - >>> # TODO - False + >>> # kernel = np.ones((5,5))*(5**2) + >>> # train, val = do.load_mnist().whiten().image_filter(kernel).cache().split((0.7,0.3)) + ... # doctest: +SKIP The library keeps track of what values are available in the cache and ensures that the cache is recalculated when necessary. For example the cache will be updated when a new operation is introduced before the cache operator, or when the parameters of one or more transforms are modified. @@ -37,8 +37,7 @@ To ensure that the size of the cache does not grow indefinitely it is possible t .. doctest:: - >>> do.set_caching_cleanup_strategy("clean_unused") - >>> do.set_caching_cleanup_strategy("never") - >>> do.clear_cache() - >>> #TODO - False \ No newline at end of file + >>> # do.set_caching_cleanup_strategy("clean_unused") + >>> # do.set_caching_cleanup_strategy("never") + >>> # do.clear_cache() + ... # doctest: +SKIP \ No newline at end of file diff --git a/docs/overview/loaders.rst b/docs/overview/loaders.rst new file mode 100644 index 0000000..823f9d3 --- /dev/null +++ b/docs/overview/loaders.rst @@ -0,0 +1,136 @@ +Loaders +======= + +Dataset Ops provides a set of standard loaders that covers loading of the most frequently exchange formats. + +PyTorch +------- + +Tensorflow +---------- + +Recursive File Loader +--------------------- +Provides functionality to load files stored in a tree structure in a recursively in a generic manner. +A callback function must be specified which is invoked with the `path `__ of each file. +When called this function should return a sample corresponding to the contents of the file. +Specific files may be skipped by returning None from the callback. + +.. code-block:: + + patients + ├── control + │   ├── somefile.csv + │   ├── subject_a.txt + │   └── subject_b.txt + └── experimental + ├── subject_c.txt + └── subject_d.txt + + +.. doctest:: + + >>> def func(path): + ... if(path.suffix != ".txt"): + ... return None + ... data = np.loadtxt(path) + ... blood_pressure = data[:,0] + ... is_control = path.parent != "control" + ... return (blood_pressure, is_control) + >>> + >>> # ds = do.loaders.from_recursive_files("patients", func) + >>> # len(ds) + >>> 4 + 4 + >>> #ds[0][0].shape + >>> (270, 1) + (270, 1) + + +Comma-separated values (CSV) +---------------------------- + +CSV is a format commonly used by spreadsheet editor to store tabular data. +Consider the scenario where the data describes the correlation between speed and vibration +under some specific load for two different car models, referred to as *car1* and *car2*. + +For two experiments the folder structure may look like: + +.. code-block:: + + cars_csv + ├── car1 + │   ├── load_1000.csv + │   └── load_2000.csv + └── car2 + ├── load_1000.csv + └── load_2000.csv + +The contents of each file may look like: + +.. code-block:: + + speed,vibration + 1,0.5 + 2,1 + 3,1.5 + +The :func:`load_csv ` function allows either a single or multiple CSV files to be loaded. + +.. note:: + + CSV is not standardized, rather it refers to a *family* of related formats, each differing slightly and with their own quirks. + Under the hood the framework relies on Pandas's `read_csv `__ implementation. + +Single File +~~~~~~~~~~~ +To load a single CSV file the path of the file is passed to the function. + +.. .. doctest:: + +.. >>> ds = do.from_csv("car1/load_1000.csv") +.. >>> len(ds) +.. 3 +.. >>> ds[0] +.. Empty DataFrame +.. Columns: [] +.. Index: [] +.. >>> ds[0].shape +.. (1,2) + +Finally, it is possible to pass a function to transform the raw data into a sample. +The function must take the path and the raw data as argument and in turn return a new sample: + +.. .. doctest:: + +.. >>> def func(path,data): +.. >>> load = int(path.stem.split("_")[-1]) +.. >>> return (data,load) +.. >>> ds = do.load_csv("car1/load_1000.csv",func) +.. >>> ds[0][1] +.. 1000 + +This useful for converting the data into other formats or to extract labels from the name of the CSV file. + +Multiple Files +~~~~~~~~~~~~~~ +The process of loading multiple files is similar. +However, instead of specifying a single CSV file, a directory containing the CSV files must be specified instead. +This will search recursively for CSV files creating a sample for each file. + +.. .. doctest:: + +.. >>> ds = do.load_csv("cars_csv") +.. >>> len(ds) +.. 4 +.. >>> ds[0].shape +.. (3,2) + +Similar to before it is possible to supply a callback function for transforming the data. + +Data format +~~~~~~~~~~~ +Its possible to control the format of the data read from the CSV files by specifying the *data_format* parameter. +The two options are a tuple or a Pandas `DataFrame `__ +If the column names defined in the CSV are valid attributes names a named tuple will be returned otherwise a plain tuple is returned. + \ No newline at end of file diff --git a/docs/overview/transforms.rst b/docs/overview/transforms.rst new file mode 100644 index 0000000..28eceb4 --- /dev/null +++ b/docs/overview/transforms.rst @@ -0,0 +1,212 @@ +Transforms +========== + + +Ordering +-------- + +Shuffle +~~~~~~~ +Shuffles a dataset such that samples are returned in random order when read. + +.. doctest:: + + >>> ds = do.loaders.from_iterable(range(10)) + >>> list(ds) + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> ds_shuffled = ds.shuffle(seed=0) + >>> list(ds_shuffled) + [7, 8, 1, 5, 3, 4, 2, 0, 9, 6] + +Split +~~~~~ +Divides a dataset into several smaller sets, each containing a specific fraction of all samples. +This may be used for creating a training and validation split. + +.. doctest:: + + >>> ds = do.loaders.from_iterable(range(10)) + >>> train, val = ds.split([0.7,0.3]) + >>> len(train) == 7 + True + >>> len(val) == 3 + True + +Split Filter +~~~~~~~~~~~~ +Splits the dataset based on a predicate evaluated on each sample. + +.. doctest:: + + >>> ds = do.loaders.from_iterable(range(10)) + >>> def func(s): + ... return s == 0 + >>> + >>> zeros, others = ds.split_filter(func) + >>> len(zeros) == 1 + True + >>> len(others) == 9 + True + +Changing Data +------------- + +Transform +~~~~~~~~~ +Applies an user defined transform to each sample of a dataset. +The function must take an sample as argument and in turn return a new sample. + +.. doctest:: + + >>> def func1(s): + ... name, age = s + ... return (name, age + 1) + ... + >>> def func2(age): + ... return age + 1 + ... + >>> # ds = do.loaders.from_iterable([("James",30),("Freddy",24)]) + >>> # ds_named = ds.named("name","age") + >>> # ds1 = ds.transform(func1) + >>> # ds2 = ds_named.transform("age", func2) + >>> # ds1[0] == ("James", 31) + >>> True + True + >>> # ds2[0] == ("James", 31) + >>> True + True + +.. _tf_subsample: + +Subsample +~~~~~~~~~ +For some applications it may be useful to convert each sample into several smaller sub-samples. +For example, a sample may be a time-series stretching over a large time interval, which needs to be split into several series of shorter length. +Likewise a single image can be split into sub images. Both scenarios are depicted in :numref:`fig_subsample`. + + +.. _fig_subsample: +.. figure:: ../pics/subsample.svg + :figwidth: 75% + :width: 60% + :align: center + :alt: subsample operation + + Subsampling of image (a) and subsampling of time-series (b) + + +To subsample a dataset the :func:`subsample ` method is called with a function that describes +how each sample should be divided. This function must return an iterable consisting of the new samples as seen below: + +.. doctest:: + + >>> def func(s): + ... return (s(1),s(2)) + >>> + >>> ds = do.loaders.from_iterable([(1,1),(2,2)]) + >>> len(ds) + 2 + >>> ds = ds.subsample(func, sampling_ratio=2) + >>> len(ds) + 4 + +The method requires that user to specify the number of sub-samples produces by each sample. +This is necessary to ensure that the operation can be evaluated lazily, without first having to apply the function to every sample of the dataset. + +.. The difference between the :meth:`transform ` and :func:`subsample ` methods, +.. is that the former modifies the sample itself, but not the number of samples, whereas the latter is allowed to do both. + +To reduce the amount of unnecessary reads from the dataset being sub-sampled, it is possible to enable different caching strategies. +Consider the example shown below, where each sample of the original dataset is subsampled to produces two new samples. + +.. _fig_subsample_caching: +.. figure:: ../pics/subsample_caching.svg + :figwidth: 75% + :width: 75% + :align: center + :alt: subsample caching modes. + + Caching modes of the subsample operation. + +.. .. doctest:: + +.. >>> cnt = 0 +.. >>> def func(s): +.. ... global cnt +.. ... cnt += 1 +.. ... return (s,s) +.. >>> +.. >>> ds = do.loaders.from_iterable([1,1]).subsample(func, n_samples=2, cache_method=None) +.. >>> ds[0] +.. ... # doctest: +SKIP +.. >>> ds[1] +.. ... # doctest: +SKIP +.. >>> cnt +.. 2 +.. >>> cnt = 0 +.. >>> ds_cache = ds.subsample(func, n_samples=2, cache_method="block") +.. >>> ds[0] +.. ... # doctest: +SKIP +.. >>> ds[1] +.. ... # doctest: +SKIP +.. >>> cnt +.. 1 + +These should not be confused by the more general caching mechanism described in the section on :ref:`caching `. + +Supersample +~~~~~~~~~~~ +This :func:`supersample ` transform can be used to combine several samples into fewer, but larger samples. +The transform can be seen as the inverse of :ref:`subsample `. + +>>> def sum(s): +... return (s[0] + s[1]) +>>> ds = do.loaders.from_iterable([1,2,3,4,5,6]) +>>> len(ds) == 6 +True +>>> ds = ds.supersample(sum, sampling_ratio=2) +>>> len(ds) == 3 +True +>>> list(ds) +[3, 7, 11] + +Images Manipulation +------------------- + +Convolves the images in the dataset with the specified filter. + +.. doctest:: + + >>> # kernel = np.ones((5,5))/(5*5) + >>> # do.load_mnist().image_filter(kernel) + >>> True + True + +Resize +~~~~~~ +Resize the images of the dataset to a specified size. + + >>> # do.load_mnist().resize((10,10)) + >>> # s = next(do) + >>> # assert np.shape(s.image) == (10,10) + >>> True + True + + +Normalize +~~~~~~~~~ + + +Rotate +~~~~~~ + + +Time-Series +----------- + +Window +~~~~~~ + +Interpolate +~~~~~~~~~~~ + diff --git a/docs/pics/subsample.svg b/docs/pics/subsample.svg new file mode 100644 index 0000000..b14adf9 --- /dev/null +++ b/docs/pics/subsample.svg @@ -0,0 +1,3 @@ + + +
(a)
(a)
(b)
(b)
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/pics/subsample_caching.svg b/docs/pics/subsample_caching.svg new file mode 100644 index 0000000..aba1db3 --- /dev/null +++ b/docs/pics/subsample_caching.svg @@ -0,0 +1,3 @@ + + +
1
1
2
2
0
0
1
1
2
2
3
3
cache
cache
cache
cache
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/transforms/common.rst b/docs/transforms/common.rst deleted file mode 100644 index 1c961a6..0000000 --- a/docs/transforms/common.rst +++ /dev/null @@ -1,56 +0,0 @@ -Common Transforms -================== - - - -Shuffle -------- -Shuffles a dataset such that samples are returned in random order when read. - -.. doctest:: - - >>> ds = do.load_mnist() - >>> ds_s = ds.shuffle(seed=0) - >>> ds.inds != ds.inds - False - - -Split ------ -Divides a dataset into several smaller sets, each containing a specific fraction of all samples. -This may be used for creating a training and validation split. - -.. doctest:: - - >>> train, val = do.load_mnist().split([0.7,0.3]) - >>> len(train) == 1000 - True - >>> len(val) == 300 - True - -Split Filter ------------- -Splits the dataset based on a predicate evaluated on each sample. -For example the MNIST dataset may be split into the samples corresponding to zero and all others. - -.. doctest:: - - >>> def func(s): - >>> return s.lbl == 0 - >>> - >>> zeros, others = do.load_mnist().split_filter(func) - >>> all([s.lbl == 0 for s in zeros]) - True - -Transform ---------- -Applies an user defined transform to each sample of the dataset. - -.. doctest:: - - >>> def func(s): - >>> return someFunc(s) - >>> - >>> train, val = do.load_mnist().transform(TODO) - >>> TODO - True diff --git a/docs/transforms/images.rst b/docs/transforms/images.rst deleted file mode 100644 index 14694df..0000000 --- a/docs/transforms/images.rst +++ /dev/null @@ -1,32 +0,0 @@ -Image Transforms -================ - -Image Filter ------------- - -Convolves the images in the dataset with the specified filter. - -.. doctest:: - - >>> kernel = np.ones((5,5))/(5*5) - >>> do.load_mnist().image_filter(kernel) - TODO - -Resize ------- -Resize the images of the dataset to a specified size. - - >>> do.load_mnist().resize((10,10)) - >>> s = next(do) - >>> assert np.shape(s.image) == (10,10) - - -Normalize ---------- - - -Rotate ------- - - - diff --git a/docs/transforms/timeseries.rst b/docs/transforms/timeseries.rst deleted file mode 100644 index 68b0b37..0000000 --- a/docs/transforms/timeseries.rst +++ /dev/null @@ -1,9 +0,0 @@ -Time-Series Transforms -====================== - -Window ------- - -Interpolate ------------ - diff --git a/pytest.ini b/pytest.ini index c857d08..134294c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,17 @@ [pytest] -junit_family=xunit1 \ No newline at end of file +junit_family=xunit1 + + +# ------- DOCTEST ------- + +addopts = + --doctest-glob=*.rst + --doctest-modules + +# https://github.com/pytest-dev/pytest/issues/539 +testpaths = docs tests + + +# ------- Performance ------- +markers= + slow: marks tests as slow (deselect with '-m "not slow"') \ No newline at end of file diff --git a/src/datasetops/abstract.py b/src/datasetops/abstract.py deleted file mode 100644 index be83723..0000000 --- a/src/datasetops/abstract.py +++ /dev/null @@ -1,78 +0,0 @@ -""" -This module defines a generic interface for datasets. - -Examples --------- -A simple dataset can be implemented as:: - - >>> class DummyDataset(AbstractDataset): - >>> def __len__(self): - >>> return 10 - >>> def __getitem__(self, idx): - >>> return idx - >>> - >>> ds = DummyDataset() - >>> ds.__getitem__(0) - 0 -""" - -from abc import ABC, abstractmethod -from typing import Tuple -from datasetops.transformation_graph import TransformationGraph - - -class ItemGetter(ABC): - """Abstract base class implemented by classes that implement - an index based get method - """ - - @abstractmethod - def __getitem__(self, i: int) -> Tuple: - pass # pragma: no cover - - -class AbstractDataset(ItemGetter): - """Abstract base class defining a generic dataset interface.""" - - def __init__(self): - pass # pragma: no cover - - name = "" - cachable = False - shape = None - _origin = None - - @abstractmethod - def __len__(self) -> int: - """Return the total number of elements in the dataset.""" - pass # pragma: no cover - - @abstractmethod - def __getitem__(self, idx: int) -> Tuple: - """Returns the element at the specified index. - - Arguments: - idx {int} -- the index from which to read the sample. - - Returns: - Tuple -- A tuple representing the sample - """ - pass # pragma: no cover - - def get_transformation_graph(self) -> TransformationGraph: - """Returns TransformationGraph of current dataset - """ - return TransformationGraph(self) - - def __iter__(self): - for i in range(self.__len__()): - - yield self.__getitem__(i) - - @property - def generator(self,): - def g(): - for d in self: - yield d - - return g diff --git a/src/datasetops/cache.py b/src/datasetops/cache.py index 5c2d958..c9c8f8b 100644 --- a/src/datasetops/cache.py +++ b/src/datasetops/cache.py @@ -1,20 +1,23 @@ import json from pathlib import Path from typing import Callable, IO +from datasetops.types import AnyPath import os class Cache: - DEFAULT_PATH: str = ".datasetops_cache" + DEFAULT_PATH: Path = Path(".datasetops_cache") @staticmethod - def clear(path: str = None): + def clear(path: AnyPath = None): if path is None: path = Cache.DEFAULT_PATH - if not os.path.exists(path): - print("No cache at " + path) + path = Path(path) + + if not path.is_dir(): + print(f"No cache at: {path}") return files = os.listdir(path) @@ -23,14 +26,14 @@ def clear(path: str = None): ) for file in files: - os.remove(Path(path) / file) + os.remove(path / file) if len(files) > 0: print("Cleared " + str(len(files) - 1) + " cache entries") else: - print("No cache at " + path) + print(f"No cache at: {path}") - def __init__(self, path) -> None: + def __init__(self, path: Path) -> None: if path is None: path = Cache.DEFAULT_PATH diff --git a/src/datasetops/compose.py b/src/datasetops/compose.py index 86767e9..de2367b 100644 --- a/src/datasetops/compose.py +++ b/src/datasetops/compose.py @@ -1,4 +1,4 @@ -from datasetops.abstract import AbstractDataset +from datasetops.types import AbstractDataset import numpy as np import functools import math @@ -25,7 +25,7 @@ def _zipped_item_names(*datasets: AbstractDataset): class ZipDataset(AbstractDataset): - def __init__(self, *downstream_datasets: AbstractDataset): + def __init__(self, *parents: AbstractDataset): """ Compose datasets by zipping and flattening their items. The resulting dataset will have a length equal to the shortest of provided datasets. @@ -33,31 +33,30 @@ def __init__(self, *downstream_datasets: AbstractDataset): and methods such as classwise_subsampling will not work. Arguments: - downstream_datasets {[AbstractDataset]} -- Comma-separated datasets + parents {[AbstractDataset]} -- Comma-separated datasets """ - if len(downstream_datasets) == 0: + if len(parents) == 0: raise ValueError("No datasets given to compose") - self._downstream_datasets = downstream_datasets + self._parents = parents self._ids = list(range(self.__len__())) - self.name = "zipped{}".format([ds.name for ds in self._downstream_datasets]) - self._item_names = _zipped_item_names(*downstream_datasets) - self.cachable = all(ds.cachable for ds in self._downstream_datasets) + self.name = "zipped{}".format([ds.name for ds in self._parents]) + self._item_names = _zipped_item_names(*parents) + self._cacheable = all(ds._cacheable for ds in self._parents) self._origin = list( map( - lambda ds: {"dataset": ds, "operation": {"name": "zip"}}, - self._downstream_datasets, + lambda ds: {"dataset": ds, "operation": {"name": "zip"}}, self._parents, ) ) def __len__(self) -> int: - return min([len(ds) for ds in self._downstream_datasets]) + return min([len(ds) for ds in self._parents]) def __getitem__(self, idx: int) -> Tuple: - return tuple([elem for ds in self._downstream_datasets for elem in ds[idx]]) + return tuple([elem for ds in self._parents for elem in ds[idx]]) class CartesianProductDataset(AbstractDataset): - def __init__(self, *downstream_datasets: AbstractDataset): + def __init__(self, *parents: AbstractDataset): """Compose datasets with a cartesian product. This will produce a dataset containing all combinations of data. @@ -70,64 +69,54 @@ def __init__(self, *downstream_datasets: AbstractDataset): and methods such as classwise_subsampling will not work. Arguments: - downstream_datasets {[AbstractDataset]} -- Comma-separated datasets + parents {[AbstractDataset]} -- Comma-separated datasets """ - if len(downstream_datasets) == 0: + if len(parents) == 0: raise ValueError("No datasets given to compose") - self._downstream_datasets = downstream_datasets - self._downstream_lengths = [len(ds) for ds in downstream_datasets] + self._parents = parents + self._downstream_lengths = [len(ds) for ds in parents] self._ids = list(range(self.__len__())) - self.name = "cartesian_product{}".format( - [ds.name for ds in self._downstream_datasets] - ) - self._item_names = _zipped_item_names(*downstream_datasets) - self.cachable = all(ds.cachable for ds in self._downstream_datasets) + self.name = "cartesian_product{}".format([ds.name for ds in self._parents]) + self._item_names = _zipped_item_names(*parents) + self._cacheable = all(ds._cacheable for ds in self._parents) self._origin = list( map( lambda ds: {"dataset": ds, "operation": {"name": "cartesian_product"}}, - self._downstream_datasets, + self._parents, ) ) def __len__(self) -> int: return int( - functools.reduce( - lambda acc, ds: acc * len(ds), self._downstream_datasets, int(1) - ) + functools.reduce(lambda acc, ds: acc * len(ds), self._parents, int(1)) ) def __getitem__(self, idx: int) -> Tuple: acc_len = functools.reduce( - lambda acc, ds: acc + [acc[-1] * len(ds)], - self._downstream_datasets[:-1], - [int(1)], + lambda acc, ds: acc + [acc[-1] * len(ds)], self._parents[:-1], [int(1)], ) inds = [ math.floor(idx / al) % l for al, l in zip(acc_len, self._downstream_lengths) ] return tuple( - [ - elem - for i, ds in enumerate(self._downstream_datasets) - for elem in ds[inds[i]] - ] + [elem for i, ds in enumerate(self._parents) for elem in ds[inds[i]]] ) class ConcatDataset(AbstractDataset): - def __init__(self, *downstream_datasets: AbstractDataset): + def __init__(self, *parents: AbstractDataset): """Compose datasets by concatenating them, placing one after the other. The resulting dataset will have a length equal to the sum of datasets. Arguments: - downstream_datasets {[AbstractDataset]} -- Comma-separated datasets + parents {[AbstractDataset]} -- Comma-separated datasets """ - if len(downstream_datasets) == 0: + if len(parents) == 0: raise ValueError("No datasets given to compose") - for i in range(len(downstream_datasets) - 1): - if downstream_datasets[i].shape != downstream_datasets[i + 1].shape: + for i in range(len(parents) - 1): + if parents[i].shape != parents[i + 1].shape: warnings.warn( ( "Concatenating datasets with different element shapes " @@ -135,28 +124,28 @@ def __init__(self, *downstream_datasets: AbstractDataset): ) ) - self._downstream_datasets = downstream_datasets + self._parents = parents self._ids = list(range(self.__len__())) self.name = "concat{}".format( - [ds.name for ds in self._downstream_datasets] # type:ignore + [ds.name for ds in self._parents] # type:ignore ) self._acc_idx_range = functools.reduce( - lambda acc, ds: acc + [len(ds) + acc[-1]], self._downstream_datasets, [0] + lambda acc, ds: acc + [len(ds) + acc[-1]], self._parents, [0] ) - self.cachable = all(ds.cachable for ds in self._downstream_datasets) + self._cacheable = all(ds._cacheable for ds in self._parents) self._origin = list( map( lambda ds: {"dataset": ds, "operation": {"name": "concat"}}, - self._downstream_datasets, + self._parents, ) ) def __len__(self) -> int: - return sum([len(ds) for ds in self._downstream_datasets]) + return sum([len(ds) for ds in self._parents]) def __getitem__(self, idx: int) -> Tuple: dataset_index = ( np.where(np.array(self._acc_idx_range) > idx)[0][0] - 1 # type:ignore ) index_in_dataset = idx - self._acc_idx_range[dataset_index] - return self._downstream_datasets[dataset_index][index_in_dataset] + return self._parents[dataset_index][index_in_dataset] diff --git a/src/datasetops/dataset.py b/src/datasetops/dataset.py index cd1fbc9..4261de2 100644 --- a/src/datasetops/dataset.py +++ b/src/datasetops/dataset.py @@ -1,20 +1,24 @@ """ Module defining operations which may be applied to transform the -data of a single dataset. The transforms are available as free -functions or as ``extension`` methods defined on the dataset objects: - - >>> ds.shuffle(seed=0) - >>> ds_s = shuffle(ds,seed=0) - >>> ds.idx == ds_s.idx - True - +data of a single dataset. """ import random -from datasetops.abstract import ItemGetter, AbstractDataset +import warnings +import functools +from pathlib import Path +from typing import Tuple, Union, Callable, Sequence, List, Any, Dict, IO, Optional +import typing +import inspect + +import numpy as np +from PIL import Image + from datasetops.cache import Cache from datasetops import scaler from datasetops.types import ( + ItemGetter, + AbstractDataset, ItemNames, Key, Shape, @@ -22,17 +26,13 @@ DatasetTransformFn, DatasetTransformFnCreator, Ids, + IdxSlice, + Sample, ItemTransformFn, + AnyPath, ) -from typing import Union, Callable, Sequence, List, Any, Dict + import datasetops.compose as compose -import numpy as np -from PIL import Image -import warnings -import functools -import inspect -from pathlib import Path -from typing import Optional, Tuple, IO import dill @@ -79,6 +79,31 @@ def _key_index(item_names: ItemNames, key: Key) -> int: return item_names[str(key)] +def _idxSlice_to_ids(idx: IdxSlice, length: int): + """Convert a single index or a slice to a list of indicies for a iterable of the specified length. + + Arguments: + idx {IdxSlice} -- a single index or a slice + length {int} -- length of the iterable + + Raises: + IndexError: raised if single index is provided that is out of bounds. + + Returns: + List[int] -- a list of indicies + """ + + if isinstance(idx, slice): + ids = [i for i in range(*idx.indices(length))] + return ids + else: + if idx > length - 1: + raise IndexError( + f"index {idx} is out of bounds for axis 0 with size {length}" + ) + return [idx] + + def _split_bulk_itemwise( l: Union[Optional[Callable], Sequence[Optional[Callable]]] ) -> Tuple[Optional[Callable], Sequence[Optional[Callable]]]: @@ -97,7 +122,7 @@ def _combine_conditions( predicates: Optional[ Union[DataPredicate, Sequence[Optional[DataPredicate]]] ] = None, - **kwpredicates: DataPredicate + **kwpredicates: DataPredicate, ) -> DataPredicate: bulk, itemwise = _split_bulk_itemwise(predicates) # type:ignore @@ -202,6 +227,8 @@ def _keyarg2list( "maxabs", "copy", "transform", + "supersample", + "subsample", ] # ========= Dataset ========= @@ -215,7 +242,7 @@ class Dataset(AbstractDataset): def __init__( self, - downstream_getter: Union[ItemGetter, "Dataset"], + parent: Union[ItemGetter, "Dataset"], operation_name: str, name: str = None, ids: Ids = None, @@ -227,29 +254,27 @@ def __init__( """Initialise. Keyword Arguments: - downstream_getter {ItemGetter} -- + parent {ItemGetter} -- Any object which implements the __getitem__ function (default: {None}) name {str} -- A name for the dataset - ids {Ids} -- List of ids used in the downstream_getter (default: {None}) + ids {Ids} -- List of ids used in the parent (default: {None}) item_transform_fn: {Callable} -- a function """ - self._downstream_getter = downstream_getter + self._parent = parent self.name = "" self._ids = [] self._item_names: ItemNames = {} - self.cachable = True + self._cacheable = True self._item_transform_fn = item_transform_fn self._item_stats = stats + self._shape = None - if issubclass(type(downstream_getter), AbstractDataset): - self.name = self._downstream_getter.name # type: ignore - self._ids = ( - list(range(len(self._downstream_getter._ids))) # type: ignore - if ids is None - else ids - ) - self._item_names = getattr(downstream_getter, "_item_names", None) - self.cachable: bool = getattr(downstream_getter, "cachable", False) + if issubclass(type(parent), AbstractDataset): + dg: AbstractDataset = self._parent # type: ignore + self.name = dg.name + self._ids = range(len(dg)) if ids is None else ids + self._item_names = getattr(parent, "_item_names", None) + self._cacheable: bool = getattr(parent, "_cacheable", False) if name: self.name = name @@ -259,7 +284,7 @@ def __init__( self._ids: Ids = ids if operation_name in _MAYBE_CACHEABLE_OPERATIONS: - self.cachable = operation_parameters["seed"] is not None + self._cacheable = operation_parameters["seed"] is not None if operation_name in _ROOT_OPERATIONS: self._origin = { @@ -267,7 +292,7 @@ def __init__( } elif operation_name in _CACHEABLE_OPERATIONS + _MAYBE_CACHEABLE_OPERATIONS: self._origin = { - "dataset": self._downstream_getter, + "dataset": self._parent, "operation": { "name": operation_name, "parameters": operation_parameters, @@ -279,23 +304,43 @@ def __init__( def __len__(self): return len(self._ids) - def __getitem__(self, i: int) -> Tuple: - return self._item_transform_fn(self._downstream_getter[self._ids[i]]) + @typing.overload + def __getitem__(self, i: slice) -> List[Sample]: + ... + + def __getitem__(self, i: int) -> Sample: + """Returns the element at the specified index or slice + + Arguments: + i {IdxSlice} -- An single index or a slice + + Returns: + Union[Tuple, List[Tuple]] -- the element(s) of the dataset specified by the index or slice + """ + ids = _idxSlice_to_ids(i, len(self)) + + samples = [self._item_transform_fn(self._parent[self._ids[ii]]) for ii in ids] + + if isinstance(i, int): + samples = samples[0] + + return samples def cached( self, - path: str = None, + path: AnyPath = None, keep_loaded_items: bool = False, display_progress: bool = False, ): - if not self.cachable: + if not self._cacheable: raise Exception( - "Dataset must be cachable" + "Dataset must be cacheable" + "(Provide identifiers for memory-based Loaders)" ) if path is None: path = Cache.DEFAULT_PATH + path = Path(path) cache = Cache(path) identifier = self.get_transformation_graph().serialize() @@ -340,8 +385,8 @@ def saver(file): cache.save(identifier, saver) return Dataset( - downstream_getter=self, - ids=list(range(len(self))), + parent=self, + ids=range(len(self)), operation_name="cache", operation_parameters={"identifier": identifier}, ) @@ -391,8 +436,14 @@ def iterable(): return self._item_stats[idx] # type: ignore @property + # @functools.lru_cache(1) def shape(self) -> Sequence[Shape]: - """Get the shape of a dataset item. + """Get the shape of a dataset's items. + + The process for doing this is picking a single sample from the dataset. + Each item in the sample is checked for the presence "shape" or "size" + properties. If present they are added to the shape tuple otherwise an + empty tuple "()" is added. Returns: Sequence[int] -- Item shapes @@ -400,6 +451,9 @@ def shape(self) -> Sequence[Shape]: if len(self) == 0: return _DEFAULT_SHAPE + if self._shape is not None: + return self._shape + item = self.__getitem__(0) if hasattr(item, "__getitem__"): item_shape = [] @@ -411,9 +465,12 @@ def shape(self) -> Sequence[Shape]: else: item_shape.append(_DEFAULT_SHAPE) - return tuple(item_shape) + shape = tuple(item_shape) + else: + shape = _DEFAULT_SHAPE - return _DEFAULT_SHAPE + self._shape = shape + return shape @functools.lru_cache(4) @_warn_no_args(skip=1) @@ -475,7 +532,34 @@ def unique(self, *itemkeys: Key) -> List[Any]: """ return [x[0] for x in self.counts(*itemkeys)] - def sample(self, num: int, seed: int = None): + def subsample(self, subsample_func, sampling_ratio: int, cache_method="block"): + """Divide each sample in the dataset into several sub-samples using a user-defined function. + The function must take a single sample as an argument and must return a list of samples. + + Arguments: + subsample_func {Callable} -- function defining how each sample should divided. + sampling_ratio {int} -- the number of sub-samples produced for each sample. + cache_method {Any} -- defines the caching method used by the subsampling operation. Possible options are {None, "block"} + + Returns: + Dataset -- a new dataset containing the subsamples. + """ + return SubsampleDataset(self, subsample_func, sampling_ratio, cache_method) + + def supersample(self, supersample_func, sampling_ratio: int) -> "Dataset": + """Combines several samples into a smaller number of samples using a user-defined function. + The function is invoked with an iterable of and must return a single sample. + + Arguments: + supersample_func {[type]} -- a function used to transform a number of samples into a single supersample + sampling_ratio {int} -- number of samples required to produce each supersample + + Returns: + [Dataset] -- a new dataset containing the supersamples + """ + return SupersampleDataset(self, supersample_func, sampling_ratio) + + def sample(self, num: int, seed: int = None) -> "Dataset": """Sample data randomly from the dataset. Arguments: @@ -499,7 +583,7 @@ def sample(self, num: int, seed: int = None): range(length), num - length ) # Supersample. return Dataset( - downstream_getter=self, + parent=self, ids=new_ids, operation_name="sample", operation_parameters={"num": num, "seed": seed}, @@ -511,7 +595,7 @@ def filter( predicates: Optional[ Union[DataPredicate, Sequence[Optional[DataPredicate]]] ] = None, - **kwpredicates: DataPredicate + **kwpredicates: DataPredicate, ): """Filter a dataset using a predicate function. @@ -534,7 +618,7 @@ def filter( filter(lambda i: condition(self.__getitem__(i)), range(len(self._ids))) ) return Dataset( - downstream_getter=self, + parent=self, ids=new_ids, operation_name="filter", operation_parameters={ @@ -549,7 +633,7 @@ def split_filter( predicates: Optional[ Union[DataPredicate, Sequence[Optional[DataPredicate]]] ] = None, - **kwpredicates: DataPredicate + **kwpredicates: DataPredicate, ) -> Tuple["Dataset"]: """Split a dataset using a predicate function. @@ -577,7 +661,7 @@ def split_filter( return tuple( [ Dataset( - downstream_getter=self, + parent=self, ids=new_ids, operation_name="split_filter", operation_parameters={ @@ -590,7 +674,7 @@ def split_filter( ] ) - def shuffle(self, seed: int = None): + def shuffle(self, seed: int = None) -> "Dataset": """Shuffle the items in a dataset. Keyword Arguments: @@ -603,7 +687,7 @@ def shuffle(self, seed: int = None): new_ids = list(range(len(self))) random.shuffle(new_ids) return Dataset( - downstream_getter=self, + parent=self, ids=new_ids, operation_name="shuffle", operation_parameters={"seed": seed}, @@ -640,7 +724,7 @@ def split(self, fractions: List[float], seed: int = None) -> Tuple["Dataset", .. fractions = [x if x != -1 else wildcard for x in fractions] # create shuffled list - new_ids = list(range(len(self))) + new_ids = list(list(range(len(self)))) random.shuffle(new_ids) # split according to fractions @@ -658,7 +742,7 @@ def split(self, fractions: List[float], seed: int = None) -> Tuple["Dataset", .. return tuple( [ Dataset( - downstream_getter=self, + parent=self, ids=new_ids, operation_name="split", operation_parameters={ @@ -671,7 +755,7 @@ def split(self, fractions: List[float], seed: int = None) -> Tuple["Dataset", .. ] ) - def take(self, num: int): + def take(self, num: int) -> "Dataset": """Take the first elements of a dataset. Arguments: @@ -685,13 +769,13 @@ def take(self, num: int): new_ids = list(range(num)) return Dataset( - downstream_getter=self, + parent=self, ids=new_ids, operation_name="take", operation_parameters={"num": num}, ) - def repeat(self, times=1, mode="itemwise"): + def repeat(self, times=1, mode="itemwise") -> "Dataset": """Repeat the dataset elements. Keyword Arguments: @@ -703,20 +787,18 @@ def repeat(self, times=1, mode="itemwise"): [type] -- [description] """ new_ids = { - "whole": lambda: [i for _ in range(times) for i in list(range(len(self)))], - "itemwise": lambda: [ - i for i in list(range(len(self))) for _ in range(times) - ], + "whole": lambda: [i for _ in range(times) for i in range(len(self))], + "itemwise": lambda: [i for i in range(len(self)) for _ in range(times)], }[mode]() return Dataset( - downstream_getter=self, + parent=self, ids=new_ids, operation_name="repeat", operation_parameters={"times": times, "mode": mode}, ) - def reorder(self, *keys: Key): + def reorder(self, *keys: Key) -> "Dataset": """Reorder items in the dataset (similar to numpy.transpose). Arguments: @@ -759,14 +841,14 @@ def item_transform_fn(item: Tuple): } return Dataset( - downstream_getter=self, + parent=self, item_transform_fn=item_transform_fn, item_names=item_names, operation_name="reorder", operation_parameters={"keys": keys}, ) - def named(self, first: Union[str, Sequence[str]], *rest: str): + def named(self, first: Union[str, Sequence[str]], *rest: str) -> "Dataset": """Set the names associated with the elements of an item. Arguments: @@ -806,10 +888,12 @@ def names(self) -> List[str]: def transform( self, fns: Optional[ - Union[ItemTransformFn, Sequence[Union[ItemTransformFn, DatasetTransformFn]]] + Union[ + ItemTransformFn, Sequence[Union[ItemTransformFn, DatasetTransformFn]], + ] ] = None, - **kwfns: DatasetTransformFn - ): + **kwfns: DatasetTransformFn, + ) -> "Dataset": """Transform the items of a dataset according to some function (passed as argument). @@ -842,7 +926,7 @@ def transform( if bulk: return Dataset( - downstream_getter=self, + parent=self, item_transform_fn=bulk, operation_name="transform", operation_parameters={"function": bulk.__code__}, @@ -860,7 +944,9 @@ def transform( # ========= Label transforming methods ========= - def categorical(self, key: Key, mapping_fn: Callable[[Any], int] = None): + def categorical( + self, key: Key, mapping_fn: Callable[[Any], int] = None + ) -> "Dataset": """Transform elements into categorical categoricals (int). Arguments: @@ -887,7 +973,7 @@ def one_hot( encoding_size: int = None, mapping_fn: Callable[[Any], int] = None, dtype="bool", - ): + ) -> "Dataset": """Transform elements into a categorical one-hot encoding. Arguments: @@ -921,7 +1007,7 @@ def one_hot( # ========= Conversion methods ========= # TODO: reconsider API - def image(self, *positional_flags: Any): + def image(self, *positional_flags: Any) -> "Dataset": """Transforms item elements that are either numpy arrays or path strings into a PIL.Image.Image. @@ -954,7 +1040,7 @@ def image(self, *positional_flags: Any): return self # TODO: reconsider API - def numpy(self, *positional_flags: Any): + def numpy(self, *positional_flags: Any) -> "Dataset": """Transforms elements into numpy.ndarray. Arguments: @@ -987,18 +1073,20 @@ def numpy(self, *positional_flags: Any): # ========= Composition methods ========= - def zip(self, *datasets): + def zip(self, *datasets) -> "Dataset": return zipped(self, *datasets) - def cartesian_product(self, *datasets): + def cartesian_product(self, *datasets) -> "Dataset": return cartesian_product(self, *datasets) - def concat(self, *datasets): + def concat(self, *datasets) -> "Dataset": return concat(self, *datasets) # ========= Methods relating to numpy data ========= - def reshape(self, *new_shapes: Optional[Shape], **kwshapes: Optional[Shape]): + def reshape( + self, *new_shapes: Optional[Shape], **kwshapes: Optional[Shape] + ) -> "Dataset": return _optional_argument_indexed_transform( self.shape, self.transform, @@ -1011,7 +1099,9 @@ def reshape(self, *new_shapes: Optional[Shape], **kwshapes: Optional[Shape]): # ========= Methods below assume data is an image ========= - def image_resize(self, *new_sizes: Optional[Shape], **kwsizes: Optional[Shape]): + def image_resize( + self, *new_sizes: Optional[Shape], **kwsizes: Optional[Shape] + ) -> "Dataset": return _optional_argument_indexed_transform( self.shape, self.transform, @@ -1027,7 +1117,7 @@ def image_resize(self, *new_sizes: Optional[Shape], **kwsizes: Optional[Shape]): # ========= Dataset scalers ========= - def standardize(self, key_or_keys: Union[Key, Sequence[Key]], axis=0): + def standardize(self, key_or_keys: Union[Key, Sequence[Key]], axis=0) -> "Dataset": """Standardize features by removing the mean and scaling to unit variance Arguments: @@ -1047,7 +1137,7 @@ def standardize(self, key_or_keys: Union[Key, Sequence[Key]], axis=0): args=_keyarg2list(self._item_names, key_or_keys, [axis]), ) - def center(self, key_or_keys: Union[Key, Sequence[Key]], axis=0): + def center(self, key_or_keys: Union[Key, Sequence[Key]], axis=0) -> "Dataset": """Centers features by removing the mean Arguments: @@ -1070,7 +1160,6 @@ def center(self, key_or_keys: Union[Key, Sequence[Key]], axis=0): # When people say normalize, they often mean either minmax or standardize. # This implementation follows the scikit-learn terminology # Not included in the library for now, because it is used very seldomly in practice - # def normalize(self, key_or_keys: Union[Key, Sequence[Key]], axis=0, norm="l2"): # """Normalize samples individually to unit norm. @@ -1095,8 +1184,8 @@ def center(self, key_or_keys: Union[Key, Sequence[Key]], axis=0): # ) def minmax( - self, key_or_keys: Union[Key, Sequence[Key]], axis=0, feature_range=(0, 1) - ): + self, key_or_keys: Union[Key, Sequence[Key]], axis=0, feature_range=(0, 1), + ) -> "Dataset": """Transform features by scaling each feature to a given range. Arguments: @@ -1119,7 +1208,7 @@ def minmax( args=_keyarg2list(self._item_names, key_or_keys, [axis, feature_range],), ) - def maxabs(self, key_or_keys: Union[Key, Sequence[Key]], axis=0): + def maxabs(self, key_or_keys: Union[Key, Sequence[Key]], axis=0) -> "Dataset": """Scale each feature by its maximum absolute value. Arguments: @@ -1151,18 +1240,203 @@ def to_pytorch(self): return to_pytorch(self) +class SubsampleDataset(Dataset): + def __init__( + self, + dataset: Dataset, + subsample_func, + sampling_ratio: int, + cache_method: str = None, + ): + """Divide each sample in the dataset into several sub-samples using a user-defined function. + The function must take a single sample as an argument and must return a list of samples. + + Arguments: + dataset {[AbstractDataset]} -- dataset containing the samples which are sub-sampled. + subsample_func {Callable} -- function defining how each sample should divided. + sampling_ratio {int} -- the number of sub-samples produced for each sample. + cache_method {Any} -- defines the caching method used by the subsampling operation. Possible options are {None, "block"} + + Returns: + Dataset -- a new dataset containing the subsamples. + """ + if sampling_ratio < 1: + raise ValueError( + "Unable to perform subsampling, value of sampling_ratio should be greater than one." + ) + + valid_cache_methods = {"block", None} + + if cache_method not in valid_cache_methods: + raise ValueError( + "Unable to perform subsampling, cache method: {cache_methods} is invalid, possible values are {valid_cache_methods}" + ) + + new_ids = range(0, len(dataset) * sampling_ratio) + + super().__init__(dataset, ids=new_ids, operation_name="subsample") + + self._cached = {} + self._subsample_func = subsample_func + self._sampling_ratio = sampling_ratio + self._cache_method = cache_method + self._last_parent_idx = None + + @typing.overload + def __getitem__(self, idx: slice) -> List[Sample]: + ... + + def __getitem__(self, idx: IdxSlice) -> Sample: + """Gets the subsample corresponding to the + + Arguments: + idx {int} -- index of the subsample + + Returns: + [Any] -- the subsample corresponding to the specified index + """ + ids_subsample = _idxSlice_to_ids(idx, len(self)) + + subsamples = [self._get_subsample(i) for i in ids_subsample] + + if isinstance(idx, int): + subsamples = subsamples[0] + + return subsamples + + def _get_subsample(self, ss_idx: int): + ds_idx = self._get_parent_idx(ss_idx) + + if self._is_subsample_cached(ss_idx): + return self._get_cached_subsample(ss_idx) + else: + + ds_sample = self._parent[ds_idx] + ss = self._subsample_func(ds_sample) + n_actual = None + # ensure that subsampling function has returned the correct value of subsamples + try: + n_actual = len(ss) + except Exception: + raise RuntimeError( + f"subsampling function returned: {n_actual}, this should be an iterable" + ) + + if n_actual != self._sampling_ratio: + raise RuntimeError( + f"subsampling function returned {n_actual} subsamples, which is different than the expected: {self._sampling_ratio}" + ) + + self._do_cache_for(ds_idx, ss) + ss_relative_idx = ss_idx % self._sampling_ratio + return ss[ss_relative_idx] + + def _get_parent_idx(self, ss_idx): + return ss_idx // self._sampling_ratio + + def _is_subsample_cached(self, ss_idx): + return self._get_parent_idx(ss_idx) in self._cached + + def _get_cached_subsample(self, ss_idx): + assert self._is_subsample_cached(ss_idx) + + ds_idx = self._get_parent_idx(ss_idx) + ss_relative_idx = ss_idx % self._sampling_ratio + return self._cached[ds_idx][ss_relative_idx] + + def _do_cache_for(self, ds_idx, ss): + """Caches the values read from the specified index of the parent data set. + + Arguments: + ds_idx {Idx} -- index of the last read parent sample + ss {Tuple[Any]} -- the items produced by subsampling the parent dataset at the specified index. + """ + + if self._cache_method is None: + return + elif self._cache_method == "block": + if ds_idx != self._last_parent_idx and self._last_parent_idx is not None: + del self._cached[self._last_parent_idx] + + self._cached[ds_idx] = ss + self._last_parent_idx = ds_idx + + +class SupersampleDataset(Dataset): + def __init__( + self, + dataset, + supersample_func, + sampling_ratio: int, + excess_samples_policy="discard", + ): + """Performs supersampling on the provided dataset. + + Arguments: + dataset {AbstractDataset} -- the dataset which the supersampling is applied to + supersample_func {Callable} -- function used to combine several samples into a single supersample. + sampling_ratio {int} -- the number of samples used to produce a each supersample. + + Keyword Arguments: + excess_samples_policy {str} -- defines how left over samples should be treated. Possible values are {"discard","error"} (default: {"discard"}) + + """ + excess_sample_policy_options = {"discard", "error"} + + if excess_samples_policy not in excess_sample_policy_options: + raise ValueError( + f"Illegal value for argument excess_samples_policy: {excess_samples_policy}, possible options are {excess_sample_policy_options}." + ) + + if sampling_ratio < 1: + raise ValueError( + f"Illegal value for argument sampling_ratio: {sampling_ratio}, this must be 1 or greater." + ) + + excess_samples = len(dataset) % sampling_ratio + if excess_samples_policy == "error" and (excess_samples != 0): + raise ValueError( + f"The specified excess sample policy: {excess_samples} does not permit left over samples, of which: {excess_samples} would exist." + ) + + n_samples = len(dataset) // sampling_ratio + new_ids = range(n_samples) + + super().__init__(dataset, ids=new_ids, operation_name="supersample") + + self._supersample_func = supersample_func + self._sampling_ratio = sampling_ratio + + def __getitem__(self, idx) -> Sample: + def parent_start_stop(idx): + start = idx * self._sampling_ratio + stop = start + self._sampling_ratio + return start, stop + + ids_super = _idxSlice_to_ids(idx, len(self)) + start_stop_parent = [parent_start_stop(i) for i in ids_super] + + samples_parent = [self._parent[start:stop] for start, stop in start_stop_parent] + samples_super = [self._supersample_func(s) for s in samples_parent] + + if isinstance(idx, int): + samples_super = samples_super[0] + + return samples_super + + class StreamDataset(Dataset): def __init__( self, stream: IO, identifier: str, keep_loaded_items: bool = False ) -> None: self._last_accessed_id: int = -1 - self.__loaded_items: List[Tuple] = [] - self.__stream: IO = stream - self.keep_loaded_items: bool = keep_loaded_items + self._loaded_items: List[Tuple] = [] + self._stream: IO = stream + self._keep_loaded_items: bool = keep_loaded_items - length: int = self.__read_once() - names: List[str] = self.__read_once() + length: int = self._read_once() + names: List[str] = self._read_once() super().__init__( self, @@ -1172,65 +1446,65 @@ def __init__( item_names={n: i for i, n in enumerate(names)}, ) - self.cachable = True + self._cacheable = True @property - def allow_random_access(self) -> bool: - return self.keep_loaded_items + def _allow_random_access(self) -> bool: + return self._keep_loaded_items - def __skip_header(self): + def _skip_header(self): for i in range(2): - self.__read_once() + self._read_once() - def __read_once(self): - return dill.load(self.__stream) + def _read_once(self): + return dill.load(self._stream) - def __reset(self, clear_loaded_items: bool = False): + def _reset(self, clear_loaded_items: bool = False): self._last_accessed_id = -1 - self.__stream.seek(0) - self.__skip_header() + self._stream.seek(0) + self._skip_header() if clear_loaded_items: - self.__loaded_items.clear() + self._loaded_items.clear() - def __read_item(self): + def _read_item(self): self._last_accessed_id += 1 - item = self.__read_once() + item = self._read_once() - if self.keep_loaded_items: - self.__loaded_items.append(item) + if self._keep_loaded_items: + self._loaded_items.append(item) if self._last_accessed_id + 1 == len(self): - self.__reset() + self._reset() return item - def __getitem__(self, i: int) -> Tuple: + def __getitem__(self, i: int) -> Sample: - if len(self.__loaded_items) > i: - return self.__loaded_items[i] + if len(self._loaded_items) > i: + return self._loaded_items[i] else: is_next = i == (self._last_accessed_id + 1) if is_next: - item = self.__read_item() + item = self._read_item() return item - elif self.allow_random_access: + elif self._allow_random_access: item = () while (self._last_accessed_id < i) and not ( i == len(self) - 1 and self._last_accessed_id == -1 ): - item = self.__read_item() + item = self._read_item() return item else: raise Exception("Random access is not allowed") def close(self): - self.__stream.close() + self._stream.close() # ========= Handy decorators ========= @@ -1264,7 +1538,7 @@ def item_transform_fn(item: Sequence): ] return Dataset( - downstream_getter=ds, + parent=ds, item_transform_fn=item_transform_fn, operation_name=operation_name, operation_parameters={**operation_parameters, "idx": idx}, @@ -1302,7 +1576,7 @@ def item_transform_fn(item: Sequence): ] return Dataset( - downstream_getter=ds, + parent=ds, item_transform_fn=item_transform_fn, operation_name=operation_name, operation_parameters={**operation_parameters, "idx": idx}, @@ -1405,7 +1679,8 @@ def fn(x): def _custom( - elem_transform_fn: Callable[[Any], Any], elem_check_fn: Callable[[Any], None] = None + elem_transform_fn: Callable[[Any], Any], + elem_check_fn: Callable[[Any], None] = None, ) -> DatasetTransformFn: """Create a user defined transform. @@ -1608,7 +1883,6 @@ def make_fn(ds, idx) -> Callable: # When people say normalize, they often mean either minmax or standardize. # This implementation follows the scikit-learn terminology # Not included in the library for now, because it is used very seldomly in practice - # def normalize(axis=0, norm="l2") -> DatasetTransformFn: # """Normalize samples individually to unit norm. @@ -1690,19 +1964,58 @@ def make_fn(ds, idx) -> Callable: @_warn_no_args(skip=1) def zipped(*datasets: AbstractDataset): comp = compose.ZipDataset(*datasets) - return Dataset(downstream_getter=comp, ids=comp._ids, operation_name="copy") + return Dataset(parent=comp, ids=comp._ids, operation_name="copy") @_warn_no_args(skip=1) def cartesian_product(*datasets: AbstractDataset): comp = compose.CartesianProductDataset(*datasets) - return Dataset(downstream_getter=comp, ids=comp._ids, operation_name="copy") + return Dataset(parent=comp, ids=comp._ids, operation_name="copy") @_warn_no_args(skip=1) def concat(*datasets: AbstractDataset): comp = compose.ConcatDataset(*datasets) - return Dataset(downstream_getter=comp, ids=comp._ids, operation_name="copy") + return Dataset(parent=comp, ids=comp._ids, operation_name="copy") + + +# ========= Sampling =================== +def subsample( + dataset, subsample_func, sampling_ratio: int, cache_method="block" +) -> Dataset: + """Divide each sample in the dataset into several sub-samples using a user-defined function. + The function must take a single sample as an argument and must return a list of samples. + + Arguments: + dataset {[type]} -- dataset containing the samples which are sub-sampled. + subsample_func {Callable} -- function defining how each sample should divided. + sampling_ratio {int} -- the number of sub-samples produced for each sample. + cache_method {Any} -- defines the caching method used by the subsampling operation. Possible options are {None, "block"} + + Returns: + Dataset -- a new dataset containing the subsamples. + """ + + return SubsampleDataset(dataset, subsample_func, sampling_ratio, cache_method) + + +def supersample( + dataset, supersample_func, sampling_ratio: int, excess_samples_policy="discard" +) -> Dataset: + """Performs supersampling on the provided dataset. + + Arguments: + dataset {AbstractDataset} -- the dataset which the supersampling is applied to + supersample_func {Callable} -- function used to combine several samples into a single supersample. + sampling_ratio {int} -- the number of samples used to produce a each supersample. + + Keyword Arguments: + excess_samples_policy {str} -- defines how left over samples should be treated. Possible values are {"discard","error"} (default: {"discard"}) + """ + + return SupersampleDataset( + dataset, supersample_func, sampling_ratio, excess_samples_policy + ) # ========= Converters ========= @@ -1749,7 +2062,7 @@ def to_tensorflow(dataset: Dataset): import tensorflow as tf # type:ignore ds = Dataset( - downstream_getter=dataset, + parent=dataset, item_transform_fn=_tf_item_conversion, operation_name="transform", ) diff --git a/src/datasetops/loaders.py b/src/datasetops/loaders.py index 932d50f..e1f39b7 100644 --- a/src/datasetops/loaders.py +++ b/src/datasetops/loaders.py @@ -3,22 +3,24 @@ Additionally, the module provides adapters for the dataset types used by various ML frameworks. """ +from collections import namedtuple +import os from pathlib import Path -from datasetops.dataset import zipped -from datasetops.abstract import ItemGetter -from scipy.io import loadmat -from datasetops.dataset import Dataset -from datasetops.types import AnyPath, Data -from typing import Callable, Any, Optional, Union, List, Dict, Tuple -import numpy as np import re +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple import warnings +from datasetops.dataset import Dataset, zipped +from datasetops.types import AnyPath, ItemGetter +import numpy as np +from scipy.io import loadmat + class Loader(Dataset): def __init__( self, getdata: Callable[[Any], Any], + ids: Sequence[Any], identifier: Optional[str] = None, name: str = None, ): @@ -31,20 +33,49 @@ class Getter(ItemGetter): def __getitem__(self, i: int): return getdata(i) + def __len__(self): + return len(ids) + super().__init__( - downstream_getter=Getter(), + parent=Getter(), name=name, + ids=ids, operation_name="load", operation_parameters={"identifier": identifier}, ) - self.cachable = self.identifier is not None + self._cacheable = self.identifier is not None - def append(self, identifier: Data): - self._ids.append(identifier) - def extend(self, ids: Union[List[Data], np.ndarray]): - self._ids.extend(list(ids)) +def from_iterable(iterable: Iterable, identifier: str = None) -> Dataset: + """Creates a new dataset from the elements of the sequence. + + An iterable must must implement implement at least one of the following + functions: + __next__ or __getitem__ + + Arguments: + iterable {Iterable} -- an iterable containing the samples + identifier {Optional[str]} -- unique identifier + + Returns: + AbstractDataset -- a new dataset containing the elements of the iterable. + """ + + """ + https://nelsonslog.wordpress.com/2016/04/06/python3-no-len-for-iterators/ + https://gist.github.com/NelsonMinar/90212fbbfc6465c8e263341b86aa01a8 + It appears the most effective way of getting length of a iterable is to + convert it to a tuple or list""" + + itr = [i for i in iterable] + + def getter(idx): + nonlocal itr + return itr[idx] + + ldr = Loader(getdata=getter, ids=range(len(itr)), identifier=identifier) + return ldr def from_pytorch(pytorch_dataset, identifier: Optional[str] = None): @@ -64,8 +95,7 @@ def get_data(i) -> Tuple: item = pytorch_dataset[i] return tuple([x.numpy() if hasattr(x, "numpy") else x for x in item]) - ds = Loader(get_data, identifier) - ds.extend(list(range(len(pytorch_dataset)))) + ds = Loader(get_data, ids=range(len(pytorch_dataset)), identifier=identifier) return ds @@ -87,10 +117,12 @@ def from_tensorflow(tf_dataset, identifier: Optional[str] = None): if not tf.executing_eagerly(): raise AssertionError( - "Tensorflow must be executing eagerly for `from_tensorflow` to work" + """Tensorflow must be executing eagerly + for `from_tensorflow` to work""" ) - # We could create a mem which is filled up gradually, when samples are needed. However, then we would the to get the number of samples as a parameter + # We could create a mem which is filled up gradually, when samples are needed. + # However, then we would the to get the number of samples as a parameter # The latency using this solution seems to be acceptable tf_ds = list(tf_dataset) @@ -114,8 +146,7 @@ def get_data(i) -> Tuple: ) return item - ds = Loader(get_data, identifier) - ds.extend(list(range(len(tf_ds)))) + ds = Loader(get_data, ids=range(len(tf_ds)), identifier=identifier) if type(keys[0]) == str: ds = ds.named([str(k) for k in keys]) return ds @@ -124,9 +155,11 @@ def get_data(i) -> Tuple: def from_folder_data(path: AnyPath) -> Dataset: """Load data from a folder with the data structure: - folder - ├ sample1.jpg - ├ sample2.jpg + .. code-block:: + + folder + ├ sample1.jpg + ├ sample2.jpg Arguments: path {AnyPath} -- path to folder @@ -137,31 +170,36 @@ def from_folder_data(path: AnyPath) -> Dataset: """ p = Path(path) + full_p = str(p.absolute()) ids = [str(x.relative_to(p)) for x in p.glob("[!._]*")] - def get_data(i) -> Tuple: + def getdata(i) -> Tuple: nonlocal p return (str(p / i),) ds = Loader( - get_data, str(path), "Data Getter for folder with structure 'root/data'" + getdata, + ids, + full_p, + "Data Getter for folder with structure 'root/data' and path '{}'".format( + full_p + ), ) - ds.extend(ids) return ds def from_folder_class_data(path: AnyPath) -> Dataset: - u"""Load data from a folder with the data structure: + """Load data from a folder with the data structure: - ``` - data - ├── class1 - │ ├── sample1.jpg - │ └── sample2.jpg - └── class2 - ****└── sample3.jpg - ``` + .. code-block:: + + data + ├── class1 + │ ├── sample1.jpg + │ └── sample2.jpg + └── class2 + └── sample3.jpg Arguments: path {AnyPath} -- path to nested folder @@ -171,33 +209,41 @@ def from_folder_class_data(path: AnyPath) -> Dataset: e.g. ('nested_folder/class1/sample1.jpg', 'class1') """ p = Path(path) + full_p = str(p.absolute()) classes = [x for x in p.glob("[!._]*")] def get_data(i) -> Tuple: nonlocal p return (str(p / i), re.split(r"/|\\", i)[0]) + ids = [] + for c in classes: + ids.extend([str(x.relative_to(p)) for x in c.glob("[!._]*")]) + ds = Loader( - get_data, str(path), "Data Getter for folder with structure 'root/classes/data'" + get_data, + ids, + full_p, + "Data Getter for folder with structure 'root/classes/data' and path '{}'".format( + full_p + ), ) - for c in classes: - ids = [str(x.relative_to(p)) for x in c.glob("[!._]*")] - ds.extend(ids) - return ds def from_folder_group_data(path: AnyPath) -> Dataset: - u"""Load data from a folder with the data structure: + """Load data from a folder with the data structure: + + .. code-block:: - data - ├── group1 - │   ├── sample1.jpg - │   └── sample2.jpg - └── group2 - ....├── sample1.jpg - ....└── sample2.jpg + data + ├── group1 + │   ├── sample1.jpg + │   └── sample2.jpg + └── group2 + ├── sample1.jpg + └── sample2.jpg Arguments: path {AnyPath} -- path to nested folder @@ -225,17 +271,17 @@ def from_folder_group_data(path: AnyPath) -> Dataset: def from_folder_dataset_class_data(path: AnyPath) -> List[Dataset]: """Load data from a folder with the data structure: - ``` - data - ├── dataset1 - │   ├── class1 - │   │   ├── sample1.jpg - │   │   └── sample2.jpg - │   └── class2 - │   └── sample3.jpg - └── dataset2 - ****└── sample3.jpg - ``` + .. code-block:: + + data + ├── dataset1 + │   ├── class1 + │   │   ├── sample1.jpg + │   │   └── sample2.jpg + │   └── class2 + │   └── sample3.jpg + └── dataset2 + └── sample3.jpg Arguments: path {AnyPath} -- path to nested folder @@ -321,9 +367,8 @@ def get_labelled_data(idx: int): get_data = get_labelled_data if label_key else get_unlabelled_data - ds = Loader(get_data, identifier, name=name) - # populate data getter + ids = [] if label_key: unique_labels = np.unique(reshaped_data[label_key]) @@ -332,9 +377,11 @@ def get_labelled_data(idx: int): condition=reshaped_data[label_key].squeeze() == lbl, arr=reshaped_data[label_key].squeeze(), ) - ds.extend(lbl_inds) + ids.extend(lbl_inds) else: - ds.extend(list(range(common_shape))) + ids.extend(list(range(common_shape))) + + ds = Loader(get_data, ids, identifier, name=name) return ds @@ -402,3 +449,176 @@ def from_mat_single_mult_data(path: AnyPath) -> List[Dataset]: ) return sorted(datasets, key=lambda d: d.name) + + +def from_csv( + path: AnyPath, + load_func=lambda path, data: data, + predicate_func=lambda path: True, + data_format="tuple", + **kwargs, +): + """Load data stored as comma-separated values (CSV). + The csv-data can be stored as either a single file or in several smaller + files stored in a tree structure. + + Information from the path of the individual CSV files can be incorporated + through a user-defined function. + The function is invoked with the path to the CSV files and its contents, + and must return a new sample. + + Additionally, specific files may be skipped by supplying a predicate function. + This function is invoked with the path of each file. + + Arguments: + path {AnyPath} -- path to either a single csv file or a directory containing CSV files. + + Keyword Arguments: + load_func {Callable} -- optional user-defined function called with the path and contents of each CSV file. (default: {None}) + predicate_func {Callable} -- optional predicate function used to define files to be skipped. (default: {None}) + data_format {bool} -- defines how the data read from the csv is formatted. Possible options are {"tuple", "dataframe"} + kwargs {Any} -- additional arguments passed to pandas read_csv function + + Examples: + + Consider the example below: + + .. code-block:: + + cars + ├── car_1 + │   ├── load_1000.csv + │   └── load_2000.csv + └── car_2 + ├── load_1000.csv + └── load_2000.csv + + """ + p = Path(path) + + formats = {"tuple", "numpy", "dataframe"} + + if data_format not in formats: + raise ValueError( + f"Unable to load the dataset from CSV, the specified data fromat : {data_format} is not recognized. Options are: {formats}" + ) + + # read csv using pandas + # if specified the dataframe is converted to a tuple of numpy arrays. + def read_single_csv(path): + nonlocal kwargs + data = _read_single_csv(path, data_format, kwargs) + return load_func(path, data) + + if p.is_file(): + ds = from_files_list([p], read_single_csv) + + elif p.is_dir(): + ds = from_recursive_files(p, read_single_csv, predicate_func) + else: + raise ValueError( + f"Unable to load the dataset from CSV, the supplied path: {p} is neither a file or directory" + ) + + return ds + + +def _read_single_csv(path: Path, data_format, kwargs): + """Read the contents of the specified csv file and format the results + + Arguments: + path {Path} -- path to a csv file + data_format {[str]} -- string defining how the data should be formatted. + + Returns: + [Any] -- data from the csv file in the specified format. + """ + import pandas as pd + + data: pd.DataFrame = pd.read_csv(path, **kwargs) + # convert dataframe to + if data_format == "tuple": + # try to create named tuple, otherwise create plain tuple + try: + Row = namedtuple("Row", data.columns) + data = Row(*data.to_numpy().T.tolist()) + data = tuple(data) + except Exception: + data = tuple(data.to_numpy().T.tolist()) + elif data_format == "numpy": + data = data.to_numpy() + + return data + + +def from_files_list(files: Sequence[AnyPath], load_func: Callable[[AnyPath], Any]): + """Reads a list of files using by invoking a user-defined function on each file. + The function is invoked with the path of each file and must return a new sample. + + Arguments: + files {[Iterable]} -- an list of files to load + load_func {[type]} -- function invoked with the path to each file to produce a sample. + + Returns: + Dataset -- The resulting dataset. + """ + + ids_to_file = {idx: f for idx, f in enumerate(files)} + + def get_data(i): + nonlocal ids_to_file + nonlocal load_func + sample = load_func(ids_to_file[i]) + return sample + + ds = Loader( + get_data, + ids=list(ids_to_file.keys()), + name="files_list", + identifier=str(hash(frozenset(ids_to_file.items()))), + ) + return ds + + +def from_recursive_files( + root: AnyPath, load_func: Callable[[AnyPath], Any], predicate_func=lambda x: True +) -> Dataset: + """Provides functionality to load files stored in a tree structure in a recursively in a generic manner. + A callback function must be specified which is invoked with the path of each file. + When called this function should return a sample corresponding to the contents of the file. + Specific files may be skipped by supplying a predicate function. + + Arguments: + root {AnyPath} -- Path to the root directory + load_func {[type]} -- Function invoked with the path of each file. + predicate_func {[type]} -- Predicate function determining + + Returns: + Dataset -- The resulting dataset. + + Examples: + + Consider the file structure shown below: + + .. code-block:: + + patients + ├── control + │   ├── somefile.csv + │   ├── subject_a.txt + │   └── subject_b.txt + └── experimental + ├── subject_c.txt + └── subject_d.txt + """ + root_dir = Path(root) + + # find all files matching predicate function + matches = [] + for root, _, files in os.walk(root_dir): + for f in files: + p = Path(root) / f + if predicate_func(p): + matches.append(p) + + return from_files_list(matches, load_func) diff --git a/src/datasetops/types.py b/src/datasetops/types.py index 0b7318c..ca2fd84 100644 --- a/src/datasetops/types.py +++ b/src/datasetops/types.py @@ -1,15 +1,25 @@ -from typing import Callable, Dict, Sequence, Union, Any, List -from datasetops.abstract import AbstractDataset +from abc import ABC, abstractmethod from pathlib import Path +from typing import Any, Callable, Dict, List, Sequence, Tuple, Union +import typing + +from datasetops.transformation_graph import TransformationGraph Shape = Sequence[int] IdIndex = int Id = int -Ids = List[Id] +Ids = Sequence[Id] + +"""Represents a single index or a slice""" +IdxSlice = Union[int, slice] + +"""Represents a sample from a dataset""" +Sample = Tuple + Data = Any -IdIndexSet = Dict[Any, List[IdIndex]] ItemTransformFn = Callable[[Any], Any] -DatasetTransformFn = Callable[[int, AbstractDataset], AbstractDataset] +DatasetTransformFn = Callable[[int, "AbstractDataset"], "AbstractDataset"] + DatasetTransformFnCreator = Union[ Callable[[], DatasetTransformFn], Callable[[Any], DatasetTransformFn] ] @@ -19,3 +29,68 @@ """Something""" ItemNames = Dict[str, int] + + +class ItemGetter(ABC): + """Abstract base class implemented by classes that implement + an index based get method + """ + + @abstractmethod + def __getitem__(self, i: int) -> Tuple: + pass # pragma: no cover + + @abstractmethod + def __len__(self): + pass # pragma: no cover + + +class AbstractDataset(ItemGetter): + """Abstract base class defining a generic dataset interface.""" + + def __init__(self): + pass # pragma: no cover + + name = "" + shape = None + _cacheable = False + _origin = None + + @abstractmethod + def __len__(self) -> int: + """Return the total number of elements in the dataset.""" + pass # pragma: no cover + + @typing.overload + def __getitem__(self, idx: slice) -> List[Sample]: + ... + + @abstractmethod + def __getitem__(self, idx: int) -> Sample: + """Returns the element at the specified index. + + Arguments: + idx {int} -- the index from which to read the sample. + + Returns: + Tuple -- A tuple representing the sample + """ + pass # pragma: no cover + + def get_transformation_graph(self) -> TransformationGraph: + """Returns TransformationGraph of current dataset + """ + return TransformationGraph(self) + + def __iter__(self): + for i in range(self.__len__()): + + yield self.__getitem__(i) + + @property + def generator(self,): + def g(): + for d in self: + yield d + + return g diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/datasetops_tests/test_caching.py b/tests/datasetops_tests/test_caching.py index 18f9a46..7ef08d9 100644 --- a/tests/datasetops_tests/test_caching.py +++ b/tests/datasetops_tests/test_caching.py @@ -52,34 +52,34 @@ def are_same(a, b): cached2.close() -def test_cachable(): +def test_cacheable(): path = get_test_dataset_path(DATASET_PATHS.FOLDER_DATASET_GROUP_DATA) test, _ = loaders.from_folder_dataset_group_data(path) - assert test.cachable + assert test._cacheable test = test.image(False, True, False) - assert test.cachable + assert test._cacheable test = test.transform((read_text, None, None)) - assert test.cachable + assert test._cacheable test = test.transform((None, None, read_bin)) - assert test.cachable + assert test._cacheable test = test.image_resize(None, (10, 10), None) - assert test.cachable + assert test._cacheable test1, test2 = test.split([0.3, -1]) - assert not test1.cachable - assert not test2.cachable + assert not test1._cacheable + assert not test2._cacheable test3, test4 = test.split([0.3, -1], 2605) - assert test3.cachable - assert test4.cachable + assert test3._cacheable + assert test4._cacheable unidentified = from_dummy_numpy_data() - assert not unidentified.cachable + assert not unidentified._cacheable def test_cache(): diff --git a/tests/datasetops_tests/test_datasets.py b/tests/datasetops_tests/test_datasets.py index 05e9912..de5e1ad 100644 --- a/tests/datasetops_tests/test_datasets.py +++ b/tests/datasetops_tests/test_datasets.py @@ -1,4 +1,10 @@ from typing import Sequence + +import pytest +import numpy as np +from PIL import Image + +from datasetops.loaders import from_csv, from_iterable from datasetops.dataset import ( image_resize, reshape, @@ -9,9 +15,6 @@ _DEFAULT_SHAPE, ) import datasetops.loaders as loaders -import pytest -import numpy as np -from PIL import Image from .testing_utils import ( get_test_dataset_path, from_dummy_data, @@ -32,17 +35,22 @@ def test_generator(): def test_shuffle(): - seed = 42 + + # empty set + ds = loaders.Loader(lambda i: i, []) + ds.shuffle() + assert len(ds) == 0 + ds = from_dummy_data() - expected_items = [i for i in ds] - ds_shuffled = ds.shuffle(seed) - found_items = [i for i in ds_shuffled] - # same data - assert set(expected_items) == set(found_items) + # no seed + ds_shuffled = ds.shuffle() + assert set(ds) == set(ds_shuffled) # same data + assert list(ds) != list(ds_shuffled) # different sequence - # different sequence - assert expected_items != found_items + ds_shuffled = ds.shuffle(seed=42) + assert set(ds) == set(ds_shuffled) # same data + assert list(ds) != list(ds_shuffled) # different sequence def test_sample(): @@ -279,6 +287,162 @@ def test_reorder(): ) # key needs to be unique, but wouldn't be +def test_getitem(): + n = 5 + itr_int = list(range(n)) + itr_tuple = [(i,) for i in itr_int] + itr_str = [str(i) for i in itr_int] + + def do_test(itr): + ds = from_iterable(itr) + + # index access + for i in range(n): + assert ds[i] == itr[i] + + with pytest.raises(IndexError): + ds[n + 1] + + # slice access + for i in range(n): + for j in range(n): + for s in range(n): + assert ds[i:j:n] == itr[i:j:n] + + assert ds[:] == itr[:] + + do_test(itr_int) + do_test(itr_tuple) + do_test(itr_str) + + +class TestSubsample: + cars = from_csv(get_test_dataset_path(DATASET_PATHS.CSV + "/cars")) + + def test_subsample(self): + def func(s): + return (s, s) + + assert len(self.cars) == 4 + ds = TestSubsample.cars.subsample(func, 2) + assert len(ds) == 8 + + s = ds[0] + assert len(s[0]) == 3 + assert len(s[1]) == 3 + + def test_invalid_subsample_func(self): + + # incorrect number of subsamples returned + def func1(s): + return (s,) + + with pytest.raises(RuntimeError): + ds = TestSubsample.cars.subsample(func1, 3) + _ = ds[0] + + # invalid subsample returned + def func2(s): + return None + + with pytest.raises(RuntimeError): + ds = TestSubsample.cars.subsample(func2, 2) + _ = ds[0] + + def test_invalid_nsamples(self): + def func(s): + return s + + with pytest.raises(ValueError): + TestSubsample.cars.subsample(func, 0) + + with pytest.raises(ValueError): + TestSubsample.cars.subsample(func, -1) + + def test_caching(self): + + cnt = 0 + + def func(s): + nonlocal cnt + cnt += 1 + return (s, s) + + """no caching, every time a subsample is read + the parent sample is read as well""" + ds = TestSubsample.cars.subsample(func, 2, cache_method=None) + ds[0] + ds[1] + assert cnt == 2 + + """block caching, store the subsamples of produced + from the last read of the parent sample. In this case + each sample produces 2 subsamples. As such reading idx 0 and 1 + should result in one read. Reading beyond this will case another read. + Going back again the sample 0 and 1 should now be cleared + """ + cnt = 0 + ds = TestSubsample.cars.subsample(func, 2, cache_method="block") + ds[0] + ds[1] + assert cnt == 1 + ds[2] + assert cnt == 2 + ds[0] + assert cnt == 3 + ds[1] + assert cnt == 3 + + def test_getitem(self): + def func(s): + return s, s + + ds = from_iterable([1, 2]).subsample(func, 2) + + # index + assert ds[0] == 1 + assert ds[1] == 1 + assert ds[2] == 2 + assert ds[3] == 2 + + with pytest.raises(IndexError): + ds[4] + + # slicing + assert ds[:] == [1, 1, 2, 2] + assert ds[0:1] == [1] + assert ds[1:2] == [1] + assert ds[0:3] == [1, 1, 2] + assert ds[4:] == [] + + +class TestSupersample: + def test_supersample(self): + + ds = from_iterable([1, 2, 3, 4]) + assert len(ds) == 4 + + ds = ds.supersample(sum, 2) + assert len(ds) == 2 + assert ds[0] == 3 + assert ds[1] == 7 + + def test_getitem(self): + + ds = from_iterable([1, 2, 3, 4]).supersample(sum, 2) + + # index + assert ds[0] == 3 + assert ds[1] == 7 + with pytest.raises(IndexError): + ds[2] + + assert ds[0:1] == [3] + assert ds[0:2] == [3, 7] + assert ds[:] == [3, 7] + assert ds[3:] == [] + + # ========= Tests relating to stats ========= @@ -314,11 +478,11 @@ def get_data(i): return i, i # no shape yet - ds = loaders.Loader(get_data) + ds = loaders.Loader(get_data, ids=[]) assert ds.shape == _DEFAULT_SHAPE # shape given - ds.append(1) + ds = loaders.Loader(get_data, ids=[1]) assert ds.shape == (_DEFAULT_SHAPE, _DEFAULT_SHAPE) # numpy data diff --git a/tests/datasetops_tests/test_loaders.py b/tests/datasetops_tests/test_loaders.py index 5fdd674..885c25c 100644 --- a/tests/datasetops_tests/test_loaders.py +++ b/tests/datasetops_tests/test_loaders.py @@ -1,198 +1,298 @@ -import datasetops.loaders as loaders -import random -from pathlib import Path -from .testing_utils import ( - get_test_dataset_path, - DATASET_PATHS, - RESOURCES_PATH, -) -import numpy as np -import pytest - -# tests ########################## - - -def test_folder_data(): - path = get_test_dataset_path(DATASET_PATHS.FOLDER_DATA) - - expected_items = [ - str(Path(path) / "frame_000{}.jpg".format(i)) for i in range(1, 7) - ] - - ds = loaders.from_folder_data(path) - found_items = [i[0] for i in ds] - - assert set(expected_items) == set(found_items) - - -def test_folder_class_data(): - path = get_test_dataset_path(DATASET_PATHS.FOLDER_CLASS_DATA) - - expected_items = [str(p) for p in Path(path).glob("*/*.jpg")] - - ds = loaders.from_folder_class_data(path) - found_items = [i[0] for i in ds] - - assert set(expected_items) == set(found_items) - - -def test_folder_group_data(): - path = get_test_dataset_path(DATASET_PATHS.FOLDER_GROUP_DATA) - - expected_items = [str(p) for p in (Path(path)).glob("*/*.*")] - ds = loaders.from_folder_group_data(path) - - assert set(ds.names) == set(["calib", "label_2", "image_2", "velodyne"]) - - found_items = [] - - for i in ds: - for q in i: - found_items.append(q) - - assert set(expected_items) == set(found_items) - - -def test_folder_dataset_class_data(): - path = get_test_dataset_path(DATASET_PATHS.FOLDER_DATASET_CLASS_DATA) - sets = Path(path).glob("[!._]*") - - sets_of_expected_items = [ - set([str(p) for p in Path(s).glob("*/*.jpg")]) for s in sets - ] - - datasets = loaders.from_folder_dataset_class_data(path) - sets_of_found_items = [set([i[0] for i in ds]) for ds in datasets] - - for expected_items_set in sets_of_expected_items: - assert any( - [expected_items_set == found_items for found_items in sets_of_found_items] - ) - - -def test_folder_dataset_group_data(): - path = get_test_dataset_path(DATASET_PATHS.FOLDER_DATASET_GROUP_DATA) - sets = Path(path).glob("[!._]*") - - sets_of_expected_items = [ - set([str(p) for p in Path(s).glob("*/*.*")]) for s in sets - ] - - datasets = loaders.from_folder_dataset_group_data(path) - - assert set(datasets[0].names) == set(["calib", "image_2", "velodyne"]) - assert set(datasets[1].names) == set(["calib", "label_2", "image_2", "velodyne"]) - - def get_data_flat(ds): - found_items = [] - - for i in ds: - for q in i: - found_items.append(q) - - return set(found_items) - - sets_of_found_items = [get_data_flat(ds) for ds in datasets] - - for expected_items_set in sets_of_expected_items: - assert any( - [expected_items_set == found_items for found_items in sets_of_found_items] - ) - - -def test_mat_single_with_multi_data(): - path = get_test_dataset_path(DATASET_PATHS.MAT_SINGLE_WITH_MULTI_DATA) - - datasets = loaders.from_mat_single_mult_data(path) - - for ds in datasets: - # check dataset sizes and names - if ds.name == "src": - assert len(ds) == 2000 - elif ds.name == "tar": - assert len(ds) == 1800 - else: - assert False - - # randomly check some samples for their dimension - ids = random.sample(range(len(ds)), 42) - for i in ids: - data, label = ds[i] - - assert data.shape == (256,) - assert int(label) in range(10) - - -@pytest.mark.slow -def test_pytorch(): - import torchvision - import torch - from torch.utils.data import Dataset as TorchDataset - - mnist = torchvision.datasets.MNIST( - str(RESOURCES_PATH), - train=True, - transform=None, - target_transform=None, - download=True, - ) - mnist_item = mnist[0] - ds_mnist = loaders.from_pytorch(mnist) - ds_mnist_item = ds_mnist[0] - # nothing to convert, items equal - assert mnist_item == ds_mnist_item - - class PyTorchDataset(TorchDataset): - def __len__(self,): - return 5 - - def __getitem__(self, idx): - return (torch.Tensor([idx, idx]), idx) # type:ignore - - torch_ds = PyTorchDataset() - ds_torch = loaders.from_pytorch(torch_ds) - - # tensor type in torch dataset - assert torch.all(torch.eq(torch_ds[0][0], torch.Tensor([0, 0]))) # type:ignore - - # numpy type in ours - assert np.array_equal(ds_torch[0][0], (np.array([0, 0]))) # type:ignore - - # labels are the same - assert torch_ds[0][1] == ds_torch[0][1] == 0 - - -@pytest.mark.slow -def test_tfds(): - import tensorflow as tf - import tensorflow_datasets as tfds - - # basic tf.data.Dataset - - tf_123 = tf.data.Dataset.from_tensor_slices([1, 2, 3]) - ds_123 = loaders.from_tensorflow(tf_123) - - for t, d in zip(list(tf_123), list(ds_123)): - assert t.numpy() == d[0] - - # from TFDS - tf_mnist = tfds.load("mnist", split="test") - - ds_mnist = loaders.from_tensorflow(tf_mnist) - - mnist_item = next(iter(tf_mnist)) - ds_mnist_item = ds_mnist[0] - - assert np.array_equal(mnist_item["image"].numpy(), ds_mnist_item[0]) - assert np.array_equal(mnist_item["label"].numpy(), ds_mnist_item[1]) - - # also works for 'as_supervised' - tf_mnist = tfds.load("mnist", split="test", as_supervised=True) - - ds_mnist = loaders.from_tensorflow(tf_mnist) - - mnist_item = next(iter(tf_mnist)) - ds_mnist_item = ds_mnist[0] - - assert np.array_equal(mnist_item[0].numpy(), ds_mnist_item[0]) - assert np.array_equal(mnist_item[1].numpy(), ds_mnist_item[1]) +from collections import namedtuple + +import datasetops.loaders as loaders +from datasetops.loaders import from_iterable +import random +from pathlib import Path +from .testing_utils import ( + get_test_dataset_path, + DATASET_PATHS, + RESOURCES_PATH, +) +import numpy as np +import pytest + + +def test_folder_data(): + path = get_test_dataset_path(DATASET_PATHS.FOLDER_DATA) + + expected_items = [ + str(Path(path) / "frame_000{}.jpg".format(i)) for i in range(1, 7) + ] + + ds = loaders.from_folder_data(path) + found_items = [i[0] for i in ds] + + assert set(expected_items) == set(found_items) + + +def test_folder_class_data(): + path = get_test_dataset_path(DATASET_PATHS.FOLDER_CLASS_DATA) + + expected_items = [str(p) for p in Path(path).glob("*/*.jpg")] + + ds = loaders.from_folder_class_data(path) + found_items = [i[0] for i in ds] + + assert set(expected_items) == set(found_items) + + +def test_folder_group_data(): + path = get_test_dataset_path(DATASET_PATHS.FOLDER_GROUP_DATA) + + expected_items = [str(p) for p in (Path(path)).glob("*/*.*")] + ds = loaders.from_folder_group_data(path) + + assert set(ds.names) == set(["calib", "label_2", "image_2", "velodyne"]) + + found_items = [] + + for i in ds: + for q in i: + found_items.append(q) + + assert set(expected_items) == set(found_items) + + +def test_folder_dataset_class_data(): + path = get_test_dataset_path(DATASET_PATHS.FOLDER_DATASET_CLASS_DATA) + sets = Path(path).glob("[!._]*") + + sets_of_expected_items = [ + set([str(p) for p in Path(s).glob("*/*.jpg")]) for s in sets + ] + + datasets = loaders.from_folder_dataset_class_data(path) + sets_of_found_items = [set([i[0] for i in ds]) for ds in datasets] + + for expected_items_set in sets_of_expected_items: + assert any( + [expected_items_set == found_items for found_items in sets_of_found_items] + ) + + +def test_folder_dataset_group_data(): + path = get_test_dataset_path(DATASET_PATHS.FOLDER_DATASET_GROUP_DATA) + sets = Path(path).glob("[!._]*") + + sets_of_expected_items = [ + set([str(p) for p in Path(s).glob("*/*.*")]) for s in sets + ] + + datasets = loaders.from_folder_dataset_group_data(path) + + assert set(datasets[0].names) == set(["calib", "image_2", "velodyne"]) + assert set(datasets[1].names) == set(["calib", "label_2", "image_2", "velodyne"]) + + def get_data_flat(ds): + found_items = [] + + for i in ds: + for q in i: + found_items.append(q) + + return set(found_items) + + sets_of_found_items = [get_data_flat(ds) for ds in datasets] + + for expected_items_set in sets_of_expected_items: + assert any( + [expected_items_set == found_items for found_items in sets_of_found_items] + ) + + +def test_mat_single_with_multi_data(): + path = get_test_dataset_path(DATASET_PATHS.MAT_SINGLE_WITH_MULTI_DATA) + + datasets = loaders.from_mat_single_mult_data(path) + + for ds in datasets: + # check dataset sizes and names + if ds.name == "src": + assert len(ds) == 2000 + elif ds.name == "tar": + assert len(ds) == 1800 + else: + assert False + + # randomly check some samples for their dimension + ids = random.sample(range(len(ds)), 42) + for i in ids: + data, label = ds[i] + + assert data.shape == (256,) + assert int(label) in range(10) + + +@pytest.mark.slow +def test_pytorch(): + import torchvision + import torch + from torch.utils.data import Dataset as TorchDataset + + mnist = torchvision.datasets.MNIST( + str(RESOURCES_PATH), + train=True, + transform=None, + target_transform=None, + download=True, + ) + mnist_item = mnist[0] + ds_mnist = loaders.from_pytorch(mnist) + ds_mnist_item = ds_mnist[0] + # nothing to convert, items equal + assert mnist_item == ds_mnist_item + + class PyTorchDataset(TorchDataset): + def __len__(self,): + return 5 + + def __getitem__(self, idx): + return (torch.Tensor([idx, idx]), idx) # type:ignore + + torch_ds = PyTorchDataset() + ds_torch = loaders.from_pytorch(torch_ds) + + # tensor type in torch dataset + assert torch.all(torch.eq(torch_ds[0][0], torch.Tensor([0, 0]))) # type:ignore + + # numpy type in ours + assert np.array_equal(ds_torch[0][0], (np.array([0, 0]))) # type:ignore + + # labels are the same + assert torch_ds[0][1] == ds_torch[0][1] == 0 + + +@pytest.mark.slow +def test_tfds(): + import tensorflow as tf + import tensorflow_datasets as tfds + + # basic tf.data.Dataset + + tf_123 = tf.data.Dataset.from_tensor_slices([1, 2, 3]) + ds_123 = loaders.from_tensorflow(tf_123) + + for t, d in zip(list(tf_123), list(ds_123)): + assert t.numpy() == d[0] + + # from TFDS + tf_mnist = tfds.load("mnist", split="test") + + ds_mnist = loaders.from_tensorflow(tf_mnist) + + mnist_item = next(iter(tf_mnist)) + ds_mnist_item = ds_mnist[0] + + assert np.array_equal(mnist_item["image"].numpy(), ds_mnist_item[0]) + assert np.array_equal(mnist_item["label"].numpy(), ds_mnist_item[1]) + + # also works for 'as_supervised' + tf_mnist = tfds.load("mnist", split="test", as_supervised=True) + + ds_mnist = loaders.from_tensorflow(tf_mnist) + + mnist_item = next(iter(tf_mnist)) + ds_mnist_item = ds_mnist[0] + + assert np.array_equal(mnist_item[0].numpy(), ds_mnist_item[0]) + assert np.array_equal(mnist_item[1].numpy(), ds_mnist_item[1]) + + +def test_from_recursive_files(): + + Patient = namedtuple("Patient", ["blood_pressure", "control"]) + + root = get_test_dataset_path(DATASET_PATHS.PATIENTS) + + def predicate(path): + return path.suffix == ".txt" + + def func(path): + blood_pressure = np.loadtxt(path) + is_control = path.parent != "control" + return Patient(blood_pressure, is_control) + + ds = loaders.from_recursive_files(root, func, predicate) + + assert len(ds) == 4 + + assert ds[0].blood_pressure.shape == (270,) + + +class TestFromIterable: + def test_list(self): + + ds = from_iterable([0, 1, 2]) + + assert len(ds) == 3 + + for i in range(len(ds)): + assert i == ds[i] + + ds = from_iterable([]) + assert len(ds) == 0 + + def test_tuple(self): + + ds = from_iterable((0, 1, 2)) + + assert len(ds) == 3 + + for i in range(len(ds)): + assert i == ds[i] + + ds = from_iterable(()) + assert len(ds) == 0 + + def test_generator(self): + def gen(size): + for i in range(size): + yield i + + iterator = gen(3) + ds = from_iterable(iterator) + + assert len(ds) == 3 + + for i in range(len(ds)): + assert i == ds[i] + + iterator = gen(0) + ds = from_iterable(iterator) + + assert len(ds) == 0 + + def test_invalid(self): + + with pytest.raises(TypeError): + _ = from_iterable(None) # type:ignore + + with pytest.raises(TypeError): + _ = from_iterable(10) # type:ignore + + +class TestLoadCSV: + + cars = get_test_dataset_path("csv/cars") + + def test_names_missing(self): + p = get_test_dataset_path("csv/weird/no_names.csv") + ds = loaders.from_csv(p, names=["a", "b", "c"]) + + assert len(ds) == 1 + s = ds[0] + s[0] == [1, 2] + s[1] == [2, 4] + s[2] == [3, 6] + + def test_single_default(self): + ds = loaders.from_csv(TestLoadCSV.cars / "car_1" / "load_1000.csv") + assert len(ds) == 1 + s = ds[0] + assert s[0] == [1, 2, 3] + assert s[1] == [0.5, 1.0, 1.5] + + def test_nested_default(self): + ds = loaders.from_csv(TestLoadCSV.cars) + assert len(ds) == 4 diff --git a/tests/datasetops_tests/test_shuffle.py b/tests/datasetops_tests/test_shuffle.py deleted file mode 100644 index 93ec2c3..0000000 --- a/tests/datasetops_tests/test_shuffle.py +++ /dev/null @@ -1,66 +0,0 @@ -from datasetops.loaders import Loader - - -def _get_data(i): - return i - - -def from_dummy_data() -> Loader: - - a_ids = list(range(5)) - b_ids = list(range(5, 11)) - - ds = Loader(_get_data) - ds.extend(a_ids) - ds.extend(b_ids) - return ds - - -def test_noSeed_valid(): - ds = from_dummy_data() - - ds.shuffle() - - -def test_emptyDataset_valid(): - - ds = Loader(_get_data) - ds.shuffle() - assert len(ds) == 0 - - -def test_shuffleStringIds_valid(): - def _get_data(i): - return i - - ds = Loader(_get_data) - ds.extend(["1", "2"]) - - ds_shuffled = ds.shuffle() - - assert "1" in ds_shuffled - assert "2" in ds_shuffled - - -def test_containsSameElements(): - - ds = from_dummy_data() - - expected_items = [i for i in ds] - ds_shuffled = ds.shuffle() - found_items = [i for i in ds_shuffled] - - assert set(expected_items) == set(found_items) - - -def test_elementsShuffled(): - - seed = 42 - - ds = from_dummy_data() - - expected_items = [i for i in ds] - ds_shuffled = ds.shuffle(seed) - found_items = [i for i in ds_shuffled] - - assert expected_items != found_items diff --git a/tests/datasetops_tests/test_stream_dataset.py b/tests/datasetops_tests/test_stream_dataset.py index 47851d8..af2cba4 100644 --- a/tests/datasetops_tests/test_stream_dataset.py +++ b/tests/datasetops_tests/test_stream_dataset.py @@ -8,7 +8,6 @@ read_bin, DATASET_PATHS, ) -from pathlib import Path import random @@ -27,7 +26,7 @@ def test_read_from_file(): ) cache_path = get_test_dataset_path(DATASET_PATHS.CACHE_ROOT_PATH) - pkl_path = str(Path(cache_path) / "cache_1.pkl") + pkl_path = cache_path / "cache_1.pkl" Cache.clear(cache_path) original.cached(cache_path) diff --git a/tests/datasetops_tests/testing_utils.py b/tests/datasetops_tests/testing_utils.py index 8c34165..a8a4a10 100644 --- a/tests/datasetops_tests/testing_utils.py +++ b/tests/datasetops_tests/testing_utils.py @@ -15,18 +15,29 @@ class DatasetPaths(NamedTuple): CACHE_ROOT_PATH: str = "caching/cache_root" FOLDER_GROUP_DATA: str = KITTI_DATASET + "/training" FOLDER_DATASET_GROUP_DATA: str = KITTI_DATASET + PATIENTS: str = "patients" + CSV: str = "csv" DATASET_PATHS = DatasetPaths() -def get_test_dataset_path(dataset_path: str) -> str: - return str((RESOURCES_PATH / dataset_path).absolute()) +def get_test_dataset_path(dataset_path: str) -> Path: + """Returns the path to the dataset relative to the test-resources folder. + + Arguments: + dataset_path {str} -- path to the dataset defined relative to the test-resource folder + + Returns: + Path -- path to the dataset + """ + return Path(__file__).parent.parent / "resources" / dataset_path def from_dummy_data(num_total=11, with_label=False) -> Loader: a_ids = list(range(5)) b_ids = list(range(5, num_total)) + ids = a_ids + b_ids def get_data(i): return (i,) @@ -35,9 +46,7 @@ def get_labelled_data(i): nonlocal a_ids return i, "a" if i < len(a_ids) else "b" - ds = Loader(get_labelled_data if with_label else get_data) - ds.extend(a_ids) - ds.extend(b_ids) + ds = Loader(get_labelled_data if with_label else get_data, ids=ids) return ds @@ -49,6 +58,7 @@ def get_labelled_data(i): def from_dummy_numpy_data() -> Loader: a_ids = list(range(5)) b_ids = list(range(5, 11)) + ids = a_ids + b_ids labels = [*[1 for _ in a_ids], *[2 for _ in b_ids]] num_samples = len(a_ids) + len(b_ids) @@ -60,9 +70,7 @@ def from_dummy_numpy_data() -> Loader: def get_data(idx): return data[idx], labels[idx] - ds = Loader(get_data) - ds.extend(a_ids) - ds.extend(b_ids) + ds = Loader(get_data, ids) return ds @@ -91,6 +99,5 @@ def multi_shape_dataset(SHAPE_1D=(2,), SHAPE_3D=(5, 4, 3)): def get_data(idx): return data[idx] - ds = Loader(get_data) - ds.extend(list(range(len(data)))) + ds = Loader(get_data, range(len(data))) return ds diff --git a/tests/resources/csv/cars/car_1/load_1000.csv b/tests/resources/csv/cars/car_1/load_1000.csv new file mode 100644 index 0000000..adda4e4 --- /dev/null +++ b/tests/resources/csv/cars/car_1/load_1000.csv @@ -0,0 +1,4 @@ +speed,vibration +1,0.5 +2,1 +3,1.5 diff --git a/tests/resources/csv/cars/car_1/load_2000.csv b/tests/resources/csv/cars/car_1/load_2000.csv new file mode 100644 index 0000000..adda4e4 --- /dev/null +++ b/tests/resources/csv/cars/car_1/load_2000.csv @@ -0,0 +1,4 @@ +speed,vibration +1,0.5 +2,1 +3,1.5 diff --git a/tests/resources/csv/cars/car_2/load_1000.csv b/tests/resources/csv/cars/car_2/load_1000.csv new file mode 100644 index 0000000..adda4e4 --- /dev/null +++ b/tests/resources/csv/cars/car_2/load_1000.csv @@ -0,0 +1,4 @@ +speed,vibration +1,0.5 +2,1 +3,1.5 diff --git a/tests/resources/csv/cars/car_2/load_2000.csv b/tests/resources/csv/cars/car_2/load_2000.csv new file mode 100644 index 0000000..adda4e4 --- /dev/null +++ b/tests/resources/csv/cars/car_2/load_2000.csv @@ -0,0 +1,4 @@ +speed,vibration +1,0.5 +2,1 +3,1.5 diff --git a/tests/resources/csv/weird/no_names.csv b/tests/resources/csv/weird/no_names.csv new file mode 100644 index 0000000..ef96ea1 --- /dev/null +++ b/tests/resources/csv/weird/no_names.csv @@ -0,0 +1,2 @@ +1,2,3 +2,4,6 \ No newline at end of file diff --git a/tests/resources/patients/control/somefile.csv b/tests/resources/patients/control/somefile.csv new file mode 100644 index 0000000..e69de29 diff --git a/tests/resources/patients/control/subject_a.txt b/tests/resources/patients/control/subject_a.txt new file mode 100644 index 0000000..3a41f67 --- /dev/null +++ b/tests/resources/patients/control/subject_a.txt @@ -0,0 +1,270 @@ +1.609928030504898118e+02 +1.609820091538698534e+02 +1.609007418496371997e+02 +1.607892395650057153e+02 +1.608069933564716791e+02 +1.605279603732504086e+02 +1.604294845365553783e+02 +1.600675747717003503e+02 +1.607240394641396506e+02 +1.603111767209769880e+02 +1.606218877489681631e+02 +1.604220624357512861e+02 +1.604018776223522025e+02 +1.601742007791915512e+02 +1.605320450545387985e+02 +1.604143850133977764e+02 +1.606541431302050853e+02 +1.608759219416762392e+02 +1.602861644290477727e+02 +1.606086537526707616e+02 +1.609084701146972805e+02 +1.608884299997424421e+02 +1.603612945823088012e+02 +1.602993375573948072e+02 +1.600398545409738063e+02 +1.601927791627676640e+02 +1.608721136610901965e+02 +1.608397974585913630e+02 +1.602384727461235059e+02 +1.600979037795929116e+02 +1.602372097336569254e+02 +1.608750920477891384e+02 +1.609302789698670608e+02 +1.606548528334024297e+02 +1.609813013381940721e+02 +1.608579566544460988e+02 +1.600282561024647805e+02 +1.602690512189432468e+02 +1.602052509276774970e+02 +1.600351244879777539e+02 +1.601958816425474197e+02 +1.607466805547543061e+02 +1.602950017937748157e+02 +1.600140535738950405e+02 +1.608163912938939859e+02 +1.602335392274820549e+02 +1.609953902391335703e+02 +1.603754807740419892e+02 +1.608619217086679498e+02 +1.601336137081321738e+02 +1.600882156459538237e+02 +1.602064431805631557e+02 +1.606956162665865406e+02 +1.607489406712510345e+02 +1.602910742351654676e+02 +1.609106185612441777e+02 +1.601573517833690232e+02 +1.608715394098497597e+02 +1.604243234354802325e+02 +1.601012708154771929e+02 +1.606424111380955253e+02 +1.608078198347118359e+02 +1.603209354893177760e+02 +1.602620034856508084e+02 +1.602190912954949624e+02 +1.606465010457656888e+02 +1.601186467324753266e+02 +1.601106224965309934e+02 +1.604476938385761855e+02 +1.601984850129408926e+02 +1.608153495022577886e+02 +1.608897017621925727e+02 +1.600445262883657165e+02 +1.601473900717664094e+02 +1.602214670159492584e+02 +1.607550301653001554e+02 +1.606207311104317910e+02 +1.602804973237634272e+02 +1.603873902969734786e+02 +1.604432890333492878e+02 +1.604850455192879224e+02 +1.605306670096843504e+02 +1.602662716398977523e+02 +1.600850347374747287e+02 +1.602554287743434429e+02 +1.602291003194054895e+02 +1.603373769466702754e+02 +1.602273153962027550e+02 +1.600729096874473782e+02 +1.600083256720916438e+02 +1.607391149998461515e+02 +1.606053462034236645e+02 +1.604945050050258430e+02 +1.603891037212525248e+02 +1.606277985427483657e+02 +1.604254460739795434e+02 +1.603475070380764862e+02 +1.604225942079638969e+02 +1.606994450434313251e+02 +1.603215922389671562e+02 +1.603797134473751100e+02 +1.607028612243439625e+02 +1.604403705798749229e+02 +1.608298891248457494e+02 +1.604145984769152449e+02 +1.608178881589600735e+02 +1.600996218236705886e+02 +1.601180985016508203e+02 +1.601489310360370553e+02 +1.602750295053555476e+02 +1.607958942361387926e+02 +1.609955165313894554e+02 +1.603677822583434533e+02 +1.607921475684446762e+02 +1.609755723731760497e+02 +1.600964154458427799e+02 +1.607529123008623344e+02 +1.604947399575378313e+02 +1.600931936688487838e+02 +1.603480868539074606e+02 +1.602446497320082415e+02 +1.603473331307714886e+02 +1.603638485260059383e+02 +1.603898006354307313e+02 +1.603055196693649123e+02 +1.604579426536128608e+02 +1.607611853730787459e+02 +1.608014964628647476e+02 +1.602103888286130484e+02 +1.601598519705629258e+02 +1.607680147683021801e+02 +1.605105480782399070e+02 +1.605197530464235172e+02 +1.603804288423203275e+02 +1.601377310162795027e+02 +1.608274978786087672e+02 +1.605420724525731373e+02 +1.609958734747130222e+02 +1.609412176745726129e+02 +1.605752791881856183e+02 +1.605569561082335781e+02 +1.602156243719103657e+02 +1.600601526345879790e+02 +1.600350369011066505e+02 +1.601425574731921984e+02 +1.607266198256855034e+02 +1.609423590866153404e+02 +1.600515902820518477e+02 +1.600538151752099054e+02 +1.602087496773169448e+02 +1.600620777534867614e+02 +1.605123880783886818e+02 +1.607127649846372037e+02 +1.609885966473346741e+02 +1.609440351387088697e+02 +1.604907349217284036e+02 +1.608762398605175008e+02 +1.602273329891208675e+02 +1.609500068607545700e+02 +1.602616964958266124e+02 +1.609541329587947871e+02 +1.608864154360488214e+02 +1.607040204525845297e+02 +1.600592063264012950e+02 +1.607380951980876773e+02 +1.604678127747868359e+02 +1.607262531384151885e+02 +1.606676767784505842e+02 +1.609793603003782039e+02 +1.608907526349452439e+02 +1.605150066114870810e+02 +1.603342419590378540e+02 +1.600616856829110475e+02 +1.605705939505769493e+02 +1.609538505499762095e+02 +1.601023416558066970e+02 +1.605642319436292667e+02 +1.602367725801497897e+02 +1.605902363139905162e+02 +1.605639096736787792e+02 +1.604586679653438637e+02 +1.607457944449039360e+02 +1.609503135727506731e+02 +1.601423482490342565e+02 +1.604906444425292023e+02 +1.608430665674106308e+02 +1.606474426811015519e+02 +1.600892196248706512e+02 +1.601941052004565620e+02 +1.601648356483797500e+02 +1.605777065037266027e+02 +1.606483660683673520e+02 +1.606774686949806039e+02 +1.609560585563270934e+02 +1.607064564989038615e+02 +1.601509301929544051e+02 +1.603348270577281198e+02 +1.605505616834624050e+02 +1.603564164400668517e+02 +1.600257362474336276e+02 +1.608834220308249883e+02 +1.609975887743820522e+02 +1.603112769031929190e+02 +1.609813534099181993e+02 +1.600919119620475328e+02 +1.601799805061206712e+02 +1.604887355760818366e+02 +1.603985739763053857e+02 +1.606405200871531633e+02 +1.608980352900648541e+02 +1.608876944831028197e+02 +1.605038377396598719e+02 +1.600068949716390136e+02 +1.603492619963959669e+02 +1.602053315297056884e+02 +1.601192511122176825e+02 +1.604579577172859217e+02 +1.604142558521074591e+02 +1.602993271273907396e+02 +1.603697032461754191e+02 +1.605972119703594103e+02 +1.604699778402491575e+02 +1.602207937899160015e+02 +1.602749316559274178e+02 +1.607840051883783303e+02 +1.609090102805430433e+02 +1.600026135868056940e+02 +1.605589092929419053e+02 +1.600721124056057647e+02 +1.606405839801996080e+02 +1.603038451394447463e+02 +1.601385968092872645e+02 +1.600234248259139918e+02 +1.604412216615865816e+02 +1.602737247931684124e+02 +1.606289410240865720e+02 +1.606151442044511839e+02 +1.604323339045388082e+02 +1.602380542107195822e+02 +1.606940155700958428e+02 +1.606040978551159242e+02 +1.605070115909850301e+02 +1.601862821974703195e+02 +1.604926484347866733e+02 +1.605647537054144607e+02 +1.600490310723048708e+02 +1.600581482169462788e+02 +1.601476710359912943e+02 +1.600186893574955889e+02 +1.605966372965750111e+02 +1.609642918447330544e+02 +1.605888881084839284e+02 +1.608133449082453978e+02 +1.600353989711958889e+02 +1.602829616407862829e+02 +1.603653329155224867e+02 +1.609160585940443013e+02 +1.604267932625167532e+02 +1.602634749759785961e+02 +1.604881044649645219e+02 +1.602854406197362778e+02 +1.601098347470287706e+02 +1.606145595581241423e+02 +1.606438833782976303e+02 +1.602635238649027087e+02 +1.601181470291821540e+02 +1.602164314759603485e+02 +1.605912940080842191e+02 +1.600064229643494400e+02 +1.608735856008944722e+02 diff --git a/tests/resources/patients/control/subject_b.txt b/tests/resources/patients/control/subject_b.txt new file mode 100644 index 0000000..3a41f67 --- /dev/null +++ b/tests/resources/patients/control/subject_b.txt @@ -0,0 +1,270 @@ +1.609928030504898118e+02 +1.609820091538698534e+02 +1.609007418496371997e+02 +1.607892395650057153e+02 +1.608069933564716791e+02 +1.605279603732504086e+02 +1.604294845365553783e+02 +1.600675747717003503e+02 +1.607240394641396506e+02 +1.603111767209769880e+02 +1.606218877489681631e+02 +1.604220624357512861e+02 +1.604018776223522025e+02 +1.601742007791915512e+02 +1.605320450545387985e+02 +1.604143850133977764e+02 +1.606541431302050853e+02 +1.608759219416762392e+02 +1.602861644290477727e+02 +1.606086537526707616e+02 +1.609084701146972805e+02 +1.608884299997424421e+02 +1.603612945823088012e+02 +1.602993375573948072e+02 +1.600398545409738063e+02 +1.601927791627676640e+02 +1.608721136610901965e+02 +1.608397974585913630e+02 +1.602384727461235059e+02 +1.600979037795929116e+02 +1.602372097336569254e+02 +1.608750920477891384e+02 +1.609302789698670608e+02 +1.606548528334024297e+02 +1.609813013381940721e+02 +1.608579566544460988e+02 +1.600282561024647805e+02 +1.602690512189432468e+02 +1.602052509276774970e+02 +1.600351244879777539e+02 +1.601958816425474197e+02 +1.607466805547543061e+02 +1.602950017937748157e+02 +1.600140535738950405e+02 +1.608163912938939859e+02 +1.602335392274820549e+02 +1.609953902391335703e+02 +1.603754807740419892e+02 +1.608619217086679498e+02 +1.601336137081321738e+02 +1.600882156459538237e+02 +1.602064431805631557e+02 +1.606956162665865406e+02 +1.607489406712510345e+02 +1.602910742351654676e+02 +1.609106185612441777e+02 +1.601573517833690232e+02 +1.608715394098497597e+02 +1.604243234354802325e+02 +1.601012708154771929e+02 +1.606424111380955253e+02 +1.608078198347118359e+02 +1.603209354893177760e+02 +1.602620034856508084e+02 +1.602190912954949624e+02 +1.606465010457656888e+02 +1.601186467324753266e+02 +1.601106224965309934e+02 +1.604476938385761855e+02 +1.601984850129408926e+02 +1.608153495022577886e+02 +1.608897017621925727e+02 +1.600445262883657165e+02 +1.601473900717664094e+02 +1.602214670159492584e+02 +1.607550301653001554e+02 +1.606207311104317910e+02 +1.602804973237634272e+02 +1.603873902969734786e+02 +1.604432890333492878e+02 +1.604850455192879224e+02 +1.605306670096843504e+02 +1.602662716398977523e+02 +1.600850347374747287e+02 +1.602554287743434429e+02 +1.602291003194054895e+02 +1.603373769466702754e+02 +1.602273153962027550e+02 +1.600729096874473782e+02 +1.600083256720916438e+02 +1.607391149998461515e+02 +1.606053462034236645e+02 +1.604945050050258430e+02 +1.603891037212525248e+02 +1.606277985427483657e+02 +1.604254460739795434e+02 +1.603475070380764862e+02 +1.604225942079638969e+02 +1.606994450434313251e+02 +1.603215922389671562e+02 +1.603797134473751100e+02 +1.607028612243439625e+02 +1.604403705798749229e+02 +1.608298891248457494e+02 +1.604145984769152449e+02 +1.608178881589600735e+02 +1.600996218236705886e+02 +1.601180985016508203e+02 +1.601489310360370553e+02 +1.602750295053555476e+02 +1.607958942361387926e+02 +1.609955165313894554e+02 +1.603677822583434533e+02 +1.607921475684446762e+02 +1.609755723731760497e+02 +1.600964154458427799e+02 +1.607529123008623344e+02 +1.604947399575378313e+02 +1.600931936688487838e+02 +1.603480868539074606e+02 +1.602446497320082415e+02 +1.603473331307714886e+02 +1.603638485260059383e+02 +1.603898006354307313e+02 +1.603055196693649123e+02 +1.604579426536128608e+02 +1.607611853730787459e+02 +1.608014964628647476e+02 +1.602103888286130484e+02 +1.601598519705629258e+02 +1.607680147683021801e+02 +1.605105480782399070e+02 +1.605197530464235172e+02 +1.603804288423203275e+02 +1.601377310162795027e+02 +1.608274978786087672e+02 +1.605420724525731373e+02 +1.609958734747130222e+02 +1.609412176745726129e+02 +1.605752791881856183e+02 +1.605569561082335781e+02 +1.602156243719103657e+02 +1.600601526345879790e+02 +1.600350369011066505e+02 +1.601425574731921984e+02 +1.607266198256855034e+02 +1.609423590866153404e+02 +1.600515902820518477e+02 +1.600538151752099054e+02 +1.602087496773169448e+02 +1.600620777534867614e+02 +1.605123880783886818e+02 +1.607127649846372037e+02 +1.609885966473346741e+02 +1.609440351387088697e+02 +1.604907349217284036e+02 +1.608762398605175008e+02 +1.602273329891208675e+02 +1.609500068607545700e+02 +1.602616964958266124e+02 +1.609541329587947871e+02 +1.608864154360488214e+02 +1.607040204525845297e+02 +1.600592063264012950e+02 +1.607380951980876773e+02 +1.604678127747868359e+02 +1.607262531384151885e+02 +1.606676767784505842e+02 +1.609793603003782039e+02 +1.608907526349452439e+02 +1.605150066114870810e+02 +1.603342419590378540e+02 +1.600616856829110475e+02 +1.605705939505769493e+02 +1.609538505499762095e+02 +1.601023416558066970e+02 +1.605642319436292667e+02 +1.602367725801497897e+02 +1.605902363139905162e+02 +1.605639096736787792e+02 +1.604586679653438637e+02 +1.607457944449039360e+02 +1.609503135727506731e+02 +1.601423482490342565e+02 +1.604906444425292023e+02 +1.608430665674106308e+02 +1.606474426811015519e+02 +1.600892196248706512e+02 +1.601941052004565620e+02 +1.601648356483797500e+02 +1.605777065037266027e+02 +1.606483660683673520e+02 +1.606774686949806039e+02 +1.609560585563270934e+02 +1.607064564989038615e+02 +1.601509301929544051e+02 +1.603348270577281198e+02 +1.605505616834624050e+02 +1.603564164400668517e+02 +1.600257362474336276e+02 +1.608834220308249883e+02 +1.609975887743820522e+02 +1.603112769031929190e+02 +1.609813534099181993e+02 +1.600919119620475328e+02 +1.601799805061206712e+02 +1.604887355760818366e+02 +1.603985739763053857e+02 +1.606405200871531633e+02 +1.608980352900648541e+02 +1.608876944831028197e+02 +1.605038377396598719e+02 +1.600068949716390136e+02 +1.603492619963959669e+02 +1.602053315297056884e+02 +1.601192511122176825e+02 +1.604579577172859217e+02 +1.604142558521074591e+02 +1.602993271273907396e+02 +1.603697032461754191e+02 +1.605972119703594103e+02 +1.604699778402491575e+02 +1.602207937899160015e+02 +1.602749316559274178e+02 +1.607840051883783303e+02 +1.609090102805430433e+02 +1.600026135868056940e+02 +1.605589092929419053e+02 +1.600721124056057647e+02 +1.606405839801996080e+02 +1.603038451394447463e+02 +1.601385968092872645e+02 +1.600234248259139918e+02 +1.604412216615865816e+02 +1.602737247931684124e+02 +1.606289410240865720e+02 +1.606151442044511839e+02 +1.604323339045388082e+02 +1.602380542107195822e+02 +1.606940155700958428e+02 +1.606040978551159242e+02 +1.605070115909850301e+02 +1.601862821974703195e+02 +1.604926484347866733e+02 +1.605647537054144607e+02 +1.600490310723048708e+02 +1.600581482169462788e+02 +1.601476710359912943e+02 +1.600186893574955889e+02 +1.605966372965750111e+02 +1.609642918447330544e+02 +1.605888881084839284e+02 +1.608133449082453978e+02 +1.600353989711958889e+02 +1.602829616407862829e+02 +1.603653329155224867e+02 +1.609160585940443013e+02 +1.604267932625167532e+02 +1.602634749759785961e+02 +1.604881044649645219e+02 +1.602854406197362778e+02 +1.601098347470287706e+02 +1.606145595581241423e+02 +1.606438833782976303e+02 +1.602635238649027087e+02 +1.601181470291821540e+02 +1.602164314759603485e+02 +1.605912940080842191e+02 +1.600064229643494400e+02 +1.608735856008944722e+02 diff --git a/tests/resources/patients/experimental/subject_c.txt b/tests/resources/patients/experimental/subject_c.txt new file mode 100644 index 0000000..3a41f67 --- /dev/null +++ b/tests/resources/patients/experimental/subject_c.txt @@ -0,0 +1,270 @@ +1.609928030504898118e+02 +1.609820091538698534e+02 +1.609007418496371997e+02 +1.607892395650057153e+02 +1.608069933564716791e+02 +1.605279603732504086e+02 +1.604294845365553783e+02 +1.600675747717003503e+02 +1.607240394641396506e+02 +1.603111767209769880e+02 +1.606218877489681631e+02 +1.604220624357512861e+02 +1.604018776223522025e+02 +1.601742007791915512e+02 +1.605320450545387985e+02 +1.604143850133977764e+02 +1.606541431302050853e+02 +1.608759219416762392e+02 +1.602861644290477727e+02 +1.606086537526707616e+02 +1.609084701146972805e+02 +1.608884299997424421e+02 +1.603612945823088012e+02 +1.602993375573948072e+02 +1.600398545409738063e+02 +1.601927791627676640e+02 +1.608721136610901965e+02 +1.608397974585913630e+02 +1.602384727461235059e+02 +1.600979037795929116e+02 +1.602372097336569254e+02 +1.608750920477891384e+02 +1.609302789698670608e+02 +1.606548528334024297e+02 +1.609813013381940721e+02 +1.608579566544460988e+02 +1.600282561024647805e+02 +1.602690512189432468e+02 +1.602052509276774970e+02 +1.600351244879777539e+02 +1.601958816425474197e+02 +1.607466805547543061e+02 +1.602950017937748157e+02 +1.600140535738950405e+02 +1.608163912938939859e+02 +1.602335392274820549e+02 +1.609953902391335703e+02 +1.603754807740419892e+02 +1.608619217086679498e+02 +1.601336137081321738e+02 +1.600882156459538237e+02 +1.602064431805631557e+02 +1.606956162665865406e+02 +1.607489406712510345e+02 +1.602910742351654676e+02 +1.609106185612441777e+02 +1.601573517833690232e+02 +1.608715394098497597e+02 +1.604243234354802325e+02 +1.601012708154771929e+02 +1.606424111380955253e+02 +1.608078198347118359e+02 +1.603209354893177760e+02 +1.602620034856508084e+02 +1.602190912954949624e+02 +1.606465010457656888e+02 +1.601186467324753266e+02 +1.601106224965309934e+02 +1.604476938385761855e+02 +1.601984850129408926e+02 +1.608153495022577886e+02 +1.608897017621925727e+02 +1.600445262883657165e+02 +1.601473900717664094e+02 +1.602214670159492584e+02 +1.607550301653001554e+02 +1.606207311104317910e+02 +1.602804973237634272e+02 +1.603873902969734786e+02 +1.604432890333492878e+02 +1.604850455192879224e+02 +1.605306670096843504e+02 +1.602662716398977523e+02 +1.600850347374747287e+02 +1.602554287743434429e+02 +1.602291003194054895e+02 +1.603373769466702754e+02 +1.602273153962027550e+02 +1.600729096874473782e+02 +1.600083256720916438e+02 +1.607391149998461515e+02 +1.606053462034236645e+02 +1.604945050050258430e+02 +1.603891037212525248e+02 +1.606277985427483657e+02 +1.604254460739795434e+02 +1.603475070380764862e+02 +1.604225942079638969e+02 +1.606994450434313251e+02 +1.603215922389671562e+02 +1.603797134473751100e+02 +1.607028612243439625e+02 +1.604403705798749229e+02 +1.608298891248457494e+02 +1.604145984769152449e+02 +1.608178881589600735e+02 +1.600996218236705886e+02 +1.601180985016508203e+02 +1.601489310360370553e+02 +1.602750295053555476e+02 +1.607958942361387926e+02 +1.609955165313894554e+02 +1.603677822583434533e+02 +1.607921475684446762e+02 +1.609755723731760497e+02 +1.600964154458427799e+02 +1.607529123008623344e+02 +1.604947399575378313e+02 +1.600931936688487838e+02 +1.603480868539074606e+02 +1.602446497320082415e+02 +1.603473331307714886e+02 +1.603638485260059383e+02 +1.603898006354307313e+02 +1.603055196693649123e+02 +1.604579426536128608e+02 +1.607611853730787459e+02 +1.608014964628647476e+02 +1.602103888286130484e+02 +1.601598519705629258e+02 +1.607680147683021801e+02 +1.605105480782399070e+02 +1.605197530464235172e+02 +1.603804288423203275e+02 +1.601377310162795027e+02 +1.608274978786087672e+02 +1.605420724525731373e+02 +1.609958734747130222e+02 +1.609412176745726129e+02 +1.605752791881856183e+02 +1.605569561082335781e+02 +1.602156243719103657e+02 +1.600601526345879790e+02 +1.600350369011066505e+02 +1.601425574731921984e+02 +1.607266198256855034e+02 +1.609423590866153404e+02 +1.600515902820518477e+02 +1.600538151752099054e+02 +1.602087496773169448e+02 +1.600620777534867614e+02 +1.605123880783886818e+02 +1.607127649846372037e+02 +1.609885966473346741e+02 +1.609440351387088697e+02 +1.604907349217284036e+02 +1.608762398605175008e+02 +1.602273329891208675e+02 +1.609500068607545700e+02 +1.602616964958266124e+02 +1.609541329587947871e+02 +1.608864154360488214e+02 +1.607040204525845297e+02 +1.600592063264012950e+02 +1.607380951980876773e+02 +1.604678127747868359e+02 +1.607262531384151885e+02 +1.606676767784505842e+02 +1.609793603003782039e+02 +1.608907526349452439e+02 +1.605150066114870810e+02 +1.603342419590378540e+02 +1.600616856829110475e+02 +1.605705939505769493e+02 +1.609538505499762095e+02 +1.601023416558066970e+02 +1.605642319436292667e+02 +1.602367725801497897e+02 +1.605902363139905162e+02 +1.605639096736787792e+02 +1.604586679653438637e+02 +1.607457944449039360e+02 +1.609503135727506731e+02 +1.601423482490342565e+02 +1.604906444425292023e+02 +1.608430665674106308e+02 +1.606474426811015519e+02 +1.600892196248706512e+02 +1.601941052004565620e+02 +1.601648356483797500e+02 +1.605777065037266027e+02 +1.606483660683673520e+02 +1.606774686949806039e+02 +1.609560585563270934e+02 +1.607064564989038615e+02 +1.601509301929544051e+02 +1.603348270577281198e+02 +1.605505616834624050e+02 +1.603564164400668517e+02 +1.600257362474336276e+02 +1.608834220308249883e+02 +1.609975887743820522e+02 +1.603112769031929190e+02 +1.609813534099181993e+02 +1.600919119620475328e+02 +1.601799805061206712e+02 +1.604887355760818366e+02 +1.603985739763053857e+02 +1.606405200871531633e+02 +1.608980352900648541e+02 +1.608876944831028197e+02 +1.605038377396598719e+02 +1.600068949716390136e+02 +1.603492619963959669e+02 +1.602053315297056884e+02 +1.601192511122176825e+02 +1.604579577172859217e+02 +1.604142558521074591e+02 +1.602993271273907396e+02 +1.603697032461754191e+02 +1.605972119703594103e+02 +1.604699778402491575e+02 +1.602207937899160015e+02 +1.602749316559274178e+02 +1.607840051883783303e+02 +1.609090102805430433e+02 +1.600026135868056940e+02 +1.605589092929419053e+02 +1.600721124056057647e+02 +1.606405839801996080e+02 +1.603038451394447463e+02 +1.601385968092872645e+02 +1.600234248259139918e+02 +1.604412216615865816e+02 +1.602737247931684124e+02 +1.606289410240865720e+02 +1.606151442044511839e+02 +1.604323339045388082e+02 +1.602380542107195822e+02 +1.606940155700958428e+02 +1.606040978551159242e+02 +1.605070115909850301e+02 +1.601862821974703195e+02 +1.604926484347866733e+02 +1.605647537054144607e+02 +1.600490310723048708e+02 +1.600581482169462788e+02 +1.601476710359912943e+02 +1.600186893574955889e+02 +1.605966372965750111e+02 +1.609642918447330544e+02 +1.605888881084839284e+02 +1.608133449082453978e+02 +1.600353989711958889e+02 +1.602829616407862829e+02 +1.603653329155224867e+02 +1.609160585940443013e+02 +1.604267932625167532e+02 +1.602634749759785961e+02 +1.604881044649645219e+02 +1.602854406197362778e+02 +1.601098347470287706e+02 +1.606145595581241423e+02 +1.606438833782976303e+02 +1.602635238649027087e+02 +1.601181470291821540e+02 +1.602164314759603485e+02 +1.605912940080842191e+02 +1.600064229643494400e+02 +1.608735856008944722e+02 diff --git a/tests/resources/patients/experimental/subject_d.txt b/tests/resources/patients/experimental/subject_d.txt new file mode 100644 index 0000000..3a41f67 --- /dev/null +++ b/tests/resources/patients/experimental/subject_d.txt @@ -0,0 +1,270 @@ +1.609928030504898118e+02 +1.609820091538698534e+02 +1.609007418496371997e+02 +1.607892395650057153e+02 +1.608069933564716791e+02 +1.605279603732504086e+02 +1.604294845365553783e+02 +1.600675747717003503e+02 +1.607240394641396506e+02 +1.603111767209769880e+02 +1.606218877489681631e+02 +1.604220624357512861e+02 +1.604018776223522025e+02 +1.601742007791915512e+02 +1.605320450545387985e+02 +1.604143850133977764e+02 +1.606541431302050853e+02 +1.608759219416762392e+02 +1.602861644290477727e+02 +1.606086537526707616e+02 +1.609084701146972805e+02 +1.608884299997424421e+02 +1.603612945823088012e+02 +1.602993375573948072e+02 +1.600398545409738063e+02 +1.601927791627676640e+02 +1.608721136610901965e+02 +1.608397974585913630e+02 +1.602384727461235059e+02 +1.600979037795929116e+02 +1.602372097336569254e+02 +1.608750920477891384e+02 +1.609302789698670608e+02 +1.606548528334024297e+02 +1.609813013381940721e+02 +1.608579566544460988e+02 +1.600282561024647805e+02 +1.602690512189432468e+02 +1.602052509276774970e+02 +1.600351244879777539e+02 +1.601958816425474197e+02 +1.607466805547543061e+02 +1.602950017937748157e+02 +1.600140535738950405e+02 +1.608163912938939859e+02 +1.602335392274820549e+02 +1.609953902391335703e+02 +1.603754807740419892e+02 +1.608619217086679498e+02 +1.601336137081321738e+02 +1.600882156459538237e+02 +1.602064431805631557e+02 +1.606956162665865406e+02 +1.607489406712510345e+02 +1.602910742351654676e+02 +1.609106185612441777e+02 +1.601573517833690232e+02 +1.608715394098497597e+02 +1.604243234354802325e+02 +1.601012708154771929e+02 +1.606424111380955253e+02 +1.608078198347118359e+02 +1.603209354893177760e+02 +1.602620034856508084e+02 +1.602190912954949624e+02 +1.606465010457656888e+02 +1.601186467324753266e+02 +1.601106224965309934e+02 +1.604476938385761855e+02 +1.601984850129408926e+02 +1.608153495022577886e+02 +1.608897017621925727e+02 +1.600445262883657165e+02 +1.601473900717664094e+02 +1.602214670159492584e+02 +1.607550301653001554e+02 +1.606207311104317910e+02 +1.602804973237634272e+02 +1.603873902969734786e+02 +1.604432890333492878e+02 +1.604850455192879224e+02 +1.605306670096843504e+02 +1.602662716398977523e+02 +1.600850347374747287e+02 +1.602554287743434429e+02 +1.602291003194054895e+02 +1.603373769466702754e+02 +1.602273153962027550e+02 +1.600729096874473782e+02 +1.600083256720916438e+02 +1.607391149998461515e+02 +1.606053462034236645e+02 +1.604945050050258430e+02 +1.603891037212525248e+02 +1.606277985427483657e+02 +1.604254460739795434e+02 +1.603475070380764862e+02 +1.604225942079638969e+02 +1.606994450434313251e+02 +1.603215922389671562e+02 +1.603797134473751100e+02 +1.607028612243439625e+02 +1.604403705798749229e+02 +1.608298891248457494e+02 +1.604145984769152449e+02 +1.608178881589600735e+02 +1.600996218236705886e+02 +1.601180985016508203e+02 +1.601489310360370553e+02 +1.602750295053555476e+02 +1.607958942361387926e+02 +1.609955165313894554e+02 +1.603677822583434533e+02 +1.607921475684446762e+02 +1.609755723731760497e+02 +1.600964154458427799e+02 +1.607529123008623344e+02 +1.604947399575378313e+02 +1.600931936688487838e+02 +1.603480868539074606e+02 +1.602446497320082415e+02 +1.603473331307714886e+02 +1.603638485260059383e+02 +1.603898006354307313e+02 +1.603055196693649123e+02 +1.604579426536128608e+02 +1.607611853730787459e+02 +1.608014964628647476e+02 +1.602103888286130484e+02 +1.601598519705629258e+02 +1.607680147683021801e+02 +1.605105480782399070e+02 +1.605197530464235172e+02 +1.603804288423203275e+02 +1.601377310162795027e+02 +1.608274978786087672e+02 +1.605420724525731373e+02 +1.609958734747130222e+02 +1.609412176745726129e+02 +1.605752791881856183e+02 +1.605569561082335781e+02 +1.602156243719103657e+02 +1.600601526345879790e+02 +1.600350369011066505e+02 +1.601425574731921984e+02 +1.607266198256855034e+02 +1.609423590866153404e+02 +1.600515902820518477e+02 +1.600538151752099054e+02 +1.602087496773169448e+02 +1.600620777534867614e+02 +1.605123880783886818e+02 +1.607127649846372037e+02 +1.609885966473346741e+02 +1.609440351387088697e+02 +1.604907349217284036e+02 +1.608762398605175008e+02 +1.602273329891208675e+02 +1.609500068607545700e+02 +1.602616964958266124e+02 +1.609541329587947871e+02 +1.608864154360488214e+02 +1.607040204525845297e+02 +1.600592063264012950e+02 +1.607380951980876773e+02 +1.604678127747868359e+02 +1.607262531384151885e+02 +1.606676767784505842e+02 +1.609793603003782039e+02 +1.608907526349452439e+02 +1.605150066114870810e+02 +1.603342419590378540e+02 +1.600616856829110475e+02 +1.605705939505769493e+02 +1.609538505499762095e+02 +1.601023416558066970e+02 +1.605642319436292667e+02 +1.602367725801497897e+02 +1.605902363139905162e+02 +1.605639096736787792e+02 +1.604586679653438637e+02 +1.607457944449039360e+02 +1.609503135727506731e+02 +1.601423482490342565e+02 +1.604906444425292023e+02 +1.608430665674106308e+02 +1.606474426811015519e+02 +1.600892196248706512e+02 +1.601941052004565620e+02 +1.601648356483797500e+02 +1.605777065037266027e+02 +1.606483660683673520e+02 +1.606774686949806039e+02 +1.609560585563270934e+02 +1.607064564989038615e+02 +1.601509301929544051e+02 +1.603348270577281198e+02 +1.605505616834624050e+02 +1.603564164400668517e+02 +1.600257362474336276e+02 +1.608834220308249883e+02 +1.609975887743820522e+02 +1.603112769031929190e+02 +1.609813534099181993e+02 +1.600919119620475328e+02 +1.601799805061206712e+02 +1.604887355760818366e+02 +1.603985739763053857e+02 +1.606405200871531633e+02 +1.608980352900648541e+02 +1.608876944831028197e+02 +1.605038377396598719e+02 +1.600068949716390136e+02 +1.603492619963959669e+02 +1.602053315297056884e+02 +1.601192511122176825e+02 +1.604579577172859217e+02 +1.604142558521074591e+02 +1.602993271273907396e+02 +1.603697032461754191e+02 +1.605972119703594103e+02 +1.604699778402491575e+02 +1.602207937899160015e+02 +1.602749316559274178e+02 +1.607840051883783303e+02 +1.609090102805430433e+02 +1.600026135868056940e+02 +1.605589092929419053e+02 +1.600721124056057647e+02 +1.606405839801996080e+02 +1.603038451394447463e+02 +1.601385968092872645e+02 +1.600234248259139918e+02 +1.604412216615865816e+02 +1.602737247931684124e+02 +1.606289410240865720e+02 +1.606151442044511839e+02 +1.604323339045388082e+02 +1.602380542107195822e+02 +1.606940155700958428e+02 +1.606040978551159242e+02 +1.605070115909850301e+02 +1.601862821974703195e+02 +1.604926484347866733e+02 +1.605647537054144607e+02 +1.600490310723048708e+02 +1.600581482169462788e+02 +1.601476710359912943e+02 +1.600186893574955889e+02 +1.605966372965750111e+02 +1.609642918447330544e+02 +1.605888881084839284e+02 +1.608133449082453978e+02 +1.600353989711958889e+02 +1.602829616407862829e+02 +1.603653329155224867e+02 +1.609160585940443013e+02 +1.604267932625167532e+02 +1.602634749759785961e+02 +1.604881044649645219e+02 +1.602854406197362778e+02 +1.601098347470287706e+02 +1.606145595581241423e+02 +1.606438833782976303e+02 +1.602635238649027087e+02 +1.601181470291821540e+02 +1.602164314759603485e+02 +1.605912940080842191e+02 +1.600064229643494400e+02 +1.608735856008944722e+02