From 6d13a2be59025ca17f8a0913265650a691099f33 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Wed, 26 Oct 2022 12:55:51 +0100 Subject: [PATCH 01/15] added extension for ipython --- docs/conf.py | 3 + docs/tutorial.rst | 1212 ++++++++++++++++++++++++--------------------- 2 files changed, 653 insertions(+), 562 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2639f765ee..b1f1ce194a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,6 +34,7 @@ # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' +#import IPython # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. @@ -45,6 +46,8 @@ 'numpydoc', 'sphinx_issues', "sphinx_copybutton", + 'IPython.sphinxext.ipython_console_highlighting', + 'IPython.sphinxext.ipython_directive' ] numpydoc_show_class_members = False diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 411ce0a163..c7c234eef7 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1,3 +1,4 @@ +.. _ipython_directive: .. _tutorial: Tutorial @@ -13,12 +14,18 @@ similar functionality, but with some additional flexibility. Creating an array ----------------- -Zarr has several functions for creating arrays. For example:: +Zarr has several functions for creating arrays. For example + - >>> import zarr - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z - + +.. ipython:: + + In [1]: import zarr + + In [2]: z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') + + In [3]: z + The code above creates a 2-dimensional array of 32-bit integers with 10000 rows and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 @@ -33,35 +40,31 @@ Reading and writing data ------------------------ Zarr arrays support a similar interface to NumPy arrays for reading and writing -data. For example, the entire array can be filled with a scalar value:: +data. For example, the entire array can be filled with a scalar value - >>> z[:] = 42 +.. ipython:: + + In [10]: z[:] = 42 + +Regions of the array can also be written to, e.g. -Regions of the array can also be written to, e.g.:: - >>> import numpy as np - >>> z[0, :] = np.arange(10000) - >>> z[:, 0] = np.arange(10000) The contents of the array can be retrieved by slicing, which will load the -requested region into memory as a NumPy array, e.g.:: - - >>> z[0, 0] - 0 - >>> z[-1, -1] - 42 - >>> z[0, :] - array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) - >>> z[:, 0] - array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) - >>> z[:] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [ 1, 42, 42, ..., 42, 42, 42], - [ 2, 42, 42, ..., 42, 42, 42], - ..., - [9997, 42, 42, ..., 42, 42, 42], - [9998, 42, 42, ..., 42, 42, 42], - [9999, 42, 42, ..., 42, 42, 42]], dtype=int32) +requested region into memory as a NumPy array, e.g. + +.. ipython:: + + In [10]: z[0, 0] + + In [11]: z[-1, -1] + + In [12]: z[0, :] + + In [13]: z[:, 0] + + In [14]: z[:] + .. _tutorial_persist: @@ -70,10 +73,11 @@ Persistent arrays In the examples above, compressed data for each chunk of the array was stored in main memory. Zarr arrays can also be stored on a file system, enabling -persistence of data between sessions. For example:: +persistence of data between sessions. For example - >>> z1 = zarr.open('data/example.zarr', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4') +.. ipython:: + + In [13]: z1 = zarr.open('data/example.zarr', mode='w', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') The array above will store its configuration metadata and all compressed chunk data in a directory called 'data/example.zarr' relative to the current working @@ -84,27 +88,35 @@ close an array: data are automatically flushed to disk, and files are automatically closed whenever an array is modified. Persistent arrays support the same interface for reading and writing data, -e.g.:: +e.g. - >>> z1[:] = 42 - >>> z1[0, :] = np.arange(10000) - >>> z1[:, 0] = np.arange(10000) +.. ipython:: -Check that the data have been written and can be read again:: + In [14]: z1[:] = 42 - >>> z2 = zarr.open('data/example.zarr', mode='r') - >>> np.all(z1[:] == z2[:]) - True + In [15]: z1[0, :] = np.arange(10000) + + In [16]: z1[:, 0] = np.arange(10000) + + #Check that the data have been written and can be read again + + In [18]: z2 = zarr.open('data/example.zarr', mode='r') + + In [18]: np.all(z1[:] == z2[:]) + If you are just looking for a fast and convenient way to save NumPy arrays to disk then load back into memory later, the functions :func:`zarr.convenience.save` and :func:`zarr.convenience.load` may be -useful. E.g.:: +useful. E.g. + +.. ipython:: + + In [0]: a = np.arange(10) + + In [0]: zarr.save('data/example.zarr', a) - >>> a = np.arange(10) - >>> zarr.save('data/example.zarr', a) - >>> zarr.load('data/example.zarr') - array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + In [0]: zarr.load('data/example.zarr') Please note that there are a number of other options for persistent array storage, see the section on :ref:`tutorial_storage` below. @@ -115,31 +127,39 @@ Resizing and appending ---------------------- A Zarr array can be resized, which means that any of its dimensions can be -increased or decreased in length. For example:: +increased or decreased in length. For example - >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) - >>> z[:] = 42 - >>> z.resize(20000, 10000) - >>> z.shape - (20000, 10000) +.. ipython:: + + + In [0]: z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) + + In [0]: z[:] = 42 + + In [0]: z.resize(20000, 10000) + + In [0]: z.shape Note that when an array is resized, the underlying data are not rearranged in any way. If one or more dimensions are shrunk, any chunks falling outside the new array shape will be deleted from the underlying store. For convenience, Zarr arrays also provide an ``append()`` method, which can be -used to append data to any axis. E.g.:: - - >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) - >>> z = zarr.array(a, chunks=(1000, 100)) - >>> z.shape - (10000, 1000) - >>> z.append(a) - (20000, 1000) - >>> z.append(np.vstack([a, a]), axis=1) - (20000, 2000) - >>> z.shape - (20000, 2000) +used to append data to any axis. E.g. + +.. ipython:: + + In [0]: a = np.arange(10000000, dtype='i4').reshape(10000, 1000) + + In [0]: z = zarr.array(a, chunks=(1000, 100)) + + In [0]: z.shape + + In [0]: z.append(a) + + In [0]: z.append(np.vstack([a, a]), axis=1) + + In [0]: z.shape .. _tutorial_compress: @@ -150,14 +170,20 @@ A number of different compressors can be used with Zarr. A separate package called NumCodecs_ is available which provides a common interface to various compressor libraries including Blosc, Zstandard, LZ4, Zlib, BZ2 and LZMA. Different compressors can be provided via the ``compressor`` keyword -argument accepted by all array creation functions. For example:: +argument accepted by all array creation functions. For example + +.. ipython:: + + In [0]: from numcodecs import Blosc + + In [0]: compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) - >>> from numcodecs import Blosc - >>> compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) - >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) - >>> z = zarr.array(data, chunks=(1000, 1000), compressor=compressor) - >>> z.compressor - Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) + In [0]: data = np.arange(100000000, dtype='i4').reshape(10000, 10000) + + In [0]: z = zarr.array(data, chunks=(1000, 1000), compressor=compressor) + + In [0]: z.compressor + This array above will use Blosc as the primary compressor, using the Zstandard algorithm (compression level 3) internally within Blosc, and with the @@ -165,22 +191,12 @@ bit-shuffle filter applied. When using a compressor, it can be useful to get some diagnostics on the compression ratio. Zarr arrays provide a ``info`` property which can be used to -print some diagnostics, e.g.:: +print some diagnostics, e.g. - >>> z.info - Type : zarr.core.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Compressor : Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, - : blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 400000000 (381.5M) - No. bytes stored : 3379344 (3.2M) - Storage ratio : 118.4 - Chunks initialized : 100/100 +.. ipython:: + + In [0]: z.info + If you don't specify a compressor, by default Zarr uses the Blosc compressor. Blosc is generally very fast and can be configured in a variety of @@ -189,52 +205,68 @@ fact a "meta-compressor", which means that it can use a number of different compression algorithms internally to compress the data. Blosc also provides highly optimized implementations of byte- and bit-shuffle filters, which can improve compression ratios for some data. A list of the internal compression -libraries available within Blosc can be obtained via:: +libraries available within Blosc can be obtained via - >>> from numcodecs import blosc - >>> blosc.list_compressors() - ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'] +.. ipython:: + + In [0]: from numcodecs import blosc + In [0]: blosc.list_compressors() + In addition to Blosc, other compression libraries can also be used. For example, -here is an array using Zstandard compression, level 1:: - - >>> from numcodecs import Zstd - >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), - ... chunks=(1000, 1000), compressor=Zstd(level=1)) - >>> z.compressor - Zstd(level=1) - -Here is an example using LZMA with a custom filter pipeline including LZMA's -built-in delta filter:: - - >>> import lzma - >>> lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), - ... dict(id=lzma.FILTER_LZMA2, preset=1)] - >>> from numcodecs import LZMA - >>> compressor = LZMA(filters=lzma_filters) - >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), - ... chunks=(1000, 1000), compressor=compressor) - >>> z.compressor - LZMA(format=1, check=-1, preset=None, filters=[{'dist': 4, 'id': 3}, {'id': 33, 'preset': 1}]) +here is an array using Zstandard compression, level 1 + +.. ipython:: + + In [0]: from numcodecs import Zstd + + In [0]: z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000),chunks=(1000, 1000), compressor=Zstd(level=1)) + + In [0]: z.compressor + + #Here is an example using LZMA with a custom filter pipeline including LZMA's built-in delta filter + + In [0]: import lzma + + In [0]: lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)] + + In [0]: from numcodecs import LZMA + + In [0]: compressor = LZMA(filters=lzma_filters) + + In [0]: z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), chunks=(1000, 1000), compressor=compressor) + + In [0]: z.compressor + The default compressor can be changed by setting the value of the -``zarr.storage.default_compressor`` variable, e.g.:: +``zarr.storage.default_compressor`` variable, e.g. + +.. ipython:: - >>> import zarr.storage - >>> from numcodecs import Zstd, Blosc - >>> # switch to using Zstandard + In [0]: import zarr.storage + + In [0]: from numcodecs import Zstd, Blosc + + In [0]: # switch to using Zstandard ... zarr.storage.default_compressor = Zstd(level=1) - >>> z = zarr.zeros(100000000, chunks=1000000) - >>> z.compressor - Zstd(level=1) - >>> # switch back to Blosc defaults + + In [0]: z = zarr.zeros(100000000, chunks=1000000) + + In [0]: z.compressor + + + In [0]: # switch back to Blosc defaults ... zarr.storage.default_compressor = Blosc() -To disable compression, set ``compressor=None`` when creating an array, e.g.:: +To disable compression, set ``compressor=None`` when creating an array, e.g. - >>> z = zarr.zeros(100000000, chunks=1000000, compressor=None) - >>> z.compressor is None - True +.. ipython:: + + In [0]: z = zarr.zeros(100000000, chunks=1000000, compressor=None) + + In [0]: z.compressor is None + .. _tutorial_filters: @@ -252,27 +284,22 @@ filter. However, to provide additional flexibility for implementing and using filters in combination with different compressors, Zarr also provides a mechanism for configuring filters outside of the primary compressor. -Here is an example using a delta filter with the Blosc compressor:: +Here is an example using a delta filter with the Blosc compressor - >>> from numcodecs import Blosc, Delta - >>> filters = [Delta(dtype='i4')] - >>> compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) - >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) - >>> z = zarr.array(data, chunks=(1000, 1000), filters=filters, compressor=compressor) - >>> z.info - Type : zarr.core.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Filter [0] : Delta(dtype='`_ documentation. @@ -286,65 +313,77 @@ Zarr supports hierarchical organization of arrays via groups. As with arrays, groups can be stored in memory, on disk, or via other storage systems that support a similar interface. -To create a group, use the :func:`zarr.group` function:: +To create a group, use the :func:`zarr.group` function + +.. ipython:: - >>> root = zarr.group() - >>> root - + In [0]: root = zarr.group() + + In [0]: root Groups have a similar API to the Group class from `h5py -`_. For example, groups can contain other groups:: +`_. For example, groups can contain other groups + +.. ipython:: - >>> foo = root.create_group('foo') - >>> bar = foo.create_group('bar') + In [0]: foo = root.create_group('foo') -Groups can also contain arrays, e.g.:: + In [0]: bar = foo.create_group('bar') - >>> z1 = bar.zeros('baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z1 - +Groups can also contain arrays, e.g. + +.. ipython:: + + In [0]: z1 = bar.zeros('baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') + In [0]: z1 Arrays are known as "datasets" in HDF5 terminology. For compatibility with h5py, Zarr groups also implement the ``create_dataset()`` and ``require_dataset()`` -methods, e.g.:: +methods, e.g. - >>> z = bar.create_dataset('quux', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z - +.. ipython:: + + In [0]: z = bar.create_dataset('quux', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') + In [0]: z -Members of a group can be accessed via the suffix notation, e.g.:: +Members of a group can be accessed via the suffix notation, e.g. - >>> root['foo'] - +.. ipython:: + + In [0]: root['foo'] The '/' character can be used to access multiple levels of the hierarchy in one -call, e.g.:: +call, e.g. + +.. ipython:: - >>> root['foo/bar'] - - >>> root['foo/bar/baz'] - + In [0]: root['foo/bar'] + + In [0]: root['foo/bar/baz'] + The :func:`zarr.hierarchy.Group.tree` method can be used to print a tree -representation of the hierarchy, e.g.:: +representation of the hierarchy, e.g. - >>> root.tree() - / - └── foo - └── bar - ├── baz (10000, 10000) int32 - └── quux (10000, 10000) int32 +.. ipython:: + + In [0]: root.tree() + The :func:`zarr.convenience.open` function provides a convenient way to create or re-open a group stored in a directory on the file-system, with sub-groups stored in -sub-directories, e.g.:: +sub-directories, e.g. - >>> root = zarr.open('data/group.zarr', mode='w') - >>> root - - >>> z = root.zeros('foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z - +.. ipython:: + + In [0]: root = zarr.open('data/group.zarr', mode='w') + + In [0]: root + + In [0]: z = root.zeros('foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') + + In [0]: z + Groups can be used as context managers (in a ``with`` statement). If the underlying store has a ``close`` method, it will be called on exit. @@ -358,71 +397,35 @@ Array and group diagnostics --------------------------- Diagnostic information about arrays and groups is available via the ``info`` -property. E.g.:: - - >>> root = zarr.group() - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=1000000, chunks=100000, dtype='i8') - >>> bar[:] = 42 - >>> baz = foo.zeros('baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4') - >>> baz[:] = 4.2 - >>> root.info - Name : / - Type : zarr.hierarchy.Group - Read-only : False - Store type : zarr.storage.MemoryStore - No. members : 1 - No. arrays : 0 - No. groups : 1 - Groups : foo - - >>> foo.info - Name : /foo - Type : zarr.hierarchy.Group - Read-only : False - Store type : zarr.storage.MemoryStore - No. members : 2 - No. arrays : 2 - No. groups : 0 - Arrays : bar, baz - - >>> bar.info - Name : /foo/bar - Type : zarr.core.Array - Data type : int64 - Shape : (1000000,) - Chunk shape : (100000,) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.MemoryStore - No. bytes : 8000000 (7.6M) - No. bytes stored : 33240 (32.5K) - Storage ratio : 240.7 - Chunks initialized : 10/10 - - >>> baz.info - Name : /foo/baz - Type : zarr.core.Array - Data type : float32 - Shape : (1000, 1000) - Chunk shape : (100, 100) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.MemoryStore - No. bytes : 4000000 (3.8M) - No. bytes stored : 23943 (23.4K) - Storage ratio : 167.1 - Chunks initialized : 100/100 +property. E.g. -Groups also have the :func:`zarr.hierarchy.Group.tree` method, e.g.:: +.. ipython:: - >>> root.tree() - / - └── foo - ├── bar (1000000,) int64 - └── baz (1000, 1000) float32 + In [0]: root = zarr.group() + + In [0]: foo = root.create_group('foo') + + In [0]: bar = foo.zeros('bar', shape=1000000, chunks=100000, dtype='i8') + + In [0]: bar[:] = 42 + + In [0]: baz = foo.zeros('baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4') + + In [0]: baz[:] = 4.2 + + In [0]: root.info + + In [0]: foo.info + + In [0]: bar.info + + In [0]: baz.info + +Groups also have the :func:`zarr.hierarchy.Group.tree` method, e.g. + +.. ipython:: + + In [0]: root.tree() If you're using Zarr within a Jupyter notebook (requires `ipytree `_), calling ``tree()`` will generate an @@ -436,25 +439,31 @@ User attributes --------------- Zarr arrays and groups support custom key/value attributes, which can be useful for -storing application-specific metadata. For example:: - - >>> root = zarr.group() - >>> root.attrs['foo'] = 'bar' - >>> z = root.zeros('zzz', shape=(10000, 10000)) - >>> z.attrs['baz'] = 42 - >>> z.attrs['qux'] = [1, 4, 7, 12] - >>> sorted(root.attrs) - ['foo'] - >>> 'foo' in root.attrs - True - >>> root.attrs['foo'] - 'bar' - >>> sorted(z.attrs) - ['baz', 'qux'] - >>> z.attrs['baz'] - 42 - >>> z.attrs['qux'] - [1, 4, 7, 12] +storing application-specific metadata. For example + +.. ipython:: + + In [0]: root = zarr.group() + + In [0]: root.attrs['foo'] = 'bar' + + In [0]: z = root.zeros('zzz', shape=(10000, 10000)) + + In [0]: z.attrs['baz'] = 42 + + In [0]: z.attrs['qux'] = [1, 4, 7, 12] + + In [0]: sorted(root.attrs) + + In [0]: 'foo' in root.attrs + + In [0]: root.attrs['foo'] + + In [0]: sorted(z.attrs) + + In [0]: z.attrs['baz'] + + In [0]: z.attrs['qux'] Internally Zarr uses JSON to store array attributes, so attribute values must be JSON serializable. @@ -478,103 +487,118 @@ Indexing with coordinate arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Items from a Zarr array can be extracted by providing an integer array of -coordinates. E.g.:: +coordinates. E.g. - >>> z = zarr.array(np.arange(10)) - >>> z[:] - array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> z.get_coordinate_selection([1, 4]) - array([1, 4]) +.. ipython:: -Coordinate arrays can also be used to update data, e.g.:: + In [0]: z = zarr.array(np.arange(10)) - >>> z.set_coordinate_selection([1, 4], [-1, -2]) - >>> z[:] - array([ 0, -1, 2, 3, -2, 5, 6, 7, 8, 9]) + In [0]: z[:] + + In [0]: z.get_coordinate_selection([1, 4]) + + +Coordinate arrays can also be used to update data, e.g. + +.. ipython:: + + In [0]: z.set_coordinate_selection([1, 4], [-1, -2]) + + In [0]: z[:] For multidimensional arrays, coordinates must be provided for each dimension, -e.g.:: +e.g. - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> z.get_coordinate_selection(([0, 2], [1, 3])) - array([ 1, 13]) - >>> z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) - >>> z[:] - array([[ 0, -1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -2, 14]]) +.. ipython:: + + In [0]: z = zarr.array(np.arange(15).reshape(3, 5)) + + In [0]: z[:] + + In [0]: z.get_coordinate_selection(([0, 2], [1, 3])) + + In [0]: z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) + + In [0]: z[:] + For convenience, coordinate indexing is also available via the ``vindex`` -property, as well as the square bracket operator, e.g.:: +property, as well as the square bracket operator, e.g. - >>> z.vindex[[0, 2], [1, 3]] - array([-1, -2]) - >>> z.vindex[[0, 2], [1, 3]] = [-3, -4] - >>> z[:] - array([[ 0, -3, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -4, 14]]) - >>> z[[0, 2], [1, 3]] - array([-3, -4]) +.. ipython:: + + In [0]: z.vindex[[0, 2], [1, 3]] + + In [0]: z.vindex[[0, 2], [1, 3]] = [-3, -4] + + In [0]: z[:] + + In [0]: z[[0, 2], [1, 3]] + When the indexing arrays have different shapes, they are broadcast together. -That is, the following two calls are equivalent:: +That is, the following two calls are equivalent + +.. ipython:: - >>> z[1, [1, 3]] - array([5, 7]) - >>> z[[1, 1], [1, 3]] - array([5, 7]) + In [0]: z[1, [1, 3]] + + In [0]: z[[1, 1], [1, 3]] + Indexing with a mask array ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Items can also be extracted by providing a Boolean mask. E.g.:: +Items can also be extracted by providing a Boolean mask. E.g. - >>> z = zarr.array(np.arange(10)) - >>> z[:] - array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[1] = True - >>> sel[4] = True - >>> z.get_mask_selection(sel) - array([1, 4]) - >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[:] - array([ 0, -1, 2, 3, -2, 5, 6, 7, 8, 9]) +.. ipython:: -Here's a multidimensional example:: + In [0]: z = zarr.array(np.arange(10)) - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[0, 1] = True - >>> sel[2, 3] = True - >>> z.get_mask_selection(sel) - array([ 1, 13]) - >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[:] - array([[ 0, -1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -2, 14]]) + In [0]: z[:] + + In [0]: sel = np.zeros_like(z, dtype=bool) + + In [0]: sel[1] = True + + In [0]: sel[4] = True + + In [0]: z.get_mask_selection(sel) + + In [0]: z.set_mask_selection(sel, [-1, -2]) + + In [0]: z[:] + +Here's a multidimensional example + +.. ipython:: + + In [0]: z = zarr.array(np.arange(15).reshape(3, 5)) + + In [0]: z[:] + + In [0]: sel = np.zeros_like(z, dtype=bool) + + In [0]: sel[0, 1] = True + + In [0]: sel[2, 3] = True + + In [0]: z.get_mask_selection(sel) + + In [0]: z.set_mask_selection(sel, [-1, -2]) + + In [0]: z[:] For convenience, mask indexing is also available via the ``vindex`` property, -e.g.:: +e.g. - >>> z.vindex[sel] - array([-1, -2]) - >>> z.vindex[sel] = [-3, -4] - >>> z[:] - array([[ 0, -3, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -4, 14]]) +.. ipython:: + + In [0]: z.vindex[sel] + + In [0]: z.vindex[sel] = [-3, -4] + In [0]: z[:] Mask indexing is conceptually the same as coordinate indexing, and is implemented internally via the same machinery. Both styles of indexing allow selecting arbitrary items from an array, also known as point selection. @@ -585,51 +609,47 @@ Orthogonal indexing Zarr arrays also support methods for orthogonal indexing, which allows selections to be made along each dimension of an array independently. For example, this allows selecting a subset of rows and/or columns from a -2-dimensional array. E.g.:: +2-dimensional array. E.g. - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows - array([[ 0, 1, 2, 3, 4], - [10, 11, 12, 13, 14]]) - >>> z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns - array([[ 1, 3], - [ 6, 8], - [11, 13]]) - >>> z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] - array([[ 1, 3], - [11, 13]]) - -Data can also be modified, e.g.:: - - >>> z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) - >>> z[:] +.. ipython:: + + In [0]: z = zarr.array(np.arange(15).reshape(3, 5)) + + In [0]: z[:] + + In [0]: z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows + + In [0]: z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns + + In [0]: z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] + + +Data can also be modified, e.g. + +.. ipython:: + + In [0]: z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) + In [0]: z[:] array([[ 0, -1, 2, -2, 4], [ 5, 6, 7, 8, 9], [10, -3, 12, -4, 14]]) For convenience, the orthogonal indexing functionality is also available via the -``oindex`` property, e.g.:: - - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z.oindex[[0, 2], :] # select first and third rows - array([[ 0, 1, 2, 3, 4], - [10, 11, 12, 13, 14]]) - >>> z.oindex[:, [1, 3]] # select second and fourth columns - array([[ 1, 3], - [ 6, 8], - [11, 13]]) - >>> z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] - array([[ 1, 3], - [11, 13]]) - >>> z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] - >>> z[:] - array([[ 0, -1, 2, -2, 4], - [ 5, 6, 7, 8, 9], - [10, -3, 12, -4, 14]]) +``oindex`` property, e.g. + +.. ipython:: + + In [0]: z = zarr.array(np.arange(15).reshape(3, 5)) + + In [0]: z.oindex[[0, 2], :] # select first and third rows + + In [0]: z.oindex[:, [1, 3]] # select second and fourth columns + + In [0]: z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] + + In [0]: z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] + + In [0]: z[:] Any combination of integer, slice, 1D integer array and/or 1D Boolean array can be used for orthogonal indexing. @@ -638,23 +658,24 @@ Indexing fields in structured arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ All selection methods support a ``fields`` parameter which allows retrieving or -replacing data for a specific field in an array with a structured dtype. E.g.:: - - >>> a = np.array([(b'aaa', 1, 4.2), - ... (b'bbb', 2, 8.4), - ... (b'ccc', 3, 12.6)], - ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - >>> z = zarr.array(a) - >>> z['foo'] - array([b'aaa', b'bbb', b'ccc'], - dtype='|S3') - >>> z['baz'] - array([ 4.2, 8.4, 12.6]) - >>> z.get_basic_selection(slice(0, 2), fields='bar') - array([1, 2], dtype=int32) - >>> z.get_coordinate_selection([0, 2], fields=['foo', 'baz']) - array([(b'aaa', 4.2), (b'ccc', 12.6)], - dtype=[('foo', 'S3'), ('baz', '>> z = zarr.open('data/example.zarr', mode='w', shape=1000000, dtype='i4') +...is short-hand for -...is short-hand for:: +.. ipython:: - >>> store = zarr.DirectoryStore('data/example.zarr') - >>> z = zarr.create(store=store, overwrite=True, shape=1000000, dtype='i4') + In [0]: store = zarr.DirectoryStore('data/example.zarr') -...and the following code:: + In [0]: z = zarr.create(store=store, overwrite=True, shape=1000000, dtype='i4') - >>> root = zarr.open('data/example.zarr', mode='w') +...and the following code -...is short-hand for:: +.. ipython:: - >>> store = zarr.DirectoryStore('data/example.zarr') - >>> root = zarr.group(store=store, overwrite=True) + In [0]: root = zarr.open('data/example.zarr', mode='w') + +...is short-hand for + +.. ipython:: + + In [0]: store = zarr.DirectoryStore('data/example.zarr') + + In [0]: root = zarr.group(store=store, overwrite=True) Any other compatible storage class could be used in place of :class:`zarr.storage.DirectoryStore` in the code examples above. For example, here is an array stored directly into a Zip file, via the -:class:`zarr.storage.ZipStore` class:: +:class:`zarr.storage.ZipStore` class - >>> store = zarr.ZipStore('data/example.zip', mode='w') - >>> root = zarr.group(store=store) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() +.. ipython:: -Re-open and check that data have been written:: + In [0]: store = zarr.ZipStore('data/example.zip', mode='w') - >>> store = zarr.ZipStore('data/example.zip', mode='r') - >>> root = zarr.group(store=store) - >>> z = root['foo/bar'] - >>> z[:] - array([[42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - ..., - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42]], dtype=int32) - >>> store.close() + In [0]: root = zarr.group(store=store) + + In [0]: z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + + In [0]: z[:] = 42 + + In [0]: store.close() + +Re-open and check that data have been written + +.. ipython:: + + In [0]: store = zarr.ZipStore('data/example.zip', mode='r') + + In [0]: root = zarr.group(store=store) + + In [0]: z = root['foo/bar'] + + In [0]: z[:] + + In [0]: store.close() Note that there are some limitations on how Zip files can be used, because items within a Zip file cannot be updated in place. This means that data in the array @@ -724,34 +760,53 @@ Another storage alternative is the :class:`zarr.storage.DBMStore` class, added in Zarr version 2.2. This class allows any DBM-style database to be used for storing an array or group. Here is an example using a Berkeley DB B-tree database for storage (requires `bsddb3 -`_ to be installed):: +`_ to be installed) - >>> import bsddb3 - >>> store = zarr.DBMStore('data/example.bdb', open=bsddb3.btopen) - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() +.. ipython:: + + In [0]: import bsddb3 + + In [0]: store = zarr.DBMStore('data/example.bdb', open=bsddb3.btopen) + + In [0]: root = zarr.group(store=store, overwrite=True) + + In [0]: z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + + In [0]: z[:] = 42 + + In [0]: store.close() Also added in Zarr version 2.2 is the :class:`zarr.storage.LMDBStore` class which enables the lightning memory-mapped database (LMDB) to be used for storing an array or -group (requires `lmdb `_ to be installed):: +group (requires `lmdb `_ to be installed) - >>> store = zarr.LMDBStore('data/example.lmdb') - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() +.. ipython::okexcept: + + In [0]: store = zarr.LMDBStore('data/example.lmdb') + + In [0]: root = zarr.group(store=store, overwrite=True) + + In [0]: z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + + In [0]: z[:] = 42 + + In [0]: store.close() In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which enables the SQLite database to be used for storing an array or group (requires -Python is built with SQLite support):: +Python is built with SQLite support) + +.. ipython:: - >>> store = zarr.SQLiteStore('data/example.sqldb') - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() + In [0]: store = zarr.SQLiteStore('data/example.sqldb') + + In [0]: root = zarr.group(store=store, overwrite=True) + + In [0]: z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + + In [0]: z[:] = 42 + + In [0]: store.close() Also added in Zarr version 2.3 are two storage classes for interfacing with server-client databases. The :class:`zarr.storage.RedisStore` class interfaces `Redis `_ @@ -762,17 +817,24 @@ respectively require the `redis-py `_ and For compatibility with the `N5 `_ data format, Zarr also provides an N5 backend (this is currently an experimental feature). Similar to the zip storage class, an -:class:`zarr.n5.N5Store` can be instantiated directly:: +:class:`zarr.n5.N5Store` can be instantiated directly - >>> store = zarr.N5Store('data/example.n5') - >>> root = zarr.group(store=store) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 +.. ipython::okwarning: + + In [0]: store = zarr.N5Store('data/example.n5') + + In [0]: root = zarr.group(store=store) + + In [0]: z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + + In [0]: z[:] = 42 For convenience, the N5 backend will automatically be chosen when the filename -ends with `.n5`:: +ends with `.n5` - >>> root = zarr.open('data/example.n5', mode='w') +.. ipython:: + + In [0]: root = zarr.open('data/example.n5', mode='w') Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -786,48 +848,48 @@ Google Cloud Storage (`GCSMap `_), which can be used with Zarr. -Here is an example using S3Map to read an array created previously:: +Here is an example using S3Map to read an array created previously - >>> import s3fs - >>> import zarr - >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) - >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) - >>> root = zarr.group(store=store) - >>> z = root['foo/bar/baz'] - >>> z - - >>> z.info - Name : /foo/bar/baz - Type : zarr.core.Array - Data type : |S1 - Shape : (21,) - Chunk shape : (7,) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 21 - No. bytes stored : 382 - Storage ratio : 0.1 - Chunks initialized : 3/3 - >>> z[:] - array([b'H', b'e', b'l', b'l', b'o', b' ', b'f', b'r', b'o', b'm', b' ', - b't', b'h', b'e', b' ', b'c', b'l', b'o', b'u', b'd', b'!'], - dtype='|S1') - >>> z[:].tobytes() - b'Hello from the cloud!' +.. ipython:: + + In [0]: import s3fs + + In [0]: import zarr + + In [0]: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) + + In [0]: store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) + + In [0]: root = zarr.group(store=store) + + In [0]: z = root['foo/bar/baz'] + + In [0]: z + + In [0]: z.info + + In [0]: z[:] + + In [0]: z[:].tobytes() Zarr now also has a builtin storage backend for Azure Blob Storage. The class is :class:`zarr.storage.ABSStore` (requires `azure-storage-blob `_ -to be installed):: - - >>> import azure.storage.blob - >>> container_client = azure.storage.blob.ContainerClient(...) # doctest: +SKIP - >>> store = zarr.ABSStore(client=container_client, prefix='zarr-testing') # doctest: +SKIP - >>> root = zarr.group(store=store, overwrite=True) # doctest: +SKIP - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') # doctest: +SKIP - >>> z[:] = 42 # doctest: +SKIP +to be installed) + +.. ipython::okexcept: + + In [0]: import azure.storage.blob + + In [0]: container_client = azure.storage.blob.ContainerClient(...) # doctest: +SKIP + + In [0]: store = zarr.ABSStore(client=container_client, prefix='zarr-testing') # doctest: +SKIP + + In [0]: root = zarr.group(store=store, overwrite=True) # doctest: +SKIP + + In [0]: z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') # doctest: +SKIP + + In [0]: z[:] = 42 # doctest: +SKIP When using an actual storage account, provide ``account_name`` and ``account_key`` arguments to :class:`zarr.storage.ABSStore`, the @@ -846,22 +908,35 @@ limited network bandwidth). As of version 2.2, Zarr also provides the :class:`zarr.storage.LRUStoreCache` which can be used to implement a local in-memory cache layer over a remote -store. E.g.:: - - >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) - >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) - >>> cache = zarr.LRUStoreCache(store, max_size=2**28) - >>> root = zarr.group(store=cache) - >>> z = root['foo/bar/baz'] - >>> from timeit import timeit - >>> # first data access is relatively slow, retrieved from store - ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP - b'Hello from the cloud!' - 0.1081731989979744 - >>> # second data access is faster, uses cache - ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP - b'Hello from the cloud!' - 0.0009490990014455747 +store. E.g. + +.. ipython:: + + In [0]: import s3fs + + In [0]: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) + + In [0]: store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) + + In [0]: cache = zarr.LRUStoreCache(store, max_size=2**28) + + In [0]: root = zarr.group(store=cache) + + In [0]: z = root['foo/bar/baz'] + + In [0]: from timeit import timeit + + In [0]: from timeit import timeit + + In [0]: # first data access is relatively slow, retrieved from store + + In [0]: timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP + + In [0]: # second data access is faster, uses cache + + In [0]: timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP + + If you are still experiencing poor performance with distributed/cloud storage, please raise an issue on the GitHub issue tracker with any profiling data you @@ -875,23 +950,26 @@ As of version 2.5, zarr supports passing URLs directly to `fsspec`_, and having it create the "mapping" instance automatically. This means, that for all of the backend storage implementations `supported by fsspec`_, you can skip importing and configuring the storage explicitly. -For example:: +For example - >>> g = zarr.open_group("s3://zarr-demo/store", storage_options={'anon': True}) # doctest: +SKIP - >>> g['foo/bar/baz'][:].tobytes() # doctest: +SKIP - b'Hello from the cloud!' +.. ipython:: + In [0]: g = zarr.open_group("s3://zarr-demo/store", storage_options={'anon': True}) # doctest: +SKIP + + In [0]: g['foo/bar/baz'][:].tobytes() # doctest: +SKIP + The provision of the protocol specifier "s3://" will select the correct backend. Notice the kwargs ``storage_options``, used to pass parameters to that backend. -As of version 2.6, write mode and complex URLs are also supported, such as:: +As of version 2.6, write mode and complex URLs are also supported, such as - >>> g = zarr.open_group("simplecache::s3://zarr-demo/store", - ... storage_options={"s3": {'anon': True}}) # doctest: +SKIP - >>> g['foo/bar/baz'][:].tobytes() # downloads target file # doctest: +SKIP - b'Hello from the cloud!' - >>> g['foo/bar/baz'][:].tobytes() # uses cached file # doctest: +SKIP - b'Hello from the cloud!' +.. ipython:: + + In [0]: g = zarr.open_group("simplecache::s3://zarr-demo/store", storage_options={"s3": {'anon': True}}) # doctest: +SKIP + + In [0]: g['foo/bar/baz'][:].tobytes() # downloads target file # doctest: +SKIP + + In [0]: g['foo/bar/baz'][:].tobytes() # uses cached file # doctest: +SKIP The second invocation here will be much faster. Note that the ``storage_options`` have become more complex here, to account for the two parts of the supplied @@ -899,12 +977,17 @@ URL. It is also possible to initialize the filesystem outside of Zarr and then pass it through. This requires creating an :class:`zarr.storage.FSStore` object -explicitly. For example:: +explicitly. For example + +.. ipython:: - >>> import s3fs * doctest: +SKIP - >>> fs = s3fs.S3FileSystem(anon=True) # doctest: +SKIP - >>> store = zarr.storage.FSStore('/zarr-demo/store', fs=fs) # doctest: +SKIP - >>> g = zarr.open_group(store) # doctest: +SKIP + In [0]: import s3fs # doctest: +SKIP + + In [0]: fs = s3fs.S3FileSystem(anon=True) # doctest: +SKIP + + In [0]: store = zarr.storage.FSStore('/zarr-demo/store', fs=fs) # doctest: +SKIP + + In [0]: g = zarr.open_group(store) # doctest: +SKIP This is useful in cases where you want to also use the same fsspec filesystem object separately from Zarr. @@ -928,7 +1011,7 @@ consolidated into a single one via :func:`zarr.convenience.consolidate_metadata`. Doing this can greatly increase the speed of reading the array metadata, e.g.:: - >>> zarr.consolidate_metadata(store) # doctest: +SKIP + >>> zarr.consolidate_metadata(store) # doctest: +SKIP This creates a special key with a copy of all of the metadata from all of the metadata objects in the store. @@ -936,7 +1019,7 @@ metadata objects in the store. Later, to open a Zarr store with consolidated metadata, use :func:`zarr.convenience.open_consolidated`, e.g.:: - >>> root = zarr.open_consolidated(store) # doctest: +SKIP + >>> root = zarr.open_consolidated(store) This uses the special key to read all of the metadata in a single call to the backend storage. @@ -957,54 +1040,59 @@ Copying/migrating data If you have some data in an HDF5 file and would like to copy some or all of it into a Zarr group, or vice-versa, the :func:`zarr.convenience.copy` and :func:`zarr.convenience.copy_all` functions can be used. Here's an example -copying a group named 'foo' from an HDF5 file to a Zarr group:: - - >>> import h5py - >>> import zarr - >>> import numpy as np - >>> source = h5py.File('data/example.h5', mode='w') - >>> foo = source.create_group('foo') - >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) - >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) - >>> zarr.tree(source) - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> dest = zarr.open_group('data/example.zarr', mode='w') - >>> from sys import stdout - >>> zarr.copy(source['foo'], dest, log=stdout) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - all done: 3 copied, 0 skipped, 800 bytes copied - (3, 0, 800) - >>> dest.tree() # N.B., no spam - / - └── foo - └── bar - └── baz (100,) int64 - >>> source.close() +copying a group named 'foo' from an HDF5 file to a Zarr group + +.. ipython:: + + In [0]: import h5py + + In [0]: import zarr + + In [0]: import numpy as np + + In [0]: import ipytree + + In [0]: source = h5py.File('data/example.h5', mode='w') + + In [0]: foo = source.create_group('foo') + + In [0]: baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) + + In [0]: spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) + +.. ipython:: python + :okexcept: + + zarr.tree(source) + + dest = zarr.open_group('data/example.zarr', mode='w') + + from sys import stdout + + zarr.copy(source['foo'], dest, log=stdout) + + dest.tree() #N.B.,no spam + + source.close() If rather than copying a single group or array you would like to copy all -groups and arrays, use :func:`zarr.convenience.copy_all`, e.g.:: - - >>> source = h5py.File('data/example.h5', mode='r') - >>> dest = zarr.open_group('data/example2.zarr', mode='w') - >>> zarr.copy_all(source, dest, log=stdout) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - copy /spam (100,) int64 - all done: 4 copied, 0 skipped, 1,600 bytes copied - (4, 0, 1600) - >>> dest.tree() - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 +groups and arrays, use :func:`zarr.convenience.copy_all`, e.g. + +.. ipython:: + + In [0]: import h5py + + In [0]: import zarr + + In [0]: from sys import stdout + + In [0]: source = h5py.File('data/example.h5', mode='r') + + In [0]: dest = zarr.open_group('data/example2.zarr', mode='w') + + In [0]: zarr.copy_all(source, dest, log=stdout) + + In [0]: dest.tree() If you need to copy data between two Zarr groups, the :func:`zarr.convenience.copy` and :func:`zarr.convenience.copy_all` functions can From 68d736ce83aa593d6c1a4a990a09e7acd8dc24b6 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Wed, 26 Oct 2022 13:10:03 +0100 Subject: [PATCH 02/15] converting each runs into a ipython --- docs/tutorial.rst | 66 ++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index c7c234eef7..38d057fa56 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1102,44 +1102,34 @@ the :func:`zarr.convenience.copy_store` function can be used. This function copies data directly between the underlying stores, without any decompression or re-compression, and so should be faster. E.g.:: - >>> import zarr - >>> import numpy as np - >>> store1 = zarr.DirectoryStore('data/example.zarr') - >>> root = zarr.group(store1, overwrite=True) - >>> baz = root.create_dataset('foo/bar/baz', data=np.arange(100), chunks=(50,)) - >>> spam = root.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) - >>> root.tree() - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> from sys import stdout - >>> store2 = zarr.ZipStore('data/example.zip', mode='w') - >>> zarr.copy_store(store1, store2, log=stdout) - copy .zgroup - copy foo/.zgroup - copy foo/bar/.zgroup - copy foo/bar/baz/.zarray - copy foo/bar/baz/0 - copy foo/bar/baz/1 - copy spam/.zarray - copy spam/0 - copy spam/1 - copy spam/2 - copy spam/3 - all done: 11 copied, 0 skipped, 1,138 bytes copied - (11, 0, 1138) - >>> new_root = zarr.group(store2) - >>> new_root.tree() - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> new_root['foo/bar/baz'][:] - array([ 0, 1, 2, ..., 97, 98, 99]) - >>> store2.close() # zip stores need to be closed + . ipython:: + + In [0]: import zarr + + In [0]: import numpy as np + + In [0]: store1 = zarr.DirectoryStore('data/example.zarr') + + In [0]: root = zarr.group(store1, overwrite=True) + + In [0]: baz = root.create_dataset('foo/bar/baz', data=np.arange(100), chunks=(50,)) + + In [0]: spam = root.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) + + In [0]: root.tree() + + In [0]: from sys import stdout + + In [0]: store2 = zarr.ZipStore('data/example.zip', mode='w') + + In [0]: zarr.copy_store(store1, store2, log=stdout) + + In [0]: new_root = zarr.group(store2) + + In [0]: new_root.tree() + + In [0]: store2.close() # zip stores need to be closed + .. _tutorial_strings: From 0ff891d8243dac53ac4faf4996ae9e67217357dd Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Thu, 27 Oct 2022 18:28:34 +0100 Subject: [PATCH 03/15] ipython integration --- docs/tutorial.rst | 584 +++++++++++++++++++++++++--------------------- 1 file changed, 318 insertions(+), 266 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 38d057fa56..7607dec1ff 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1100,9 +1100,9 @@ be used and provide the most flexibility. However, if you want to copy data in the most efficient way possible, without changing any configuration options, the :func:`zarr.convenience.copy_store` function can be used. This function copies data directly between the underlying stores, without any decompression or -re-compression, and so should be faster. E.g.:: +re-compression, and so should be faster. E.g. - . ipython:: +.. ipython:: In [0]: import zarr @@ -1139,90 +1139,101 @@ String arrays There are several options for storing arrays of strings. If your strings are all ASCII strings, and you know the maximum length of the string in -your array, then you can use an array with a fixed-length bytes dtype. E.g.:: - - >>> z = zarr.zeros(10, dtype='S6') - >>> z - - >>> z[0] = b'Hello' - >>> z[1] = b'world!' - >>> z[:] - array([b'Hello', b'world!', b'', b'', b'', b'', b'', b'', b'', b''], - dtype='|S6') - -A fixed-length unicode dtype is also available, e.g.:: - - >>> greetings = ['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!', - ... 'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', - ... 'こんにちは世界', '世界,你好!', 'Helló, világ!', 'Zdravo svete!', - ... 'เฮลโลเวิลด์'] - >>> text_data = greetings * 10000 - >>> z = zarr.array(text_data, dtype='U20') - >>> z - - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], - dtype='>> import numcodecs - >>> z = zarr.array(text_data, dtype=object, object_codec=numcodecs.VLenUTF8()) - >>> z - - >>> z.filters - [VLenUTF8()] - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) +and :class:`numcodecs.Pickle`. E.g. using ``VLenUTF8`` + +.. ipython:: + + In [0]: import numcodecs + + In [0]: z = zarr.array(text_data, dtype=object, object_codec=numcodecs.VLenUTF8()) + + In [0]: z + + In [0]: z.filters + + In [0]: z[:] + As a convenience, ``dtype=str`` (or ``dtype=unicode`` on Python 2.7) can be used, which -is a short-hand for ``dtype=object, object_codec=numcodecs.VLenUTF8()``, e.g.:: +is a short-hand for ``dtype=object, object_codec=numcodecs.VLenUTF8()``, e.g. + +.. ipython:: - >>> z = zarr.array(text_data, dtype=str) - >>> z - - >>> z.filters - [VLenUTF8()] - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) + In [0]: z = zarr.array(text_data, dtype=str) + + In [0]: z + + In [0]: z.filters + + In [0]: z[:] + Variable-length byte strings are also supported via ``dtype=object``. Again an ``object_codec`` is required, which can be one of :class:`numcodecs.VLenBytes` or :class:`numcodecs.Pickle`. For convenience, ``dtype=bytes`` (or ``dtype=str`` on Python 2.7) can be used as a short-hand for ``dtype=object, object_codec=numcodecs.VLenBytes()``, -e.g.:: - - >>> bytes_data = [g.encode('utf-8') for g in greetings] * 10000 - >>> z = zarr.array(bytes_data, dtype=bytes) - >>> z - - >>> z.filters - [VLenBytes()] - >>> z[:] - array([b'\xc2\xa1Hola mundo!', b'Hej V\xc3\xa4rlden!', b'Servus Woid!', - ..., b'Hell\xc3\xb3, vil\xc3\xa1g!', b'Zdravo svete!', - b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c'], dtype=object) +e.g. + +.. ipython:: + In [0]: bytes_data = [g.encode('utf-8') for g in greetings] * 10000 + + In [0]: z = zarr.array(bytes_data, dtype=bytes) + + In [0]: z + + In [0]: z.filters + + In [0]: z[:] + If you know ahead of time all the possible string values that can occur, you could also use the :class:`numcodecs.Categorize` codec to encode each unique string value as an -integer. E.g.:: +integer. E.g. + +.. ipython:: + + In [0]: categorize = numcodecs.Categorize(greetings, dtype=object) + + In [0]: z = zarr.array(text_data, dtype=object, object_codec=categorize) + + In [0]: z - >>> categorize = numcodecs.Categorize(greetings, dtype=object) - >>> z = zarr.array(text_data, dtype=object, object_codec=categorize) - >>> z - - >>> z.filters - [Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])] - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) + In [0]: z.filters + + In [0]: z[:] + .. _tutorial_objects: @@ -1240,15 +1251,21 @@ At the time of writing there are three codecs available that can serve as a gene purpose object codec and support encoding of a mixture of object types: :class:`numcodecs.JSON`, :class:`numcodecs.MsgPack`. and :class:`numcodecs.Pickle`. -For example, using the JSON codec:: +For example, using the JSON codec - >>> z = zarr.empty(5, dtype=object, object_codec=numcodecs.JSON()) - >>> z[0] = 42 - >>> z[1] = 'foo' - >>> z[2] = ['bar', 'baz', 'qux'] - >>> z[3] = {'a': 1, 'b': 2.2} - >>> z[:] - array([42, 'foo', list(['bar', 'baz', 'qux']), {'a': 1, 'b': 2.2}, None], dtype=object) +.. ipython:: + + In [0]: z = zarr.empty(5, dtype=object, object_codec=numcodecs.JSON()) + + In [0]: z[0] = 42 + + In [0]: z[1] = 'foo' + + In [0]: z[2] = ['bar', 'baz', 'qux'] + + In [0]: z[3] = {'a': 1, 'b': 2.2} + + In [0]: z[:] Not all codecs support encoding of all object types. The :class:`numcodecs.Pickle` codec is the most flexible, supporting encoding any type @@ -1263,35 +1280,44 @@ Ragged arrays If you need to store an array of arrays, where each member array can be of any length and stores the same primitive type (a.k.a. a ragged array), the -:class:`numcodecs.VLenArray` codec can be used, e.g.:: - - >>> z = zarr.empty(4, dtype=object, object_codec=numcodecs.VLenArray(int)) - >>> z - - >>> z.filters - [VLenArray(dtype='>> z[0] = np.array([1, 3, 5]) - >>> z[1] = np.array([4]) - >>> z[2] = np.array([7, 9, 14]) - >>> z[:] - array([array([1, 3, 5]), array([4]), array([ 7, 9, 14]), - array([], dtype=int64)], dtype=object) +:class:`numcodecs.VLenArray` codec can be used, e.g. + +.. ipython:: + + In [0]: z = zarr.empty(4, dtype=object, object_codec=numcodecs.VLenArray(int)) + + In [0]: z + + In [0]: z.filters + + In [0]: z[0] = np.array([1, 3, 5]) + + In [0]: z[1] = np.array([4]) + + In [0]: z[2] = np.array([7, 9, 14]) + + In [0]: z[:] As a convenience, ``dtype='array:T'`` can be used as a short-hand for ``dtype=object, object_codec=numcodecs.VLenArray('T')``, where 'T' can be any NumPy -primitive dtype such as 'i4' or 'f8'. E.g.:: - - >>> z = zarr.empty(4, dtype='array:i8') - >>> z - - >>> z.filters - [VLenArray(dtype='>> z[0] = np.array([1, 3, 5]) - >>> z[1] = np.array([4]) - >>> z[2] = np.array([7, 9, 14]) - >>> z[:] - array([array([1, 3, 5]), array([4]), array([ 7, 9, 14]), - array([], dtype=int64)], dtype=object) +primitive dtype such as 'i4' or 'f8'. E.g. + +.. ipython:: + + In [0]: z = zarr.empty(4, dtype='array:i8') + + In [0]: z + + In [0]: z.filters + + In [0]: z[0] = np.array([1, 3, 5]) + + In [0]: z[1] = np.array([4]) + + In [0]: z[2] = np.array([7, 9, 14]) + + In [0]: z[:] + .. _tutorial_chunks: @@ -1310,41 +1336,53 @@ The optimal chunk shape will depend on how you want to access the data. E.g., for a 2-dimensional array, if you only ever take slices along the first dimension, then chunk across the second dimenson. If you know you want to chunk across an entire dimension you can use ``None`` or ``-1`` within the ``chunks`` -argument, e.g.:: +argument, e.g. + +.. ipython:: + + In [0]: z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4') - >>> z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4') - >>> z1.chunks - (100, 10000) + In [0]: z1.chunks Alternatively, if you only ever take slices along the second dimension, then -chunk across the first dimension, e.g.:: +chunk across the first dimension, e.g. - >>> z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4') - >>> z2.chunks - (10000, 100) +.. ipython:: + + In [0]: z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4') + + In [0]: z2.chunks If you require reasonable performance for both access patterns then you need to -find a compromise, e.g.:: +find a compromise, e.g. + +.. ipython:: + + In [0]: z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z3.chunks - (1000, 1000) + In [0]: z3.chunks If you are feeling lazy, you can let Zarr guess a chunk shape for your data by providing ``chunks=True``, although please note that the algorithm for guessing -a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: +a chunk shape is based on simple heuristics and may be far from optimal. E.g. + +.. ipython:: + + In [0]: z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4') + + In [0]: z4.chunks - >>> z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4') - >>> z4.chunks - (625, 625) If you know you are always going to be loading the entire array into memory, you can turn off chunks by providing ``chunks=False``, in which case there will be -one single chunk for the array:: +one single chunk for the array + +.. ipython:: + + In [0]: z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4') + + In [0]: z5.chunks - >>> z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4') - >>> z5.chunks - (10000, 10000) .. _tutorial_chunks_order: @@ -1356,35 +1394,12 @@ The order of bytes **within each chunk** of an array can be changed via the multi-dimensional arrays, these two layouts may provide different compression ratios, depending on the correlation structure within the data. E.g.:: - >>> a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T - >>> c = zarr.array(a, chunks=(1000, 1000)) - >>> c.info - Type : zarr.core.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 400000000 (381.5M) - No. bytes stored : 6696010 (6.4M) - Storage ratio : 59.7 - Chunks initialized : 100/100 - >>> f = zarr.array(a, chunks=(1000, 1000), order='F') - >>> f.info - Type : zarr.core.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : F - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 400000000 (381.5M) - No. bytes stored : 4684636 (4.5M) - Storage ratio : 85.4 - Chunks initialized : 100/100 + In [0]: a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T + + In [0]: c = zarr.array(a, chunks=(1000, 1000)) + + In [0]: c.info + In the above example, Fortran order gives a better compression ratio. This is an artificial example but illustrates the general point that changing the order of @@ -1411,52 +1426,50 @@ If you know that your data will form chunks that are almost always non-empty, th In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness. The following example illustrates the effect of the ``write_empty_chunks`` flag on -the time required to write an array with different values.:: - - >>> import zarr - >>> import numpy as np - >>> import time - >>> from tempfile import TemporaryDirectory - >>> def timed_write(write_empty_chunks): - ... """ - ... Measure the time required and number of objects created when writing - ... to a Zarr array with random ints or fill value. - ... """ - ... chunks = (8192,) - ... shape = (chunks[0] * 1024,) - ... data = np.random.randint(0, 255, shape) - ... dtype = 'uint8' - ... - ... with TemporaryDirectory() as store: - ... arr = zarr.open(store, - ... shape=shape, - ... chunks=chunks, - ... dtype=dtype, - ... write_empty_chunks=write_empty_chunks, - ... fill_value=0, - ... mode='w') - ... # initialize all chunks - ... arr[:] = 100 - ... result = [] - ... for value in (data, arr.fill_value): - ... start = time.time() - ... arr[:] = value - ... elapsed = time.time() - start - ... result.append((elapsed, arr.nchunks_initialized)) - ... - ... return result - >>> for write_empty_chunks in (True, False): - ... full, empty = timed_write(write_empty_chunks) - ... print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') - - write_empty_chunks=True: - Random Data: 0.1252s, 1024 objects stored - Empty Data: 0.1060s, 1024 objects stored - - - write_empty_chunks=False: - Random Data: 0.1359s, 1024 objects stored - Empty Data: 0.0301s, 0 objects stored +the time required to write an array with different values. + +.. ipython:: + + In [0]: import zarr + + In [0]: import numpy as np + + In [0]: import time + + In [0]: from tempfile import TemporaryDirectory + + In [0]: def timed_write(write_empty_chunks): + ...: """ + ...: Measure the time required and number of objects created when writing + ...: to a Zarr array with random ints or fill value. + ...: """ + ...: chunks = (8192,) + ...: shape = (chunks[0] * 1024,) + ...: data = np.random.randint(0, 255, shape) + ...: dtype = 'uint8' + ...: + ...: with TemporaryDirectory() as store: + ...: arr = zarr.open(store, + ...: shape=shape, + ...: chunks=chunks, + ...: dtype=dtype, + ...: write_empty_chunks=write_empty_chunks, + ...: fill_value=0, + ...: mode='w') + ...: # initialize all chunks + ...: arr[:] = 100 + ...: result = [] + ...: for value in (data, arr.fill_value): + ...: start = time.time() + ...: arr[:] = value + ...: elapsed = time.time() - start + ...: result.append((elapsed, arr.nchunks_initialized)) + ...: + ...: return result + + In [0]: for write_empty_chunks in (True, False): + ...: full, empty = timed_write(write_empty_chunks) + ...: print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') In this example, writing random data is slightly slower with ``write_empty_chunks=True``, but writing empty data is substantially faster and generates far fewer objects in storage. @@ -1472,27 +1485,37 @@ have planned. In such cases it can be advantageous to re-chunk the data. For sma datasets, or when the mismatch between input and output chunks is small such that only a few chunks of the input dataset need to be read to create each chunk in the output array, it is sufficient to simply copy the data to a new array -with the desired chunking, e.g. :: +with the desired chunking, e.g. + +.. ipython:: + + In [0]: #a = zarr.zeros((10000, 10000), chunks=(100,100), dtype='uint16', store='a.zarr') - >>> a = zarr.zeros((10000, 10000), chunks=(100,100), dtype='uint16', store='a.zarr') - >>> b = zarr.array(a, chunks=(100, 200), store='b.zarr') + In [0]: #b = zarr.array(a, chunks=(100, 200), store='b.zarr') If the chunk shapes mismatch, however, a simple copy can lead to non-optimal data access patterns and incur a substantial performance hit when using file based stores. One of the most pathological examples is -switching from column-based chunking to row-based chunking e.g. :: +switching from column-based chunking to row-based chunking e.g. - >>> a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr') - >>> b = zarr.array(a, chunks=(1,10000), store='b.zarr') +.. ipython:: + + In [0]: #m = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='m.zarr') + + In [0]: #n = zarr.array(m, chunks=(1,10000), store='n.zarr') which will require every chunk in the input data set to be repeatedly read when creating each output chunk. If the entire array will fit within memory, this is simply resolved by forcing the entire input array into memory as a numpy array before converting -back to zarr with the desired chunking. :: +back to zarr with the desired chunking. + +.. ipython:: - >>> a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr') - >>> b = a[...] - >>> c = zarr.array(b, chunks=(1,10000), store='c.zarr') + In [0]: #x = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='x.zarr') + + In [0]: #y = x[...] + + In [0]: #z = zarr.array(y, chunks=(1,10000), store='z.zarr') For data sets which have mismatched chunks and which do not fit in memory, a more sophisticated approach to rechunking, such as offered by the @@ -1536,26 +1559,30 @@ then it is possible both workers will attempt to modify the middle chunk at the same time, and synchronization is required to prevent data loss. Zarr provides support for chunk-level synchronization. E.g., create an array -with thread synchronization:: +with thread synchronization + +.. ipython:: + + In [0]: z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', + ...: synchronizer=zarr.ThreadSynchronizer()) - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', - ... synchronizer=zarr.ThreadSynchronizer()) - >>> z - + In [0]: z This array is safe to read or write within a multi-threaded program. Zarr also provides support for process synchronization via file locking, provided that all processes have access to a shared file system, and provided that the underlying file system supports file locking (which is not the case for -some networked file systems). E.g.:: +some networked file systems). E.g. - >>> synchronizer = zarr.ProcessSynchronizer('data/example.sync') - >>> z = zarr.open_array('data/example', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4', - ... synchronizer=synchronizer) - >>> z - +.. ipython:: + + In [0]: synchronizer = zarr.ProcessSynchronizer('data/example.sync') + + In [0]: z = zarr.open_array('data/example', mode='w', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4', synchronizer=synchronizer) + + In [0]: z + This array is safe to read or write from multiple processes. @@ -1587,31 +1614,44 @@ store like a :class:`zarr.storage.DirectoryStore`, :class:`zarr.storage.ZipStore that is pickled is the necessary parameters to allow the store to re-open any underlying files or databases upon being unpickled. -E.g., pickle/unpickle an in-memory array:: - - >>> import pickle - >>> z1 = zarr.array(np.arange(100000)) - >>> s = pickle.dumps(z1) - >>> len(s) > 5000 # relatively large because data have been pickled - True - >>> z2 = pickle.loads(s) - >>> z1 == z2 - True - >>> np.all(z1[:] == z2[:]) - True - -E.g., pickle/unpickle an array stored on disk:: - - >>> z3 = zarr.open('data/walnuts.zarr', mode='w', shape=100000, dtype='i8') - >>> z3[:] = np.arange(100000) - >>> s = pickle.dumps(z3) - >>> len(s) < 200 # small because no data have been pickled - True - >>> z4 = pickle.loads(s) - >>> z3 == z4 - True - >>> np.all(z3[:] == z4[:]) - True +E.g., pickle/unpickle an in-memory array + +.. ipython:: + + In [0]: import pickle + + + In [0]: z1 = zarr.array(np.arange(100000)) + + In [0]: s = pickle.dumps(z1) + + In [0]: len(s) > 5000 # relatively large because data have been pickled + + In [0]: z2 = pickle.loads(s) + + In [0]: z1 == z2 + + In [0]: np.all(z1[:] == z2[:]) + + +E.g., pickle/unpickle an array stored on disk + +.. ipython:: + + In [0]: z3 = zarr.open('data/walnuts.zarr', mode='w', shape=100000, dtype='i8') + + In [0]: z3[:] = np.arange(100000) + + In [0]: s = pickle.dumps(z3) + + In [0]: len(s) < 200 # small because no data have been pickled + + In [0]: z4 = pickle.loads(s) + + In [0]: z3 == z4 + + In [0]: np.all(z3[:] == z4[:]) + .. _tutorial_datetime: @@ -1619,18 +1659,22 @@ Datetimes and timedeltas ------------------------ NumPy's ``datetime64`` ('M8') and ``timedelta64`` ('m8') dtypes are supported for Zarr -arrays, as long as the units are specified. E.g.:: - - >>> z = zarr.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='M8[D]') - >>> z - - >>> z[:] - array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]') - >>> z[0] - numpy.datetime64('2007-07-13') - >>> z[0] = '1999-12-31' - >>> z[:] - array(['1999-12-31', '2006-01-13', '2010-08-13'], dtype='datetime64[D]') +arrays, as long as the units are specified. E.g. + +.. ipython:: + + In [0]: z = zarr.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='M8[D]') + + In [0]: z + + In [0]: z[:] + + In [0]: z[0] + + In [0]: z[0] = '1999-12-31' + + In [0]: z[:] + .. _tutorial_tips: @@ -1642,12 +1686,17 @@ Usage tips Copying large arrays ~~~~~~~~~~~~~~~~~~~~ -Data can be copied between large arrays without needing much memory, e.g.:: +Data can be copied between large arrays without needing much memory, e.g. + +.. ipython:: - >>> z1 = zarr.empty((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z1[:] = 42 - >>> z2 = zarr.empty_like(z1) - >>> z2[:] = z1 + In [0]: z1 = zarr.empty((10000, 10000), chunks=(1000, 1000), dtype='i4') + + In [0]: z1[:] = 42 + + In [0]: z2 = zarr.empty_like(z1) + + In [0]: z2[:] = z1 Internally the example above works chunk-by-chunk, extracting only the data from ``z1`` required to fill each chunk in ``z2``. The source of the data (``z1``) @@ -1661,11 +1710,14 @@ Configuring Blosc The Blosc compressor is able to use multiple threads internally to accelerate compression and decompression. By default, Blosc uses up to 8 internal threads. The number of Blosc threads can be changed to increase or -decrease this number, e.g.:: +decrease this number, e.g. - >>> from numcodecs import blosc - >>> blosc.set_nthreads(2) # doctest: +SKIP - 8 +.. ipython:: + + In [0]: from numcodecs import blosc + + In [0]: blosc.set_nthreads(2) # doctest: +SKIP + When a Zarr array is being used within a multi-threaded program, Zarr automatically switches to using Blosc in a single-threaded From 1ba23fe2082a460c248382091f667fc79eb77f34 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Fri, 28 Oct 2022 13:46:08 +0100 Subject: [PATCH 04/15] import ipython --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index b1f1ce194a..feebebf457 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,7 +34,7 @@ # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' -#import IPython +import IPython # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. From 948abb6c28d0f9f53993c5ff2f16f60d4e096baa Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Fri, 28 Oct 2022 16:34:02 +0100 Subject: [PATCH 05/15] ipython fix --- docs/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index feebebf457..e969898393 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,7 +34,6 @@ # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' -import IPython # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. From fdcb615d730a202552c9907c3cb620a8d90f110c Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Fri, 28 Oct 2022 16:38:26 +0100 Subject: [PATCH 06/15] ipython fix --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index e969898393..68ea41bcfc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -45,7 +45,7 @@ 'numpydoc', 'sphinx_issues', "sphinx_copybutton", - 'IPython.sphinxext.ipython_console_highlighting', + #'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive' ] From 3acb60fb44562aec334ba225eedf72f9040ed791 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Fri, 28 Oct 2022 16:40:25 +0100 Subject: [PATCH 07/15] ipython fix --- docs/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 68ea41bcfc..cf0b5bc485 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -45,8 +45,8 @@ 'numpydoc', 'sphinx_issues', "sphinx_copybutton", - #'IPython.sphinxext.ipython_console_highlighting', - 'IPython.sphinxext.ipython_directive' + 'IPython.sphinxext.ipython_console_highlighting', + #'IPython.sphinxext.ipython_directive' ] numpydoc_show_class_members = False From 0835ace1e4017ed33cbe569d5355b9ebf72c6349 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Mon, 31 Oct 2022 10:35:08 +0100 Subject: [PATCH 08/15] extensions for ipython --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index cf0b5bc485..e969898393 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,7 +46,7 @@ 'sphinx_issues', "sphinx_copybutton", 'IPython.sphinxext.ipython_console_highlighting', - #'IPython.sphinxext.ipython_directive' + 'IPython.sphinxext.ipython_directive' ] numpydoc_show_class_members = False From fafa13f29f044f3c2dee817e93dac28a7283d63b Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Mon, 31 Oct 2022 10:35:38 +0100 Subject: [PATCH 09/15] added ipython --- requirements_rtfd.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements_rtfd.txt b/requirements_rtfd.txt index 0a7d90358e..ddcdfad7c3 100644 --- a/requirements_rtfd.txt +++ b/requirements_rtfd.txt @@ -7,4 +7,5 @@ sphinx-copybutton sphinx-rtd-theme numpydoc numpy!=1.21.0 -msgpack-python==0.5.6 \ No newline at end of file +msgpack-python==0.5.6 +ipython \ No newline at end of file From 9ea34b804eb976596230670e108da38eec4be0d0 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Mon, 31 Oct 2022 13:56:31 +0100 Subject: [PATCH 10/15] ipytree not needed --- docs/tutorial.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 7607dec1ff..d005e2ce24 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1049,8 +1049,6 @@ copying a group named 'foo' from an HDF5 file to a Zarr group In [0]: import zarr In [0]: import numpy as np - - In [0]: import ipytree In [0]: source = h5py.File('data/example.h5', mode='w') From 5a3539016a70c614ecfc0491f30c378f5078bbd9 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Mon, 31 Oct 2022 14:28:10 +0100 Subject: [PATCH 11/15] install ipytree --- requirements_rtfd.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements_rtfd.txt b/requirements_rtfd.txt index ddcdfad7c3..6806af69e1 100644 --- a/requirements_rtfd.txt +++ b/requirements_rtfd.txt @@ -8,4 +8,5 @@ sphinx-rtd-theme numpydoc numpy!=1.21.0 msgpack-python==0.5.6 -ipython \ No newline at end of file +ipython +ipytree \ No newline at end of file From 06b7ca1fef55eae52d4115a1eccfd50b30b48cae Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Mon, 31 Oct 2022 14:28:43 +0100 Subject: [PATCH 12/15] ipytree included --- docs/tutorial.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index d005e2ce24..7607dec1ff 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1049,6 +1049,8 @@ copying a group named 'foo' from an HDF5 file to a Zarr group In [0]: import zarr In [0]: import numpy as np + + In [0]: import ipytree In [0]: source = h5py.File('data/example.h5', mode='w') From 0c4f3375cb978a3418157ea1866ec4411e06bcda Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Mon, 31 Oct 2022 14:39:03 +0100 Subject: [PATCH 13/15] added bsddb3 --- requirements_rtfd.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements_rtfd.txt b/requirements_rtfd.txt index 6806af69e1..1ba627ff75 100644 --- a/requirements_rtfd.txt +++ b/requirements_rtfd.txt @@ -9,4 +9,5 @@ numpydoc numpy!=1.21.0 msgpack-python==0.5.6 ipython -ipytree \ No newline at end of file +ipytree +bsddb3 From e4c5a492f4619f422cc7b576c45adbf52115ce92 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Mon, 31 Oct 2022 14:46:01 +0100 Subject: [PATCH 14/15] berkeleydb replacing bsddb3 --- requirements_rtfd.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_rtfd.txt b/requirements_rtfd.txt index 1ba627ff75..2e66299f38 100644 --- a/requirements_rtfd.txt +++ b/requirements_rtfd.txt @@ -10,4 +10,4 @@ numpy!=1.21.0 msgpack-python==0.5.6 ipython ipytree -bsddb3 +berkeleydb \ No newline at end of file From ef8eb8dd96fc1f015e7f579c613f57a99cc87f07 Mon Sep 17 00:00:00 2001 From: GbotemiB Date: Tue, 1 Nov 2022 14:34:11 +0100 Subject: [PATCH 15/15] berkeleydb --- requirements_rtfd.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements_rtfd.txt b/requirements_rtfd.txt index 2e66299f38..bf813d869d 100644 --- a/requirements_rtfd.txt +++ b/requirements_rtfd.txt @@ -10,4 +10,3 @@ numpy!=1.21.0 msgpack-python==0.5.6 ipython ipytree -berkeleydb \ No newline at end of file