From 02fca542f531901b565116ec893a1b0d1d59cb20 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 12:20:54 +0000 Subject: [PATCH 001/105] Using s3 to get at some real data for testing --- bnl/playing.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/bnl/playing.py b/bnl/playing.py index ebd6b01..8666d22 100644 --- a/bnl/playing.py +++ b/bnl/playing.py @@ -1,18 +1,35 @@ import pyfive from pathlib import Path from pyfive.as_dataobjects import ADataObjects +import s3fs +import h5py MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' -MYFILE = '../tests/chunked.hdf5' +#MYFILE = '../tests/chunked.hdf5' +MYFILE = 'da193a_25_day__198807-198807.nc' + MYPATH = Path(__file__).parent #f = h5py.File(MYPATH/MYFILE,'r') -f2 = pyfive.File(MYPATH/MYFILE) -path = 'dataset1' -link_target = f2._links[path] -dsref = ADataObjects(f2.file._fh, link_target) -chunk_index = dsref.get_offset_addresses() -print(chunk_index) + +S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/' +fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) +uri = 'bnl/'+MYFILE + +with fs.open(uri,'rb') as s3file2: + f2 = pyfive.File(s3file2) + #f2 = pyfive.File(MYPATH/MYFILE) + path = 'dataset1' + path = 'tos' + path = 'm01s06i247_4' + link_target = f2._links[path] + dsref = ADataObjects(f2.file._fh, link_target) + chunk_index = dsref.get_offset_addresses() + for e in chunk_index: + print(e) +with fs.open(uri,'rb') as s3file2: + f3 = h5py.File(s3file2,'r') + print(f3[path]) #v='tos' From df3669a11438be9b12dcb63e92cf6c4d74598a9f Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 12:37:31 +0000 Subject: [PATCH 002/105] Getting the address as well as size into the index --- pyfive/as_dataobjects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py index b811fb3..ccf9c38 100644 --- a/pyfive/as_dataobjects.py +++ b/pyfive/as_dataobjects.py @@ -57,7 +57,7 @@ def _as_get_chunk_addresses(self, args): size = node_key['chunk_size'] start = node_key['chunk_offset'][:-1] region = [slice(i, i+j) for i, j in zip(start, self.shape)] - self._as_chunk_index.append([region, start, size]) + self._as_chunk_index.append([region, start, addr, size]) if args is not None: return NotImplementedError From 16c0e813e0f769e5a45c55c1cb4272f09d5edf64 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 15:22:42 +0000 Subject: [PATCH 003/105] With timer --- bnl/playing.py | 55 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/bnl/playing.py b/bnl/playing.py index 8666d22..56ae5a8 100644 --- a/bnl/playing.py +++ b/bnl/playing.py @@ -3,33 +3,48 @@ from pyfive.as_dataobjects import ADataObjects import s3fs import h5py +import time MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' #MYFILE = '../tests/chunked.hdf5' MYFILE = 'da193a_25_day__198807-198807.nc' +INPUT_OPTIONS = [ + ('da193o_25_day__grid_T_198807-198807.nc','tos','s3'), + ('da193a_25_day__198807-198807.nc','m01s06i247_4','s3'), + ('../tests/chunked.hdf5','dataset1','local'), + ('CMIP6-test.nc','tas', 's3') +] + MYPATH = Path(__file__).parent -#f = h5py.File(MYPATH/MYFILE,'r') - -S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/' -fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) -uri = 'bnl/'+MYFILE - -with fs.open(uri,'rb') as s3file2: - f2 = pyfive.File(s3file2) - #f2 = pyfive.File(MYPATH/MYFILE) - path = 'dataset1' - path = 'tos' - path = 'm01s06i247_4' - link_target = f2._links[path] - dsref = ADataObjects(f2.file._fh, link_target) - chunk_index = dsref.get_offset_addresses() - for e in chunk_index: - print(e) -with fs.open(uri,'rb') as s3file2: - f3 = h5py.File(s3file2,'r') - print(f3[path]) +option = 1 +location = INPUT_OPTIONS[option][2] +MYFILE = INPUT_OPTIONS[option][0] + +if location == 's3': + + S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/' + fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) + uri = 'bnl/'+MYFILE + + t1 = time.time() + with fs.open(uri,'rb') as s3file2: + f2 = pyfive.File(s3file2) + #f2 = pyfive.File(MYPATH/MYFILE) + path = INPUT_OPTIONS[option][1] + link_target = f2._links[path] + dsref = ADataObjects(f2.file._fh, link_target) + chunk_index = dsref.get_offset_addresses() + t2 = time.time() + print(f'Chunk index timer {t2-t1:.2}s') + for e in chunk_index: + print(e) + + + with fs.open(uri,'rb') as s3file2: + f3 = h5py.File(s3file2,'r') + print(f3[path]) #v='tos' From c464be8e881d85770e752e3d7e90b4cda0cf38d2 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 23 Feb 2024 11:12:15 +0000 Subject: [PATCH 004/105] Not working yet. Don't reckon I have the arguments to OrthogonalIndexer right yet. --- bnl/playing.py | 53 ++++++++++++++++++++++++++-------------- pyfive/as_dataobjects.py | 45 ++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 26 deletions(-) diff --git a/bnl/playing.py b/bnl/playing.py index 56ae5a8..e379e1b 100644 --- a/bnl/playing.py +++ b/bnl/playing.py @@ -18,9 +18,24 @@ MYPATH = Path(__file__).parent -option = 1 +option = 2 location = INPUT_OPTIONS[option][2] MYFILE = INPUT_OPTIONS[option][0] +path = INPUT_OPTIONS[option][1] + + +def working(f2, path, printindex=True): + link_target = f2._links[path] + t1 = time.time() + dsref = ADataObjects(f2.file._fh, link_target) + chunk_index = dsref.get_offset_addresses() + t2 = time.time() + print(f'Chunk index timer {t2-t1:.2}s') + if printindex: + for e in chunk_index: + print(e) + + return t2 if location == 's3': @@ -31,29 +46,31 @@ t1 = time.time() with fs.open(uri,'rb') as s3file2: f2 = pyfive.File(s3file2) - #f2 = pyfive.File(MYPATH/MYFILE) - path = INPUT_OPTIONS[option][1] - link_target = f2._links[path] - dsref = ADataObjects(f2.file._fh, link_target) - chunk_index = dsref.get_offset_addresses() - t2 = time.time() - print(f'Chunk index timer {t2-t1:.2}s') - for e in chunk_index: - print(e) - + t2 = working(f2, path) + print(f'Complete chunking timer {t2-t1:.2}s') with fs.open(uri,'rb') as s3file2: f3 = h5py.File(s3file2,'r') print(f3[path]) +elif location == 'local': -#v='tos' -#tos =f2[v] -#v='dataset1' -#print(tos) -#x = tos[2,:] -#print(x) -#print(tos.shape) + + f2 = pyfive.File(MYPATH/MYFILE) + x = f2[path] + y = x[2,:] + print(x.shape) + print(y) + t1 = time.time() + t2 = working(f2, path, printindex=False) + d = ADataObjects(f2.file._fh, f2._links[path]) + r = d[2:] + if len(r) != len(y): + print(f"yeah, well, it's not working (returning {len(r)} items instead of {len(y)})") + print(r) + raise ValueError('Busted') +else: + raise ValueError('You stuffed up') diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py index ccf9c38..ab5e3da 100644 --- a/pyfive/as_dataobjects.py +++ b/pyfive/as_dataobjects.py @@ -2,6 +2,19 @@ from .datatype_msg import DatatypeMessage import numpy as np from .btree import BTreeV1RawDataChunks +from .indexing import OrthogonalIndexer + + +class ZarrSubstitute: + """ + This mimics the funcationality of the zarr array produced by kerchunk, + but with only what is needed for indexing + """ + def __init__(self, chunk_index, shape, chunks): + self.chunk_index = chunk_index + self._chunks = chunks + self._shape = shape + class ADataObjects(DataObjects): """ @@ -16,7 +29,7 @@ def __init__(self,*args,**kwargs): # not yet sure we need our own copy self._as_chunk_index=[] - def get_offset_addresses(self, args=None): + def get_offset_addresses(self): """ Get the offset addresses for the data requested """ @@ -32,17 +45,18 @@ def get_offset_addresses(self, args=None): elif layout_class == 1: # contiguous storage return NotImplementedError("Contiguous storage") if layout_class == 2: # chunked storage - return self._as_get_chunk_addresses(args) + return self._as_get_chunk_addresses() - def _as_get_chunk_addresses(self, args): + def _as_get_chunk_addresses(self): """ Get the offset addresses associated with all the chunks known to the b-tree of this object """ - self._get_chunk_params() - if self._as_chunk_index == []: + + self._get_chunk_params() + chunk_btree = BTreeV1RawDataChunks( self.fh, self._chunk_address, self._chunk_dims) @@ -59,9 +73,24 @@ def _as_get_chunk_addresses(self, args): region = [slice(i, i+j) for i, j in zip(start, self.shape)] self._as_chunk_index.append([region, start, addr, size]) - if args is not None: - return NotImplementedError - return self._as_chunk_index + def __getitem__(self, args): + + if self._as_chunk_index == []: + self._as_get_chunk_addresses + + array = ZarrSubstitute(self._as_chunk_index, self.shape, self.chunks) + + indexer = OrthogonalIndexer(args, array) + stripped_indexer = [(a, b, c) for a,b,c in indexer] + print(stripped_indexer) + mycoords = [] + for chunk_coords, chunk_selection, out_selection in stripped_indexer: + coord = '.'.join([str(c) for c in chunk_coords]) + mycoords.append((chunk_coords,coord)) + print("This isn't yet doing what you think it is, it's only returning chunk indices for your selection") + return mycoords + + From afaa4f57ae72fe404b184846aa0a3ba4dd6e13a2 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 23 Feb 2024 11:16:19 +0000 Subject: [PATCH 005/105] A few more notes in the code so I can come back to it anon. --- bnl/playing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bnl/playing.py b/bnl/playing.py index e379e1b..fd1f321 100644 --- a/bnl/playing.py +++ b/bnl/playing.py @@ -65,10 +65,14 @@ def working(f2, path, printindex=True): t2 = working(f2, path, printindex=False) d = ADataObjects(f2.file._fh, f2._links[path]) r = d[2:] - if len(r) != len(y): + if len(r) >= len(y): print(f"yeah, well, it's not working (returning {len(r)} items instead of {len(y)})") + # as it's stands, r should be a set of indices for chunks containing y, which should have + # length less than or equal to length (y). At the moment it's too long, so that's clearly + # broken print(r) raise ValueError('Busted') + raise ValueError('Busted, but in a better way') else: From 18bc37c464e5d9d3b9335fc3135e20f97adc00d0 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 23 Feb 2024 11:17:26 +0000 Subject: [PATCH 006/105] Woops. Need this. --- pyfive/indexing.py | 1101 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1101 insertions(+) create mode 100644 pyfive/indexing.py diff --git a/pyfive/indexing.py b/pyfive/indexing.py new file mode 100644 index 0000000..f1e5e5e --- /dev/null +++ b/pyfive/indexing.py @@ -0,0 +1,1101 @@ +# This file was lifted from zarr 23/02/24 +# lift and shift rather than import to keep installation dependencies down +# and because we are very likely to change the array format for the chunk index +# The code is MIT licensesd. + +import collections +import itertools +import math +import numbers + +import numpy as np + +# in the original zarr source the following error definitions are an import from zarr.errors + +class _BaseZarrIndexError(IndexError): + _msg = "" + + def __init__(self, *args): + super().__init__(self._msg.format(*args)) + +class ArrayIndexError(IndexError): + pass + +def err_too_many_indices(selection, shape): + raise IndexError(f"too many indices for array; expected {len(shape)}, got {len(selection)}") + +class VindexInvalidSelectionError(_BaseZarrIndexError): + _msg = ( + "unsupported selection type for vectorized indexing; only " + "coordinate selection (tuple of integer arrays) and mask selection " + "(single Boolean array) are supported; got {0!r}" + ) +class BoundsCheckError(_BaseZarrIndexError): + _msg = "index out of bounds for dimension with length {0}" + +class NegativeStepError(IndexError): + def __init__(self): + super().__init__("only slices with step >= 1 are supported") + +# And the rest of the code is the original file. + +def is_integer(x): + """True if x is an integer (both pure Python or NumPy). + + Note that Python's bool is considered an integer too. + """ + return isinstance(x, numbers.Integral) + + +def is_integer_list(x): + """True if x is a list of integers. + + This function assumes ie *does not check* that all elements of the list + have the same type. Mixed type lists will result in other errors that will + bubble up anyway. + """ + return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) + + +def is_integer_array(x, ndim=None): + t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_bool_array(x, ndim=None): + t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_scalar(value, dtype): + if np.isscalar(value): + return True + if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): + return True + return False + + +def is_pure_fancy_indexing(selection, ndim): + """Check whether a selection contains only scalars or integer array-likes. + + Parameters + ---------- + selection : tuple, slice, or scalar + A valid selection value for indexing into arrays. + + Returns + ------- + is_pure : bool + True if the selection is a pure fancy indexing expression (ie not mixed + with boolean or slices). + """ + if ndim == 1: + if is_integer_list(selection) or is_integer_array(selection): + return True + # if not, we go through the normal path below, because a 1-tuple + # of integers is also allowed. + no_slicing = ( + isinstance(selection, tuple) + and len(selection) == ndim + and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection)) + ) + return ( + no_slicing + and all( + is_integer(elem) or is_integer_list(elem) or is_integer_array(elem) + for elem in selection + ) + and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection) + ) + + +def is_pure_orthogonal_indexing(selection, ndim): + if not ndim: + return False + + # Case 1: Selection is a single iterable of integers + if is_integer_list(selection) or is_integer_array(selection, ndim=1): + return True + + # Case two: selection contains either zero or one integer iterables. + # All other selection elements are slices or integers + return ( + isinstance(selection, tuple) + and len(selection) == ndim + and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1 + and all( + is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, (int, slice)) + for elem in selection + ) + ) + + +def normalize_integer_selection(dim_sel, dim_len): + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise BoundsCheckError(dim_len) + + return dim_sel + + +ChunkDimProjection = collections.namedtuple( + "ChunkDimProjection", ("dim_chunk_ix", "dim_chunk_sel", "dim_out_sel") +) +"""A mapping from chunk to output array for a single dimension. + +Parameters +---------- +dim_chunk_ix + Index of chunk. +dim_chunk_sel + Selection of items from chunk array. +dim_out_sel + Selection of items in target (output) array. + +""" + + +class IntDimIndexer: + def __init__(self, dim_sel, dim_len, dim_chunk_len): + # normalize + dim_sel = normalize_integer_selection(dim_sel, dim_len) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = 1 + + def __iter__(self): + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def ceildiv(a, b): + return math.ceil(a / b) + + +class SliceDimIndexer: + def __init__(self, dim_sel, dim_len, dim_chunk_len): + # normalize + self.start, self.stop, self.step = dim_sel.indices(dim_len) + if self.step < 1: + raise NegativeStepError() + + # store attributes + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = max(0, ceildiv((self.stop - self.start), self.step)) + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) + + def __iter__(self): + # figure out the range of chunks we need to visit + dim_chunk_ix_from = self.start // self.dim_chunk_len + dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) + + # iterate over chunks in range + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + # compute offsets for chunk within overall array + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) + + # determine chunk length, accounting for trailing chunk + dim_chunk_len = dim_limit - dim_offset + + if self.start < dim_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + # compute number of previous items, provides offset into output array + dim_out_offset = ceildiv((dim_offset - self.start), self.step) + + else: + # selection starts within current chunk + dim_chunk_sel_start = self.start - dim_offset + dim_out_offset = 0 + + if self.stop > dim_limit: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = self.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + # If there are no elements on the selection within this chunk, then skip + if dim_chunk_nitems == 0: + continue + + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def check_selection_length(selection, shape): + if len(selection) > len(shape): + err_too_many_indices(selection, shape) + + +def replace_ellipsis(selection, shape): + selection = ensure_tuple(selection) + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += (slice(None),) * (len(shape) - len(selection)) + + # check selection not too long + check_selection_length(selection, shape) + + return selection + + +def replace_lists(selection): + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection + ) + + +def ensure_tuple(v): + if not isinstance(v, tuple): + v = (v,) + return v + + +ChunkProjection = collections.namedtuple( + "ChunkProjection", ("chunk_coords", "chunk_selection", "out_selection") +) +"""A mapping of items from chunk to output array. Can be used to extract items from the +chunk array for loading into an output array. Can also be used to extract items from a +value array for setting/updating in a chunk array. + +Parameters +---------- +chunk_coords + Indices of chunk. +chunk_selection + Selection of items from chunk array. +out_selection + Selection of items in target (output) array. + +""" + + +def is_slice(s): + return isinstance(s, slice) + + +def is_contiguous_slice(s): + return is_slice(s) and (s.step is None or s.step == 1) + + +def is_positive_slice(s): + return is_slice(s) and (s.step is None or s.step >= 1) + + +def is_contiguous_selection(selection): + selection = ensure_tuple(selection) + return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) + + +def is_basic_selection(selection): + selection = ensure_tuple(selection) + return all(is_integer(s) or is_positive_slice(s) for s in selection) + + +# noinspection PyProtectedMember +class BasicIndexer: + def __init__(self, selection, array): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_slice(dim_sel): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + f"unsupported selection item for basic indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) + self.drop_axes = None + + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +class BoolArrayDimIndexer: + def __init__(self, dim_sel, dim_len, dim_chunk_len): + # check number of dimensions + if not is_bool_array(dim_sel, 1): + raise IndexError( + "Boolean arrays in an orthogonal selection must " "be 1-dimensional only" + ) + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError( + f"Boolean array has the wrong length for dimension; " + f"expected {dim_len}, got { dim_sel.shape[0]}" + ) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.zeros(self.nchunks, dtype="i8") + for dim_chunk_ix in range(self.nchunks): + dim_offset = dim_chunk_ix * self.dim_chunk_len + self.chunk_nitems[dim_chunk_ix] = np.count_nonzero( + self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + ) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = self.chunk_nitems_cumsum[-1] + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + + def __iter__(self): + # iterate over chunks with at least one item + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +class Order: + UNKNOWN = 0 + INCREASING = 1 + DECREASING = 2 + UNORDERED = 3 + + @staticmethod + def check(a): + diff = np.diff(a) + diff_positive = diff >= 0 + n_diff_positive = np.count_nonzero(diff_positive) + all_increasing = n_diff_positive == len(diff_positive) + any_increasing = n_diff_positive > 0 + if all_increasing: + order = Order.INCREASING + elif any_increasing: + order = Order.UNORDERED + else: + order = Order.DECREASING + return order + + +def wraparound_indices(x, dim_len): + loc_neg = x < 0 + if np.any(loc_neg): + x[loc_neg] = x[loc_neg] + dim_len + + +def boundscheck_indices(x, dim_len): + if np.any(x < 0) or np.any(x >= dim_len): + raise BoundsCheckError(dim_len) + + +class IntArrayDimIndexer: + """Integer array selection against a single dimension.""" + + def __init__( + self, + dim_sel, + dim_len, + dim_chunk_len, + wraparound=True, + boundscheck=True, + order=Order.UNKNOWN, + ): + # ensure 1d array + dim_sel = np.asanyarray(dim_sel) + if not is_integer_array(dim_sel, 1): + raise IndexError( + "integer arrays in an orthogonal selection must be " "1-dimensional only" + ) + + # handle wraparound + if wraparound: + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + if boundscheck: + boundscheck_indices(dim_sel, dim_len) + + # store attributes + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) + self.nitems = len(dim_sel) + + # determine which chunk is needed for each selection item + # note: for dense integer selections, the division operation here is the + # bottleneck + dim_sel_chunk = dim_sel // dim_chunk_len + + # determine order of indices + if order == Order.UNKNOWN: + order = Order.check(dim_sel) + self.order = order + + if self.order == Order.INCREASING: + self.dim_sel = dim_sel + self.dim_out_sel = None + elif self.order == Order.DECREASING: + self.dim_sel = dim_sel[::-1] + # TODO should be possible to do this without creating an arange + self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) + else: + # sort indices to group by chunk + self.dim_out_sel = np.argsort(dim_sel_chunk) + self.dim_sel = np.take(dim_sel, self.dim_out_sel) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) + + # find chunks that we need to visit + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + + # compute offsets into the output array + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + + def __iter__(self): + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.order == Order.INCREASING: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_out_sel[start:stop] + + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def slice_to_range(s: slice, l: int): # noqa: E741 + return range(*s.indices(l)) + + +def ix_(selection, shape): + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ + but with support for slices and single ints.""" + + # normalisation + selection = replace_ellipsis(selection, shape) + + # replace slice and int as these are not supported by numpy.ix_ + selection = [ + ( + slice_to_range(dim_sel, dim_len) + if isinstance(dim_sel, slice) + else [dim_sel] if is_integer(dim_sel) else dim_sel + ) + for dim_sel, dim_len in zip(selection, shape) + ] + + # now get numpy to convert to a coordinate selection + selection = np.ix_(*selection) + + return selection + + +def oindex(a, selection): + """Implementation of orthogonal indexing with slices and ints.""" + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + result = a[selection] + if drop_axes: + result = result.squeeze(axis=drop_axes) + return result + + +def oindex_set(a, selection, value): + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + if not np.isscalar(value) and drop_axes: + value = np.asanyarray(value) + value_selection = [slice(None)] * len(a.shape) + for i in drop_axes: + value_selection[i] = np.newaxis + value_selection = tuple(value_selection) + value = value[value_selection] + a[selection] = value + + +# noinspection PyProtectedMember +class OrthogonalIndexer: + def __init__(self, selection, array): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_integer_array(dim_sel): + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_bool_array(dim_sel): + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + f"unsupported selection item for orthogonal indexing; " + f"expected integer, slice, integer array or Boolean " + f"array, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + self.array = array + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) + self.is_advanced = not is_basic_selection(selection) + if self.is_advanced: + self.drop_axes = tuple( + i + for i, dim_indexer in enumerate(self.dim_indexers) + if isinstance(dim_indexer, IntDimIndexer) + ) + else: + self.drop_axes = None + + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # N.B., numpy doesn't support orthogonal indexing directly as yet, + # so need to work around via np.ix_. Also np.ix_ does not support a + # mixture of arrays and slices or integers, so need to convert slices + # and integers into ranges. + chunk_selection = ix_(chunk_selection, self.array._chunks) + + # special case for non-monotonic indices + if not is_basic_selection(out_selection): + out_selection = ix_(out_selection, self.shape) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +class OIndex: + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_orthogonal_selection(selection, fields=fields) + + def __setitem__(self, selection, value): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_orthogonal_selection(selection, value, fields=fields) + + +# noinspection PyProtectedMember +class BlockIndexer: + def __init__(self, selection, array): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_size in zip(selection, array._shape, array._chunks): + dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) + + if is_integer(dim_sel): + if dim_sel < 0: + dim_sel = dim_numchunks + dim_sel + + start = dim_sel * dim_chunk_size + stop = start + dim_chunk_size + slice_ = slice(start, stop) + + elif is_slice(dim_sel): + start = dim_sel.start if dim_sel.start is not None else 0 + stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks + + if dim_sel.step not in {1, None}: + raise IndexError( + f"unsupported selection item for block indexing; " + f"expected integer or slice with step=1, got {type(dim_sel)!r}" + ) + + # Can't reuse wraparound_indices because it expects a numpy array + # We have integers here. + if start < 0: + start = dim_numchunks + start + if stop < 0: + stop = dim_numchunks + stop + + start = start * dim_chunk_size + stop = stop * dim_chunk_size + slice_ = slice(start, stop) + + else: + raise IndexError( + f"unsupported selection item for block indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) + dim_indexers.append(dim_indexer) + + if start >= dim_len or start < 0: + raise BoundsCheckError(dim_len) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers) + self.drop_axes = None + + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +class BlockIndex: + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_block_selection(selection, fields=fields) + + def __setitem__(self, selection, value): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_block_selection(selection, value, fields=fields) + + +# noinspection PyProtectedMember +def is_coordinate_selection(selection, array): + return (len(selection) == len(array._shape)) and all( + is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection + ) + + +# noinspection PyProtectedMember +def is_mask_selection(selection, array): + return ( + len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == array._shape + ) + + +# noinspection PyProtectedMember +class CoordinateIndexer: + def __init__(self, selection, array): + # some initial normalization + selection = ensure_tuple(selection) + selection = tuple([i] if is_integer(i) else i for i in selection) + selection = replace_lists(selection) + + # validation + if not is_coordinate_selection(selection, array): + raise IndexError( + f"invalid coordinate selection; expected one integer " + f"(coordinate) array per dimension of the target array, " + f"got {selection!r}" + ) + + # handle wraparound, boundscheck + for dim_sel, dim_len in zip(selection, array.shape): + # handle wraparound + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + boundscheck_indices(dim_sel, dim_len) + + # compute chunk index for each point in the selection + chunks_multi_index = tuple( + dim_sel // dim_chunk_len for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) + ) + + # broadcast selection - this will raise error if array dimensions don't match + selection = np.broadcast_arrays(*selection) + chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) + + # remember shape of selection, because we will flatten indices for processing + self.sel_shape = selection[0].shape if selection[0].shape else (1,) + + # flatten selection + selection = [dim_sel.reshape(-1) for dim_sel in selection] + chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] + + # ravel chunk indices + chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape) + + # group points by chunk + if np.any(np.diff(chunks_raveled_indices) < 0): + # optimisation, only sort if needed + sel_sort = np.argsort(chunks_raveled_indices) + selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + else: + sel_sort = None + + # store attributes + self.selection = selection + self.sel_sort = sel_sort + self.shape = selection[0].shape if selection[0].shape else (1,) + self.drop_axes = None + self.array = array + + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + # locate the chunks we need to process + self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] + + # unravel chunk indices + self.chunk_mixs = np.unravel_index(self.chunk_rixs, array._cdata_shape) + + def __iter__(self): + # iterate over chunks + for i, chunk_rix in enumerate(self.chunk_rixs): + chunk_coords = tuple(m[i] for m in self.chunk_mixs) + if chunk_rix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[chunk_rix - 1] + stop = self.chunk_nitems_cumsum[chunk_rix] + if self.sel_sort is None: + out_selection = slice(start, stop) + else: + out_selection = self.sel_sort[start:stop] + + chunk_offsets = tuple( + dim_chunk_ix * dim_chunk_len + for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks) + ) + chunk_selection = tuple( + dim_sel[start:stop] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +# noinspection PyProtectedMember +class MaskIndexer(CoordinateIndexer): + def __init__(self, selection, array): + # some initial normalization + selection = ensure_tuple(selection) + selection = replace_lists(selection) + + # validation + if not is_mask_selection(selection, array): + raise IndexError( + f"invalid mask selection; expected one Boolean (mask)" + f"array with the same shape as the target array, got {selection!r}" + ) + + # convert to indices + selection = np.nonzero(selection[0]) + + # delegate the rest to superclass + super().__init__(selection, array) + + +class VIndex: + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + return self.array.get_coordinate_selection(selection, fields=fields) + elif is_mask_selection(selection, self.array): + return self.array.get_mask_selection(selection, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + def __setitem__(self, selection, value): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + self.array.set_coordinate_selection(selection, value, fields=fields) + elif is_mask_selection(selection, self.array): + self.array.set_mask_selection(selection, value, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + +def check_fields(fields, dtype): + # early out + if fields is None: + return dtype + # check type + if not isinstance(fields, (str, list, tuple)): + raise IndexError( + f"'fields' argument must be a string or list of strings; found " f"{type(fields)!r}" + ) + if fields: + if dtype.names is None: + raise IndexError("invalid 'fields' argument, array does not have any fields") + try: + if isinstance(fields, str): + # single field selection + out_dtype = dtype[fields] + else: + # multiple field selection + out_dtype = np.dtype([(f, dtype[f]) for f in fields]) + except KeyError as e: + raise IndexError(f"invalid 'fields' argument, field not found: {e!r}") + else: + return out_dtype + else: + return dtype + + +def check_no_multi_fields(fields): + if isinstance(fields, list): + if len(fields) == 1: + return fields[0] + elif len(fields) > 1: + raise IndexError("multiple fields are not supported for this operation") + return fields + + +def pop_fields(selection): + if isinstance(selection, str): + # single field selection + fields = selection + selection = () + elif not isinstance(selection, tuple): + # single selection item, no fields + fields = None + # leave selection as-is + else: + # multiple items, split fields from selection items + fields = [f for f in selection if isinstance(f, str)] + fields = fields[0] if len(fields) == 1 else fields + selection = tuple(s for s in selection if not isinstance(s, str)) + selection = selection[0] if len(selection) == 1 else selection + return fields, selection + + +def make_slice_selection(selection): + ls = [] + for dim_selection in selection: + if is_integer(dim_selection): + ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) + elif isinstance(dim_selection, np.ndarray): + if len(dim_selection) == 1: + ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1)) + else: + raise ArrayIndexError() + else: + ls.append(dim_selection) + return ls + + +class PartialChunkIterator: + """Iterator to retrieve the specific coordinates of requested data + from within a compressed chunk. + + Parameters + ---------- + selection : tuple + tuple of slice objects to take from the chunk + arr_shape : shape of chunk to select data from + + Attributes + ----------- + arr_shape + selection + + Returns + ------- + Tuple with 3 elements: + + start: int + elements offset in the chunk to read from + nitems: int + number of elements to read in the chunk from start + partial_out_selection: list of slices + indices of a temporary empty array of size `Array._chunks` to assign + the decompressed data to after the partial read. + + Notes + ----- + An array is flattened when compressed with blosc, so this iterator takes + the wanted selection of an array and determines the wanted coordinates + of the flattened, compressed data to be read and then decompressed. The + decompressed data is then placed in a temporary empty array of size + `Array._chunks` at the indices yielded as partial_out_selection. + Once all the slices yielded by this iterator have been read, decompressed + and written to the temporary array, the wanted slice of the chunk can be + indexed from the temporary array and written to the out_selection slice + of the out array. + + """ + + def __init__(self, selection, arr_shape): + selection = make_slice_selection(selection) + self.arr_shape = arr_shape + + # number of selection dimensions can't be greater than the number of chunk dimensions + if len(selection) > len(self.arr_shape): + raise ValueError( + "Selection has more dimensions then the array:\n" + f"selection dimensions = {len(selection)}\n" + f"array dimensions = {len(self.arr_shape)}" + ) + + # any selection can not be out of the range of the chunk + selection_shape = np.empty(self.arr_shape)[tuple(selection)].shape + if any( + selection_dim < 0 or selection_dim > arr_dim + for selection_dim, arr_dim in zip(selection_shape, self.arr_shape) + ): + raise IndexError( + "a selection index is out of range for the dimension" + ) # pragma: no cover + + for i, dim_size in enumerate(self.arr_shape[::-1]): + index = len(self.arr_shape) - (i + 1) + if index <= len(selection) - 1: + slice_size = selection_shape[index] + if slice_size == dim_size and index > 0: + selection.pop() + else: + break + + chunk_loc_slices = [] + last_dim_slice = None if selection[-1].step > 1 else selection.pop() + for arr_shape_i, sl in zip(arr_shape, selection): + dim_chunk_loc_slices = [] + assert isinstance(sl, slice) + for x in slice_to_range(sl, arr_shape_i): + dim_chunk_loc_slices.append(slice(x, x + 1, 1)) + chunk_loc_slices.append(dim_chunk_loc_slices) + if last_dim_slice: + chunk_loc_slices.append([last_dim_slice]) + self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices)) + + def __iter__(self): + chunk1 = self.chunk_loc_slices[0] + nitems = (chunk1[-1].stop - chunk1[-1].start) * np.prod( + self.arr_shape[len(chunk1) :], dtype=int + ) + for partial_out_selection in self.chunk_loc_slices: + start = 0 + for i, sl in enumerate(partial_out_selection): + start += sl.start * np.prod(self.arr_shape[i + 1 :], dtype=int) + yield start, nitems, partial_out_selection From 4b0ac08545cad640a8a3204ee2ed6af50352048d Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sat, 24 Feb 2024 17:33:53 +0000 Subject: [PATCH 007/105] First working lazy read (only reads chunks needed for selection) --- bnl/playing.py | 17 ++++++----------- bnl/playings3.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 11 deletions(-) create mode 100644 bnl/playings3.py diff --git a/bnl/playing.py b/bnl/playing.py index fd1f321..091e18f 100644 --- a/bnl/playing.py +++ b/bnl/playing.py @@ -4,6 +4,7 @@ import s3fs import h5py import time +import numpy as np MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' #MYFILE = '../tests/chunked.hdf5' @@ -13,7 +14,8 @@ ('da193o_25_day__grid_T_198807-198807.nc','tos','s3'), ('da193a_25_day__198807-198807.nc','m01s06i247_4','s3'), ('../tests/chunked.hdf5','dataset1','local'), - ('CMIP6-test.nc','tas', 's3') + ('CMIP6-test.nc','tas', 's3'), + ('CMIP6-test.nc','tas', 'local') ] MYPATH = Path(__file__).parent @@ -55,7 +57,6 @@ def working(f2, path, printindex=True): elif location == 'local': - f2 = pyfive.File(MYPATH/MYFILE) x = f2[path] y = x[2,:] @@ -64,15 +65,9 @@ def working(f2, path, printindex=True): t1 = time.time() t2 = working(f2, path, printindex=False) d = ADataObjects(f2.file._fh, f2._links[path]) - r = d[2:] - if len(r) >= len(y): - print(f"yeah, well, it's not working (returning {len(r)} items instead of {len(y)})") - # as it's stands, r should be a set of indices for chunks containing y, which should have - # length less than or equal to length (y). At the moment it's too long, so that's clearly - # broken - print(r) - raise ValueError('Busted') - raise ValueError('Busted, but in a better way') + r = d[2,:] + assert np.array_equal(r,y),f'original {y} does not equal {r} (minimal chunk read)' + print(r) else: diff --git a/bnl/playings3.py b/bnl/playings3.py new file mode 100644 index 0000000..67b2f65 --- /dev/null +++ b/bnl/playings3.py @@ -0,0 +1,21 @@ +import h5py +import pyfive +import s3fs + +MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' +S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/' + + +uri = 'bnl/'+MYFILE +fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) + + +with fs.open(uri,'rb') as s3file: + f = h5py.File(s3file,'r') + tos =f['tos'] + print(tos) +with fs.open(uri,'rb') as s3file2: + f2 = pyfive.File(s3file2) + tos2 = f2['tos'] + print(tos2) + From 5356aa0163d53c0140fd95807e8e3ab36c0995cb Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sat, 24 Feb 2024 17:37:14 +0000 Subject: [PATCH 008/105] Woops didnt' commit the real oil --- bnl/playings3.py | 21 ------------- pyfive/as_dataobjects.py | 65 +++++++++++++++++++++++++++------------- 2 files changed, 44 insertions(+), 42 deletions(-) delete mode 100644 bnl/playings3.py diff --git a/bnl/playings3.py b/bnl/playings3.py deleted file mode 100644 index 67b2f65..0000000 --- a/bnl/playings3.py +++ /dev/null @@ -1,21 +0,0 @@ -import h5py -import pyfive -import s3fs - -MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' -S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/' - - -uri = 'bnl/'+MYFILE -fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) - - -with fs.open(uri,'rb') as s3file: - f = h5py.File(s3file,'r') - tos =f['tos'] - print(tos) -with fs.open(uri,'rb') as s3file2: - f2 = pyfive.File(s3file2) - tos2 = f2['tos'] - print(tos2) - diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py index ab5e3da..278a912 100644 --- a/pyfive/as_dataobjects.py +++ b/pyfive/as_dataobjects.py @@ -5,15 +5,14 @@ from .indexing import OrthogonalIndexer -class ZarrSubstitute: +class ZarrArrayStub: """ This mimics the funcationality of the zarr array produced by kerchunk, but with only what is needed for indexing """ - def __init__(self, chunk_index, shape, chunks): - self.chunk_index = chunk_index - self._chunks = chunks - self._shape = shape + def __init__(self, shape, chunks): + self._chunks = list(chunks) + self._shape = list(shape) class ADataObjects(DataObjects): @@ -26,8 +25,11 @@ def __init__(self,*args,**kwargs): """ super().__init__(*args,**kwargs) - # not yet sure we need our own copy - self._as_chunk_index=[] + # Need our own copy for now to utilise the zarr indexer. + # An optimisation could be to modify what is returned from OrthogonalIndexer + self._zchunk_index={} + + self.order='C' def get_offset_addresses(self): """ @@ -53,7 +55,7 @@ def _as_get_chunk_addresses(self): Get the offset addresses associated with all the chunks known to the b-tree of this object """ - if self._as_chunk_index == []: + if self._zchunk_index == {}: self._get_chunk_params() @@ -63,6 +65,12 @@ def _as_get_chunk_addresses(self): count = np.prod(self.shape) itemsize = np.dtype(self.dtype).itemsize chunk_buffer_size = count * itemsize + + # The zarr orthogonal indexer returns the position in chunk + # space, whereas pyfive wants the position in array space. + # Here we index the pyfive chunk_index in zarr index space. + + ichunks = [1/c for c in self.chunks] for node in chunk_btree.all_nodes[0]: for node_key, addr in zip(node['keys'], node['addresses']): @@ -70,27 +78,42 @@ def _as_get_chunk_addresses(self): if self.filter_pipeline: size = node_key['chunk_size'] start = node_key['chunk_offset'][:-1] - region = [slice(i, i+j) for i, j in zip(start, self.shape)] - self._as_chunk_index.append([region, start, addr, size]) + key = tuple([int(i*d) for i,d in zip(list(start),ichunks)]) + self._zchunk_index[key] = (addr,size) def __getitem__(self, args): - if self._as_chunk_index == []: - self._as_get_chunk_addresses + if self._zchunk_index == {}: + self._as_get_chunk_addresses() + print("Loaded addresses for ", len(self._zchunk_index),' chunks') - array = ZarrSubstitute(self._as_chunk_index, self.shape, self.chunks) + array = ZarrArrayStub(self.shape, self.chunks) indexer = OrthogonalIndexer(args, array) stripped_indexer = [(a, b, c) for a,b,c in indexer] - print(stripped_indexer) - mycoords = [] - for chunk_coords, chunk_selection, out_selection in stripped_indexer: - coord = '.'.join([str(c) for c in chunk_coords]) - mycoords.append((chunk_coords,coord)) - print("This isn't yet doing what you think it is, it's only returning chunk indices for your selection") - return mycoords - + filter_pipeline=None #FIXME, needs to be an argument or grabbed from somewhere + count = np.prod(self.chunks) + itemsize = np.dtype(self.dtype).itemsize + default_chunk_buffer_size = itemsize*count + + out_shape = indexer.shape + out = np.empty(out_shape, dtype=self.dtype, order=self.order) + + for chunk_coords, chunk_selection, out_selection in stripped_indexer: + addr, chunk_buffer_size = self._zchunk_index[chunk_coords] + self.fh.seek(addr) + if filter_pipeline is None: + chunk_buffer = self.fh.read(default_chunk_buffer_size) + else: + raise NotImplementedError + # The plan here would be to take the _filter_chunk method from BTree1RawDataChunks + # pop it out on it's own and make it a class method here as well as wherever else it needs to be + chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) + out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection] + + return out + From 9fe23943c1c77260df7ebf55ce9843e49c73b042 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sat, 24 Feb 2024 18:54:06 +0000 Subject: [PATCH 009/105] Should now support filtering chunks in the partical chunk loading. Haven't got any tests around this yet. --- pyfive/as_dataobjects.py | 32 ++++++++++++-------------------- pyfive/btree.py | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py index 278a912..1f1515b 100644 --- a/pyfive/as_dataobjects.py +++ b/pyfive/as_dataobjects.py @@ -59,7 +59,7 @@ def _as_get_chunk_addresses(self): self._get_chunk_params() - chunk_btree = BTreeV1RawDataChunks( + self.chunk_btree = BTreeV1RawDataChunks( self.fh, self._chunk_address, self._chunk_dims) count = np.prod(self.shape) @@ -72,43 +72,35 @@ def _as_get_chunk_addresses(self): ichunks = [1/c for c in self.chunks] - for node in chunk_btree.all_nodes[0]: + for node in self.chunk_btree.all_nodes[0]: for node_key, addr in zip(node['keys'], node['addresses']): - size = chunk_buffer_size - if self.filter_pipeline: - size = node_key['chunk_size'] + size = node_key['chunk_size'] + if self._filter_pipeline: + filter_mask = node_key['filter_mask'] + else: + filter_mask=None start = node_key['chunk_offset'][:-1] key = tuple([int(i*d) for i,d in zip(list(start),ichunks)]) - self._zchunk_index[key] = (addr,size) + self._zchunk_index[key] = (addr,size,filter_mask) def __getitem__(self, args): if self._zchunk_index == {}: self._as_get_chunk_addresses() - print("Loaded addresses for ", len(self._zchunk_index),' chunks') array = ZarrArrayStub(self.shape, self.chunks) indexer = OrthogonalIndexer(args, array) stripped_indexer = [(a, b, c) for a,b,c in indexer] - filter_pipeline=None #FIXME, needs to be an argument or grabbed from somewhere - count = np.prod(self.chunks) - itemsize = np.dtype(self.dtype).itemsize - default_chunk_buffer_size = itemsize*count - + itemsize = np.dtype(self.dtype).itemsize out_shape = indexer.shape out = np.empty(out_shape, dtype=self.dtype, order=self.order) for chunk_coords, chunk_selection, out_selection in stripped_indexer: - addr, chunk_buffer_size = self._zchunk_index[chunk_coords] - self.fh.seek(addr) - if filter_pipeline is None: - chunk_buffer = self.fh.read(default_chunk_buffer_size) - else: - raise NotImplementedError - # The plan here would be to take the _filter_chunk method from BTree1RawDataChunks - # pop it out on it's own and make it a class method here as well as wherever else it needs to be + addr, chunk_buffer_size, filter_mask = self._zchunk_index[chunk_coords] + chunk_buffer = self.chunk_btree.get_one_chunk_buffer( + addr, chunk_buffer_size, itemsize,self._filter_pipeline, filter_mask) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection] diff --git a/pyfive/btree.py b/pyfive/btree.py index 38518fc..d4ee3cd 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -199,6 +199,20 @@ def construct_data_from_chunks( non_padded_region = tuple([slice(i) for i in data_shape]) return data[non_padded_region] + + def get_one_chunk_buffer(self, addr, size, itemsize, filter_pipeline, filter_mask): + """ + Used when getting data chunk by chunk for reading partial data arrays + All the shaping and positioning is done in the calling function. + """ + self.fh.seek(addr) + chunk_buffer = self.fh.read(size) + if filter_pipeline is not None: + chunk_buffer = self.fh.read(size) + filter_mask = filter_mask + chunk_buffer = self._filter_chunk( + chunk_buffer, filter_mask, filter_pipeline, itemsize) + return chunk_buffer @classmethod def _filter_chunk(cls, chunk_buffer, filter_mask, filter_pipeline, itemsize): From dafb3c9bf25956c9f34abce919c289cddd610198 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sun, 25 Feb 2024 09:05:32 +0000 Subject: [PATCH 010/105] Some additional documentation --- pyfive/as_dataobjects.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py index 1f1515b..66d8caf 100644 --- a/pyfive/as_dataobjects.py +++ b/pyfive/as_dataobjects.py @@ -8,7 +8,7 @@ class ZarrArrayStub: """ This mimics the funcationality of the zarr array produced by kerchunk, - but with only what is needed for indexing + but with only what is needed for indexing. """ def __init__(self, shape, chunks): self._chunks = list(chunks) @@ -17,7 +17,7 @@ def __init__(self, shape, chunks): class ADataObjects(DataObjects): """ - Subclass of DataObjets which access the chunk addresses for a given slice of data + Subclass of DataObjets which accesses the chunk addresses for a given slice of data """ def __init__(self,*args,**kwargs): """ @@ -26,9 +26,7 @@ def __init__(self,*args,**kwargs): super().__init__(*args,**kwargs) # Need our own copy for now to utilise the zarr indexer. - # An optimisation could be to modify what is returned from OrthogonalIndexer self._zchunk_index={} - self.order='C' def get_offset_addresses(self): @@ -47,13 +45,15 @@ def get_offset_addresses(self): elif layout_class == 1: # contiguous storage return NotImplementedError("Contiguous storage") if layout_class == 2: # chunked storage - return self._as_get_chunk_addresses() - + self._as_get_chunk_addresses() + return self._zchunk_index + def _as_get_chunk_addresses(self): """ Get the offset addresses associated with all the chunks - known to the b-tree of this object + known to the b-tree of this object, and load them into + an index suitable for use with the zarr indexer. """ if self._zchunk_index == {}: @@ -64,7 +64,6 @@ def _as_get_chunk_addresses(self): count = np.prod(self.shape) itemsize = np.dtype(self.dtype).itemsize - chunk_buffer_size = count * itemsize # The zarr orthogonal indexer returns the position in chunk # space, whereas pyfive wants the position in array space. @@ -76,6 +75,7 @@ def _as_get_chunk_addresses(self): for node_key, addr in zip(node['keys'], node['addresses']): size = node_key['chunk_size'] if self._filter_pipeline: + # I am not sure this varies per chunk, but in case it does filter_mask = node_key['filter_mask'] else: filter_mask=None @@ -84,6 +84,10 @@ def _as_get_chunk_addresses(self): self._zchunk_index[key] = (addr,size,filter_mask) def __getitem__(self, args): + """ + Use the zarr orthongal indexer to extract data for a specfic selection within + the dataset array and in doing so, only load the relevant chunks. + """ if self._zchunk_index == {}: self._as_get_chunk_addresses() @@ -91,16 +95,17 @@ def __getitem__(self, args): array = ZarrArrayStub(self.shape, self.chunks) indexer = OrthogonalIndexer(args, array) - stripped_indexer = [(a, b, c) for a,b,c in indexer] + # FIXME: Need to understand what drop_axes was up to and whether or not + # it is relevant to this or not (I didn't understand it in the zarr implementation). itemsize = np.dtype(self.dtype).itemsize out_shape = indexer.shape out = np.empty(out_shape, dtype=self.dtype, order=self.order) - for chunk_coords, chunk_selection, out_selection in stripped_indexer: + for chunk_coords, chunk_selection, out_selection in indexer: addr, chunk_buffer_size, filter_mask = self._zchunk_index[chunk_coords] chunk_buffer = self.chunk_btree.get_one_chunk_buffer( - addr, chunk_buffer_size, itemsize,self._filter_pipeline, filter_mask) + addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection] From 53e4ebe1715d6ec6f34b0a62ad32752c43f5a6f6 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 29 Feb 2024 10:00:01 +0000 Subject: [PATCH 011/105] Seems to work, prior to re-integration --- bnl/playing.py | 5 ++--- pyfive/as_dataobjects.py | 34 ++++++++++++++-------------------- pyfive/btree.py | 4 ++-- pyfive/indexing.py | 11 +++++++++++ 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/bnl/playing.py b/bnl/playing.py index 091e18f..aeff19d 100644 --- a/bnl/playing.py +++ b/bnl/playing.py @@ -30,7 +30,7 @@ def working(f2, path, printindex=True): link_target = f2._links[path] t1 = time.time() dsref = ADataObjects(f2.file._fh, link_target) - chunk_index = dsref.get_offset_addresses() + chunk_index = dsref._get_chunk_addresses() t2 = time.time() print(f'Chunk index timer {t2-t1:.2}s') if printindex: @@ -66,10 +66,9 @@ def working(f2, path, printindex=True): t2 = working(f2, path, printindex=False) d = ADataObjects(f2.file._fh, f2._links[path]) r = d[2,:] + print(r) assert np.array_equal(r,y),f'original {y} does not equal {r} (minimal chunk read)' print(r) - - else: raise ValueError('You stuffed up') diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py index 66d8caf..7e32398 100644 --- a/pyfive/as_dataobjects.py +++ b/pyfive/as_dataobjects.py @@ -2,18 +2,7 @@ from .datatype_msg import DatatypeMessage import numpy as np from .btree import BTreeV1RawDataChunks -from .indexing import OrthogonalIndexer - - -class ZarrArrayStub: - """ - This mimics the funcationality of the zarr array produced by kerchunk, - but with only what is needed for indexing. - """ - def __init__(self, shape, chunks): - self._chunks = list(chunks) - self._shape = list(shape) - +from .indexing import OrthogonalIndexer, ZarrArrayStub class ADataObjects(DataObjects): """ @@ -29,7 +18,7 @@ def __init__(self,*args,**kwargs): self._zchunk_index={} self.order='C' - def get_offset_addresses(self): + def _get_offset_addresses(self): """ Get the offset addresses for the data requested """ @@ -46,10 +35,18 @@ def get_offset_addresses(self): return NotImplementedError("Contiguous storage") if layout_class == 2: # chunked storage self._as_get_chunk_addresses() - return self._zchunk_index + + def get_chunk_details(self, chunk_coords): + """ + Returns the chunk details associated with chunk coords + returned by the Zarr orthogonal indexer + """ + if self._zchunk_index == {}: + self._get_chunk_addresses() + return self._zchunk_index[chunk_coords] - def _as_get_chunk_addresses(self): + def _get_chunk_addresses(self): """ Get the offset addresses associated with all the chunks known to the b-tree of this object, and load them into @@ -85,13 +82,10 @@ def _as_get_chunk_addresses(self): def __getitem__(self, args): """ - Use the zarr orthongal indexer to extract data for a specfic selection within + Use the zarr orthogonal indexer to extract data for a specfic selection within the dataset array and in doing so, only load the relevant chunks. """ - if self._zchunk_index == {}: - self._as_get_chunk_addresses() - array = ZarrArrayStub(self.shape, self.chunks) indexer = OrthogonalIndexer(args, array) @@ -103,7 +97,7 @@ def __getitem__(self, args): out = np.empty(out_shape, dtype=self.dtype, order=self.order) for chunk_coords, chunk_selection, out_selection in indexer: - addr, chunk_buffer_size, filter_mask = self._zchunk_index[chunk_coords] + addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) chunk_buffer = self.chunk_btree.get_one_chunk_buffer( addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) diff --git a/pyfive/btree.py b/pyfive/btree.py index d4ee3cd..fedbc64 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -208,8 +208,8 @@ def get_one_chunk_buffer(self, addr, size, itemsize, filter_pipeline, filter_mas self.fh.seek(addr) chunk_buffer = self.fh.read(size) if filter_pipeline is not None: - chunk_buffer = self.fh.read(size) - filter_mask = filter_mask + #chunk_buffer = self.fh.read(size) + #filter_mask = filter_mask chunk_buffer = self._filter_chunk( chunk_buffer, filter_mask, filter_pipeline, itemsize) return chunk_buffer diff --git a/pyfive/indexing.py b/pyfive/indexing.py index f1e5e5e..e43ab9f 100644 --- a/pyfive/indexing.py +++ b/pyfive/indexing.py @@ -10,6 +10,17 @@ import numpy as np + +class ZarrArrayStub: + """ + This mimics the functionality of the zarr array produced by kerchunk, + but with only what is needed for indexing. This is the only "new" code + on top of the original Zarr source. + """ + def __init__(self, shape, chunks): + self._chunks = list(chunks) + self._shape = list(shape) + # in the original zarr source the following error definitions are an import from zarr.errors class _BaseZarrIndexError(IndexError): From 9ac0bbdc4cf8c783b17e304ba19bac846bd82e93 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 1 Mar 2024 20:31:19 +0000 Subject: [PATCH 012/105] Moved chunk support into standard API --- pyfive/as_dataobjects.py | 110 ----------------- pyfive/dataobjects.py | 252 ++++++++++++++++++++++++++++----------- pyfive/high_level.py | 4 +- 3 files changed, 185 insertions(+), 181 deletions(-) delete mode 100644 pyfive/as_dataobjects.py diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py deleted file mode 100644 index 7e32398..0000000 --- a/pyfive/as_dataobjects.py +++ /dev/null @@ -1,110 +0,0 @@ -from .dataobjects import DataObjects, DATA_STORAGE_MSG_TYPE -from .datatype_msg import DatatypeMessage -import numpy as np -from .btree import BTreeV1RawDataChunks -from .indexing import OrthogonalIndexer, ZarrArrayStub - -class ADataObjects(DataObjects): - """ - Subclass of DataObjets which accesses the chunk addresses for a given slice of data - """ - def __init__(self,*args,**kwargs): - """ - Initialise via super class - """ - super().__init__(*args,**kwargs) - - # Need our own copy for now to utilise the zarr indexer. - self._zchunk_index={} - self.order='C' - - def _get_offset_addresses(self): - """ - Get the offset addresses for the data requested - """ - - # offset and size from data storage message - msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0] - msg_offset = msg['offset_to_message'] - version, dims, layout_class, property_offset = ( - self._get_data_message_properties(msg_offset)) - - if layout_class == 0: # compact storage - raise NotImplementedError("Compact storage") - elif layout_class == 1: # contiguous storage - return NotImplementedError("Contiguous storage") - if layout_class == 2: # chunked storage - self._as_get_chunk_addresses() - - def get_chunk_details(self, chunk_coords): - """ - Returns the chunk details associated with chunk coords - returned by the Zarr orthogonal indexer - """ - if self._zchunk_index == {}: - self._get_chunk_addresses() - - return self._zchunk_index[chunk_coords] - - def _get_chunk_addresses(self): - """ - Get the offset addresses associated with all the chunks - known to the b-tree of this object, and load them into - an index suitable for use with the zarr indexer. - """ - if self._zchunk_index == {}: - - self._get_chunk_params() - - self.chunk_btree = BTreeV1RawDataChunks( - self.fh, self._chunk_address, self._chunk_dims) - - count = np.prod(self.shape) - itemsize = np.dtype(self.dtype).itemsize - - # The zarr orthogonal indexer returns the position in chunk - # space, whereas pyfive wants the position in array space. - # Here we index the pyfive chunk_index in zarr index space. - - ichunks = [1/c for c in self.chunks] - - for node in self.chunk_btree.all_nodes[0]: - for node_key, addr in zip(node['keys'], node['addresses']): - size = node_key['chunk_size'] - if self._filter_pipeline: - # I am not sure this varies per chunk, but in case it does - filter_mask = node_key['filter_mask'] - else: - filter_mask=None - start = node_key['chunk_offset'][:-1] - key = tuple([int(i*d) for i,d in zip(list(start),ichunks)]) - self._zchunk_index[key] = (addr,size,filter_mask) - - def __getitem__(self, args): - """ - Use the zarr orthogonal indexer to extract data for a specfic selection within - the dataset array and in doing so, only load the relevant chunks. - """ - - array = ZarrArrayStub(self.shape, self.chunks) - - indexer = OrthogonalIndexer(args, array) - # FIXME: Need to understand what drop_axes was up to and whether or not - # it is relevant to this or not (I didn't understand it in the zarr implementation). - - itemsize = np.dtype(self.dtype).itemsize - out_shape = indexer.shape - out = np.empty(out_shape, dtype=self.dtype, order=self.order) - - for chunk_coords, chunk_selection, out_selection in indexer: - addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) - chunk_buffer = self.chunk_btree.get_one_chunk_buffer( - addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask) - chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) - out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection] - - return out - - - - diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 886b48f..a2fe60b 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -18,6 +18,7 @@ from .btree import BTreeV2GroupNames, BTreeV2GroupOrders from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap +from .indexing import OrthogonalIndexer, ZarrArrayStub class DataObjects(object): @@ -25,7 +26,7 @@ class DataObjects(object): HDF5 DataObjects. """ - def __init__(self, fh, offset): + def __init__(self, fh, offset, order='C'): """ initalize. """ fh.seek(offset) version_hint = struct.unpack_from('= 1) and (version <= 4) - return version, dims, layout_class, property_offset - - def _get_contiguous_data(self, property_offset): - data_offset, = struct.unpack_from(' 0 + + +class DatasetDataObject(DataObjects): + """ + Subclass of DataObjects associated with one Dataset, and + which handles actual data access. + """ + def __init__(self,*args,**kwargs): + """ + Initialise via super class + """ + super().__init__(*args,**kwargs) + + # Need our own copy for now to utilise the zarr indexer. + self._zchunk_index={} + self.order='C' + # offset and size from data storage message + msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0] + self.msg_offset = msg['offset_to_message'] + version, dims, self.layout_class, self.property_offset = ( + self._get_data_message_properties(self.msg_offset)) + + def get_data(self, args=None): + """ + Return the data pointed to in the DataObject. + """ + + if self.layout_class == 0: # compact storage + raise NotImplementedError("Compact storage") + elif self.layout_class == 1: # contiguous storage + if args is None: + return self._get_contiguous_data(self.property_offset) + else: + return self._get_contiguous_data(self.property_offset)[args] + if self.layout_class == 2: # chunked storage + if args is None: + return self._get_chunked_data(self.msg_offset) + else: + return self._get_selection_via_chunks(args) + + def _get_data_message_properties(self, msg_offset): + """ Return the message properties of the DataObject. """ + dims, layout_class, property_offset = None, None, None + version, arg1, arg2 = struct.unpack_from( + '= 1) and (version <= 4) + return version, dims, layout_class, property_offset + + def _get_contiguous_data(self, property_offset): + data_offset, = struct.unpack_from(' Date: Fri, 1 Mar 2024 21:30:57 +0000 Subject: [PATCH 013/105] removing playing code --- bnl/playing.py | 74 -------------------------------------------------- 1 file changed, 74 deletions(-) delete mode 100644 bnl/playing.py diff --git a/bnl/playing.py b/bnl/playing.py deleted file mode 100644 index aeff19d..0000000 --- a/bnl/playing.py +++ /dev/null @@ -1,74 +0,0 @@ -import pyfive -from pathlib import Path -from pyfive.as_dataobjects import ADataObjects -import s3fs -import h5py -import time -import numpy as np - -MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' -#MYFILE = '../tests/chunked.hdf5' -MYFILE = 'da193a_25_day__198807-198807.nc' - -INPUT_OPTIONS = [ - ('da193o_25_day__grid_T_198807-198807.nc','tos','s3'), - ('da193a_25_day__198807-198807.nc','m01s06i247_4','s3'), - ('../tests/chunked.hdf5','dataset1','local'), - ('CMIP6-test.nc','tas', 's3'), - ('CMIP6-test.nc','tas', 'local') -] - -MYPATH = Path(__file__).parent - -option = 2 -location = INPUT_OPTIONS[option][2] -MYFILE = INPUT_OPTIONS[option][0] -path = INPUT_OPTIONS[option][1] - - -def working(f2, path, printindex=True): - link_target = f2._links[path] - t1 = time.time() - dsref = ADataObjects(f2.file._fh, link_target) - chunk_index = dsref._get_chunk_addresses() - t2 = time.time() - print(f'Chunk index timer {t2-t1:.2}s') - if printindex: - for e in chunk_index: - print(e) - - return t2 - -if location == 's3': - - S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/' - fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) - uri = 'bnl/'+MYFILE - - t1 = time.time() - with fs.open(uri,'rb') as s3file2: - f2 = pyfive.File(s3file2) - t2 = working(f2, path) - print(f'Complete chunking timer {t2-t1:.2}s') - - with fs.open(uri,'rb') as s3file2: - f3 = h5py.File(s3file2,'r') - print(f3[path]) - -elif location == 'local': - - f2 = pyfive.File(MYPATH/MYFILE) - x = f2[path] - y = x[2,:] - print(x.shape) - print(y) - t1 = time.time() - t2 = working(f2, path, printindex=False) - d = ADataObjects(f2.file._fh, f2._links[path]) - r = d[2,:] - print(r) - assert np.array_equal(r,y),f'original {y} does not equal {r} (minimal chunk read)' - print(r) -else: - raise ValueError('You stuffed up') - From 96dc178e989c8eedc20b8268e022bdb9140556ca Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sun, 3 Mar 2024 09:29:15 +0000 Subject: [PATCH 014/105] Fixes bug which stops the selection read from actually occurring and changes to actually using the filter pipeline. At this point is failling test_reference. --- pyfive/__init__.py | 1 + pyfive/dataobjects.py | 4 ++-- pyfive/high_level.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 9df950e..9fa6180 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -3,5 +3,6 @@ """ from .high_level import File +from .indexing import ZarrArrayStub, OrthogonalIndexer __version__ = '0.4.0.dev' diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 0730e12..90be3c4 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -741,7 +741,7 @@ def _get_chunk_addresses(self): for node in self.chunk_btree.all_nodes[0]: for node_key, addr in zip(node['keys'], node['addresses']): size = node_key['chunk_size'] - if self._filter_pipeline: + if self.filter_pipeline: # I am not sure this varies per chunk, but in case it does filter_mask = node_key['filter_mask'] else: @@ -768,7 +768,7 @@ def _get_selection_via_chunks(self, args): for chunk_coords, chunk_selection, out_selection in indexer: addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) chunk_buffer = self.chunk_btree.get_one_chunk_buffer( - addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask) + addr, chunk_buffer_size, itemsize, self.filter_pipeline, filter_mask) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection] diff --git a/pyfive/high_level.py b/pyfive/high_level.py index 6614f3a..c033217 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -276,7 +276,7 @@ def __repr__(self): return '' % info def __getitem__(self, args): - data = self._dataobjects.get_data()[args] + data = self._dataobjects.get_data(args) if self._astype is None: return data return data.astype(self._astype) From eb44c15f3bd8fc0c7defc3ea91d80037d7136153 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sun, 3 Mar 2024 09:45:00 +0000 Subject: [PATCH 015/105] Hack to avoid reference datatypes in chunk by chunk selections. --- pyfive/dataobjects.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 90be3c4..e427d08 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -629,8 +629,12 @@ def get_data(self, args=None): else: return self._get_contiguous_data(self.property_offset)[args] if self.layout_class == 2: # chunked storage + # If reading all chunks, use the (hopefully faster) "do it one go" method. + # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version if args is None: return self._get_chunked_data(self.msg_offset) + elif isinstance(self.dtype, tuple): + return self._get_chunked_data(self.msg_offset)[args] else: return self._get_selection_via_chunks(args) From 51f7cca261e03aa3dabae93350c6a3589a6c8847 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 5 Mar 2024 07:31:31 +0000 Subject: [PATCH 016/105] Remove obsolete function --- pyfive/dataobjects.py | 19 ------------------- pyfive/high_level.py | 1 + 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index e427d08..7b0c1a8 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -690,25 +690,6 @@ def _get_chunked_data(self, offset): self.fh, self._chunk_address, self._chunk_dims) return chunk_btree.construct_data_from_chunks( self.chunks, self.shape, self.dtype, self.filter_pipeline) - - - def _get_offset_addresses(self): - """ - Get the offset addresses for the data requested - """ - - # offset and size from data storage message - msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0] - msg_offset = msg['offset_to_message'] - version, dims, layout_class, property_offset = ( - self._get_data_message_properties(msg_offset)) - - if layout_class == 0: # compact storage - raise NotImplementedError("Compact storage") - elif layout_class == 1: # contiguous storage - return NotImplementedError("Contiguous storage") - if layout_class == 2: # chunked storage - self._as_get_chunk_addresses() def get_chunk_details(self, chunk_coords): """ diff --git a/pyfive/high_level.py b/pyfive/high_level.py index c033217..8bf0544 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -323,6 +323,7 @@ def ndim(self): @property def dtype(self): """ dtype attribute. """ + # In the HDF5 implementation this is a numpy dtype return self._dataobjects.dtype @property From 1f61d6c1e03ec21ba9bc2991d17473f311d141a1 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 5 Mar 2024 08:05:38 +0000 Subject: [PATCH 017/105] Support for third party access to contiguous data address and size. Also remove list definition which breaks references. --- pyfive/dataobjects.py | 21 +++++++++++++++++---- pyfive/indexing.py | 7 +++++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 7b0c1a8..100210c 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -694,12 +694,25 @@ def _get_chunked_data(self, offset): def get_chunk_details(self, chunk_coords): """ Returns the chunk details associated with chunk coords - returned by the Zarr orthogonal indexer + returned by the Zarr orthogonal indexer. The special case + is that if the data is contiguous, we still want to return + the offset and size, as the point of this entry point is + to provide third party applications an address to the data. """ - if self._zchunk_index == {}: - self._get_chunk_addresses() + if self.layout_class == 0: # compact storage + raise NotImplementedError("Compact storage") + elif self.layout_class == 1: # contiguous storage + # This option never used by pyfive itself as we use the memory map for + # access to contiguous data, but third parties may need it. + # Ignore coordinates, just give the location and size of entire array + data_offset, = struct.unpack_from(' Date: Thu, 7 Mar 2024 17:35:41 +0000 Subject: [PATCH 018/105] First cut, fails references and classic, even with new stuff turned off? --- pyfive/dataobjects.py | 105 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 9 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 100210c..d221698 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -5,6 +5,7 @@ from collections import OrderedDict import struct import warnings +from io import UnsupportedOperation import numpy as np @@ -610,6 +611,18 @@ def __init__(self,*args,**kwargs): self._zchunk_index={} self.order='C' + ########################################################################## + # pseudo chunking control. + #these can be changed from outside for testing purposes + # pseudo chunk blocksize: this is a size below which we don't bother + # pseudo chunking for contiguous data and just load the lot at data + # access time: units are kibibytes + self.pseudo_chunking = False + self.pseudo_block_size_kib = 0 + # We can't use mmaps on S3 + self.avoid_mmap = False + ########################################################################## + # offset and size from data storage message msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0] self.msg_offset = msg['offset_to_message'] @@ -624,10 +637,9 @@ def get_data(self, args=None): if self.layout_class == 0: # compact storage raise NotImplementedError("Compact storage") elif self.layout_class == 1: # contiguous storage - if args is None: - return self._get_contiguous_data(self.property_offset) - else: - return self._get_contiguous_data(self.property_offset)[args] + if self.avoid_mmap: + return self._get_selection_from_contiguous(args) + return self._get_contiguous_data(self.property_offset,args) if self.layout_class == 2: # chunked storage # If reading all chunks, use the (hopefully faster) "do it one go" method. # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version @@ -659,17 +671,20 @@ def _get_data_message_properties(self, msg_offset): assert (version >= 1) and (version <= 4) return version, dims, layout_class, property_offset - def _get_contiguous_data(self, property_offset): + def _get_contiguous_data(self, property_offset, args): data_offset, = struct.unpack_from(' Date: Thu, 7 Mar 2024 18:07:52 +0000 Subject: [PATCH 019/105] This version appears to now support failing over from a memory map to a pseudo chunked read. Lots of things to do around optimising that read, but let's test this more widely first. --- pyfive/dataobjects.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index d221698..dcdef0a 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -617,10 +617,10 @@ def __init__(self,*args,**kwargs): # pseudo chunk blocksize: this is a size below which we don't bother # pseudo chunking for contiguous data and just load the lot at data # access time: units are kibibytes - self.pseudo_chunking = False + self.pseudo_chunking = True self.pseudo_block_size_kib = 0 # We can't use mmaps on S3 - self.avoid_mmap = False + self.avoid_mmap = True ########################################################################## # offset and size from data storage message @@ -637,8 +637,6 @@ def get_data(self, args=None): if self.layout_class == 0: # compact storage raise NotImplementedError("Compact storage") elif self.layout_class == 1: # contiguous storage - if self.avoid_mmap: - return self._get_selection_from_contiguous(args) return self._get_contiguous_data(self.property_offset,args) if self.layout_class == 2: # chunked storage # If reading all chunks, use the (hopefully faster) "do it one go" method. @@ -676,15 +674,18 @@ def _get_contiguous_data(self, property_offset, args): if data_offset == UNDEFINED_ADDRESS: # no storage is backing array, return all zeros - result = np.zeros(self.shape, dtype=self.dtype) + return np.zeros(self.shape, dtype=self.dtype)[args] if not isinstance(self.dtype, tuple): - try: - # return a memory-map to the stored array with copy-on-write - result = np.memmap(self.fh, dtype=self.dtype, mode='c', - offset=data_offset, shape=self.shape, order=self.order) - except UnsupportedOperation: + if self.avoid_mmap: return self._get_selection_from_contiguous(args) + else: + try: + # return a memory-map to the stored array with copy-on-write + return np.memmap(self.fh, dtype=self.dtype, mode='c', + offset=data_offset, shape=self.shape, order=self.order)[args] + except UnsupportedOperation: + return self._get_selection_from_contiguous(args) else: dtype_class = self.dtype[0] if dtype_class == 'REFERENCE': @@ -694,13 +695,9 @@ def _get_contiguous_data(self, property_offset, args): ref_addresses = np.memmap( self.fh, dtype=(' Date: Sun, 21 Apr 2024 17:00:00 +0100 Subject: [PATCH 020/105] First cut, no tests yet --- pyfive/dataobjects.py | 125 +++++++++-------------------------- pyfive/h5d.py | 148 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 94 deletions(-) create mode 100644 pyfive/h5d.py diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index dcdef0a..7031d40 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -19,7 +19,8 @@ from .btree import BTreeV2GroupNames, BTreeV2GroupOrders from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap -from .indexing import OrthogonalIndexer, ZarrArrayStub +from .h5d import H5Dataset + # these constants happen to have the same value... UNLIMITED_SIZE = UNDEFINED_ADDRESS @@ -598,27 +599,27 @@ def is_dataset(self): class DatasetDataObject(DataObjects): """ - Subclass of DataObjects associated with one Dataset, and - which handles actual data access. + Subclass of DataObjects associated with one Dataset, + handles actual data access. """ def __init__(self,*args,**kwargs): """ Initialise via super class """ super().__init__(*args,**kwargs) + self._id = None - # Need our own copy for now to utilise the zarr indexer. - self._zchunk_index={} + # make this explicit, but controllable self.order='C' ########################################################################## # pseudo chunking control. - #these can be changed from outside for testing purposes + # these can be changed from outside for optimisation # pseudo chunk blocksize: this is a size below which we don't bother # pseudo chunking for contiguous data and just load the lot at data # access time: units are kibibytes self.pseudo_chunking = True - self.pseudo_block_size_kib = 0 + self.pseudo_block_size_kib = 1024 # We can't use mmaps on S3 self.avoid_mmap = True ########################################################################## @@ -629,6 +630,16 @@ def __init__(self,*args,**kwargs): version, dims, self.layout_class, self.property_offset = ( self._get_data_message_properties(self.msg_offset)) + @property + def id(self): + """ + Represents a PyFive approximation of an HDF5 dataset identifier. + Objects of this class provides methods for working directly with chunked data. + """ + if self._id is None: + self._id = H5Dataset(self) + return self._id + def get_data(self, args=None): """ Return the data pointed to in the DataObject. @@ -646,7 +657,7 @@ def get_data(self, args=None): elif isinstance(self.dtype, tuple): return self._get_chunked_data(self.msg_offset)[args] else: - return self._get_selection_via_chunks(args) + return self.id._get_selection_via_chunks(args) def _get_data_message_properties(self, msg_offset): """ Return the message properties of the DataObject. """ @@ -699,95 +710,21 @@ def _get_contiguous_data(self, property_offset, args): else: raise NotImplementedError('datatype not implemented') - def _get_chunked_data(self, offset): - """ Return data which is chunked. """ - self._get_chunk_params() - chunk_btree = BTreeV1RawDataChunks( - self.fh, self._chunk_address, self._chunk_dims) - return chunk_btree.construct_data_from_chunks( - self.chunks, self.shape, self.dtype, self.filter_pipeline) - - def get_chunk_details(self, chunk_coords): - """ - Returns the chunk details associated with chunk coords - returned by the Zarr orthogonal indexer. The special case - is that if the data is contiguous, we still want to return - the offset and size, as the point of this entry point is - to provide third party applications an address to the data. - """ - if self.layout_class == 0: # compact storage - raise NotImplementedError("Compact storage") - elif self.layout_class == 1: # contiguous storage - # This option never used by pyfive itself as we use the memory map for - # access to contiguous data, but third parties may need it. - # Ignore coordinates, just give the location and size of entire array - data_offset, = struct.unpack_from(' Date: Mon, 22 Apr 2024 11:50:41 +0100 Subject: [PATCH 021/105] Improvements --- pyfive/dataobjects.py | 3 ++- pyfive/h5d.py | 62 +++++++++++++++++++++++-------------------- pyfive/high_level.py | 11 +++++++- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 7031d40..67edb07 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -20,7 +20,7 @@ from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap from .h5d import H5Dataset - +from .indexing import OrthogonalIndexer, ZarrArrayStub # these constants happen to have the same value... UNLIMITED_SIZE = UNDEFINED_ADDRESS @@ -637,6 +637,7 @@ def id(self): Objects of this class provides methods for working directly with chunked data. """ if self._id is None: + self._get_chunk_params() self._id = H5Dataset(self) return self._id diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 2d294af..3673131 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -20,14 +20,14 @@ def __init__(self, dataobject): """ self.parent_object = dataobject + self.index = None # Should we read this at instantiation? # I figure yes, given folks will likely only - # go this low if they want to manipulate chunks - # Otherwise we'd have to instantiate it as None and - # call the build routine on every chunk manipulation. - # Even if that's just a return, it's a lot of empty function calls - # on an iteration over chunks. - self.index = self.__build_index() + # go this low if they want to manipulate chunks. + # Otherwise we'd call the (cached) build routine on + # each chunk manipulation. That could be a lot of + # empty function calls, even if they are cheap cf I/O. + self.__build_index() def __hash__(self): """ @@ -59,7 +59,10 @@ def get_chunk_info(self, index): return self.index[self._nthindex[index]] def get_chunk_info_by_coord(self, coordinate_index): - return self.index(coordinate_index) + return self.index[coordinate_index] + + def get_num_chunks(self): + return len(self.index) def read_direct_chunk(self, chunk_position, **kwargs): """ @@ -78,15 +81,14 @@ def __build_index(self): """ Build the chunk index if it doesn't exist """ + if self.index is not None: return + chunk_btree = BTreeV1RawDataChunks( - self.fh, self._chunk_address, self._chunk_dims) - count = np.prod(self.shape) - itemsize = np.dtype(self.dtype).itemsize + self.parent_object.fh, self.parent_object._chunk_address, self.parent_object._chunk_dims) self.index = {} - # we do this to avoid either using an iterator or many # temporary list creations if there are repeated chunk accesses. self._nthindex = [] @@ -95,28 +97,31 @@ def __build_index(self): # space, whereas pyfive wants the position in array space. # Here we index the pyfive chunk_index in zarr index space. - ichunks = [1/c for c in self.chunks] + # Can't help myself optimising to remove excessive divides + ichunks = [1/c for c in self.parent_object.chunks] for node in chunk_btree.all_nodes[0]: for node_key, addr in zip(node['keys'], node['addresses']): + start = node_key['chunk_offset'][:-1] key = tuple([int(i*d) for i,d in zip(list(start),ichunks)]) size = node_key['chunk_size'] filter_mask = node_key['filter_mask'] - start = node_key['chunk_offset'][:-1] self._nthindex.append(key) - self.index[key] = StoreInfo(key, filter_mask, start, size) + self.index[key] = StoreInfo(key, filter_mask, addr, size) + - def _iter_chunks(self, sel=None): + def _iter_chunks(self, args): """ Provides internal support for iter_chunks method on parent. Errors should be trapped there. """ - - if sel is None: - yield from self.index.values() - else: - raise NotImplementedError - + raise NotImplementedError + # FIXME: This isn't it! + array = ZarrArrayStub(self.shape, self.parent_object.chunks) + indexer = OrthogonalIndexer(args, array) + for chunk_coords, chunk_selection, out_selection in indexer: + yield out_selection + def _get_raw_chunk(self, storeinfo): """ Obtain the bytes associated with a chunk. @@ -131,18 +136,17 @@ def _get_selection_via_chunks(self, args): the dataset array and in doing so, only load the relevant chunks. """ - array = ZarrArrayStub(self.shape, self.chunks) + array = ZarrArrayStub(self.shape, self.parent_object.chunks) indexer = OrthogonalIndexer(args, array) out_shape = indexer.shape - out = np.empty(out_shape, dtype=self.dtype, order=self.order) + out = np.empty(out_shape, dtype=self.dtype, order=self.parent_object.order) + filter_pipeline = self.parent_object.filter_pipeline for chunk_coords, chunk_selection, out_selection in indexer: - chunk_info = self.get_chunk_info_by_coord(chunk_coords) - filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords.chunk_offset) - if self.filter_pipeline is not None: - chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.itemsize) - chunk_buffer = self._unpack_chunk(chunk_buffer, chunk_info) + filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords) + if filter_pipeline is not None: + chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) - out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection] + out[out_selection] = chunk_data.reshape(self.parent_object.chunks, order=self.parent_object.order)[chunk_selection] return out diff --git a/pyfive/high_level.py b/pyfive/high_level.py index 8bf0544..e02a944 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -304,12 +304,21 @@ def astype(self, dtype): def len(self): """ Return the size of the first axis. """ return self.shape[0] + + def iter_chunks(self, sel=None): + if sel == None: + sel = self.shape + return self._dataobjects.id._iter_chunks(sel) + + @property + def id(self): + return self._dataobjects.id @property def shape(self): """ shape attribute. """ return self._dataobjects.shape - + @property def maxshape(self): """ maxshape attribute. (None for unlimited dimensions) """ From 9ffb5b23daa6327b53229e865c3bb9dc2bdb611f Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 22 Apr 2024 13:28:19 +0100 Subject: [PATCH 022/105] With some failing tests --- pyfive/h5d.py | 1 + tests/test_h5d.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 tests/test_h5d.py diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 3673131..d77f492 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -70,6 +70,7 @@ def read_direct_chunk(self, chunk_position, **kwargs): Additional arugments supported by H5Py are not supported here. """ storeinfo = self.index[chunk_position] + print(storeinfo) return storeinfo.filter_mask, self._get_raw_chunk(storeinfo) ###### diff --git a/tests/test_h5d.py b/tests/test_h5d.py new file mode 100644 index 0000000..0ef4936 --- /dev/null +++ b/tests/test_h5d.py @@ -0,0 +1,62 @@ +import h5py +import pyfive +from pathlib import Path +import pytest + +mypath = Path(__file__).parent + +filename = 'compressed.hdf5' +variable_name = 'dataset3' + +def chunk_down(ff, vv): + """ + Test the chunking stuff + """ + var = ff[vv] + varid = var.id + n = varid.get_num_chunks() + c = varid.get_chunk_info(4) + with pytest.raises(OSError): + # This isn't on the chunk boundary, so should fail + address = (2,0) + d = varid.read_direct_chunk(address) + address = c.chunk_offset + d = varid.read_direct_chunk(address) + return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d + + +def get_chunks(ff, vv, view=3): + var = ff[vv] + x = var[:,2] + y = var[:,:] + chunks = list(var.iter_chunks()) + for i in range(view): + print('Chunk ',i) + print(chunks[i]) + return list(var.iter_chunks()) + + +def test_h5d_chunking_details(): + + with h5py.File(mypath/filename) as f: + h5detail = chunk_down(f, variable_name) + + with pyfive.File(mypath/filename) as g: + p5detail = chunk_down(g, variable_name) + + assert h5detail == p5detail + +def test_iter_chunks(): + + with h5py.File(mypath/filename) as f: + h5chunks = get_chunks(f, variable_name) + + with pyfive.File(mypath/filename) as g: + p5chunks = get_chunks(f, variable_name) + + assert (h5chunks == p5chunks).all() + + +if __name__ == "__main__": + test_h5d_chunking_details() + From 223a931a5c3aaa9aaf19b3cb6a3f9d81d99525da Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 22 Apr 2024 14:32:21 +0100 Subject: [PATCH 023/105] Fixed one test --- pyfive/h5d.py | 21 +++++++++++++-------- tests/test_h5d.py | 18 +++++++++++++----- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index d77f492..2e9d5ef 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -19,6 +19,9 @@ def __init__(self, dataobject): Instantiated with the pyfive datasetdataobject """ self.parent_object = dataobject + self._chunks = self.parent_object.chunks + self._ichunks = [1/c for c in self._chunks] + self._order = self.parent_object.order self.index = None # Should we read this at instantiation? @@ -29,6 +32,8 @@ def __init__(self, dataobject): # empty function calls, even if they are cheap cf I/O. self.__build_index() + + def __hash__(self): """ H5py says this is hasable, we haven't implemented that. @@ -55,6 +60,7 @@ def dtype(self): def get_chunk_info(self, index): """ Retrieve storage information about a chunk specified by its index. + Our index is in chunk space, but H5Py wants it in coordinate space. """ return self.index[self._nthindex[index]] @@ -69,8 +75,9 @@ def read_direct_chunk(self, chunk_position, **kwargs): Returns a tuple containing the filter_mask and the raw data storing this chunk as bytes. Additional arugments supported by H5Py are not supported here. """ + if chunk_position not in self.index: + raise OSError("Chunk coordinates must lie on chunk boundaries") storeinfo = self.index[chunk_position] - print(storeinfo) return storeinfo.filter_mask, self._get_raw_chunk(storeinfo) ###### @@ -97,14 +104,11 @@ def __build_index(self): # The zarr orthogonal indexer returns the position in chunk # space, whereas pyfive wants the position in array space. # Here we index the pyfive chunk_index in zarr index space. - - # Can't help myself optimising to remove excessive divides - ichunks = [1/c for c in self.parent_object.chunks] for node in chunk_btree.all_nodes[0]: for node_key, addr in zip(node['keys'], node['addresses']): start = node_key['chunk_offset'][:-1] - key = tuple([int(i*d) for i,d in zip(list(start),ichunks)]) + key = start size = node_key['chunk_size'] filter_mask = node_key['filter_mask'] self._nthindex.append(key) @@ -137,17 +141,18 @@ def _get_selection_via_chunks(self, args): the dataset array and in doing so, only load the relevant chunks. """ - array = ZarrArrayStub(self.shape, self.parent_object.chunks) + array = ZarrArrayStub(self.shape, self._chunks) indexer = OrthogonalIndexer(args, array) out_shape = indexer.shape - out = np.empty(out_shape, dtype=self.dtype, order=self.parent_object.order) + out = np.empty(out_shape, dtype=self.dtype, order=self._order) filter_pipeline = self.parent_object.filter_pipeline for chunk_coords, chunk_selection, out_selection in indexer: + chunk_coords = tuple([int(i*d) for i,d in zip(list(chunk_coords),self._chunks)]) filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords) if filter_pipeline is not None: chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) - out[out_selection] = chunk_data.reshape(self.parent_object.chunks, order=self.parent_object.order)[chunk_selection] + out[out_selection] = chunk_data.reshape(self._chunks, order=self._order)[chunk_selection] return out diff --git a/tests/test_h5d.py b/tests/test_h5d.py index 0ef4936..a7075aa 100644 --- a/tests/test_h5d.py +++ b/tests/test_h5d.py @@ -4,31 +4,39 @@ import pytest mypath = Path(__file__).parent - filename = 'compressed.hdf5' variable_name = 'dataset3' +breaking_address=(2,0) + +#mypath = mypath.parent/'bnl/' +#filename = 'da193o_25_day__grid_T_198807-198807.nc' +#variable_name = 'tos' +# breaking_address=(2,0,3) def chunk_down(ff, vv): """ Test the chunking stuff """ var = ff[vv] + v = var[2,2] + print(v) varid = var.id n = varid.get_num_chunks() c = varid.get_chunk_info(4) + for i in range(varid.get_num_chunks()): + print(varid.get_chunk_info(i)) with pytest.raises(OSError): # This isn't on the chunk boundary, so should fail - address = (2,0) + address = breaking_address d = varid.read_direct_chunk(address) address = c.chunk_offset d = varid.read_direct_chunk(address) - return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d + + return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d, v def get_chunks(ff, vv, view=3): var = ff[vv] - x = var[:,2] - y = var[:,:] chunks = list(var.iter_chunks()) for i in range(view): print('Chunk ',i) From 3a256ab4f6b163dee03c979c9b5918df7fd2e832 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 22 Apr 2024 17:18:37 +0100 Subject: [PATCH 024/105] All tests for new functionality pass, but I've broken something old --- pyfive/h5d.py | 22 ++++++++++++++-------- pyfive/high_level.py | 6 ++---- tests/test_h5d.py | 9 +++++---- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 2e9d5ef..fc5f0a9 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -1,5 +1,6 @@ import numpy as np from collections import namedtuple +from operator import mul from .indexing import OrthogonalIndexer, ZarrArrayStub from .btree import BTreeV1RawDataChunks @@ -20,7 +21,6 @@ def __init__(self, dataobject): """ self.parent_object = dataobject self._chunks = self.parent_object.chunks - self._ichunks = [1/c for c in self._chunks] self._order = self.parent_object.order self.index = None @@ -32,8 +32,6 @@ def __init__(self, dataobject): # empty function calls, even if they are cheap cf I/O. self.__build_index() - - def __hash__(self): """ H5py says this is hasable, we haven't implemented that. @@ -120,13 +118,21 @@ def _iter_chunks(self, args): Provides internal support for iter_chunks method on parent. Errors should be trapped there. """ - raise NotImplementedError - # FIXME: This isn't it! + def convert_selection(tuple_of_slices): + # while a slice of the form slice(a,b,None) is equivalent + # in funtion to a slice of form (a,b,1) it is not the same. + # For compatability I've gone for "the same" + def convert_slice(aslice): + if aslice.step is None: + return slice(aslice.start,aslice.stop,1) + return aslice + return tuple([convert_slice(a) for a in tuple_of_slices]) + array = ZarrArrayStub(self.shape, self.parent_object.chunks) indexer = OrthogonalIndexer(args, array) for chunk_coords, chunk_selection, out_selection in indexer: - yield out_selection - + yield convert_selection(out_selection) + def _get_raw_chunk(self, storeinfo): """ Obtain the bytes associated with a chunk. @@ -148,7 +154,7 @@ def _get_selection_via_chunks(self, args): filter_pipeline = self.parent_object.filter_pipeline for chunk_coords, chunk_selection, out_selection in indexer: - chunk_coords = tuple([int(i*d) for i,d in zip(list(chunk_coords),self._chunks)]) + chunk_coords = tuple(map(mul,chunk_coords,self._chunks)) filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords) if filter_pipeline is not None: chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize) diff --git a/pyfive/high_level.py b/pyfive/high_level.py index e02a944..6db9159 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -305,10 +305,8 @@ def len(self): """ Return the size of the first axis. """ return self.shape[0] - def iter_chunks(self, sel=None): - if sel == None: - sel = self.shape - return self._dataobjects.id._iter_chunks(sel) + def iter_chunks(self, *args): + return self._dataobjects.id._iter_chunks(args) @property def id(self): diff --git a/tests/test_h5d.py b/tests/test_h5d.py index a7075aa..2a9d1db 100644 --- a/tests/test_h5d.py +++ b/tests/test_h5d.py @@ -41,7 +41,7 @@ def get_chunks(ff, vv, view=3): for i in range(view): print('Chunk ',i) print(chunks[i]) - return list(var.iter_chunks()) + return chunks def test_h5d_chunking_details(): @@ -60,11 +60,12 @@ def test_iter_chunks(): h5chunks = get_chunks(f, variable_name) with pyfive.File(mypath/filename) as g: - p5chunks = get_chunks(f, variable_name) + p5chunks = get_chunks(g, variable_name) - assert (h5chunks == p5chunks).all() + assert h5chunks == p5chunks if __name__ == "__main__": - test_h5d_chunking_details() + #test_h5d_chunking_details() + test_iter_chunks() From 32d83dd8871ed6f62014832c6cf73cc09bf2e0a7 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 22 Apr 2024 20:23:15 +0100 Subject: [PATCH 025/105] Now passing all tests --- pyfive/dataobjects.py | 9 +++------ pyfive/h5d.py | 35 ++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 67edb07..5508da7 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -641,7 +641,7 @@ def id(self): self._id = H5Dataset(self) return self._id - def get_data(self, args=None): + def get_data(self, args): """ Return the data pointed to in the DataObject. """ @@ -651,12 +651,9 @@ def get_data(self, args=None): elif self.layout_class == 1: # contiguous storage return self._get_contiguous_data(self.property_offset,args) if self.layout_class == 2: # chunked storage - # If reading all chunks, use the (hopefully faster) "do it one go" method. # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version - if args is None: - return self._get_chunked_data(self.msg_offset) - elif isinstance(self.dtype, tuple): - return self._get_chunked_data(self.msg_offset)[args] + if isinstance(self.dtype, tuple): + return self.id._get_reference_chunks(self.msg_offset)[args] else: return self.id._get_selection_via_chunks(args) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index fc5f0a9..7f2b08c 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -22,6 +22,8 @@ def __init__(self, dataobject): self.parent_object = dataobject self._chunks = self.parent_object.chunks self._order = self.parent_object.order + self._fh = self.parent_object.fh + self.filter_pipeline = self.parent_object.filter_pipeline self.index = None # Should we read this at instantiation? @@ -53,7 +55,12 @@ def rank(self): return self.parent_object.rank @property def dtype(self): - return np.dtype(self.parent_object.dtype) + # FIXME: Not sure what H5Py is doing here need to find out, + # but I'm sure it's not this. + if self.parent_object.dtype == ('REFERENCE',8): + return self.parent_object.dtype + else: + return np.dtype(self.parent_object.dtype) def get_chunk_info(self, index): """ @@ -92,7 +99,7 @@ def __build_index(self): return chunk_btree = BTreeV1RawDataChunks( - self.parent_object.fh, self.parent_object._chunk_address, self.parent_object._chunk_dims) + self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims) self.index = {} # we do this to avoid either using an iterator or many @@ -128,7 +135,7 @@ def convert_slice(aslice): return aslice return tuple([convert_slice(a) for a in tuple_of_slices]) - array = ZarrArrayStub(self.shape, self.parent_object.chunks) + array = ZarrArrayStub(self.shape, self._chunks) indexer = OrthogonalIndexer(args, array) for chunk_coords, chunk_selection, out_selection in indexer: yield convert_selection(out_selection) @@ -138,8 +145,19 @@ def _get_raw_chunk(self, storeinfo): Obtain the bytes associated with a chunk. """ - self.parent_object.fh.seek(storeinfo.byte_offset) - return self.parent_object.fh.read(storeinfo.size) + self._fh.seek(storeinfo.byte_offset) + return self._fh.read(storeinfo.size) + + def _get_reference_chunks(self, offset): + """ + Return reference data which is chunked. At the moment + we re-read the b-tree to do this, since we didn't cache + it at index construction. #FIXME + """ + chunk_btree = BTreeV1RawDataChunks( + self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims) + return chunk_btree.construct_data_from_chunks( + self._chunks, self.shape, self.dtype, self.filter_pipeline) def _get_selection_via_chunks(self, args): """ @@ -151,14 +169,13 @@ def _get_selection_via_chunks(self, args): indexer = OrthogonalIndexer(args, array) out_shape = indexer.shape out = np.empty(out_shape, dtype=self.dtype, order=self._order) - filter_pipeline = self.parent_object.filter_pipeline for chunk_coords, chunk_selection, out_selection in indexer: chunk_coords = tuple(map(mul,chunk_coords,self._chunks)) filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords) - if filter_pipeline is not None: - chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize) + if self.filter_pipeline is not None: + chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) out[out_selection] = chunk_data.reshape(self._chunks, order=self._order)[chunk_selection] - return out + return out \ No newline at end of file From f5f89c53874dd669edb1abdb14d4425913dbaf96 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 22 Apr 2024 20:34:35 +0100 Subject: [PATCH 026/105] Checking coverage of get_chunk_info_by_coord(method) --- tests/test_h5d.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_h5d.py b/tests/test_h5d.py index 2a9d1db..730f379 100644 --- a/tests/test_h5d.py +++ b/tests/test_h5d.py @@ -19,23 +19,21 @@ def chunk_down(ff, vv): """ var = ff[vv] v = var[2,2] - print(v) varid = var.id n = varid.get_num_chunks() c = varid.get_chunk_info(4) - for i in range(varid.get_num_chunks()): - print(varid.get_chunk_info(i)) with pytest.raises(OSError): # This isn't on the chunk boundary, so should fail address = breaking_address d = varid.read_direct_chunk(address) address = c.chunk_offset d = varid.read_direct_chunk(address) - + dd = varid.get_chunk_info_by_coord(address) + return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d, v -def get_chunks(ff, vv, view=3): +def get_chunks(ff, vv, view=0): var = ff[vv] chunks = list(var.iter_chunks()) for i in range(view): @@ -66,6 +64,6 @@ def test_iter_chunks(): if __name__ == "__main__": - #test_h5d_chunking_details() + test_h5d_chunking_details() test_iter_chunks() From 2c8f59c6e68a199d6908b872d07121c844b78b00 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 22 Apr 2024 20:37:06 +0100 Subject: [PATCH 027/105] Missing docstring --- pyfive/h5d.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 7f2b08c..bde427b 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -70,9 +70,16 @@ def get_chunk_info(self, index): return self.index[self._nthindex[index]] def get_chunk_info_by_coord(self, coordinate_index): + """ + Retrieve information about a chunk specified by the array address of the chunk’s + first element in each dimension. + """ return self.index[coordinate_index] def get_num_chunks(self): + """ + Return total number of chunks in dataset + """ return len(self.index) def read_direct_chunk(self, chunk_position, **kwargs): From 013ce6252a2f82fd8e1ec07802eb3a00bf657ad9 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 26 Apr 2024 11:42:47 +0100 Subject: [PATCH 028/105] Cleaning up --- pyfive/btree.py | 14 -------------- pyfive/h5d.py | 7 +++---- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/pyfive/btree.py b/pyfive/btree.py index fedbc64..38518fc 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -199,20 +199,6 @@ def construct_data_from_chunks( non_padded_region = tuple([slice(i) for i in data_shape]) return data[non_padded_region] - - def get_one_chunk_buffer(self, addr, size, itemsize, filter_pipeline, filter_mask): - """ - Used when getting data chunk by chunk for reading partial data arrays - All the shaping and positioning is done in the calling function. - """ - self.fh.seek(addr) - chunk_buffer = self.fh.read(size) - if filter_pipeline is not None: - #chunk_buffer = self.fh.read(size) - #filter_mask = filter_mask - chunk_buffer = self._filter_chunk( - chunk_buffer, filter_mask, filter_pipeline, itemsize) - return chunk_buffer @classmethod def _filter_chunk(cls, chunk_buffer, filter_mask, filter_pipeline, itemsize): diff --git a/pyfive/h5d.py b/pyfive/h5d.py index bde427b..7c9f838 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -12,8 +12,8 @@ class H5Dataset: Also, many H5D* functions which take a dataset instance as their first argument are presented as methods of this class. This is a subset of those supported - by H5Py's module H5D. - + by H5Py's module H5D, but includes all the low level methods for working with + chunked data. """ def __init__(self, dataobject): """ @@ -36,7 +36,7 @@ def __init__(self, dataobject): def __hash__(self): """ - H5py says this is hasable, we haven't implemented that. + H5py says this is hashable, we haven't implemented that. """ raise NotImplementedError @@ -151,7 +151,6 @@ def _get_raw_chunk(self, storeinfo): """ Obtain the bytes associated with a chunk. """ - self._fh.seek(storeinfo.byte_offset) return self._fh.read(storeinfo.size) From c21ee637857320cab6e1ea3fd511c9c72ff23b81 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 9 Jul 2024 09:50:33 +0100 Subject: [PATCH 029/105] Ok, these were the pull request fixes that I thought I'd merged --- pyfive/__init__.py | 2 +- pyfive/dataobjects.py | 55 ++++++++++++++++++++++++------------------- pyfive/h5d.py | 10 ++++---- tests/test_h5d.py | 5 ---- 4 files changed, 37 insertions(+), 35 deletions(-) diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 9fa6180..c11dfb0 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -3,6 +3,6 @@ """ from .high_level import File -from .indexing import ZarrArrayStub, OrthogonalIndexer + __version__ = '0.4.0.dev' diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 5508da7..4be83ea 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -9,18 +9,18 @@ import numpy as np -from .datatype_msg import DatatypeMessage -from .core import _padded_size, _structure_size -from .core import _unpack_struct_from, _unpack_struct_from_file -from .core import InvalidHDF5File -from .core import Reference -from .core import UNDEFINED_ADDRESS -from .btree import BTreeV1Groups, BTreeV1RawDataChunks -from .btree import BTreeV2GroupNames, BTreeV2GroupOrders -from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER -from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap -from .h5d import H5Dataset -from .indexing import OrthogonalIndexer, ZarrArrayStub +from pyfive.datatype_msg import DatatypeMessage +from pyfive.core import _padded_size, _structure_size +from pyfive.core import _unpack_struct_from, _unpack_struct_from_file +from pyfive.core import InvalidHDF5File +from pyfive.core import Reference +from pyfive.core import UNDEFINED_ADDRESS +from pyfive.btree import BTreeV1Groups, BTreeV1RawDataChunks +from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders +from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER +from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap +from pyfive.h5d import H5Dataset +from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub # these constants happen to have the same value... UNLIMITED_SIZE = UNDEFINED_ADDRESS @@ -602,15 +602,15 @@ class DatasetDataObject(DataObjects): Subclass of DataObjects associated with one Dataset, handles actual data access. """ - def __init__(self,*args,**kwargs): + def __init__(self, *args, **kwargs): """ Initialise via super class """ - super().__init__(*args,**kwargs) + super().__init__(*args, **kwargs) self._id = None # make this explicit, but controllable - self.order='C' + self.order = 'C' ########################################################################## # pseudo chunking control. @@ -636,6 +636,8 @@ def id(self): Represents a PyFive approximation of an HDF5 dataset identifier. Objects of this class provides methods for working directly with chunked data. """ + # When instantiated self._id is None, this property is called when the + # class instance is first used in anger to actually get the chunk indices etc if self._id is None: self._get_chunk_params() self._id = H5Dataset(self) @@ -649,7 +651,7 @@ def get_data(self, args): if self.layout_class == 0: # compact storage raise NotImplementedError("Compact storage") elif self.layout_class == 1: # contiguous storage - return self._get_contiguous_data(self.property_offset,args) + return self._get_contiguous_data(self.property_offset, args) if self.layout_class == 2: # chunked storage # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version if isinstance(self.dtype, tuple): @@ -700,13 +702,13 @@ def _get_contiguous_data(self, property_offset, args): if dtype_class == 'REFERENCE': size = self.dtype[1] if size != 8: - raise NotImplementedError('Unsupported Reference type') + raise NotImplementedError('Unsupported Reference type - size {size}') ref_addresses = np.memmap( self.fh, dtype=(' Date: Wed, 10 Jul 2024 11:39:59 +0100 Subject: [PATCH 030/105] Adding Datatype and check_enum_dtype in a minimal manner - closes #8 --- pyfive/__init__.py | 6 ++++-- pyfive/h5t.py | 15 +++++++++++++++ pyfive/high_level.py | 22 ++++++++++++++++++---- 3 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 pyfive/h5t.py diff --git a/pyfive/__init__.py b/pyfive/__init__.py index c11dfb0..6cba821 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -1,8 +1,10 @@ """ pyfive : a pure python HDF5 file reader. +This is the public API exposed by pyfive, +which is a small subset of the H5PY API. """ -from .high_level import File - +from pyfive.high_level import File, Group, Dataset, Datatype +from pyfive.h5t import check_enum_dtype __version__ = '0.4.0.dev' diff --git a/pyfive/h5t.py b/pyfive/h5t.py new file mode 100644 index 0000000..9eae216 --- /dev/null +++ b/pyfive/h5t.py @@ -0,0 +1,15 @@ +def check_enum_dtype(dt): + """ + If the dtype represents an HDF5 enumerated type, returns the dictionary + mapping string names to integer values. + Returns None if the dtype does not represent an HDF5 enumerated type. + + Note that currently pyfive does not support HDF5 enumerated types, + so this will always return None (see datatype_msg), and AFIK, should + never get called in anger. It is only included so h5netcdf wont + barf at its absence when pyfive is used as a backend. + """ + try: + return dt.metadata.get('enum', None) + except AttributeError: + return None \ No newline at end of file diff --git a/pyfive/high_level.py b/pyfive/high_level.py index 6db9159..7561ab1 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -7,9 +7,10 @@ import numpy as np -from .core import Reference -from .dataobjects import DataObjects, DatasetDataObject -from .misc_low_level import SuperBlock +from pyfive.core import Reference +from pyfive.dataobjects import DataObjects, DatasetDataObject +from pyfive.misc_low_level import SuperBlock +from pyfive.datatype_msg import DatatypeMessage class Group(Mapping): @@ -172,8 +173,10 @@ class File(Group): """ - def __init__(self, filename): + def __init__(self, filename, mode='r'): """ initalize. """ + if mode != 'r': + raise NotImplementedError('pyfive only provides support for reading and treats all reads as binary') self._close = False if hasattr(filename, 'read'): if not hasattr(filename, 'seek'): @@ -443,3 +446,14 @@ def __enter__(self): def __exit__(self, *args): self._dset._astype = None + + +class Datatype(DatatypeMessage): + """ + Class provided for compatbility with the H5PY API. + It's not yet clear where and how this might be used + by that name (if at all), but the existence of a + class with this name is required by h5netcdf. + """ + def __init__(self, *args, **kw): + super().__init__(self, *args, **kw) \ No newline at end of file From 99945989d215cd9fbd893d62dd2f0d06939631ab Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 12 Jul 2024 10:33:45 +0100 Subject: [PATCH 031/105] Basic support for elements of h5netcdf and what it expects to be able to import from it's backend (which it expects to be h5py). Also added some better handling for situations where we find datatypes which pyfive doesn't understand. --- pyfive/__init__.py | 3 ++- pyfive/dataobjects.py | 12 ++++++++++++ pyfive/h5py.py | 38 ++++++++++++++++++++++++++++++++++++++ pyfive/high_level.py | 32 +++++++++++++++++++------------- 4 files changed, 71 insertions(+), 14 deletions(-) create mode 100644 pyfive/h5py.py diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 6cba821..8e43221 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -4,7 +4,8 @@ which is a small subset of the H5PY API. """ -from pyfive.high_level import File, Group, Dataset, Datatype +from pyfive.high_level import File, Group, Dataset from pyfive.h5t import check_enum_dtype +from pyfive.h5py import Datatype, Empty __version__ = '0.4.0.dev' diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 4be83ea..318cdce 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -596,6 +596,18 @@ def is_dataset(self): """ True when DataObjects points to a dataset, False for a group. """ return len(self.find_msg_type(DATASPACE_MSG_TYPE)) > 0 + @property + def is_datatype(self): + """ Is this a standalone datatype definition?""" + if self.msgs[0]['type'] == DATATYPE_MSG_TYPE: + #I'm thinking that for the moment, this almost certainly means + #an unimplemented user datatype. If so, let's tell the higher + #level now, as the following will raise a NotImplementedError + x = DatatypeMessage(self.msg_data, self.msgs[0]['offset_to_message']) + return True + else: + return False + class DatasetDataObject(DataObjects): """ diff --git a/pyfive/h5py.py b/pyfive/h5py.py new file mode 100644 index 0000000..62c2076 --- /dev/null +++ b/pyfive/h5py.py @@ -0,0 +1,38 @@ +### This file contains H5Py classes which are not used by +### pyfive, but which are included in the public API for +### htnetcdf which expects to see these H5PY classes. + + +from pyfive.datatype_msg import DatatypeMessage +import numpy as np + +class Datatype: + """ + Class provided for compatability with the H5PY API, + to allow applications such as h5netcdf to import it, + but not use it. + """ + def __init__(self,*args,**kw): + raise NotImplementedError + +class Empty: + + """ + Proxy object to represent empty/null dataspaces (a.k.a H5S_NULL). + This can have an associated dtype, but has no shape or data. This is not + the same as an array with shape (0,). This class provided for compatibility + with the H5Py API to support h5netcdf. It is not used by pyfive. + """ + shape = None + size = None + + def __init__(self, dtype): + self.dtype = np.dtype(dtype) + + def __eq__(self, other): + if isinstance(other, Empty) and self.dtype == other.dtype: + return True + return False + + def __repr__(self): + return "Empty(dtype={0!r})".format(self.dtype) \ No newline at end of file diff --git a/pyfive/high_level.py b/pyfive/high_level.py index 7561ab1..fbb2f95 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -4,13 +4,14 @@ from collections.abc import Mapping, Sequence import os import posixpath +import warnings import numpy as np from pyfive.core import Reference from pyfive.dataobjects import DataObjects, DatasetDataObject from pyfive.misc_low_level import SuperBlock -from pyfive.datatype_msg import DatatypeMessage + class Group(Mapping): @@ -91,7 +92,19 @@ def __getitem__(self, y): if additional_obj != '.': raise KeyError('%s is a dataset, not a group' % (obj_name)) return Dataset(obj_name, DatasetDataObject(self.file._fh, link_target), self) - return Group(obj_name, dataobjs, self)[additional_obj] + + try: + # if true, this may well raise a NotImplementedError, if so, we need + # to warn the user, who may be able to use other parts of the data. + is_datatype = dataobjs.is_datatype + except NotImplementedError as e: + warnings.warn(f'Found datatype {obj_name} but pyfive cannot read this data: {e}') + is_datatype = True + + if is_datatype: + pass + else: + return Group(obj_name, dataobjs, self)[additional_obj] def __iter__(self): for k in self._links.keys(): @@ -334,7 +347,10 @@ def ndim(self): def dtype(self): """ dtype attribute. """ # In the HDF5 implementation this is a numpy dtype - return self._dataobjects.dtype + try: + return self._dataobjects.dtype + except NotImplementedError as e: + raise NotImplementedError(f'{e} (for {self.name})') @property def value(self): @@ -447,13 +463,3 @@ def __enter__(self): def __exit__(self, *args): self._dset._astype = None - -class Datatype(DatatypeMessage): - """ - Class provided for compatbility with the H5PY API. - It's not yet clear where and how this might be used - by that name (if at all), but the existence of a - class with this name is required by h5netcdf. - """ - def __init__(self, *args, **kw): - super().__init__(self, *args, **kw) \ No newline at end of file From c12b5b30fd1099cca7b4d677f9ee023a4da0227f Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 12 Jul 2024 10:56:19 +0100 Subject: [PATCH 032/105] Test support for graceful enum failure --- tests/make_enum_file.py | 13 +++++++++++++ tests/test_enum_var.py | 23 +++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 tests/make_enum_file.py create mode 100644 tests/test_enum_var.py diff --git a/tests/make_enum_file.py b/tests/make_enum_file.py new file mode 100644 index 0000000..cdba322 --- /dev/null +++ b/tests/make_enum_file.py @@ -0,0 +1,13 @@ +""" Create an HDF5 files with an enum datatype using the netcdf interface """ +from netCDF4 import Dataset +import numpy as np + +ncd = Dataset('enum_variable.hdf5','w') +enum_dict = dict(stratus=1, cumulus=2, nimbus=3, missing=255) +enum_type = ncd.createEnumType(np.uint8,'enum_t', enum_dict) + +dim = ncd.createDimension('axis',5) +enum_var = ncd.createVariable('enum_var',enum_type,'axis', + fill_value=enum_dict['missing']) +enum_var[:] = [enum_dict[k] for k in ['stratus','stratus','missing','nimbus','cumulus']] +ncd.close() diff --git a/tests/test_enum_var.py b/tests/test_enum_var.py new file mode 100644 index 0000000..e5851be --- /dev/null +++ b/tests/test_enum_var.py @@ -0,0 +1,23 @@ +""" Unit tests for pyfive dealing with an enum variable """ + +import os +import pytest + +import pyfive + +DIRNAME = os.path.dirname(__file__) +ENUMVAR_HDF5_FILE = os.path.join(DIRNAME, 'enum_variable.hdf5') + +def test_read_enum_variable(): + + with pyfive.File(ENUMVAR_HDF5_FILE) as hfile: + + for x in hfile: + if x == 'enum_t': + with pytest.warns(UserWarning,match='^Found '): + print(x, hfile[x]) + elif x == 'enum_var': + with pytest.raises(NotImplementedError): + print(x, hfile[x]) + else: + print(x, hfile[x]) \ No newline at end of file From c80ed92fdb11f4f8b2dec924ec0e5187ec31ec1b Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sun, 14 Jul 2024 08:59:06 +0100 Subject: [PATCH 033/105] Committing to the dtype returned as a numpy dtype, and the extra h5t interface. --- pyfive/__init__.py | 2 +- pyfive/dataobjects.py | 2 +- pyfive/h5t.py | 66 ++++++++++++++++++++++++++++++++++++++++++- pyfive/high_level.py | 3 +- tests/test_repr.py | 2 +- 5 files changed, 69 insertions(+), 6 deletions(-) diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 8e43221..8e67261 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -5,7 +5,7 @@ """ from pyfive.high_level import File, Group, Dataset -from pyfive.h5t import check_enum_dtype +from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype from pyfive.h5py import Datatype, Empty __version__ = '0.4.0.dev' diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 318cdce..774d942 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -188,7 +188,7 @@ def unpack_attribute(self, offset): # read in the dataspace information shape, maxshape = determine_data_shape(self.msg_data, offset) - items = int(np.product(shape)) + items = int(np.prod(shape)) offset += _padded_size(attr_dict['dataspace_size'], padding_multiple) # read in the value(s) diff --git a/pyfive/h5t.py b/pyfive/h5t.py index 9eae216..3379e4c 100644 --- a/pyfive/h5t.py +++ b/pyfive/h5t.py @@ -1,9 +1,15 @@ +# +# These are provided to support h5netcdf, and are not used +# by the pyfive package itself. +# + + def check_enum_dtype(dt): """ If the dtype represents an HDF5 enumerated type, returns the dictionary mapping string names to integer values. Returns None if the dtype does not represent an HDF5 enumerated type. - + --- Note that currently pyfive does not support HDF5 enumerated types, so this will always return None (see datatype_msg), and AFIK, should never get called in anger. It is only included so h5netcdf wont @@ -12,4 +18,62 @@ def check_enum_dtype(dt): try: return dt.metadata.get('enum', None) except AttributeError: + return None + + +def check_string_dtype(dt): + """ + If the dtype represents an HDF5 string, returns a string_info object. + The returned string_info object holds the encoding and the length. + The encoding can only be 'utf-8' or 'ascii'. The length may be None + for a variable-length string, or a fixed length in bytes. + Returns None if the dtype does not represent an HDF5 string. + --- + It's not obvious what this is used for yet, so we just return None + for now. + """ + #vlen_kind = check_vlen_dtype(dt) + # return string_info('utf-8', None) + ##if vlen_kind is unicode: + #elif vlen_kind is bytes: + # return string_info('ascii', None) + #elif dt.kind == 'S': + # enc = (dt.metadata or {}).get('h5py_encoding', 'ascii') + # return string_info(enc, dt.itemsize) + #else: + # return None + return None +def check_dtype(**kwds): + """ Check a dtype for h5py special type "hint" information. Only one + keyword may be given. + + vlen = dtype + If the dtype represents an HDF5 vlen, returns the Python base class. + Currently only built-in string vlens (str) are supported. Returns + None if the dtype does not represent an HDF5 vlen. + + enum = dtype + If the dtype represents an HDF5 enumerated type, returns the dictionary + mapping string names to integer values. Returns None if the dtype does + not represent an HDF5 enumerated type. + + ref = dtype + If the dtype represents an HDF5 reference type, returns the reference + class (either Reference or RegionReference). Returns None if the dtype + does not represent an HDF5 reference type. + """ + + if len(kwds) != 1: + raise TypeError("Exactly one keyword may be provided") + + name, dt = kwds.popitem() + + if name not in ('vlen', 'enum', 'ref'): + raise TypeError('Unknown special type "%s"' % name) + + try: + return dt.metadata[name] + except TypeError: + return None + except KeyError: return None \ No newline at end of file diff --git a/pyfive/high_level.py b/pyfive/high_level.py index fbb2f95..486a8a3 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -346,9 +346,8 @@ def ndim(self): @property def dtype(self): """ dtype attribute. """ - # In the HDF5 implementation this is a numpy dtype try: - return self._dataobjects.dtype + return np.dtype(self._dataobjects.dtype) except NotImplementedError as e: raise NotImplementedError(f'{e} (for {self.name})') diff --git a/tests/test_repr.py b/tests/test_repr.py index d4895a4..cad532f 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -19,7 +19,7 @@ def test_repr(): assert str(group1) == group1_str dataset1 = hfile['dataset1'] - dataset1_str = '' + dataset1_str = '' assert str(dataset1) == dataset1_str subgroup1 = group1['subgroup1'] From 04bbef69a3fd99c9586541635309f2d2d35fe50c Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 19 Jul 2024 11:36:39 +0100 Subject: [PATCH 034/105] Test for reference_list --- tests/test_reference_list.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/test_reference_list.py diff --git a/tests/test_reference_list.py b/tests/test_reference_list.py new file mode 100644 index 0000000..a1aef3c --- /dev/null +++ b/tests/test_reference_list.py @@ -0,0 +1,23 @@ +""" Unit tests for pyfive's ability to deal with reference lists """ +import os +import warnings + +import numpy as np +from numpy.testing import assert_array_equal, assert_almost_equal + +import pyfive + +DIRNAME = os.path.dirname(__file__) +NETCDF4_CLASSIC_FILE = os.path.join(DIRNAME, 'netcdf4_classic.nc') + + +def test_read_netcdf4_dims(): + + with pyfive.File(NETCDF4_CLASSIC_FILE) as hfile: + + with warnings.catch_warnings(record=True) as caught_warnings: + dimensions_x = hfile['x'].dims + if caught_warnings: + for warning in caught_warnings: + print('Caught warning ', warning) + raise NotImplementedError('We need to fix this warning!') \ No newline at end of file From 552c463336d71205bd81bb3725ed408aee53a167 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 19 Dec 2024 17:30:45 +0000 Subject: [PATCH 035/105] (New reference list test still broken) H5D has been disconnected from the parent dataobjects so once instantiated it does not need to touch the file metadata. --- bnl/opening_speed.py | 30 ++++++++++ pyfive/dataobjects.py | 9 +-- pyfive/h5d.py | 124 +++++++++++++++++++++++------------------- pyfive/high_level.py | 6 +- 4 files changed, 109 insertions(+), 60 deletions(-) create mode 100644 bnl/opening_speed.py diff --git a/bnl/opening_speed.py b/bnl/opening_speed.py new file mode 100644 index 0000000..db38d96 --- /dev/null +++ b/bnl/opening_speed.py @@ -0,0 +1,30 @@ +import h5py +import pyfive +from pathlib import Path +import time + +mypath = Path(__file__).parent +fname = 'da193o_25_day__grid_T_198807-198807.nc' +p = mypath/fname + +h1 = time.time() +f1 = h5py.File(p,'r') +h2 = time.time() +f2 = pyfive.File(p) +h3 = time.time() + + + +v = f2['tos'] +d = v._dataobjects +h4 = time.time() +d._get_chunk_addresses() +h5 = time.time() + + +print("File Opening Time Comparison") +print(f'h5py: {h2-h1:9.6f}') +print(f'pyfive: {h3-h2:9.6f}') +print(f'Additional times: {h4-h3:9.6f}, {h5-h4:9.6f}') +print(f'Total times: H5 {h4-h3:9.6f}, P5 {h5-h4:9.6f}') + diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 774d942..8c68bde 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -648,8 +648,7 @@ def id(self): Represents a PyFive approximation of an HDF5 dataset identifier. Objects of this class provides methods for working directly with chunked data. """ - # When instantiated self._id is None, this property is called when the - # class instance is first used in anger to actually get the chunk indices etc + # We want to make sure that this is lazy and cached if self._id is None: self._get_chunk_params() self._id = H5Dataset(self) @@ -667,10 +666,12 @@ def get_data(self, args): if self.layout_class == 2: # chunked storage # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version if isinstance(self.dtype, tuple): - return self.id._get_reference_chunks(self.msg_offset)[args] + # references need to read all the chunks for now + return self.id._get_selection_via_chunks(())[args] else: + # this is lazily reading only the chunks we need return self.id._get_selection_via_chunks(args) - + def _get_data_message_properties(self, msg_offset): """ Return the message properties of the DataObject. """ dims, layout_class, property_offset = None, None, None diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 0d57f92..532d1af 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -3,6 +3,7 @@ from operator import mul from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub from pyfive.btree import BTreeV1RawDataChunks +from pyfive.core import Reference StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size") @@ -13,55 +14,54 @@ class H5Dataset: Also, many H5D* functions which take a dataset instance as their first argument are presented as methods of this class. This is a subset of those supported by H5Py's module H5D, but includes all the low level methods for working with - chunked data. + chunked data, lazily or not. This class has been deliberately implemented in + such as way so that once you have an instance, it is completely independent + of the parent file, and it can be used efficiently in threads without rereading + the btree etc. """ def __init__(self, dataobject): """ - Instantiated with the pyfive datasetdataobject - """ - self.parent_object = dataobject - self._chunks = self.parent_object.chunks - self._order = self.parent_object.order - self._fh = self.parent_object.fh - self.filter_pipeline = self.parent_object.filter_pipeline + Instantiated with the pyfive datasetdataobject, we copy and cache everything + we want so it can be used after the parent file is closed, without needing + to go back to storage. + """ + self._chunks = dataobject.chunks + self._order = dataobject.order + self._filename = dataobject.fh.name + self.filter_pipeline = dataobject.filter_pipeline + self.shape = dataobject.shape + self.rank = len(self.shape) + self._msg_offset = dataobject.msg_offset + self._unique = (self._filename, self.shape, self._msg_offset) + + if dataobject.dtype == ('REFERENCE', 8): + # this may not behave the same as h5py, do we care? #FIXME + self.dtype = dataobject.dtype + else: + self.dtype = np.dtype(dataobject.dtype) self.index = None - # Should we read this at instantiation? - # I figure yes, given folks will likely only - # go this low if they want to manipulate chunks. - # Otherwise we'd call the (cached) build routine on - # each chunk manipulation. That could be a lot of - # empty function calls, even if they are cheap cf I/O. - self.__build_index() + + # This reads the b-tree and caches it in a form suitable for use with + # the zarr indexer we use to lazily get chunks. + + self.__build_index(dataobject) def __hash__(self): - """ - H5py says this is hashable, we haven't implemented that. + """ The hash is based on assuming the file path, the location + of the data in the file, and the data shape are a unique + combination. """ - raise NotImplementedError + return hash(self.unique) def __eq__(self, other): """ - H5Py says that equality is determined by true HDF5 identity. - """ - # We kick that upstairs. - return self.parent_object == other.parent_object - - @property - def shape(self): - return self.parent_object.shape - @property - def rank(self): - return self.parent_object.rank - @property - def dtype(self): - # FIXME: Not sure what H5Py is doing here need to find out, - # but I'm sure it's not this. - if self.parent_object.dtype == ('REFERENCE', 8): - return self.parent_object.dtype - else: - return np.dtype(self.parent_object.dtype) + Equality is based on the filename, location of the data in the file + and the shape of the data. + """ + return self._unique == other._unique + def get_chunk_info(self, index): """ Retrieve storage information about a chunk specified by its index. @@ -97,7 +97,7 @@ def read_direct_chunk(self, chunk_position, **kwargs): # third parties to use them. They are not H5Py methods. ###### - def __build_index(self): + def __build_index(self, dataobject): """ Build the chunk index if it doesn't exist """ @@ -106,7 +106,7 @@ def __build_index(self): return chunk_btree = BTreeV1RawDataChunks( - self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims) + dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims) self.index = {} # we do this to avoid either using an iterator or many @@ -144,44 +144,58 @@ def convert_slice(aslice): array = ZarrArrayStub(self.shape, self._chunks) indexer = OrthogonalIndexer(args, array) - for chunk_coords, chunk_selection, out_selection in indexer: + for _, _, out_selection in indexer: yield convert_selection(out_selection) def _get_raw_chunk(self, storeinfo): """ Obtain the bytes associated with a chunk. """ - self._fh.seek(storeinfo.byte_offset) - return self._fh.read(storeinfo.size) - - def _get_reference_chunks(self, offset): - """ - Return reference data which is chunked. At the moment - we re-read the b-tree to do this, since we didn't cache - it at index construction. #FIXME - """ - chunk_btree = BTreeV1RawDataChunks( - self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims) - return chunk_btree.construct_data_from_chunks( - self._chunks, self.shape, self.dtype, self.filter_pipeline) + with open(self._filename,'rb') as open_file: + open_file.seek(storeinfo.byte_offset) + return open_file.read(storeinfo.size) def _get_selection_via_chunks(self, args): """ Use the zarr orthogonal indexer to extract data for a specfic selection within the dataset array and in doing so, only load the relevant chunks. """ + # need a local dtype as we may override it for a reference read. + dtype = self.dtype + + if isinstance(self.dtype, tuple): + # this is a reference and we're returning that + true_dtype = tuple(dtype) + dtype_class = dtype[0] + if dtype_class == 'REFERENCE': + size = dtype[1] + if size != 8: + raise NotImplementedError('Unsupported Reference type') + dtype = ' Date: Fri, 20 Dec 2024 14:09:49 +0000 Subject: [PATCH 036/105] Interim commit so we have something to point to in a discussion around the problem with references and caching everything we need in a variable. --- .gitignore | 1 + pyfive/dataobjects.py | 170 ++++++++++++---------------------- pyfive/h5d.py | 206 +++++++++++++++++++++++++++++++++--------- pyfive/high_level.py | 42 ++++----- 4 files changed, 241 insertions(+), 178 deletions(-) diff --git a/.gitignore b/.gitignore index 737af19..bcc99b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .coverage +.pyc build diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 8c68bde..8a8c371 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -19,7 +19,7 @@ from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap -from pyfive.h5d import H5Dataset +from pyfive.h5d import DatasetID from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub # these constants happen to have the same value... @@ -247,6 +247,7 @@ def _vlen_size_and_data(self, buf, offset): gheap = self._global_heaps[gheap_address] vlen_data = gheap.objects[gheap_id['object_index']] return vlen_size, vlen_data + @property def shape(self): @@ -575,6 +576,28 @@ def _iter_links_btree_v2(self, name_btree_address, order_btree_address, heap_add for creationorder, value in sorted(adict.items()): yield value + + def _get_data_message_properties(self, msg_offset): + """ Return the message properties of the DataObject. """ + dims, layout_class, property_offset = None, None, None + version, arg1, arg2 = struct.unpack_from( + '= 1) and (version <= 4) + return version, dims, layout_class, property_offset + @staticmethod def _decode_link_info_msg(data, offset): version, flags = struct.unpack_from('= 1) and (version <= 4) - return version, dims, layout_class, property_offset + def _get_contiguous_data(self, property_offset, args): data_offset, = struct.unpack_from('' % info def __getitem__(self, args): - data = self._dataobjects.get_data(args) + data = self.id.get_data(args) if self._astype is None: return data return data.astype(self._astype) @@ -325,22 +324,16 @@ def len(self): def iter_chunks(self, *args): return self._dataobjects.id._iter_chunks(args) - @property - def id(self): - # we want to make sure that this is lazy and cached - if self._id is None: - self._id = self._dataobjects.id - return self._id @property def shape(self): """ shape attribute. """ - return self._dataobjects.shape + return self.id.shape @property def maxshape(self): """ maxshape attribute. (None for unlimited dimensions) """ - return self._dataobjects.maxshape + return self.id._meta.maxshape @property def ndim(self): @@ -351,7 +344,7 @@ def ndim(self): def dtype(self): """ dtype attribute. """ try: - return np.dtype(self._dataobjects.dtype) + return np.dtype(self.id.dtype) except NotImplementedError as e: raise NotImplementedError(f'{e} (for {self.name})') @@ -370,17 +363,17 @@ def size(self): @property def chunks(self): """ chunks attribute. """ - return self._dataobjects.chunks + return self.id.chunks @property def compression(self): """ compression attribute. """ - return self._dataobjects.compression + return self.id._meta.compression @property def compression_opts(self): """ compression_opts attribute. """ - return self._dataobjects.compression_opts + return self.id._meta.compression_opts @property def scaleoffset(self): @@ -390,17 +383,17 @@ def scaleoffset(self): @property def shuffle(self): """ shuffle attribute. """ - return self._dataobjects.shuffle + return self.id._meta.shuffle @property def fletcher32(self): """ fletcher32 attribute. """ - return self._dataobjects.fletcher32 + return self.id._meta.fletcher32 @property def fillvalue(self): """ fillvalue attribute. """ - return self._dataobjects.fillvalue + return self.id._meta.fillvalue @property def dims(self): @@ -410,9 +403,8 @@ def dims(self): @property def attrs(self): """ attrs attribute. """ - if self._attrs is None: - self._attrs = self._dataobjects.get_attributes() - return self._attrs + return self.id._meta.attributes + class DimensionManager(Sequence): From 2d881014963a578ec5134828d0964163b7998e83 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sat, 21 Dec 2024 09:01:26 +0000 Subject: [PATCH 037/105] Transition to H5D cached backend is complete, though we still have the test_reference_list fail --- pyfive/dataobjects.py | 4 +++- pyfive/h5d.py | 18 +++++++++++------- pyfive/high_level.py | 9 ++++++++- tests/test_filter_pipeline_v2.py | 8 ++++++++ 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 8a8c371..fc43e99 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -326,7 +326,9 @@ def compression_opts(self): if GZIP_DEFLATE_FILTER in self._filter_ids: gzip_entry = [d for d in self.filter_pipeline if d['filter_id'] == GZIP_DEFLATE_FILTER][0] - return gzip_entry['client_data'][0] + #raise ValueError(f'gzip_entry {gzip_entry.keys()}') + key = {0:'client_data_values',1:'client_data'}['client_data' in gzip_entry] + return gzip_entry[key][0] return None @property diff --git a/pyfive/h5d.py b/pyfive/h5d.py index ada759c..f0a4be8 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -19,7 +19,7 @@ class DatasetID: chunked data, lazily or not. This class has been deliberately implemented in such as way so as to cache all the relevant metadata, so that once you have an instance, it is completely independent of the parent file, and it can be used - efficiently in distributed threads without rereading the btree etc. + efficiently in distributed threads without thread contention to the b-tree etc. """ def __init__(self, dataobject): """ @@ -34,7 +34,7 @@ def __init__(self, dataobject): self.shape = dataobject.shape self.rank = len(self.shape) self.chunks = dataobject.chunks - + self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params() self._unique = (self._filename, self.shape, self._msg_offset) @@ -45,7 +45,7 @@ def __init__(self, dataobject): # not a posix file on a posix filesystem self.avoid_mmap = True - if dataobject.dtype == ('REFERENCE', 8): + if isinstance(dataobject.dtype,tuple): # this may not behave the same as h5py, do we care? #FIXME self.dtype = dataobject.dtype else: @@ -221,11 +221,10 @@ def _get_contiguous_data(self, args): if size != 8: raise NotImplementedError('Unsupported Reference type - size {size}') with open(self._filename,'rb') as open_file: - view = np.memmap( - self.fh, dtype=('' % info @@ -322,7 +329,7 @@ def len(self): return self.shape[0] def iter_chunks(self, *args): - return self._dataobjects.id._iter_chunks(args) + return self.id.iter_chunks(args) @property diff --git a/tests/test_filter_pipeline_v2.py b/tests/test_filter_pipeline_v2.py index 9eeb9a3..b2e3af6 100644 --- a/tests/test_filter_pipeline_v2.py +++ b/tests/test_filter_pipeline_v2.py @@ -17,3 +17,11 @@ def test_filter_pipeline_descr_v2(): d = hfile['data'] assert d.shape == (10,10,10) assert_almost_equal(d[0,0,0], 1.0) + +def test_filter_pipeline_compression_opts_v2(): + + with pyfive.File(FILTER_PIPELINE_V2_FILE) as hfile: + assert 'data' in hfile + d = hfile['data'] + # the point of this test is to ensure we can actually retrieve the compression opts + x = d.compression_opts \ No newline at end of file From 26780222c8565192f53b58cd97b3cab3f51047d3 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sat, 21 Dec 2024 09:10:26 +0000 Subject: [PATCH 038/105] Removed obsolete DatasetDataObjects --- pyfive/dataobjects.py | 120 ------------------------------------------ pyfive/high_level.py | 3 +- 2 files changed, 1 insertion(+), 122 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index fc43e99..3c56490 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -641,127 +641,7 @@ def is_datatype(self): else: return False -class DatasetDataObject(DataObjects): - """ - Subclass of DataObjects associated with one Dataset, - handles actual data access. - """ - def __init__(self, *args, **kwargs): - """ - Initialise via super class - """ - super().__init__(*args, **kwargs) - self._id = None - - # make this explicit, but controllable - self.order = 'C' - - try: - self.fh.fileno() - self.avoid_mmap = False - except (AttributeError, OSError): - # We can't use mmaps on S3 - self.avoid_mmap = True - - # offset and size from data storage message - msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0] - self.msg_offset = msg['offset_to_message'] - version, dims, self.layout_class, self.property_offset = ( - self._get_data_message_properties(self.msg_offset)) - - @property - def id(self): - """ - Represents a PyFive approximation of an HDF5 dataset identifier. - Objects of this class provides methods for working directly with chunked data. - """ - # We want to make sure that this is lazy and cached - if self._id is None: - self._get_chunk_params() - self._id = DatasetID(self) - return self._id - - def get_data(self, args): - """ - Return the data pointed to in the DataObject. - """ - if self.layout_class == 0: # compact storage - raise NotImplementedError("Compact storage") - elif self.layout_class == 1: # contiguous storage - return self._get_contiguous_data(self.property_offset, args) - if self.layout_class == 2: # chunked storage - # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version - if isinstance(self.dtype, tuple): - # references need to read all the chunks for now - return self.id._get_selection_via_chunks(())[args] - else: - # this is lazily reading only the chunks we need - return self.id._get_selection_via_chunks(args) - - - - def _get_contiguous_data(self, property_offset, args): - data_offset, = struct.unpack_from(' Date: Sun, 22 Dec 2024 07:45:34 +0000 Subject: [PATCH 039/105] Expose package version in code, and separate testing requirements out in package metadata. --- pyfive/__init__.py | 3 ++- setup.cfg | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 8e67261..0fb5696 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -7,5 +7,6 @@ from pyfive.high_level import File, Group, Dataset from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype from pyfive.h5py import Datatype, Empty +from importlib.metadata import version -__version__ = '0.4.0.dev' +__version__ = version diff --git a/setup.cfg b/setup.cfg index 3c6e79c..d942621 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,13 @@ [bdist_wheel] universal=1 + +[metadata] +name = pyfive +version = '0.4.0.dev_h5nc' + +[options.extras_require] +testing = + pytest + h5py + netCDF4 + From 503cb45648094dd7d412f0019cbe157d6fff05ff Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sun, 22 Dec 2024 07:46:00 +0000 Subject: [PATCH 040/105] Attempt to get b-tree logging in h5d --- pyfive/h5d.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index f0a4be8..3ea20e4 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -6,6 +6,8 @@ from pyfive.core import Reference, UNDEFINED_ADDRESS from io import UnsupportedOperation import struct +import logging +from importlib.metadata import version StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size") @@ -167,12 +169,19 @@ def index(self): def _build_index(self, dataobject): """ - Build the chunk index if it doesn't exist + Build the chunk index if it doesn't exist. This is only + called for chunk data, and only when the variable is accessed. + That is, it is not called when we an open a file, or when + we list the variables in a file, but only when we do + v = open_file['var_name'] where 'var_name' is chunked. + """ if self._index is not None: return + logging.info(f'Building chunk index in pyfive {version}') + chunk_btree = BTreeV1RawDataChunks( dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims) From ac96f46194686b564dd876d5554e2c36361b1ce5 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sat, 28 Dec 2024 18:04:13 +0000 Subject: [PATCH 041/105] Cleared a few bugs and misunderstandings which arose from workign with h5netcdf (closed #15) --- pyfive/__init__.py | 2 +- pyfive/h5d.py | 9 +++++- pyfive/high_level.py | 2 ++ setup.cfg | 28 ++++++++++++++++- setup.py | 40 ++---------------------- tests/h5netcdf_test.hdf5 | Bin 0 -> 19528 bytes tests/netcdf4_classic.nc | Bin 6796 -> 8330 bytes tests/test_h5netcdf.py | 58 +++++++++++++++++++++++++++++++++++ tests/test_reference_list.py | 33 +++++++++++++++----- 9 files changed, 124 insertions(+), 48 deletions(-) create mode 100644 tests/h5netcdf_test.hdf5 create mode 100644 tests/test_h5netcdf.py diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 0fb5696..8b1b869 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -9,4 +9,4 @@ from pyfive.h5py import Datatype, Empty from importlib.metadata import version -__version__ = version +__version__ = version('pyfive') diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 3ea20e4..62044af 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -180,7 +180,12 @@ def _build_index(self, dataobject): if self._index is not None: return - logging.info(f'Building chunk index in pyfive {version}') + # look out for an empty dataset, which will have no btree + if np.prod(self.shape) == 0: + self._index = {} + return + + logging.info(f'Building chunk index in pyfive {version("pyfive")}') chunk_btree = BTreeV1RawDataChunks( dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims) @@ -287,6 +292,8 @@ def _get_selection_via_chunks(self, args): raise NotImplementedError('datatype not implemented') else: true_dtype = None + if np.prod(self.shape) == 0: + return np.zeros(self.shape) array = ZarrArrayStub(self.shape, self.chunks) indexer = OrthogonalIndexer(args, array) diff --git a/pyfive/high_level.py b/pyfive/high_level.py index 9b174e3..23b30f9 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -353,6 +353,8 @@ def dtype(self): return np.dtype(self.id.dtype) except NotImplementedError as e: raise NotImplementedError(f'{e} (for {self.name})') + except TypeError as e: + return self.id.dtype @property def value(self): diff --git a/setup.cfg b/setup.cfg index d942621..7d2df77 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,11 +3,37 @@ universal=1 [metadata] name = pyfive -version = '0.4.0.dev_h5nc' +version = 0.4.0+h5nc +description = A pure python HDF5 reader +long_descriptino = file: README.rst +long_description_content_type = text/x-rst +url = https://github.com/ncas-cms/pyfive +author = Jonathan J. Helmus +author_email = jjhelmus@gmail.com (with additions from Bryan Lawrence) +maintainer = Valeriu Predoi +maintainer_email = V.Predoi@ncas.ac.uk +license = BSD +classifiers = + Development Status :: 4 - Beta + Intended Audience :: Developer + Intended Audience :: Science/Research + License :: OSI Approved :: BSD License + Operating System :: OS Independent + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Topic :: Scientific/Engineering + +[options] +packages = pyfive +python_requires >= 3.10 +install_requires = + numpy [options.extras_require] testing = pytest h5py netCDF4 + h5netcdf diff --git a/setup.py b/setup.py index 30701a8..deaace8 100644 --- a/setup.py +++ b/setup.py @@ -1,39 +1,3 @@ """ Setup script for pyfive. """ -from setuptools import setup, find_packages - -# get the long descriptions from the README.rst file -with open('README.rst') as f: - long_description = f.read() - -# get the version from the __init__.py file -with open('pyfive/__init__.py') as f: - for line in f: - if line.startswith('__version__'): - version = line.strip().split()[-1][1:-1] - break - -setup( - name='pyfive', - version=version, - description='A pure python HDF5 file reader', - long_description=long_description, - url='https://github.com/jjhelmus/pyfive', - author='Jonathan J. Helmus', - author_email='jjhelmus@gmail.com', - license='BSD', - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Topic :: Scientific/Engineering', - ], - packages=['pyfive'], - install_requires=['numpy'], -) +from setuptools import setup +setup() diff --git a/tests/h5netcdf_test.hdf5 b/tests/h5netcdf_test.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..e6cdbfb98ea64ebcbf0f7574655b8a0d4cdfced4 GIT binary patch literal 19528 zcmeHPeQZ?65ubM-=8O;H0I|(S=+UGqMZ~hP9YKPL@!5w>jqUhcO4_RDv;E9nbG~!# zo&i&peyF74kBFp7+K5O|l}2iyfs)p!5TztlqozS^N))83ZB!Cesc9mmDE*@lRk$-d zJJ09C#`eWHpuUxT`|)Py?f!OVKW2B|f#$~0g89ql138}$JW!&7@-wPkb1SDvT%LDJ zU=qK*xv|r=_JtoE?9cvx+x2Nh?_Q*{{kE0^f1bzwj=R{O?*fBDC8|_AU7Xs;9{4LM zV8A@^gAaBlQa#ZG()zF_ilScqDh3!@qhbL5X3Aqg5iCKvlod16k*H;*`SDW2P!P}y zIVbZHE^Z>@B=ww2wl>JQ``Cl?&v<{Ms=fdWqnr{A!T+wmrG^j_xw~N@Q~(m>k5_4?E`6;T;E?qqy`i#j3RBO)!Wx!(-VEURE<;se+XT|sQ|0%gJA3Ca2Qh;`5WD@ra@5M^uK`0c!nieKo$&iA-8*##tdR0l=HgCgXa7pXY*F?}2 zi)RA(7Y!s$E7%wc^u-6vWG0?U26~39Kr&^GJ=zsb$D=(7vmzjlU@Q()u9&!0z4(-z z_-;SFHd_+8{c3Jwy0_Q^kD?=Sl6Lbcm2eyXUd<%hgQ?NWucZkcF58ji{Kie}>C|B@ zut}YYKr8pBZSLPoQa(PlrXPEOiQA1{BJDN|{`DlKq_L#ukx)F5*d9#`nf%^tlqp5> z38D`lId_Izt<*3SoCd2DN)kJz(-_Q+qgo-e#rRJ=y@lkFK2mIl$4gNaWc*fEGq-m3 zu3L){HvGmdy1>#Kkj4_8Gt^jU=sJB!7dlT*wPpvwVW1NGp&X>A38qr%zIZZfnHk0V zsvbx8i(0Hiyp&JcL^9$=u2<`Jwv6nBUw`0=OstzyC6R|Idt7=QI>xwO_1*yA(`-Z! z-L}RGU-9$PLz2PS>Tx$Lhu0grni~-`1gQu)_pLswN7NE~fM0&-R~KMQu)URQ%k@)X z42nt&>t>VYp{|PDmo=f#<)+IpC+St1P&!CawE@#I(?B>{;tT&m zO(*F>&l4Pb^J`$NtQ*V`^)uxP?x?S!gE15eLfr>-s9I+5iE+<^T zceTA;EbE-QNOOFG8T0S+yMOji)vS4m$>NT}r&aE5;%2}?Gy#@pZQQiBgu4Ki;xn>J zQPrp#vSB5bKk=2}j|k9_buK$j$;A~f4$|ASnKf{)MzbDjilAl`4PpKc$~9YtVk(DH zAryHse$QOw2k&{eEffwn!9HyxHE_S-cT}%4drrMc{Kq@0owtN4PMZN#XX*I-C)rrN z5G+KvvnkZn+0+(niZH^Z2q7Wd7qy}p(;Clof|#^~y-%fSH24$b;3Rqn>qI~$NK2}g z!W8kJ(jA<{OOK8Wvg*%%3(-5sFrHrvU3uj1zOK%uCXyP#TKrp79Vh>}uzLKn+}>g; zG{@(5Vh#ch0uBNW0uBNW0uBNW0yjDWd%oJ+U0#0CC_4VC3kF-uj!J=L+A9ki*`k+; zWfY(Xpcer5?0cf=p6Fi8SufPYg*Z^0a;n^VE-9P-m^^J~xZI=DCUyuN1RMk$1RMk$ z1RMmejldMP$a=Qe6{yPqL&>;B$5f}f1fWhU>cO7OzD8-|CEHf5P7sRCO(qq#KcnOR zPpsp*$pK$6EzkDhQ>w#k9X>sG>dw>3+Kkq|{I?*8gB*pI37MS9oQrLxYS{Vhg%`6+ zvHWw~7IJ09k8wVfPS+!H52eeYahYW9IA8eA0b4|r=*S~6luva7MBunn7eox1fcrH! zmpgLmj`+te@g!hgb@5{SH5vi2N`q)ye@8xcfj%JycP+bkBGlR3p*CY&_UUdcjpz;T zW+*4y8*0r$DcNfojMBcrbQ}tvos)$)fSTUCRq~#Fud+lPKQACUS`6Xwb6tqB=@sK)5`9vYa7k(b-fISevC_nM(OEh}>0>p25VC3w;b1 zg8MMF9KQ0P=V^EngUjIAzZ^LYk0HTQc;mj-m*F!QTm>V+$6to`kRSjjK6~L6cpZZ) z;0LjT@4$WxmV(h1z5u(Bb|w5||8pM!iQy7B{>SgijUbFWE`J16`^#kl(D2IOt6S0T zI9UK_@KcUk$j;32BNq^Kv)lazi&UZHWvP){2D;tla)Fiz8Nak#$yEX^I`)BomQdS< zz`|ndCX1ZC>BWNkg@fxtQj-5~-yRiK6Arm21_T?C0O=ul>wWtQe*SY!rbiBaV!+<> zmrRNYYBGt$G$v-<#C0||-$~f;^;`h-l(dPcZ!~{+D0}uIg;ua2@y)eUUD81H9CP6h zo;^MmFib=mW!TSfdwf65D<RG_~B_16Q_^uK2g}s=pYAeO&do1v}EIK{IW|)je5S^;aM@YaqZbyxWT37pPme zHdc!t9l$Ry)NR=pp=HbP$dblu4h&kuf6%N9j|gAkP%?p^F|y1) z`DlZIBE-&(XEvmohDU5T^D_A5e*mr~j}~=*3J_gp09A!c0c6iqc+&N941>Ob!i1tmF4eHnAW1 P)C0_$+onu7{onrqKFWoL literal 0 HcmV?d00001 diff --git a/tests/netcdf4_classic.nc b/tests/netcdf4_classic.nc index 89d6cdda5ab01b889e280f064c3cbbc6c0b86bac..260b2ee37d90547e13d81c7caba72b16bab37171 100644 GIT binary patch literal 8330 zcmeI1O-vI}5P+xMwlsAS5DaP*R}RDgX`obyF$74v{Ag_rMU7mVLYJ^cT5K0Wh>3>y z6Y!=d<3SUnM-C<)QEwXcU}A_zebRz*9hICd0xjRDnusJ@Q9yljtJfbTIQ2HX?l_a9~hG%svqBj5D~>PkEJ zQ`bnWE185!tRiMaEZTS0j3@gN@l-O>6*C3Qivs2-_8_5Y+&Il%1c=k5;+8X<83{-2 zF+0OpNV~;2z@ER!(`Vfe5|Q3B(rRr>tNCu<3kT>}GgIB=!6uqRALNJP<@<_voi*9E zIr*|ye@bAL{Xf8NFnqwy|8F;p2pm$L}4gHn*tVABk7ZsWIWT!(RS2E^47d7S`#uyM1-UoDcdbrWzA{eAQHNd2qK(JczUT zRnZRsvdfod*^$Ngk(HtEVI-_NUaT)U00(g}`6SIoQMuWE*b+Qs2`(&8w0@+SC1mOJ z>^{JDG8lcxz#_^VR{$4eVdwSK;`wi@sJ@6RNII}?$|z#Nn2JZv<#Xd!-m&p8DJX04 zL*yo{Jh^Ins1dyv1FbzAJQ)l%j%G$qv^NKv!@(9Z{l()qHyK!*pMukeSjFwu0u@F_ z@pcREf$$1~*o^6X&)4ay`Qx4X5n@%mDCmj|Gn>dB`t;fhrKfK1WK5L*-9KoL*`b|V4$%}>AYQyeCQ45Ao84x=8IPDLmi24JuLhJH$fTWg!E#p9f5mhy_^F-eR}7ZHlGy^O T*XA0@R+(d=GRfS;{g=N0G}BJp delta 518 zcmeBj>@l4nqtYYA00ssSieZz{m+k%@Ed_Gbz>cXo3Wo8F+w@ zg@GYHFWET0xTGjExr9N0g8?K21PTmHK#~!N1q49+4rUO^z+exdprTA5MLS?#Xbqyd#atMKxGcy961P0*n0*dhD0Npp4Q^9U>BD1_KOc%`P{mnnu zfV~I|QO3zRtS*d9lQ*);vFx#MT)jDvV>9a{4h5n2Fx61{20uuOfAR(vc^;6nK`sG> FBmhZnS@!?{ diff --git a/tests/test_h5netcdf.py b/tests/test_h5netcdf.py new file mode 100644 index 0000000..140f11d --- /dev/null +++ b/tests/test_h5netcdf.py @@ -0,0 +1,58 @@ +# tests the variables found in the file h5netcdf_test.hdf5, +# which is produced by the write_h5netcdf test routine in the h5netcdf package +# +import pyfive +import h5py +import warnings + + +def test_file_contents(): + p5file = pyfive.File('h5netcdf_test.hdf5') + h5file = h5py.File('h5netcdf_test.hdf5') + + expected_variables = [ + "foo", + "z", + "intscalar", + "scalar", + "mismatched_dim", + "foo_unlimited", + "var_len_str", + "enum_var", + ] + + cannot_handle = ['var_len_str', 'enum_var'] + + p5contents = set([a for a in p5file]) + h5contents = set([a for a in h5file]) + + assert p5contents == h5contents + + for x in list(set(expected_variables) - set(cannot_handle)): + try: + # check we can get the variable + p5x, h5x = p5file[x], h5file[x] + if p5x is None: + warnings.warn(f'Had to skip {x}') + + if isinstance(h5x,h5py.Dataset): + # check the dtype + assert p5x.dtype == h5x.dtype + # check the shape + assert p5x.shape == h5x.shape + # now look into the details + if h5x.shape != (): + # do the values match + sh5x = str(h5x[:]) + sp5x = str(p5x[:]) + assert sh5x == sp5x + # what about the dimensions? + dh5x = h5x.dims + dp5x = p5x.dims + assert len(dh5x) == len(dp5x) + print(p5x) + except: + print('Attempting to compare ',x) + print(h5file[x]) + print(p5file[x]) + raise \ No newline at end of file diff --git a/tests/test_reference_list.py b/tests/test_reference_list.py index a1aef3c..9080184 100644 --- a/tests/test_reference_list.py +++ b/tests/test_reference_list.py @@ -6,6 +6,7 @@ from numpy.testing import assert_array_equal, assert_almost_equal import pyfive +import h5py DIRNAME = os.path.dirname(__file__) NETCDF4_CLASSIC_FILE = os.path.join(DIRNAME, 'netcdf4_classic.nc') @@ -13,11 +14,29 @@ def test_read_netcdf4_dims(): - with pyfive.File(NETCDF4_CLASSIC_FILE) as hfile: + # We want to know about this warning and deal with it + #warnings.simplefilter('error', UserWarning) - with warnings.catch_warnings(record=True) as caught_warnings: - dimensions_x = hfile['x'].dims - if caught_warnings: - for warning in caught_warnings: - print('Caught warning ', warning) - raise NotImplementedError('We need to fix this warning!') \ No newline at end of file + # not using a context manager so we can compare and contrast in debugging + hfile1 = h5py.File(NETCDF4_CLASSIC_FILE) + dimensions_x1 = hfile1['x'].dims + dimensions_v1 = hfile1['var1'].dims + + hfile2 = pyfive.File(NETCDF4_CLASSIC_FILE) + dimensions_x2 = hfile2['x'].dims + dimensions_v2 = hfile2['var1'].dims + + # as we created no explicit data for this dimension, this is the case where + # getitme goes to storage and finds an UNDEFINED_ADDRESS and returns zeros. + y = hfile2['x'][:] + + assert len(dimensions_v1) == len(dimensions_v2) + assert len(dimensions_x1) == len(dimensions_x2) + + # The dimension scale spec is here: https://support.hdfgroup.org/documentation/hdf5-docs/hdf5_topics/H5DS_Spec.pdf + # The issue is that we don't support reference_lists. + # But we don't know if this matters or not, given this is failing on the dimensions of a dimension + # ChatGPT says: + # + # - NetCDF4 Classic files often abstract away dimensions, so their behavior in HDF5 tools + # like h5py may not always align with expectations for standard HDF5 datasets. From b586db0b8710323f1fdefb535c752a758b0d887c Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 30 Dec 2024 17:04:07 +0000 Subject: [PATCH 042/105] Continue to use open file in h5d, closes #18 --- MANIFEST.in | 1 + pyfive/h5d.py | 52 +++++++++++++++++++++++++-------------------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index e02cfac..9ae33e4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ # Include the license file include LICENSE.txt +include README.rst diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 62044af..39932c9 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -26,12 +26,19 @@ class DatasetID: def __init__(self, dataobject): """ Instantiated with the pyfive datasetdataobject, we copy and cache everything - we want so it can be used after the parent file is closed, without needing - to go back to storage. + we want so that the only file operations are now data accesses. """ self._order = dataobject.order - self._filename = dataobject.fh.name + self._fh = dataobject.fh + try: + self._filename = self._fh.name + dataobject.fh.fileno() + self.avoid_mmap = False + except (AttributeError, OSError): + # maybe this is an S3File instance? + self._filename = self._fh.path + self.avoid_mmap = True self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape self.rank = len(self.shape) @@ -40,13 +47,6 @@ def __init__(self, dataobject): self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params() self._unique = (self._filename, self.shape, self._msg_offset) - try: - dataobject.fh.fileno() - self.avoid_mmap = False - except (AttributeError,OSError): - # not a posix file on a posix filesystem - self.avoid_mmap = True - if isinstance(dataobject.dtype,tuple): # this may not behave the same as h5py, do we care? #FIXME self.dtype = dataobject.dtype @@ -219,11 +219,10 @@ def _get_contiguous_data(self, args): return self._get_direct_from_contiguous(args) else: try: - with open(self._filename,'rb') as open_file: - # return a memory-map to the stored array - # I think this would mean that we only move the sub-array corresponding to result! - view = np.memmap(open_file, dtype=self.dtype, mode='c', - offset=self.data_offset, shape=self.shape, order=self._order) + # return a memory-map to the stored array + # I think this would mean that we only move the sub-array corresponding to result! + view = np.memmap(self._fh, dtype=self.dtype, mode='c', + offset=self.data_offset, shape=self.shape, order=self._order) result = view[args] return result except UnsupportedOperation: @@ -234,11 +233,11 @@ def _get_contiguous_data(self, args): size = self.dtype[1] if size != 8: raise NotImplementedError('Unsupported Reference type - size {size}') - with open(self._filename,'rb') as open_file: - ref_addresses = np.memmap( - open_file, dtype=(' Date: Wed, 1 Jan 2025 16:04:46 +0000 Subject: [PATCH 043/105] Test for true bytes-io testing (needed for h5netcdf test compatability) --- tests/test_file_like.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_file_like.py b/tests/test_file_like.py index 0933fc0..6d46970 100644 --- a/tests/test_file_like.py +++ b/tests/test_file_like.py @@ -2,6 +2,7 @@ import io import os +import h5py import numpy as np from numpy.testing import assert_array_equal, assert_almost_equal @@ -57,3 +58,25 @@ def test_read_latest_fileobj(): ref_attr6 = u'Test' + b'\xc2\xa7'.decode('utf-8') assert dset3.attrs['attr6'] == ref_attr6 assert isinstance(dset3.attrs['attr6'], string_type) + + +def write_compressed_tobytes(file_like): + """ Make an HDF file for testing """ + + f = h5py.File(file_like, 'w', libver='earliest') + + # gzip compressed dataset + f.create_dataset('dataset1', shape=(21, 16), chunks=(2, 2), dtype=' Date: Thu, 2 Jan 2025 08:38:48 +0000 Subject: [PATCH 044/105] Deals with filename issues (closes #19) (and deals with another iobytes isseue around scalars on contigous storage). --- bnl/opening_speed.py | 118 ++++++++++++++++++++++++++++++++++++------- pyfive/h5d.py | 11 ++-- pyfive/high_level.py | 2 +- 3 files changed, 108 insertions(+), 23 deletions(-) diff --git a/bnl/opening_speed.py b/bnl/opening_speed.py index db38d96..9928bc8 100644 --- a/bnl/opening_speed.py +++ b/bnl/opening_speed.py @@ -2,29 +2,111 @@ import pyfive from pathlib import Path import time +import s3fs -mypath = Path(__file__).parent -fname = 'da193o_25_day__grid_T_198807-198807.nc' -p = mypath/fname +S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/' +S3_BUCKET = 'bnl' -h1 = time.time() -f1 = h5py.File(p,'r') -h2 = time.time() -f2 = pyfive.File(p) -h3 = time.time() +def test_speed(s3=False): + mypath = Path(__file__).parent + fname1 = 'da193o_25_day__grid_T_198807-198807.nc' + vname1 = 'tos' + p1 = mypath/fname1 + fname2 = 'ch330a.pc19790301-def-short.nc' + vname2 = 'UM_m01s16i202_vn1106' + p2 = Path.home()/'Repositories/h5netcdf/h5netcdf/tests/'/fname2 + + do_run(p1, fname1, vname1, s3) + + do_run(p2, fname2, vname2, s3) + + +def do_s3(package, fname, vname): + + fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) + uri = S3_BUCKET + '/' + fname + with fs.open(uri,'rb') as p: + t_opening, t_var, t_calc, t_tot = do_inner(package, p, vname) + + return t_opening, t_var, t_calc, t_tot + +def do_inner(package, p, vname, withdask=False): + h0 = time.time() + pf1 = package.File(p) + h3 = time.time() + t_opening = 1000* (h3-h0) + + h5a = time.time() + vp = pf1[vname] + h5 = time.time() + t_var = 1000* (h5-h5a) + + h6a = time.time() + sh = sum(vp) + h6 = time.time() + t_calc = 1000* (h6-h6a) + + t_tot = t_calc+t_var+t_opening + + pf1.close() + return t_opening, t_var, t_calc, t_tot + + + +def do_run(p, fname, vname, s3): + + if s3: + import s3fs + + + # for posix force this to be a comparison from memory + # by ensuring file is in disk cache and ignore first access + # but we then do an even number of accesses to make sure we are not + # biased by caching. + n = 0 + datanames = ['h_opening','p_opening','h_var','p_var','h_calc','p_calc','h_tot','p_tot'] + results = {x:0.0 for x in datanames} + while n <2: + n+=1 + + if s3: + h_opening, h_var, h_calc, h_tot = do_s3(h5py, fname, vname) + p_opening, p_var, p_calc, p_tot = do_s3(pyfive, fname, vname) + + else: + h_opening, h_var, h_calc, h_tot = do_inner(h5py, p, vname) + p_opening, p_var, p_calc, p_tot = do_inner(pyfive, p, vname) + + if n>1: + for x,r in zip(datanames,[h_opening,p_opening,h_var,p_var,h_calc,p_calc,h_tot,p_tot]): + results[x] += r + + for v in results.values(): + v = v/(n-1) + + + print("File Opening Time Comparison ", fname, f' (ms, S3={s3})') + print(f"h5py: {results['h_opening']:9.6f}") + print(f"pyfive: {results['p_opening']:9.6f}") + + print(f'Variable instantiation for [{vname}]') + print(f"h5py: {results['h_var']:9.6f}") + print(f"pyfive: {results['p_var']:9.6f}") + + print('Access and calculation time for summation') + print(f"h5py: {results['h_calc']:9.6f}") + print(f"pyfive: {results['p_calc']:9.6f}") + + print('Total times') + print(f"h5py: {results['h_tot']:9.6f}") + print(f"pyfive: {results['p_tot']:9.6f}") + +if __name__=="__main__": + test_speed() + test_speed(s3=True) -v = f2['tos'] -d = v._dataobjects -h4 = time.time() -d._get_chunk_addresses() -h5 = time.time() -print("File Opening Time Comparison") -print(f'h5py: {h2-h1:9.6f}') -print(f'pyfive: {h3-h2:9.6f}') -print(f'Additional times: {h4-h3:9.6f}, {h5-h4:9.6f}') -print(f'Total times: H5 {h4-h3:9.6f}, P5 {h5-h4:9.6f}') diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 39932c9..7920ffa 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -32,12 +32,12 @@ def __init__(self, dataobject): self._order = dataobject.order self._fh = dataobject.fh try: - self._filename = self._fh.name + self._filename = dataobject.filename dataobject.fh.fileno() self.avoid_mmap = False except (AttributeError, OSError): # maybe this is an S3File instance? - self._filename = self._fh.path + self._filename = getattr(self._fh,'path','None') self.avoid_mmap = True self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape @@ -247,14 +247,16 @@ def _get_direct_from_contiguous(self, args=None): We read the entire contiguous array, and pull out the selection (args) from that. This is a fallback situation if we can't use a memory map which would otherwise be lazy. This will normally be when we don't have a true Posix file. + # FIXME: We can probably make this lazy by using the indexer to work out which bytes + # are where ... """ itemsize = np.dtype(self.dtype).itemsize - num_elements = np.prod(self.shape) + # need to impose type in case self.shape is () in which case numpy would return a float + num_elements = np.prod(self.shape, dtype=int) num_bytes = num_elements*itemsize # we need it all, let's get it all (i.e. this really does read the lot) - self._fh.seek(self.data_offset) chunk_buffer = self._fh.read(num_bytes) chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) @@ -302,6 +304,7 @@ def _get_selection_via_chunks(self, args): chunk_coords = tuple(map(mul, chunk_coords, self.chunks)) filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords) if self.filter_pipeline is not None: + # FIXME: Why do I assume it's always a V1 Btree? chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize) chunk_data = np.frombuffer(chunk_buffer, dtype=dtype) out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection] diff --git a/pyfive/high_level.py b/pyfive/high_level.py index 23b30f9..d1350dd 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -196,7 +196,7 @@ def __init__(self, filename, mode='r'): raise ValueError( 'File like object must have a seek method') self._fh = filename - self.filename = getattr(filename, 'name', None) + self.filename = getattr(filename, 'name', "None") else: self._fh = open(filename, 'rb') self._close = True From 32ad75de01eeb7f12a095f1fb5e5a8474b9e24be Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 2 Jan 2025 15:57:00 +0000 Subject: [PATCH 045/105] Addressing, I think, upstream issue 53, and includes a test case I should have included previously --- pyfive/h5d.py | 4 ++-- tests/test_file_like.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 7920ffa..35ef508 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -259,7 +259,7 @@ def _get_direct_from_contiguous(self, args=None): # we need it all, let's get it all (i.e. this really does read the lot) self._fh.seek(self.data_offset) chunk_buffer = self._fh.read(num_bytes) - chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype) + chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy() chunk_data = chunk_data.reshape(self.shape, order=self._order) return chunk_data[args] @@ -306,7 +306,7 @@ def _get_selection_via_chunks(self, args): if self.filter_pipeline is not None: # FIXME: Why do I assume it's always a V1 Btree? chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize) - chunk_data = np.frombuffer(chunk_buffer, dtype=dtype) + chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy() out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection] if true_dtype is not None: diff --git a/tests/test_file_like.py b/tests/test_file_like.py index 6d46970..6c0f0cf 100644 --- a/tests/test_file_like.py +++ b/tests/test_file_like.py @@ -69,6 +69,9 @@ def write_compressed_tobytes(file_like): f.create_dataset('dataset1', shape=(21, 16), chunks=(2, 2), dtype=' Date: Fri, 3 Jan 2025 10:31:54 +0000 Subject: [PATCH 046/105] Fix location of files so tests run properly from parent directory. --- tests/test_h5netcdf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_h5netcdf.py b/tests/test_h5netcdf.py index 140f11d..a92c56c 100644 --- a/tests/test_h5netcdf.py +++ b/tests/test_h5netcdf.py @@ -4,11 +4,13 @@ import pyfive import h5py import warnings +from pathlib import Path +DIRNAME = Path(__file__).parent def test_file_contents(): - p5file = pyfive.File('h5netcdf_test.hdf5') - h5file = h5py.File('h5netcdf_test.hdf5') + p5file = pyfive.File(DIRNAME/'h5netcdf_test.hdf5') + h5file = h5py.File(DIRNAME/'h5netcdf_test.hdf5') expected_variables = [ "foo", From 22333955595d571f6e892037f5b492f9937d491d Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 7 Jan 2025 21:08:38 +0000 Subject: [PATCH 047/105] Well, I think this is a fix for #23, and it's so complex I'm committing it now in case my laptop dies and I can't repeat this :-) --- pyfive/btree.py | 37 ++++++++++++++++ pyfive/dataobjects.py | 91 +++++++++++++++++++++++++++++++-------- pyfive/h5d.py | 11 +++-- pyfive/misc_low_level.py | 27 ++++++++++++ tests/data/issue23_A.nc | Bin 0 -> 23279 bytes tests/data/issue23_B.nc | Bin 0 -> 44746 bytes 6 files changed, 144 insertions(+), 22 deletions(-) create mode 100644 tests/data/issue23_A.nc create mode 100644 tests/data/issue23_B.nc diff --git a/pyfive/btree.py b/pyfive/btree.py index 38518fc..5d34258 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -8,6 +8,7 @@ from .core import _padded_size from .core import _unpack_struct_from_file +from .core import _unpack_struct_from from .core import Reference @@ -440,6 +441,25 @@ def _parse_record(self, record): creationorder = struct.unpack_from(" 1: + raise NotImplementedError('Multiple Attribute Info Messages not supported') + offset = attr_info[0]['offset_to_message'] + data = _unpack_struct_from(ATTR_INFO_MESSAGE, self.msg_data, offset) + heap_address = data['fractal_heap_address'] + # I can't find any documentation on this, but at least some + # files seem to use this to indicate no attribute info. + if heap_address == EMPTY: + return {} + name_btree_address = data['name_btree_address'] + order_btree_address = data['creation_order_btree_address'] + heap = FractalHeap(self.fh, heap_address) + ordered = (order_btree_address is not None) + if ordered: + btree = BTreeV2AttrCreationOrder(self.fh, order_btree_address) + else: + btree = BTreeV2AttrNames(self.fh, name_btree_address) + adict = dict() + for record in btree.iter_records(): + data = heap.get_data_v2(record) + name, value = self._parse_attribute_msg(data,0) + adict[name] = value + return adict + + def unpack_attribute(self, offset): """ Return the attribute name and value. """ + return self._parse_attribute_msg(self.msg_data, offset) - # read in the attribute message header - # See section IV.A.2.m. The Attribute Message for details - version = struct.unpack_from('> 4) & 3 # bit 4-5 + version = firstbyte >> 6 # bit 6-7 + match idtype: + case 0: # managed + if version != 0: + raise ValueError('Unexpected version in fractal heap') + obytes = self._managed_object_offset_size + offset = _unpack_integer(obytes, record['heapid'],1) + sbytes = self._managed_object_length_size + size = _unpack_integer(sbytes, record['heapid'], obytes+1) + return self.managed[offset:offset+size] + case 1: # tiny + raise NotImplementedError + case 2: # huge + raise NotImplementedError + case _: + raise NotImplementedError + def _min_size_integer(self, integer): """ Calculate the minimal required bytes to contain an integer. """ return self._min_size_nbits(integer.bit_length()) diff --git a/tests/data/issue23_A.nc b/tests/data/issue23_A.nc new file mode 100644 index 0000000000000000000000000000000000000000..494a90c8fccd29adb9dc718900551a4021e053a2 GIT binary patch literal 23279 zcmeHP3s6+o89sM+fd!UF6s0~YtHej;QCL=^78GV#A4R|cCW(@;F6@=v6n5$ELL{Rm zZPR8FlT0SVkV+}BO{OMws?dO%q{#%^ns%zSO;t=WX(yN%(ni~~2}vo~^PhA6U0z#Y zYMjxYGu-{}dH?4-|2ccP|F`Gijrr?iqUJ_v^m>Bfst8xM@stda@#D0oGK)9nmuL)~ znnf&48!57|%2*ktAqo_kHJ|9H&}zuo5*RlugK}D8AbKcj@wgjQx8LP)`^Ze15kW5v zy)5g~GSf3n@$vMY^g^37Sh*lPb)A++S%Yv3^=Rn&B10pQ z2mNYE0>y%=X)99wRgN8Ii^t=2xZHNX>Lc+~2R?Qq8wAyX1(Xmmq!6lzmV^sHH|Ld> zGAIE$fFyrYp|z-VQ*n{4)RMQE>G#PSjp^omSFOu2x(jHJ1@DP3*KiEU2n+?l2s9BG zE=&q2nW!g;_tO@_m_2BtI5sQ8d8>A0k#|L5+zd^G9jN{BjcdVv@8`G zxp!zza&Y9g!M;gD2N%S!0J<4(gRq%E_Y?>yd8j?62EiPS2`ysV+6*1&XFy4D^4o0C$UvS3R^{l7HU&jb-(1&z}1 z3Kwj`v{T($pbmWT>1YM9rPaG#ey(i{%>->7YPDBYeKxnp>vw`Zgayaw^V{7HyVqfJ z+iO)eMrLTV7M)3#zuuwJ+s#d>Vm4@GOxR#4-ppp2eWCIWp$N|A6;U1+2BzcYhH;c1 z3pnt>0MoI2v}h00v3-p5sZpn|rc68x!b2cD_!+3*ESx&2BDf=t`@_e2r~AnPO<@`ZxV20fWw zi+Qe96bhdWH;N8je_HBNM0Xu8QhVXUoM8PH;bfdIJjdXwoU$hrbfw{VIZ7JZ<7CLbuQb$m_yvfX>hvPso)A)@}gI zk;4FM4uZbM<7P|hd0}dN_{YD*v&*>}(tSo}0ZK1VkxE^eeq-|%*1DgQ@ z8B`B#$zsD^E=-S>ZQFp{l~S?SVIHpM|3XvDfN@U1)|f?;8SwT-pV8-XSE)u*M&_!t zOjBA$CQR1|X3BWh28;u0`zhs%h#iYYCkr#3oqcRG5L>n2IT02!UAuy#)5!x9+RIE0f~S_Kq4R!kO)WwBmy@&0`N$= z-{d6UjEI{%Hz8p`_WAZ7e53nRT59UTg=uL^5A5H6?AW8-3VmreeBX@Kp`J47kL4x| zE5vI<%*3Zjl$b81L_i`S5s(N-1SA4C4+6~j6>HAkaz6fV-In92#wDw>4nKEBi6Xn( z^k?`u*X8c4IN17+gKxYv_>`k*#U1%)`fO`k)-75PU6}o1fypxX*t1<9x7yAf{nMTI zp6=My`0V9(hg#P(Hx2frl{TN-_qwh;y>?Ied%rey!V?!)G$t+ed~mAok3BEjKPr7J zd2r>i6a7a$rlH~cN-HW(T>ROSJ&j3O@Aagfe1Bi3^TdVzz0HShuRdS-^ET~ZTy1aH zk=n~I{c>K9`<|9NJDeXK-?Vpj!<#cB$+6_tLpP6!$dO0{BmxoviNOCI0-T(buTdBg zZ{!(F_I+WL0d~_gjvhmg8cJvY z?hT7M{WfL-CN}}n4wDG+@A65s9QvvzB;wawZm0qbJo)% z%3S_N1O1sAP)x_2odZNV&>|Dm0HT~YR<3HqA!@*_8_Byunp_OmxJs?5vDK=6r^mt1 zj>yMZJLJm+h3{1Y6F!w7)FOn}Na>+R?|Ab0^KQO|n%yeWyaQx=d1 zhH>vNw(Yw1SA3y Q0f~S_Kq4R!xRDX~4@yw}H2?qr literal 0 HcmV?d00001 diff --git a/tests/data/issue23_B.nc b/tests/data/issue23_B.nc new file mode 100644 index 0000000000000000000000000000000000000000..bcc505c8133d80a4549892af0dd02c52d39999db GIT binary patch literal 44746 zcmeHQ3v?7!n!cS*VrUW`3Wx}LL3Rn6GkJmdOVKch>Sb4M;%>PSJxF4XB-iAXV-O)iZkF*M$hUvqvx!TJ+l9QAJtuX zK?FG?-V5nlb>H{9|NZZ+|Nn2*=7w;6L10=SD=&{ocTl(&AK)P+B7#2m(g!OV!mF|h zUeC^&#RC2LBCh#zelhoS?!PPU16jm}A(N+*Z1DUO$w@gp#eKrxEqrn%CK%S`PABAq zVPi2O_v8_xb5A`)_S`AMCpfaqrwm^(Huxw05t^-{6aor?V~4W**#vbSzP@NOEm^l74TGoqo-4yS$o+H_03Qg`6(}eNZuoT5e!XQNqV6r zgmZ6lkznidL;~qg`NDr&n09$ zc#~%wSV5*>3)}nzAuIgl{?()pvz*EKW%$5LYskGAaDk5v$D1EKauvA;TY3&Kd+@$@ z_4C(|Fa|7yl?A*9P7G}!S7R-IPslj%#=Q3QR&p*}5QDEnd*-dXmjI`S`tXO410}`3 zA{#LCY(5b6dpdqaQds=o5rSRXv9S9A@(Vzf{0vEgkV`k5@mq2yruY+w`lCx4e^0)| zZib;s@ZS5k?QfIwFvS8w0^oHvk-GjJi2t9=689EQOcnH)(&+DIk`sc^rz#XL?@h#!WKHnrPFw5uj;VeNXeg7n6 zBNOR)Y#tL*Ln_9Sii0rga>ikd|14w#+}*u+>=X>k%EB4TXFmEq#wO)9GM}7L$Cc**ieQ}dD)O|8wL+QtaUUyI9v9~e8!&{VrT!qV4>^evr+L*c(h zO+6V3*VCBMrJIgnnY1mHq^6ZjM|Eqqp|!>JAeF9+V1u%t{Bf`_K!aMF^kg*FUJ*9B zj2M{!V+n!aC~F*$3b>;8s_)|%uJie3V@MVdA_Ez(F#y~^9}y#@QczGc-hdk|P3E!Z z4u6(^Cj*jKgH<8bN%;N4!w+jFE?<+Nw=jcbZ$p-4#|%?T>W;V1L(DHM0CC9wJ|CFz zFmzB#deI~|yx=094!08M2Bv3BAR}D1hq0)P za9QtUECie^N0!e+37IM@k^#zB2q**;0tx|zfI>hapb$_9C!o?-%#dE@^yroJIu=PJjUAY#kD>(BFp zwuDL9wCH-%>WS;I4v?iKnY`^>C=UfMwASpSDMzPPJ6#!CPGee9qfV!lipOX=n@*FR zI*nRp(l$U0htfE!J-`AnQ^K|`)}tVqGh?g}Ct;b6WmBy~H=~B`&>kb%$x^pl@wnAv zm>mHcHIf>ZR2`smnAFeK9bL1do!A_QHv|gqHez~=w)OGKG+T?u^*9a1jV=&x(dAZ5 zj|bca6!!G=1fw0XU@EHFK|Pk5mxy-FEuu~;k+AG!x}X*9?M4(k9iVlF6}LK!s1|4K zkC@$tZJ7uZgshTn0(cOD)xqW<4eJTbPGSmHK*ZGTjy{$dlY|W$u&``wUORxWShpMi z!HhzuL@7;b6_AK zs~EaAof>8|ovu;ZY0-*IvmbI-865!--3e~Qng_0U+2)d-(jA(0xl{m10jwfg zSXNR}GPjxrv}6~prlHXC^5BBt0*L6cVn(~6$6&U%RFpNARt2o62}xG2SXmk@@x+%` zRrHosRrQva1&|3qF-t;`B4lqcvM{O3D%W=boj6vUv$_u3h@Blk9YZy3)^!>Q(om2# zq|Eg?n|e!wbQy4k)Oz4-(My9sW|XC-RF7+be6+aRaH3YqDNgWv3XKIOhLaIk9>qrR z46!~_>oTHFkV2U&JNw{bMO4>JtHTZ!(b9zrD}#YJFxhI_*t{IzfS$S}sP_W-p|T{| z3aSniV$YQ0VC%((83XDDeMy|NaVt*K8T_y=v9S@1To!=JZXBN_D?=eF-HST1BR-h3{g#-Al!6{&A%ft%4d@BGJ{w>3WYOLzWWf=LITzvb*W{tMV2jLY<`-Srdj2|(yGdmV9CPr zl7&@4GYWINFACeA08D9Uswjg`=z60UT2orSpc1&m8roG{7FgV{*uhBGRa21)N~PUe zJf%Bl)0p0Fn2;3+Bns0RHW|Pg1gQkNt$0*3fqBZhszG&W>Vf7Jy|A(w4lW6-j%wJO z#M=SO*3yMl^R+hCS_d{GP%^Ax*bh;otO(Er#=!O~l4CizWrD!~egfRt=!R=txnO72 z*i8#zD&VZ>)HU3%P`z7&y%3uoz(~u2^Gizg1#~H0-%(r&8QKSBXop<^j21e`P#!EV zuhi$e8SINX%d8*J!We34`O@7371|s;$WRLN7zzhu%UwbPnt~ltA0xDlk{E4hAoc=D ziV;v$gMBH|&VsT4?f?y-46Q~kD6kzcX}xU#hXj^gMoUYp%POj&u9e{yWFBn()&nH4 zV`Oj?;id{IVBE~sFM(0S`sCgSXA9G!LObHdr3|1!+Ee~WARi65@AfJQ+_%+AuwKb2 zror+Bn=MyE4(o87Pxx7)-<+XYsU*^Q!p6NNq5utN4Xs^O0SOP`>Km41?Ef)w{~VsB z{|w;^BP^-Fk8EG|Ov|cB1dStPnDnReEW`d5FY-nqn?NUhsR-qv>Q^D45Kssx1QY@a z0fm4<;M{!usD9ZU8K<5WwS!rmr)*J!s~Q3vw18%UuVyEFv>71w7xN$mE6Id8^2eFaVBR zpap2zd-wK{tKovc6AwO-v|e{rKe-13pbVr9J^$QW$x9dz6nGhT|C-#7Wn9iRG*dqP z@JW6qp%#88gp#dB-Y~fIO7op(GC#dFaD4 z&~qODRNA_KjV`Gyqd=hdhtYL4Ilgh@rnK$fe_~P1AM>)wbJ0m3uU&!5p^qdCK8%3O30VgUbfBceI!oZGG%W~6IXo5>)A~@13R*x4@-UE; z7K1Q2z(vM!%j{@%1;NO{W#z8C7!?yS5S9*?ObDM*Ziwla2ueznO)IWz3AdM8ui=%^ zx45dAHq=7$NMxn56|HR$iqifPJQ)D;>By=`Qz+8PP6jN1RJh!sz|ql@gvcWT?!gU` zhw*%2qC=h>*e%aLLOOJdFXO)YJ^m3GP6#PaVGKlI*HUi?5-B<*bmTawUxk1|Kp~(I zPzW542r$dYhs3d49~g7~6?^&o$CEx9KhO#^x0?fyHpV>bxv`623ktz}+whx>kM?nC z*3B75f29;lZY>7gA2sQgd;Irxv8%zVuv*;B9Fya}SQ$Py9o^%<*nBKi9tXy!iBTE$ zVzXu!ta|4t+6p7IMNenrQ;~3gc>lkf@Ue8`#*;4Fwd=wQrqA#dEX`Up+>R84R3p0a zKk!BmBpbCu_2Q%M6aoqXg@8gpA)pXY2z=WSz`bhTiP#IjPnuH7JzxIysPa6TGIY7S zl)jXqOC`u}8aSTvK5BUH*Y3Wem_18L<+Bg8y|dXP&$AQ$_b-RgT#M%?_q|lE-kFQ} zc~$T$UghK74|f6C+?>3{sJSk8chuW=7=s?~3YcOT>yWkVoO9#31NGYl7xFsOsb0S0 zntfcqEqfs`lD=%^K>c>RP{d2Uw)7+Gwdq2?{R!p;-L$LU1`AuJ&PIdaq1D;bFa_wQ zU3K<441lK%B%ZW78y+LrmJ72r!o|!Xep0JaUyKX5cV%84Ql8u$H<y^p7lk9T zc0kg=Lg1Xx;Tk)5>P#Cu$b+9Ty~EaZr`3cr*PX+dIUq5LTH$DF3{J?1qqKuf9R?>A zZ;u>g%5dROrVJU`zi~?jx$A~!-N5{xjA{!s6-FDd(i7Xx${D<_?9(`2Cyx0W+ej^-`Cy7W2Kl<`tzOf@N!%B z2|s3rU6`TxuE)Y5;UN^?MHz~34YSN1z2Y16?X2T?j{C=0#z>0q;-@ZRXNY-^)bEiZ zaHNWFIe3`_#1-FyG)N{TkxUx#4EBSMZvWkp_C@6%Jpx=rG2iE7Q;=x}4|P7`%=B~b z-HWBk)ODMA-7-!}8FB`_pH3QAi}S{qFWZpVD%M%rU@6krEwOQBkp|ciXmy z_PzO-m6cUi_TrZxemJ8BKBQX`7|RR-q!kCtZB97l$46ukfWzfZim&tf0~~9_1>Q>l z1s&Gg4?K0ICt|)RNg4#`@H*sS<^_(KsI=oy1_48)9d0sSFrr0(Wq#up0n+Xgy2qTf z2yh8cMkxQgh`?}?WqgaqP%gRI8!DFuxoUc2)ty2>A)pXY2q**;0>=Xa+{k#^w9`r+ z+B~mNI5`jqJn7ko_jT@`xnYJb5AQ_H@(?4N^M zownEZ{Pc{!{v!2XPyg|I1#54QPU&l_TO7S_(SZX`etyLpZ=Dt#x8SQAcT|70>EXBj zrRJvFPk6Sk@AHP8K6or!w||~EPIEk9srsN0PzWdl6aor?Z#e=?36pcj6^(`D7U;?p za`$KB>&RrdJemCV^L1g;1DB`r)1;tQA*YZ%5AD5yXe{lQyRLtRMBs8dF>;D_k+)cA z_UpTUPo86GzgkpqfZPw4r;-7tQN~^(UZ3+YjzI>toPF_YjRpS5NIh)&{$X-aMC4E`zKpT`D z!`Ptg+Aw~WutB*D^TJF0k-*u<1!iHPVQ!T2Y5(c7$vnIp*%x4ZkC=U_hMfN29mi{67D5z!C@q zho27E`g!C9ey4uF5d<=APz3JYam%C(4-t6An`F|9XPikUop=U3XX^(uzL7quLJuDS zrbU+q2`t;bF%NDZl@?v@j--#JARobdSDT*JY~z&2X!yYGkY*leEP(NvVYkA^qT$oL zS~6vibjp2%9Fy?9`|PWs@Sy&7_3}esdc71Rn~853qK@XQVsftJ34)n*z}1@RH?`rsgFpn)tT{PJz7m zEdf-H;szBvEXPaw*l~Hf=Onn!yoVA!_qY*O?l?ez&Hv`h(>*!=X8%%Owxo${B)E`x zXz^vsxsl7icX6$FeDP%?ZK4~)zT}Euy>Q>E<-Rx853kAiY(7ut%a#vOc@WA>N#qM4 zeA#mMy)5coDB@c(CM>5u>b literal 0 HcmV?d00001 From 1e2c424615b46d633d1e42e034df4dc41292c6a2 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 8 Jan 2025 07:47:01 +0000 Subject: [PATCH 048/105] Cleaned up issue23 fixes, all tests pass --- pyfive/dataobjects.py | 5 ++- pyfive/misc_low_level.py | 41 ++++------------------ tests/test_issue23_nc_attrs.py | 63 ++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 37 deletions(-) create mode 100644 tests/test_issue23_nc_attrs.py diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index fdaa86d..6386b6f 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -156,7 +156,6 @@ def get_attributes(self): return attrs def _get_attributes_from_attr_info(self, attrs, attr_info): - EMPTY = 18446744073709551615 #assume we only have one of these if len(attr_info) > 1: raise NotImplementedError('Multiple Attribute Info Messages not supported') @@ -165,7 +164,7 @@ def _get_attributes_from_attr_info(self, attrs, attr_info): heap_address = data['fractal_heap_address'] # I can't find any documentation on this, but at least some # files seem to use this to indicate no attribute info. - if heap_address == EMPTY: + if heap_address == UNDEFINED_ADDRESS: return {} name_btree_address = data['name_btree_address'] order_btree_address = data['creation_order_btree_address'] @@ -177,7 +176,7 @@ def _get_attributes_from_attr_info(self, attrs, attr_info): btree = BTreeV2AttrNames(self.fh, name_btree_address) adict = dict() for record in btree.iter_records(): - data = heap.get_data_v2(record) + data = heap.get_data(record['heapid']) name, value = self._parse_attribute_msg(data,0) adict[name] = value return adict diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py index 6772084..e4c8786 100644 --- a/pyfive/misc_low_level.py +++ b/pyfive/misc_low_level.py @@ -249,42 +249,14 @@ def get_data(self, heapid): idtype = (firstbyte >> 4) & 3 # bit 4-5 version = firstbyte >> 6 # bit 6-7 data_offset = 1 - if idtype == 0: # managed - assert version == 0 - nbytes = self._managed_object_offset_size - offset = _unpack_integer(nbytes, heapid, data_offset) - data_offset += nbytes - - nbytes = self._managed_object_length_size - size = _unpack_integer(nbytes, heapid, data_offset) - - return self.managed[offset:offset+size] - elif idtype == 1: # tiny - raise NotImplementedError - elif idtype == 2: # huge - raise NotImplementedError - else: - raise NotImplementedError - - - def get_data_v2(self,record): - """ - Updated version utilises the full structure view of the - record, currently only used for attribute fractal heaps. - This is transition code. - """ - firstbyte = record['flags'] - reserved = firstbyte & 15 # bit 0-3 - idtype = (firstbyte >> 4) & 3 # bit 4-5 - version = firstbyte >> 6 # bit 6-7 match idtype: case 0: # managed - if version != 0: - raise ValueError('Unexpected version in fractal heap') - obytes = self._managed_object_offset_size - offset = _unpack_integer(obytes, record['heapid'],1) - sbytes = self._managed_object_length_size - size = _unpack_integer(sbytes, record['heapid'], obytes+1) + assert version == 0 + nbytes = self._managed_object_offset_size + offset = _unpack_integer(nbytes, heapid, data_offset) + data_offset += nbytes + nbytes = self._managed_object_length_size + size = _unpack_integer(nbytes, heapid, data_offset) return self.managed[offset:offset+size] case 1: # tiny raise NotImplementedError @@ -293,6 +265,7 @@ def get_data_v2(self,record): case _: raise NotImplementedError + def _min_size_integer(self, integer): """ Calculate the minimal required bytes to contain an integer. """ return self._min_size_nbits(integer.bit_length()) diff --git a/tests/test_issue23_nc_attrs.py b/tests/test_issue23_nc_attrs.py new file mode 100644 index 0000000..41749ce --- /dev/null +++ b/tests/test_issue23_nc_attrs.py @@ -0,0 +1,63 @@ +import pyfive +import h5py +import numpy as np + +## Test files provided as part of https://github.com/NCAS-CMS/pyfive/issues/23 + + +def _compare_var_attrs(p5file, h5file): + """ Compare attributes, which ought to bet he same thing except for the + dimension lists which have package dependent attributes """ + assert p5file.keys() == h5file.keys() + for k,v in p5file.items(): + vv = h5file[k] + if k == 'DIMENSION_LIST': + assert len(v) == len(vv) + else: + if isinstance(v, np.ndarray): + assert np.all(v == vv) + else: + assert v == vv + + + +def test_A_works(): + """ This file behaves.""" + with pyfive.File('tests/data/issue23_A.nc') as hfile: + + file_attrs = hfile.attrs + q_attrs = hfile['q'].attrs + + + with h5py.File('tests/data/issue23_A.nc') as hfile: + + file_attrs2 = dict(hfile.attrs) + q_attrs2 = dict(hfile['q'].attrs) + # note that unless we explicitly copy these to dicts, + # they cannot be referenced after the file is closed. + + assert file_attrs == file_attrs2 + _compare_var_attrs(q_attrs, q_attrs2) + + +def test_B_fails(): + """ This file fails """ + + with pyfive.File('tests/data/issue23_B.nc') as hfile: + + file_attrs = hfile.attrs + t_attrs = hfile['tas'].attrs + + with h5py.File('tests/data/issue23_B.nc') as hfile: + + file_attrs2 = dict(hfile.attrs) + t_attrs2 = dict(hfile['tas'].attrs) + # note that unless we explicitly copy these to dicts, + # they cannot be referenced after the file is closed. + + assert file_attrs == file_attrs2 + _compare_var_attrs(t_attrs, t_attrs2) + + + + From 6da5fda5a39f3a9072c6f09c37fadef72b28e59f Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 8 Jan 2025 09:54:45 +0000 Subject: [PATCH 049/105] Test localisation, and a new test for laziness outside a context manager --- .gitignore | 4 +++ tests/test_issue23_nc_attrs.py | 14 +++++------ tests/test_lazy_variables.py | 46 ++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 7 deletions(-) create mode 100644 tests/test_lazy_variables.py diff --git a/.gitignore b/.gitignore index bcc99b3..2b42de6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ .coverage .pyc build +__pycache__ +*.egg-info +.idea +.DS_Store diff --git a/tests/test_issue23_nc_attrs.py b/tests/test_issue23_nc_attrs.py index 41749ce..2d1eb18 100644 --- a/tests/test_issue23_nc_attrs.py +++ b/tests/test_issue23_nc_attrs.py @@ -1,9 +1,11 @@ import pyfive import h5py -import numpy as np +import numpy as np +from pathlib import Path ## Test files provided as part of https://github.com/NCAS-CMS/pyfive/issues/23 +HERE = Path(__file__).parent def _compare_var_attrs(p5file, h5file): """ Compare attributes, which ought to bet he same thing except for the @@ -19,17 +21,15 @@ def _compare_var_attrs(p5file, h5file): else: assert v == vv - - def test_A_works(): """ This file behaves.""" - with pyfive.File('tests/data/issue23_A.nc') as hfile: + with pyfive.File(HERE/'data/issue23_A.nc') as hfile: file_attrs = hfile.attrs q_attrs = hfile['q'].attrs - with h5py.File('tests/data/issue23_A.nc') as hfile: + with h5py.File(HERE/'data/issue23_A.nc') as hfile: file_attrs2 = dict(hfile.attrs) q_attrs2 = dict(hfile['q'].attrs) @@ -43,12 +43,12 @@ def test_A_works(): def test_B_fails(): """ This file fails """ - with pyfive.File('tests/data/issue23_B.nc') as hfile: + with pyfive.File(HERE/'data/issue23_B.nc') as hfile: file_attrs = hfile.attrs t_attrs = hfile['tas'].attrs - with h5py.File('tests/data/issue23_B.nc') as hfile: + with h5py.File(HERE/'data/issue23_B.nc') as hfile: file_attrs2 = dict(hfile.attrs) t_attrs2 = dict(hfile['tas'].attrs) diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py new file mode 100644 index 0000000..0ca3feb --- /dev/null +++ b/tests/test_lazy_variables.py @@ -0,0 +1,46 @@ + +import pyfive +from pathlib import Path +import numpy as np + +HERE = Path(__file__).parent + +def test_attributes_outside_context(): + """ Pyfive should be able to access attributes outside the context manager. """ + with pyfive.File(HERE/'data/issue23_A.nc') as hfile: + + file_attrs = hfile.attrs + fdict = dict(file_attrs) + q_attrs = hfile['q'].attrs + qdict = dict(q_attrs) + + fdict1 = dict(file_attrs) + qdict1 = dict(q_attrs) + + assert fdict1 == fdict + assert qdict1 == qdict + +def test_file_data_oustside_context(): + """ Pyfive should be able to access data outside the context manager. + The data variable should be capable of reopening a closed file when + it needs access to the data. This mode should support thread + parallelism without the need for a lock. + """ + + with pyfive.File(HERE/'data/issue23_A.nc') as hfile: + + qdata = hfile['q'] + qdata1 = qdata[...] + qsum1 = np.sum(qdata1) + + qdata2 = qdata[...] + qsum2 = sum(qdata2) + + assert qsum1 == qsum2 + + + + + + + From 59e8667807e3dd4e3e7c871bcad234cbad3709d9 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 8 Jan 2025 10:09:15 +0000 Subject: [PATCH 050/105] Changes to support out of context variable access as described in #24 --- pyfive/h5d.py | 56 +++++++++++++++++++++++++++++++----- tests/test_lazy_variables.py | 2 +- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index ccf91a1..bf02185 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -23,18 +23,27 @@ class DatasetID: instance, it is completely independent of the parent file, and it can be used efficiently in distributed threads without thread contention to the b-tree etc. """ - def __init__(self, dataobject): + def __init__(self, dataobject, pseudo_chunking_size_MB=4): """ Instantiated with the pyfive datasetdataobject, we copy and cache everything we want so that the only file operations are now data accesses. + + if pseudo_chunking_size_MB is set to a value greater than zero, and + if the storage is not local posix (and hence np.mmap is not available) then + when accessing contiguous variables, we attempt to find a suitable + chunk shape to approximate that volume and read the contigous variable + as if were chunked. This is to facilitate lazy loading of partial data + from contiguous storage. """ self._order = dataobject.order - self._fh = dataobject.fh + self.__fh = dataobject.fh + try: dataobject.fh.fileno() self._filename = dataobject.fh.name self.avoid_mmap = False + self.pseudo_chunking_size = 0 except (AttributeError, OSError): try: # maybe this is an S3File instance? @@ -44,6 +53,7 @@ def __init__(self, dataobject): # failing that, maybe a memory file, return as None self._filename = getattr(self._fh,'full_name','None') self.avoid_mmap = True + self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024 self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape self.rank = len(self.shape) @@ -116,6 +126,9 @@ def read_direct_chunk(self, chunk_position, **kwargs): def get_data(self, args): """ Called by the dataset getitem method """ + + + match self.layout_class: case 0: #compact storage raise NotImplementedError("Compact Storage") @@ -249,17 +262,31 @@ def _get_contiguous_data(self, args): def _get_direct_from_contiguous(self, args=None): """ - We read the entire contiguous array, and pull out the selection (args) from that. - This is a fallback situation if we can't use a memory map which would otherwise be lazy. - This will normally be when we don't have a true Posix file. - # FIXME: We can probably make this lazy by using the indexer to work out which bytes - # are where ... + If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks + otherwise we have to read the entire array. This is a fallback situation if we + can't use a memory map which would otherwise be lazy. This will normally be when + we don't have a true Posix file. We should never end up here with compressed + data. """ + def __getstride(): + """ Determine an appropriate chunk and stride for a given pseudo chunk size """ + stride = 1 + chunk_shape = np.ones(self.rank, dtype=int) + for i in range(self.rank): + stride *= self.shape[i] + chunk_shape = box[:i] = self.shape[:i] + if stride*self.dtype.itemsize > self.pseudo_chunking_size: + stride //= self.shape[i] + chunk_shape = box[:i] = self.shape[:i-1] + return chunk_shape, stride itemsize = np.dtype(self.dtype).itemsize # need to impose type in case self.shape is () in which case numpy would return a float num_elements = np.prod(self.shape, dtype=int) num_bytes = num_elements*itemsize + + if self.pseudo_chunking_size: + stride = __getstride() # we need it all, let's get it all (i.e. this really does read the lot) self._fh.seek(self.data_offset) @@ -323,6 +350,21 @@ def _get_selection_via_chunks(self, args): raise NotImplementedError('datatype not implemented') return out + + @property + def _fh(self): + """ + When the parent file has been closed, we will need to reopen it + to continue to access data. This facility is provided to support + thread safe data access. However, now the file is open outside + a context manager, the user is responsible for closing it, + though it should get closed when the variable instance is + garbage collected. + """ + if self.__fh.closed: + self.__fh = open(self._filename, 'rb') + return self.__fh + class DatasetMeta: diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py index 0ca3feb..0d22d73 100644 --- a/tests/test_lazy_variables.py +++ b/tests/test_lazy_variables.py @@ -34,7 +34,7 @@ def test_file_data_oustside_context(): qsum1 = np.sum(qdata1) qdata2 = qdata[...] - qsum2 = sum(qdata2) + qsum2 = np.sum(qdata2) assert qsum1 == qsum2 From 34a684ab57f6a3d6f7c7ffd782123c6929bee052 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 8 Jan 2025 10:34:48 +0000 Subject: [PATCH 051/105] Removing the pseudo chunking stuff that snuck into the last commit --- pyfive/h5d.py | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index bf02185..7fd7c11 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -23,17 +23,11 @@ class DatasetID: instance, it is completely independent of the parent file, and it can be used efficiently in distributed threads without thread contention to the b-tree etc. """ - def __init__(self, dataobject, pseudo_chunking_size_MB=4): + def __init__(self, dataobject): """ Instantiated with the pyfive datasetdataobject, we copy and cache everything we want so that the only file operations are now data accesses. - if pseudo_chunking_size_MB is set to a value greater than zero, and - if the storage is not local posix (and hence np.mmap is not available) then - when accessing contiguous variables, we attempt to find a suitable - chunk shape to approximate that volume and read the contigous variable - as if were chunked. This is to facilitate lazy loading of partial data - from contiguous storage. """ self._order = dataobject.order @@ -42,8 +36,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): try: dataobject.fh.fileno() self._filename = dataobject.fh.name - self.avoid_mmap = False - self.pseudo_chunking_size = 0 + self.avoid_mmap = False except (AttributeError, OSError): try: # maybe this is an S3File instance? @@ -53,7 +46,6 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): # failing that, maybe a memory file, return as None self._filename = getattr(self._fh,'full_name','None') self.avoid_mmap = True - self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024 self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape self.rank = len(self.shape) @@ -126,9 +118,6 @@ def read_direct_chunk(self, chunk_position, **kwargs): def get_data(self, args): """ Called by the dataset getitem method """ - - - match self.layout_class: case 0: #compact storage raise NotImplementedError("Compact Storage") @@ -262,31 +251,14 @@ def _get_contiguous_data(self, args): def _get_direct_from_contiguous(self, args=None): """ - If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks - otherwise we have to read the entire array. This is a fallback situation if we - can't use a memory map which would otherwise be lazy. This will normally be when - we don't have a true Posix file. We should never end up here with compressed - data. + This is a fallback situation if we can't use a memory map which would otherwise be lazy. + At the moment it is very unlazy. """ - def __getstride(): - """ Determine an appropriate chunk and stride for a given pseudo chunk size """ - stride = 1 - chunk_shape = np.ones(self.rank, dtype=int) - for i in range(self.rank): - stride *= self.shape[i] - chunk_shape = box[:i] = self.shape[:i] - if stride*self.dtype.itemsize > self.pseudo_chunking_size: - stride //= self.shape[i] - chunk_shape = box[:i] = self.shape[:i-1] - return chunk_shape, stride itemsize = np.dtype(self.dtype).itemsize # need to impose type in case self.shape is () in which case numpy would return a float num_elements = np.prod(self.shape, dtype=int) num_bytes = num_elements*itemsize - - if self.pseudo_chunking_size: - stride = __getstride() # we need it all, let's get it all (i.e. this really does read the lot) self._fh.seek(self.data_offset) From 64827c4ebb0f0b8adce2136fbfbfa2f9fde14f69 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 8 Jan 2025 12:45:00 +0000 Subject: [PATCH 052/105] catching up to the main trunk in h5netcdf --- tests/test_lazy_variables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py index 0ca3feb..0d22d73 100644 --- a/tests/test_lazy_variables.py +++ b/tests/test_lazy_variables.py @@ -34,7 +34,7 @@ def test_file_data_oustside_context(): qsum1 = np.sum(qdata1) qdata2 = qdata[...] - qsum2 = sum(qdata2) + qsum2 = np.sum(qdata2) assert qsum1 == qsum2 From 20693b9ea53697fca7135369ed50c904f6f4d3d1 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 8 Jan 2025 12:45:35 +0000 Subject: [PATCH 053/105] Starting to sketch out the pseudo chunking --- pyfive/h5d.py | 56 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index ccf91a1..bf02185 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -23,18 +23,27 @@ class DatasetID: instance, it is completely independent of the parent file, and it can be used efficiently in distributed threads without thread contention to the b-tree etc. """ - def __init__(self, dataobject): + def __init__(self, dataobject, pseudo_chunking_size_MB=4): """ Instantiated with the pyfive datasetdataobject, we copy and cache everything we want so that the only file operations are now data accesses. + + if pseudo_chunking_size_MB is set to a value greater than zero, and + if the storage is not local posix (and hence np.mmap is not available) then + when accessing contiguous variables, we attempt to find a suitable + chunk shape to approximate that volume and read the contigous variable + as if were chunked. This is to facilitate lazy loading of partial data + from contiguous storage. """ self._order = dataobject.order - self._fh = dataobject.fh + self.__fh = dataobject.fh + try: dataobject.fh.fileno() self._filename = dataobject.fh.name self.avoid_mmap = False + self.pseudo_chunking_size = 0 except (AttributeError, OSError): try: # maybe this is an S3File instance? @@ -44,6 +53,7 @@ def __init__(self, dataobject): # failing that, maybe a memory file, return as None self._filename = getattr(self._fh,'full_name','None') self.avoid_mmap = True + self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024 self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape self.rank = len(self.shape) @@ -116,6 +126,9 @@ def read_direct_chunk(self, chunk_position, **kwargs): def get_data(self, args): """ Called by the dataset getitem method """ + + + match self.layout_class: case 0: #compact storage raise NotImplementedError("Compact Storage") @@ -249,17 +262,31 @@ def _get_contiguous_data(self, args): def _get_direct_from_contiguous(self, args=None): """ - We read the entire contiguous array, and pull out the selection (args) from that. - This is a fallback situation if we can't use a memory map which would otherwise be lazy. - This will normally be when we don't have a true Posix file. - # FIXME: We can probably make this lazy by using the indexer to work out which bytes - # are where ... + If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks + otherwise we have to read the entire array. This is a fallback situation if we + can't use a memory map which would otherwise be lazy. This will normally be when + we don't have a true Posix file. We should never end up here with compressed + data. """ + def __getstride(): + """ Determine an appropriate chunk and stride for a given pseudo chunk size """ + stride = 1 + chunk_shape = np.ones(self.rank, dtype=int) + for i in range(self.rank): + stride *= self.shape[i] + chunk_shape = box[:i] = self.shape[:i] + if stride*self.dtype.itemsize > self.pseudo_chunking_size: + stride //= self.shape[i] + chunk_shape = box[:i] = self.shape[:i-1] + return chunk_shape, stride itemsize = np.dtype(self.dtype).itemsize # need to impose type in case self.shape is () in which case numpy would return a float num_elements = np.prod(self.shape, dtype=int) num_bytes = num_elements*itemsize + + if self.pseudo_chunking_size: + stride = __getstride() # we need it all, let's get it all (i.e. this really does read the lot) self._fh.seek(self.data_offset) @@ -323,6 +350,21 @@ def _get_selection_via_chunks(self, args): raise NotImplementedError('datatype not implemented') return out + + @property + def _fh(self): + """ + When the parent file has been closed, we will need to reopen it + to continue to access data. This facility is provided to support + thread safe data access. However, now the file is open outside + a context manager, the user is responsible for closing it, + though it should get closed when the variable instance is + garbage collected. + """ + if self.__fh.closed: + self.__fh = open(self._filename, 'rb') + return self.__fh + class DatasetMeta: From 4126e2b6410fe20ff3cda2e575eaa297602e26f5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Jan 2025 14:55:43 +0000 Subject: [PATCH 054/105] threadsafe data access --- pyfive/h5d.py | 58 ++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index bf02185..717abb2 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -37,21 +37,21 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): """ self._order = dataobject.order - self.__fh = dataobject.fh + fh = dataobject.fh try: - dataobject.fh.fileno() + fh.fileno() self._filename = dataobject.fh.name self.avoid_mmap = False self.pseudo_chunking_size = 0 except (AttributeError, OSError): try: # maybe this is an S3File instance? - self._filename = getattr(self._fh,'path') + self._filename = getattr(fh,'path') except: # maybe a remote https file opened as bytes? # failing that, maybe a memory file, return as None - self._filename = getattr(self._fh,'full_name','None') + self._filename = getattr(fh,'full_name','None') self.avoid_mmap = True self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024 self.filter_pipeline = dataobject.filter_pipeline @@ -128,7 +128,6 @@ def get_data(self, args): - match self.layout_class: case 0: #compact storage raise NotImplementedError("Compact Storage") @@ -142,6 +141,7 @@ def get_data(self, args): # this is lazily reading only the chunks we need return self._get_selection_via_chunks(args) + def iter_chunks(self, args): """ Iterate over chunks in a chunked dataset. @@ -239,9 +239,11 @@ def _get_contiguous_data(self, args): try: # return a memory-map to the stored array # I think this would mean that we only move the sub-array corresponding to result! - view = np.memmap(self._fh, dtype=self.dtype, mode='c', + fh = self._fh + view = np.memmap(fh, dtype=self.dtype, mode='c', offset=self.data_offset, shape=self.shape, order=self._order) - result = view[args] + result = view[args].copy() + fh.close() return result except UnsupportedOperation: return self._get_direct_from_contiguous(args) @@ -251,15 +253,19 @@ def _get_contiguous_data(self, args): size = self.dtype[1] if size != 8: raise NotImplementedError('Unsupported Reference type - size {size}') - + fh = self._fh ref_addresses = np.memmap( - self._fh, dtype=(' Date: Thu, 9 Jan 2025 14:17:18 +0000 Subject: [PATCH 055/105] add deps for mock s3 test --- setup.cfg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 7d2df77..dbb98e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,7 +11,7 @@ url = https://github.com/ncas-cms/pyfive author = Jonathan J. Helmus author_email = jjhelmus@gmail.com (with additions from Bryan Lawrence) maintainer = Valeriu Predoi -maintainer_email = V.Predoi@ncas.ac.uk +maintainer_email = valeriu.predoi@ncas.ac.uk license = BSD classifiers = Development Status :: 4 - Beta @@ -28,7 +28,9 @@ classifiers = packages = pyfive python_requires >= 3.10 install_requires = + moto numpy + s3fs [options.extras_require] testing = From c7058b6fdb16f583971d50fb8c40a5afedce7218 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Thu, 9 Jan 2025 14:17:38 +0000 Subject: [PATCH 056/105] add mock s3 test --- tests/test_mock_s3fs.py | 129 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 tests/test_mock_s3fs.py diff --git a/tests/test_mock_s3fs.py b/tests/test_mock_s3fs.py new file mode 100644 index 0000000..931355a --- /dev/null +++ b/tests/test_mock_s3fs.py @@ -0,0 +1,129 @@ +import os +import s3fs +import pathlib +import json +import moto +import pyfive +import pytest + +import h5netcdf + +from moto.moto_server.threaded_moto_server import ThreadedMotoServer + + +# some spoofy server parameters +port = 5555 +endpoint_uri = "http://127.0.0.1:%s/" % port +test_bucket_name = "test" +versioned_bucket_name = "test-versioned" +secure_bucket_name = "test-secure" + +def get_boto3_client(): + from botocore.session import Session + + # NB: we use the sync botocore client for setup + session = Session() + return session.create_client("s3", endpoint_url=endpoint_uri) + +@pytest.fixture(scope="module") +def s3_base(): + # writable local S3 system + + # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests + ##### + # lifted from https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py + ##### + server = ThreadedMotoServer(ip_address="127.0.0.1", port=port) + server.start() + if "AWS_SECRET_ACCESS_KEY" not in os.environ: + os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" + if "AWS_ACCESS_KEY_ID" not in os.environ: + os.environ["AWS_ACCESS_KEY_ID"] = "foo" + os.environ.pop("AWS_PROFILE", None) + + print("server up") + yield + print("moto done") + server.stop() + + +@pytest.fixture() +def s3fs_s3(s3_base): + """ + Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs. + Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py + """ + client = get_boto3_client() + client.create_bucket(Bucket=test_bucket_name, ACL="public-read") + + client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read") + client.put_bucket_versioning( + Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"} + ) + + # initialize secure bucket + client.create_bucket(Bucket=secure_bucket_name, ACL="public-read") + policy = json.dumps( + { + "Version": "2012-10-17", + "Id": "PutObjPolicy", + "Statement": [ + { + "Sid": "DenyUnEncryptedObjectUploads", + "Effect": "Deny", + "Principal": "*", + "Action": "s3:PutObject", + "Resource": "arn:aws:s3:::{bucket_name}/*".format( + bucket_name=secure_bucket_name + ), + "Condition": { + "StringNotEquals": { + "s3:x-amz-server-side-encryption": "aws:kms" + } + }, + } + ], + } + ) + + client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy) + s3fs.S3FileSystem.clear_instance_cache() + s3 = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) + s3.invalidate_cache() + + yield s3 + + +def test_s3file_with_s3fs(s3fs_s3): + """ + This test spoofs a complete s3fs FileSystem via s3fs_s3, + creates a mock bucket inside it, then puts a REAL netCDF4 file in it, + then it loads it as if it was an S3 file. This is proper + Wild Weasel stuff right here. + """ + # set up physical file and Path properties + ncfile = "./tests/data/issue23_A.nc" + file_path = pathlib.Path(ncfile) + file_name = pathlib.Path(ncfile).name + + # use mocked s3fs + bucket = "MY_BUCKET" + s3fs_s3.mkdir(bucket) + s3fs_s3.put(file_path, bucket) + s3 = s3fs.S3FileSystem( + anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} + ) + + # test load by h5netcdf + with s3.open(os.path.join("MY_BUCKET", file_name), "rb") as f: + print("File path", f.path) + ncfile = h5netcdf.File(f, 'r', invalid_netcdf=True) + print("File loaded from spoof S3 with h5netcdf:", ncfile) + print(ncfile["q"]) + assert "q" in ncfile + + # PyFive it + with s3.open(os.path.join("MY_BUCKET", file_name), "rb") as f: + pyfive_ds = pyfive.File(f) + print(f"Dataset loaded from mock S3 with s3fs and Pyfive: ds") + assert "q" in pyfive_ds From df81fafd8c0f93c0a1f2668da72f2afb80a332a3 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 9 Jan 2025 15:39:22 +0000 Subject: [PATCH 057/105] posix & s3 --- pyfive/h5d.py | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index a5089a7..b0f2693 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -34,10 +34,12 @@ def __init__(self, dataobject): fh = dataobject.fh try: + # See if 'fh' an underlying file descriptor fh.fileno() - self._filename = dataobject.fh.name - self.avoid_mmap = False except (AttributeError, OSError): + # No file descriptor => Not Posix + self.posix = False + self.__fh = fh try: # maybe this is an S3File instance? self._filename = getattr(fh,'path') @@ -45,7 +47,11 @@ def __init__(self, dataobject): # maybe a remote https file opened as bytes? # failing that, maybe a memory file, return as None self._filename = getattr(fh,'full_name','None') - self.avoid_mmap = True + else: + # Has a file descriptor => Posix + self.posix = True + self._filename = fh.name + self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape self.rank = len(self.shape) @@ -222,7 +228,7 @@ def _get_contiguous_data(self, args): return np.zeros(self.shape, dtype=self.dtype)[args] if not isinstance(self.dtype, tuple): - if self.avoid_mmap: + if not self.posix: #self.avoid_mmap: return self._get_direct_from_contiguous(args) else: try: @@ -231,9 +237,9 @@ def _get_contiguous_data(self, args): fh = self._fh view = np.memmap(fh, dtype=self.dtype, mode='c', offset=self.data_offset, shape=self.shape, order=self._order) - result = view[args].copy() - fh.close() - return result + return view[args].copy() +# fh.close() +# return result except UnsupportedOperation: return self._get_direct_from_contiguous(args) else: @@ -246,9 +252,9 @@ def _get_contiguous_data(self, args): ref_addresses = np.memmap( fh, dtype=(' Date: Thu, 9 Jan 2025 16:55:02 +0000 Subject: [PATCH 058/105] tidy --- pyfive/h5d.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index b0f2693..13c0c00 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -51,7 +51,7 @@ def __init__(self, dataobject): # Has a file descriptor => Posix self.posix = True self._filename = fh.name - + self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape self.rank = len(self.shape) @@ -228,12 +228,13 @@ def _get_contiguous_data(self, args): return np.zeros(self.shape, dtype=self.dtype)[args] if not isinstance(self.dtype, tuple): - if not self.posix: #self.avoid_mmap: + if not self.posix: return self._get_direct_from_contiguous(args) else: try: # return a memory-map to the stored array - # I think this would mean that we only move the sub-array corresponding to result! + # I think this would mean that we only move the + # sub-array corresponding to result! fh = self._fh view = np.memmap(fh, dtype=self.dtype, mode='c', offset=self.data_offset, shape=self.shape, order=self._order) From ee0995b82267cfac89110ec0cd3cb92bb29361be Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 10 Jan 2025 15:02:09 +0000 Subject: [PATCH 059/105] tidy up --- pyfive/h5d.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 13c0c00..a288dd4 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -34,7 +34,7 @@ def __init__(self, dataobject): fh = dataobject.fh try: - # See if 'fh' an underlying file descriptor + # See if 'fh' is an underlying file descriptor fh.fileno() except (AttributeError, OSError): # No file descriptor => Not Posix @@ -135,7 +135,6 @@ def get_data(self, args): else: # this is lazily reading only the chunks we need return self._get_selection_via_chunks(args) - def iter_chunks(self, args): """ @@ -232,15 +231,13 @@ def _get_contiguous_data(self, args): return self._get_direct_from_contiguous(args) else: try: - # return a memory-map to the stored array + # Return a memory-map to the stored array. # I think this would mean that we only move the # sub-array corresponding to result! fh = self._fh - view = np.memmap(fh, dtype=self.dtype, mode='c', + view = np.memmap(fh, dtype=self.dtype, mode='c', offset=self.data_offset, shape=self.shape, order=self._order) - return view[args].copy() -# fh.close() -# return result + return view[args] except UnsupportedOperation: return self._get_direct_from_contiguous(args) else: @@ -254,8 +251,6 @@ def _get_contiguous_data(self, args): fh, dtype=(' Date: Fri, 10 Jan 2025 15:10:38 +0000 Subject: [PATCH 060/105] tidy up --- pyfive/h5d.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index a288dd4..a3bfbc0 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -231,13 +231,12 @@ def _get_contiguous_data(self, args): return self._get_direct_from_contiguous(args) else: try: - # Return a memory-map to the stored array. - # I think this would mean that we only move the - # sub-array corresponding to result! - fh = self._fh - view = np.memmap(fh, dtype=self.dtype, mode='c', + # return a memory-map to the stored array + # I think this would mean that we only move the sub-array corresponding to result! + view = np.memmap(self._fh, dtype=self.dtype, mode='c', offset=self.data_offset, shape=self.shape, order=self._order) - return view[args] + result = view[args] + return result except UnsupportedOperation: return self._get_direct_from_contiguous(args) else: @@ -246,17 +245,13 @@ def _get_contiguous_data(self, args): size = self.dtype[1] if size != 8: raise NotImplementedError('Unsupported Reference type - size {size}') - fh = self._fh ref_addresses = np.memmap( - fh, dtype=(' Date: Mon, 13 Jan 2025 14:10:12 +0000 Subject: [PATCH 061/105] add test reports to gitignore for now --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2b42de6..1679dae 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__ *.egg-info .idea .DS_Store +test-reports/ From 0c8ffc57a5626bf342d625b092f0fbfbef6e32a8 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:18:17 +0000 Subject: [PATCH 062/105] add conftest --- tests/conftest.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tests/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..cf55293 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,77 @@ +import os +import s3fs +import pathlib +import json +import moto +import pytest + +from moto.moto_server.threaded_moto_server import ThreadedMotoServer + + +@pytest.fixture(scope="module") +def s3_base(): + # writable local S3 system + + # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests + ##### + # lifted from https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py + ##### + server = ThreadedMotoServer(ip_address="127.0.0.1", port=port) + server.start() + if "AWS_SECRET_ACCESS_KEY" not in os.environ: + os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" + if "AWS_ACCESS_KEY_ID" not in os.environ: + os.environ["AWS_ACCESS_KEY_ID"] = "foo" + os.environ.pop("AWS_PROFILE", None) + + print("server up") + yield + print("moto done") + server.stop() + + +@pytest.fixture() +def s3fs_s3(s3_base): + """ + Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs. + Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py + """ + client = get_boto3_client() + client.create_bucket(Bucket=test_bucket_name, ACL="public-read") + + client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read") + client.put_bucket_versioning( + Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"} + ) + + # initialize secure bucket + client.create_bucket(Bucket=secure_bucket_name, ACL="public-read") + policy = json.dumps( + { + "Version": "2012-10-17", + "Id": "PutObjPolicy", + "Statement": [ + { + "Sid": "DenyUnEncryptedObjectUploads", + "Effect": "Deny", + "Principal": "*", + "Action": "s3:PutObject", + "Resource": "arn:aws:s3:::{bucket_name}/*".format( + bucket_name=secure_bucket_name + ), + "Condition": { + "StringNotEquals": { + "s3:x-amz-server-side-encryption": "aws:kms" + } + }, + } + ], + } + ) + + client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy) + s3fs.S3FileSystem.clear_instance_cache() + s3 = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) + s3.invalidate_cache() + + yield s3 From 8cc2363093c9731a769d53755415a920a7b4c7bb Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:34:40 +0000 Subject: [PATCH 063/105] minimize conftest --- tests/conftest.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index cf55293..b921310 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,22 @@ from moto.moto_server.threaded_moto_server import ThreadedMotoServer +# some spoofy server parameters +# test parameters; don't modify these +port = 5555 +endpoint_uri = "http://127.0.0.1:%s/" % port +test_bucket_name = "test" +versioned_bucket_name = "test-versioned" +secure_bucket_name = "test-secure" + +def get_boto3_client(): + from botocore.session import Session + + # NB: we use the sync botocore client for setup + session = Session() + return session.create_client("s3", endpoint_url=endpoint_uri) + + @pytest.fixture(scope="module") def s3_base(): # writable local S3 system From 3086211cd95260a7db858a8211fc6a442092fe7b Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:34:58 +0000 Subject: [PATCH 064/105] make use of conftest and add minimal test for mock s3 fs --- tests/test_mock_s3fs.py | 90 +++++------------------------------------ 1 file changed, 10 insertions(+), 80 deletions(-) diff --git a/tests/test_mock_s3fs.py b/tests/test_mock_s3fs.py index 931355a..7beffb2 100644 --- a/tests/test_mock_s3fs.py +++ b/tests/test_mock_s3fs.py @@ -1,97 +1,27 @@ import os import s3fs import pathlib -import json -import moto import pyfive import pytest - import h5netcdf -from moto.moto_server.threaded_moto_server import ThreadedMotoServer - -# some spoofy server parameters +# needed by the spoofed s3 filesystem port = 5555 endpoint_uri = "http://127.0.0.1:%s/" % port -test_bucket_name = "test" -versioned_bucket_name = "test-versioned" -secure_bucket_name = "test-secure" - -def get_boto3_client(): - from botocore.session import Session - - # NB: we use the sync botocore client for setup - session = Session() - return session.create_client("s3", endpoint_url=endpoint_uri) - -@pytest.fixture(scope="module") -def s3_base(): - # writable local S3 system - - # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests - ##### - # lifted from https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py - ##### - server = ThreadedMotoServer(ip_address="127.0.0.1", port=port) - server.start() - if "AWS_SECRET_ACCESS_KEY" not in os.environ: - os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" - if "AWS_ACCESS_KEY_ID" not in os.environ: - os.environ["AWS_ACCESS_KEY_ID"] = "foo" - os.environ.pop("AWS_PROFILE", None) - - print("server up") - yield - print("moto done") - server.stop() -@pytest.fixture() -def s3fs_s3(s3_base): - """ - Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs. - Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py - """ - client = get_boto3_client() - client.create_bucket(Bucket=test_bucket_name, ACL="public-read") - - client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read") - client.put_bucket_versioning( - Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"} - ) - - # initialize secure bucket - client.create_bucket(Bucket=secure_bucket_name, ACL="public-read") - policy = json.dumps( - { - "Version": "2012-10-17", - "Id": "PutObjPolicy", - "Statement": [ - { - "Sid": "DenyUnEncryptedObjectUploads", - "Effect": "Deny", - "Principal": "*", - "Action": "s3:PutObject", - "Resource": "arn:aws:s3:::{bucket_name}/*".format( - bucket_name=secure_bucket_name - ), - "Condition": { - "StringNotEquals": { - "s3:x-amz-server-side-encryption": "aws:kms" - } - }, - } - ], - } - ) +def test_s3fs_s3(s3fs_s3): + """Test mock S3 filesystem constructor.""" + # this is an entire mock S3 FS + mock_s3_filesystem = s3fs_s3 - client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy) - s3fs.S3FileSystem.clear_instance_cache() - s3 = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) - s3.invalidate_cache() + # explore its attributes and methods + print(dir(mock_s3_filesystem)) - yield s3 + assert not mock_s3_filesystem.anon + assert not mock_s3_filesystem.version_aware + assert mock_s3_filesystem.client_kwargs == {'endpoint_url': 'http://127.0.0.1:5555/'} def test_s3file_with_s3fs(s3fs_s3): From ed0f1172f8fabfeb7d40a758c1c0eabb75fe931e Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:39:16 +0000 Subject: [PATCH 065/105] upgrade actions versions --- .github/workflows/pytest.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 6081f94..dd0a799 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -5,7 +5,7 @@ name: Python package on: push: - branches: [ master ] + branches: [ master, mock_s3fs ] pull_request: branches: [ master ] @@ -16,12 +16,12 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies From 6843567b2b4b7be30eac1f31c46afd4e08397024 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:42:49 +0000 Subject: [PATCH 066/105] add flask dep --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index dbb98e9..c0311f9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,6 +28,7 @@ classifiers = packages = pyfive python_requires >= 3.10 install_requires = + flask moto numpy s3fs From 88752d10bde0f1b92aa645c546aa9d64673d0dfd Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:43:16 +0000 Subject: [PATCH 067/105] restrict to python 3.10 --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index dd0a799..5e99483 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 From ddeb0eabf400674105a3ad7e5cdc32153c98d0ed Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:45:03 +0000 Subject: [PATCH 068/105] add flask-cors --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index c0311f9..8c7e6eb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,6 +29,7 @@ packages = pyfive python_requires >= 3.10 install_requires = flask + flask-cors moto numpy s3fs From 522bf7a0146f86f8249666a13eb69024ec8bddd6 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:48:57 +0000 Subject: [PATCH 069/105] add h5 modules --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index 8c7e6eb..8c2010f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,6 +28,8 @@ classifiers = packages = pyfive python_requires >= 3.10 install_requires = + h5py + h5netcdf flask flask-cors moto From 22476e81604cb2f2ef08b88a3a985a8e83265d76 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 14:54:38 +0000 Subject: [PATCH 070/105] mark test as xfailed --- tests/test_enum_var.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_enum_var.py b/tests/test_enum_var.py index e5851be..e2cea47 100644 --- a/tests/test_enum_var.py +++ b/tests/test_enum_var.py @@ -8,6 +8,7 @@ DIRNAME = os.path.dirname(__file__) ENUMVAR_HDF5_FILE = os.path.join(DIRNAME, 'enum_variable.hdf5') +@pytest.mark.xfail(reason="FileNotFoundError: [Errno 2] No such file or directory: '/home/.../pyfive/pyfive/tests/enum_variable.hdf5") def test_read_enum_variable(): with pyfive.File(ENUMVAR_HDF5_FILE) as hfile: From f28c68d2b3f778207ccff5b882e7cc0002c39185 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 13 Jan 2025 16:11:51 +0000 Subject: [PATCH 071/105] add dosctrings --- tests/conftest.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index b921310..aa8b538 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,6 +34,9 @@ def s3_base(): ##### server = ThreadedMotoServer(ip_address="127.0.0.1", port=port) server.start() + # the user ID and secret key are needed when accessing a public bucket + # since our S3 FS and bucket are not actually on an AWS system, they can have + # bogus values if "AWS_SECRET_ACCESS_KEY" not in os.environ: os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" if "AWS_ACCESS_KEY_ID" not in os.environ: @@ -51,8 +54,20 @@ def s3fs_s3(s3_base): """ Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs. Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py + + The S3 FS, being AWS-like but not actually physically deployed anywhere, still needs + all the usual user IDs, secret keys, endpoint URLs etc; the setup makes use of the ACL=public + configuration (public-read, or public-read-write). Public DOES NOT mean anon=True, but rather, + All Users group – https://docs.aws.amazon.com/AmazonS3/latest/userguide/acl-overview.html + Access permission to this group allows anyone with AWS credentials to access the resource. + The requests need be signed (authenticated) or not. + + Also, keys are encrypted using AWS-KMS + https://docs.aws.amazon.com/kms/latest/developerguide/overview.html """ client = get_boto3_client() + + # see not above about ACL=public-read client.create_bucket(Bucket=test_bucket_name, ACL="public-read") client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read") From 2d312c1d96df2c52b3fd29330694a789e04c4808 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 15 Jan 2025 08:46:45 +0000 Subject: [PATCH 072/105] Minor changes following V's S3 testing merge --- .coveragerc | 4 ---- setup.cfg | 8 +++----- tests/data/enum_variable.hdf5 | Bin 0 -> 6149 bytes tests/data/enums_from_netcdf.nc | Bin 0 -> 6149 bytes tests/test_enum_var.py | 3 +-- 5 files changed, 4 insertions(+), 11 deletions(-) delete mode 100644 .coveragerc create mode 100644 tests/data/enum_variable.hdf5 create mode 100644 tests/data/enums_from_netcdf.nc diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index c4b4617..0000000 --- a/.coveragerc +++ /dev/null @@ -1,4 +0,0 @@ -[report] -exclude_lines = - raise NotImplementedError - raise InvalidHDF5File diff --git a/setup.cfg b/setup.cfg index 8c2010f..c767029 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,11 +28,6 @@ classifiers = packages = pyfive python_requires >= 3.10 install_requires = - h5py - h5netcdf - flask - flask-cors - moto numpy s3fs @@ -42,4 +37,7 @@ testing = h5py netCDF4 h5netcdf + flask + flask-cors + moto diff --git a/tests/data/enum_variable.hdf5 b/tests/data/enum_variable.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..3f640db22238917469bdab26660e7541ef2094d7 GIT binary patch literal 6149 zcmeHJ%}*0S6o0$ZmQ7s*wFW;ht{jy#fJHEd#Adtwh@~}@m}oS0p^Hq?Zn3*1-aHd8 z#*-J~&BU7r<6pqRKf$XZT)cSkUf-MD0VUvpn`U0RGqdyNef{S5Hr*;0io@wkY27eD z${?2Hg=l9gR(vk`KkMbfl5T#|C&V6Ou}Yd8p3^#LbR>0=MlC@jNy_v4oo4tWdV|qX zTI|H?jd&MQa)AbyX{{ync*_tDHr?Jx z=h2*UrBp=DU_cP|+^`?OYQNKOFf|(eCH^Le~R+TumD)R;%Wr&qQzU^wh(SUQf zMRgE1h03B+YgFpBX2Z@`9WWnJeGF3*2DSX6BhKHA&o8g@z`~zv`Ce!jidKtvJU`%F z-&*O1mfsBzRER%!SK40Ik}?aFkdFROgGmWBYhJk8TFVu9hqqu9Sr(OOiR3UQ%Hz_j zy$HkYzhO8>7@qK&l0#(7uHyg$<19r*s>B4Bwhapb$_9Cy)5Kssx M1pXBSG;P52U$tT%xBvhE literal 0 HcmV?d00001 diff --git a/tests/data/enums_from_netcdf.nc b/tests/data/enums_from_netcdf.nc new file mode 100644 index 0000000000000000000000000000000000000000..d714b1414f28a02251d68377b54bc6a491fb28b6 GIT binary patch literal 6149 zcmeHJy>HV%6o2+5sTEvWAgFv4VY&fT)RvSKp-M!K(?+BTYAObV6r9GPCnZjaor;aQ zVnIwS6&o`Wh`*o%e*!B)omrR(@11=uR4qd{&^^U>=X>|Qe)oG;wwH^g(e$OXZWthC z7)$a(v@;VcJ}vn_YRkn{-TbIei9N<*l{7g#r*+WiNa`YuT7pKBl;;n+jqrQ)2IJ$j z*p1bj@h+s~0u3(HT1)8hmLVK$x&7^U=lmp)In&^94)Q&h`DTgqE%xhd7q{!`F&HvBO1^Sbalx}a$p%1+lbElCR$ zIkOD;Jag@;ND{z;2?Hix&sQ%m1v3NFeb|5rI&jV zhC6@5Fi#ks@S2h%WXx{h07K&}MMbK_1edoY(Pqb^T@aR#!Q$|;WpQ?|*Kcv(4ZYyc z5|IiNkwnh4HlIXya0>FmO^J6p58ckdJ85~uIT=9CL$eay#29h`lRY3prCeVF>PX_q z1U}UJN4XHL;tvC@-(7h&QnQ!uO8{kS3ErdL><|?hjKK^=Ty;v$s#CR{hKP6@=|~Y< zZs-PHco2G@VPKGcpCCsRCEFd33E7=|^PVzFQ5pDski{q5=R9!YH`5^{QBqL8QMDiR zdz)TA Date: Wed, 15 Jan 2025 09:24:18 +0000 Subject: [PATCH 073/105] A framework for testing laziness. --- tests/test_lazy_variables.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py index 0d22d73..6ecfb72 100644 --- a/tests/test_lazy_variables.py +++ b/tests/test_lazy_variables.py @@ -5,6 +5,23 @@ HERE = Path(__file__).parent + +def test_file_laziness(): + """ Pyfive should not load the data until it is requested. """ + with pyfive.File(HERE/'data/issue23_A.nc') as hfile: + variables = [v for v in hfile] + # we do not want to have executed a _getitem__ on any of the variables yet + + # check each member of that list is a string + # it is, it's just the keys of the Mapping superclass of Group + assert all(isinstance(v, str) for v in variables) + + + + + + + def test_attributes_outside_context(): """ Pyfive should be able to access attributes outside the context manager. """ with pyfive.File(HERE/'data/issue23_A.nc') as hfile: From 7a131082c9ff19ad3e051a07dcbb0317a3dedf77 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 15 Jan 2025 10:32:19 +0000 Subject: [PATCH 074/105] add test for threadsafe data access on posix and s3 --- tests/test_threadsafe_data_access.py | 115 +++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 tests/test_threadsafe_data_access.py diff --git a/tests/test_threadsafe_data_access.py b/tests/test_threadsafe_data_access.py new file mode 100644 index 0000000..9c08716 --- /dev/null +++ b/tests/test_threadsafe_data_access.py @@ -0,0 +1,115 @@ +import os +import s3fs +import pathlib +import pyfive +import pytest +import h5netcdf +import netCDF4 +import numpy as np +import dask.array as da + +# needed by the spoofed s3 filesystem +port = 5555 +endpoint_uri = "http://127.0.0.1:%s/" % port + + +def _get_s3_file(s3fs_s3, ncfile): + """Copy a POSIX file to S3.""" + # set up physical file and Path properties + file_path = pathlib.Path(ncfile) + file_name = pathlib.Path(ncfile).name + + # use mocked s3fs + bucket = "MY_BUCKET" + try: + s3fs_s3.mkdir(bucket) + except FileExistsError: + # Bucket was already created by another test + pass + + s3fs_s3.put(file_path, bucket) + s3 = s3fs.S3FileSystem( + anon=False, + version_aware=True, + client_kwargs={"endpoint_url": endpoint_uri} + ) + return s3.open(os.path.join("MY_BUCKET", file_name), "rb") + + +def test_threadsafe_datea_accesss(s3fs_s3): + """test that the data is correctly retrieved using concurrent threads""" + # Create a chunked POSIX file + chunked_file = "./threading_test_file_16_chunks.nc" + + n = netCDF4.Dataset(chunked_file, 'w') + chunksizes=(6, 32, 32) + z = n.createDimension('z', 12) + y = n.createDimension('y', 64) + x = n.createDimension('x', 128) + + v = n.createVariable('v', 'f8', + dimensions=['z', 'y', 'x'], + fill_value=False, + contiguous=False, + chunksizes=(6, 32, 32), + complevel=1) + v[...] = np.random.random(v.size).reshape(12, 64, 128) + n.close() + + posix = chunked_file + + # Get s3 access + s3 = _get_s3_file(s3fs_s3, chunked_file) + + n_iterations = 1000 + for ftype, filename in zip( + ('POSIX', 'S3'), + (posix, s3), + ): + print (f"\n{ftype} ----------------\n") + + # Get the file data array, on asingle thread + v = pyfive.File(chunked_file)['v'] + print (v) + print (f"Storage chunk size: {v.chunks}") + array0 = v[...].view(type=np.ndarray) + + # Loop round different Dask chunk patterns. These are designed + # to various coincide and not coincide with the HDF5 chunks in + # the file. + for chunks in ( + v.shape, + v.chunks, + (12, 32, 32), + (11, 63, 127), + (6, 20, 20) + ): + dx = da.from_array(v, chunks=chunks) + print (f"\n{dx.npartitions} Dask chunks: {dx.chunks}") + + for i in range(n_iterations): + # Use Dask to get the array with one or more threads + try: + array = dx.compute() + except Exception as error: + print (f"Failed on iteration {i + 1}") + raise + else: + # Compare the array created with multiple threads + # with that created with one thread + if not (array == array0).all(): + print (f"Failed on iteration {i + 1}") + raise ValueError( + "At least one Dask chunk read at least one wrong " + "value from the file (likely from parts of " + "storage chunks that it should not have been " + "accessing, due to conflicting seeks on the same " + "open file handle). " + f"Storage chunks: {v.chunks}, " + f"Dask chunks: {dx.chunks}" + ) + else: + print (f"Completed {n_iterations} iterations") + + # Tidy up + os.remove(chunked_file) From 1b6d67065bd5b839580c62427eb0454c8720841c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 15 Jan 2025 10:35:38 +0000 Subject: [PATCH 075/105] note on number of threadsafe test iterations --- tests/test_threadsafe_data_access.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_threadsafe_data_access.py b/tests/test_threadsafe_data_access.py index 9c08716..4291a60 100644 --- a/tests/test_threadsafe_data_access.py +++ b/tests/test_threadsafe_data_access.py @@ -60,8 +60,12 @@ def test_threadsafe_datea_accesss(s3fs_s3): # Get s3 access s3 = _get_s3_file(s3fs_s3, chunked_file) - + + # Set number of iterations for checking thread safety. Need to be + # quite large to avoid cases where the code is not threadsafe, but + # still retrieves the correct data by good fortune. n_iterations = 1000 + for ftype, filename in zip( ('POSIX', 'S3'), (posix, s3), From 0ec45d736b3f6f338a9cc2190ae903d96281702a Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 15 Jan 2025 12:00:50 +0000 Subject: [PATCH 076/105] Test framework for pseudochunking plus starting to migrate test data to data directory --- tests/data/dataset_multidim.hdf5 | Bin 0 -> 4464 bytes tests/test_dataset_multidim.py | 2 +- tests/test_pseudochunking.py | 46 +++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 tests/data/dataset_multidim.hdf5 create mode 100644 tests/test_pseudochunking.py diff --git a/tests/data/dataset_multidim.hdf5 b/tests/data/dataset_multidim.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..6349a86592ac0d3fee9ab4c250783bf31341e3f0 GIT binary patch literal 4464 zcmeI#*-{fh6oBFGNunc)g1fklJMMAc7c{}R0Rk%S!=mDjyO&(zosZ$8_!zv2{-^si zRwWuL7nG@Tn)>E+_e>@;XL>r#*Gj3owWXuQNQq;UH*KyOHb1pYwELdW4*OHBSdCK} zzem1ejrK_;8hpBn>k4toS8ctmQ z@TT2cJFYt2wOT(s%QIC=0O&tyqjDkZC}c zCQGpl%drA0u?kY=^8E5Ptw9IYVjb3F12!VTCTzwQY{fQg#}3F_zZ1K#8+))9`>-EH z9Kb;w!eJc2Q5=K3B`0tar*Il)a2Drq9$mPAiztDGmg&YNRB#zra23~Z9X;qpA8w!@ z0~o|j+`?@P;STQN9`54-9^w&F3}XbN7{g;c!BdRm8J^<>Ug8yA;|=8Pdy98?j}Q2W zPxy?iWEsdZkYymtKyzlGc3mgGIQ%B(t&5S{@_&# Date: Wed, 15 Jan 2025 12:02:48 +0000 Subject: [PATCH 077/105] Ok, we pass the pseudochunking test, but we don't actually do it yet. --- pyfive/h5d.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index bf02185..594b15d 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -260,7 +260,7 @@ def _get_contiguous_data(self, args): raise NotImplementedError('datatype not implemented - {dtype_class}') - def _get_direct_from_contiguous(self, args=None): + def _get_direct_from_contiguous(self, args=None, kwargs={}): """ If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks otherwise we have to read the entire array. This is a fallback situation if we @@ -274,10 +274,10 @@ def __getstride(): chunk_shape = np.ones(self.rank, dtype=int) for i in range(self.rank): stride *= self.shape[i] - chunk_shape = box[:i] = self.shape[:i] + chunk_shape = self.shape[:i] if stride*self.dtype.itemsize > self.pseudo_chunking_size: stride //= self.shape[i] - chunk_shape = box[:i] = self.shape[:i-1] + chunk_shape = self.shape[:i-1] return chunk_shape, stride itemsize = np.dtype(self.dtype).itemsize From f4507763cf1afe066de11d0993b8a61d6b99d434 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 15 Jan 2025 16:27:46 +0000 Subject: [PATCH 078/105] Pseudo chunking in, with test support (and a missing make data file that I didn't move properly) --- pyfive/h5d.py | 113 ++++++++++++++++------- tests/data/make_dataset_multidim_file.py | 17 ++++ tests/test_pseudochunking.py | 13 ++- 3 files changed, 103 insertions(+), 40 deletions(-) create mode 100644 tests/data/make_dataset_multidim_file.py diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 594b15d..0a07d74 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -34,6 +34,10 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): chunk shape to approximate that volume and read the contigous variable as if were chunked. This is to facilitate lazy loading of partial data from contiguous storage. + + (Currently the only way to change this value is by explicitly using + the set_pseudo_chunk_size method. Most users will not need to change + it.) """ self._order = dataobject.order @@ -42,7 +46,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): try: dataobject.fh.fileno() self._filename = dataobject.fh.name - self.avoid_mmap = False + self.posix = True self.pseudo_chunking_size = 0 except (AttributeError, OSError): try: @@ -52,7 +56,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): # maybe a remote https file opened as bytes? # failing that, maybe a memory file, return as None self._filename = getattr(self._fh,'full_name','None') - self.avoid_mmap = True + self.posix = False self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024 self.filter_pipeline = dataobject.filter_pipeline self.shape = dataobject.shape @@ -126,14 +130,15 @@ def read_direct_chunk(self, chunk_position, **kwargs): def get_data(self, args): """ Called by the dataset getitem method """ - - - match self.layout_class: case 0: #compact storage raise NotImplementedError("Compact Storage") case 1: # contiguous storage - return self._get_contiguous_data(args) + if self.data_offset == UNDEFINED_ADDRESS: + # no storage is backing array, return all zeros + return np.zeros(self.shape, dtype=self.dtype)[args] + else: + return self._get_contiguous_data(args) case 2: # chunked storage if isinstance(self.dtype, tuple): # references need to read all the chunks for now @@ -179,6 +184,17 @@ def index(self): raise ValueError('No chunk index available for HDF layout class {self.layout}') else: return self._index + #### The following method can be used to set pseudo chunking size after the + #### file has been closed and before data transactions. This is pyfive specific + def set_psuedo_chunk_size(self, newsize_MB): + """ Set pseudo chunking size for contiguous variables """ + if self.layout_class == 1: + if not self.posix: + self.pseudo_chunking_size = newsize_MB*1024*1024 + else: + pass # silently ignore it, we'll be using a np.memmap + else: + raise ValueError('Attempt to set pseudo chunking on non-contigous variable') ###### # The following DatasetID methods are used by PyFive and you wouldn't expect @@ -227,13 +243,9 @@ def _build_index(self, dataobject): self._index[key] = StoreInfo(key, filter_mask, addr, size) def _get_contiguous_data(self, args): - - if self.data_offset == UNDEFINED_ADDRESS: - # no storage is backing array, return all zeros - return np.zeros(self.shape, dtype=self.dtype)[args] if not isinstance(self.dtype, tuple): - if self.avoid_mmap: + if not self.posix: return self._get_direct_from_contiguous(args) else: try: @@ -268,32 +280,65 @@ def _get_direct_from_contiguous(self, args=None, kwargs={}): we don't have a true Posix file. We should never end up here with compressed data. """ - def __getstride(): + def __get_pseudo_shape(): """ Determine an appropriate chunk and stride for a given pseudo chunk size """ - stride = 1 - chunk_shape = np.ones(self.rank, dtype=int) - for i in range(self.rank): - stride *= self.shape[i] - chunk_shape = self.shape[:i] - if stride*self.dtype.itemsize > self.pseudo_chunking_size: - stride //= self.shape[i] - chunk_shape = self.shape[:i-1] - return chunk_shape, stride - - itemsize = np.dtype(self.dtype).itemsize - # need to impose type in case self.shape is () in which case numpy would return a float - num_elements = np.prod(self.shape, dtype=int) - num_bytes = num_elements*itemsize + element_size = self.dtype.itemsize + chunk_shape = np.copy(self.shape) + while True: + chunk_size = np.prod(chunk_shape) * element_size + if chunk_size < self.pseudo_chunking_size: + break + for i in range(len(chunk_shape)): + if chunk_shape[i] > 1: + chunk_shape[i] //= 2 + break + + return chunk_shape, chunk_size + class LocalOffset: + def __init__(self, shape, chunk_shape, stride): + chunks_per_dim = [int(np.ceil(a / c)) for a, c in zip(shape, chunk_shape)] + self.chunk_strides = np.cumprod([1] + chunks_per_dim[::-1])[:-1][::-1] + self.stride = stride + def coord_to_offset(self,chunk_coords): + linear_offset = sum(idx * stride for idx, stride in zip(chunk_coords, self.chunk_strides)) + return linear_offset*self.stride + + if self.pseudo_chunking_size: - stride = __getstride() + chunk_shape, stride = __get_pseudo_shape() + offset_finder = LocalOffset(self.shape,chunk_shape,stride) + array = ZarrArrayStub(self.shape, chunk_shape) + indexer = OrthogonalIndexer(args, array) + out_shape = indexer.shape + out = np.empty(out_shape, dtype=self.dtype, order=self._order) + chunk_size = np.prod(chunk_shape) + + for chunk_coords, chunk_selection, out_selection in indexer: + index = self.data_offset + offset_finder.coord_to_offset(chunk_coords) + self._fh.seek(index) + chunk_buffer = self._fh.read(stride) + chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy() + if len(chunk_data) < chunk_size: + # last chunk over end of file + padded_chunk_data = np.zeros(chunk_size, dtype=self.dtype) + padded_chunk_data[:len(chunk_data)] = chunk_data + chunk_data = padded_chunk_data + out[out_selection] = chunk_data.reshape(chunk_shape, order=self._order)[chunk_selection] + + return out - # we need it all, let's get it all (i.e. this really does read the lot) - self._fh.seek(self.data_offset) - chunk_buffer = self._fh.read(num_bytes) - chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy() - chunk_data = chunk_data.reshape(self.shape, order=self._order) - return chunk_data[args] + else: + itemsize = np.dtype(self.dtype).itemsize + num_elements = np.prod(self.shape, dtype=int) + num_bytes = num_elements*itemsize + + # we need it all, let's get it all (i.e. this really does read the lot) + self._fh.seek(self.data_offset) + chunk_buffer = self._fh.read(num_bytes) + chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy() + chunk_data = chunk_data.reshape(self.shape, order=self._order) + return chunk_data[args] def _get_raw_chunk(self, storeinfo): @@ -351,6 +396,8 @@ def _get_selection_via_chunks(self, args): return out + + @property def _fh(self): """ diff --git a/tests/data/make_dataset_multidim_file.py b/tests/data/make_dataset_multidim_file.py new file mode 100644 index 0000000..7d885c4 --- /dev/null +++ b/tests/data/make_dataset_multidim_file.py @@ -0,0 +1,17 @@ +#! /usr/bin/env python +""" Create a HDF5 file with datasets of many different dimensions . """ +import h5py +import numpy as np + +f = h5py.File('dataset_multidim.hdf5', 'w') + +# signed intergers +common_args = { + 'dtype': ' Date: Wed, 15 Jan 2025 17:08:02 +0000 Subject: [PATCH 079/105] Tidy up dependencies for testing --- setup.cfg | 1 + tests/dataset_multidim.hdf5 | Bin 4464 -> 0 bytes tests/make_dataset_multidim_file.py | 17 ----------------- 3 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 tests/dataset_multidim.hdf5 delete mode 100644 tests/make_dataset_multidim_file.py diff --git a/setup.cfg b/setup.cfg index c767029..d3fb584 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,4 +40,5 @@ testing = flask flask-cors moto + dask diff --git a/tests/dataset_multidim.hdf5 b/tests/dataset_multidim.hdf5 deleted file mode 100644 index 6349a86592ac0d3fee9ab4c250783bf31341e3f0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4464 zcmeI#*-{fh6oBFGNunc)g1fklJMMAc7c{}R0Rk%S!=mDjyO&(zosZ$8_!zv2{-^si zRwWuL7nG@Tn)>E+_e>@;XL>r#*Gj3owWXuQNQq;UH*KyOHb1pYwELdW4*OHBSdCK} zzem1ejrK_;8hpBn>k4toS8ctmQ z@TT2cJFYt2wOT(s%QIC=0O&tyqjDkZC}c zCQGpl%drA0u?kY=^8E5Ptw9IYVjb3F12!VTCTzwQY{fQg#}3F_zZ1K#8+))9`>-EH z9Kb;w!eJc2Q5=K3B`0tar*Il)a2Drq9$mPAiztDGmg&YNRB#zra23~Z9X;qpA8w!@ z0~o|j+`?@P;STQN9`54-9^w&F3}XbN7{g;c!BdRm8J^<>Ug8yA;|=8Pdy98?j}Q2W zPxy?iWEsdZkYymtKyzlGc3mgGIQ%B(t&5S{@_&# Date: Wed, 15 Jan 2025 17:36:50 +0000 Subject: [PATCH 080/105] Minor changes which come from upstream advice on my two pull requests https://github.com/jjhelmus/pyfive/pull/67 and https://github.com/jjhelmus/pyfive/pull/66 --- pyfive/dataobjects.py | 9 +++++---- pyfive/h5d.py | 21 +++++++++++++++++---- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 6386b6f..9e7535a 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -368,9 +368,9 @@ def compression_opts(self): if GZIP_DEFLATE_FILTER in self._filter_ids: gzip_entry = [d for d in self.filter_pipeline if d['filter_id'] == GZIP_DEFLATE_FILTER][0] - #raise ValueError(f'gzip_entry {gzip_entry.keys()}') - key = {0:'client_data_values',1:'client_data'}['client_data' in gzip_entry] - return gzip_entry[key][0] + return gzip_entry['client_data'][0] + #key = {0:'client_data_values',1:'client_data'}['client_data' in gzip_entry] + #return gzip_entry[key][0] return None @property @@ -458,7 +458,8 @@ def filter_pipeline(self): filter_info['name'] = name client_values = struct.unpack_from("<{:d}i".format(num_client_values), self.msg_data, offset) offset += (4 * num_client_values) - filter_info['client_data_values'] = client_values + filter_info['client_data'] = client_values + filter_info['client_data_values'] = num_client_values filters.append(filter_info) else: diff --git a/pyfive/h5d.py b/pyfive/h5d.py index c2b8ea3..990b9b2 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -109,14 +109,20 @@ def get_chunk_info(self, index): Retrieve storage information about a chunk specified by its index. Our index is in chunk space, but H5Py wants it in coordinate space. """ - return self._index[self._nthindex[index]] + if not self._index: + return None + else: + return self._index[self._nthindex[index]] def get_chunk_info_by_coord(self, coordinate_index): """ Retrieve information about a chunk specified by the array address of the chunk’s first element in each dimension. """ - return self._index[coordinate_index] + if not self._index: + return None + else: + return self._index[coordinate_index] def get_num_chunks(self): """ @@ -129,6 +135,8 @@ def read_direct_chunk(self, chunk_position, **kwargs): Returns a tuple containing the filter_mask and the raw data storing this chunk as bytes. Additional arugments supported by H5Py are not supported here. """ + if not self.index: + return None if chunk_position not in self._index: raise OSError("Chunk coordinates must lie on chunk boundaries") storeinfo = self._index[chunk_position] @@ -146,6 +154,8 @@ def get_data(self, args): else: return self._get_contiguous_data(args) case 2: # chunked storage + if not self._index: + return np.zeros(self.shape, dtype=self.dtype)[args] if isinstance(self.dtype, tuple): # references need to read all the chunks for now return self._get_selection_via_chunks(())[args] @@ -193,7 +203,10 @@ def index(self): #### The following method can be used to set pseudo chunking size after the #### file has been closed and before data transactions. This is pyfive specific def set_psuedo_chunk_size(self, newsize_MB): - """ Set pseudo chunking size for contiguous variables """ + """ Set pseudo chunking size for contiguous variables. The default + value is 4 MB which should be suitable for most applications. For + arrays smaller than this value, no pseudo chunking is used. + Larger arrays will be accessed in in roughly newsize_MB reads. """ if self.layout_class == 1: if not self.posix: self.pseudo_chunking_size = newsize_MB*1024*1024 @@ -221,7 +234,7 @@ def _build_index(self, dataobject): return # look out for an empty dataset, which will have no btree - if np.prod(self.shape) == 0: + if np.prod(self.shape) == 0 or dataobject._chunk_address == UNDEFINED_ADDRESS: self._index = {} return From 49aa7949679a0d4335572b54387be992945bd370 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 15 Jan 2025 17:54:26 +0000 Subject: [PATCH 081/105] Suppress reference list warning. It's useless --- pyfive/dataobjects.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 9e7535a..ba24a7b 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -219,9 +219,12 @@ def _parse_attribute_msg(self, buffer, offset): try: dtype = DatatypeMessage(buffer, offset).dtype except NotImplementedError: - warnings.warn( - f"Attribute {name} type not implemented, set to None." - ) + if name == 'REFERENCE_LIST': + pass #suppress this one, no one actually cares about these as far as I know + else: + warnings.warn( + f"Attribute {name} type not implemented, set to None." + ) return name, None offset += _padded_size(attr_dict['datatype_size'], padding_multiple) From dba2683a3c03249ed8a73d32430ccb3807ada8b7 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 16 Jan 2025 16:10:28 +0000 Subject: [PATCH 082/105] Using context manager for threadsafe test --- tests/test_threadsafe_data_access.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_threadsafe_data_access.py b/tests/test_threadsafe_data_access.py index 4291a60..fa4a94d 100644 --- a/tests/test_threadsafe_data_access.py +++ b/tests/test_threadsafe_data_access.py @@ -73,7 +73,8 @@ def test_threadsafe_datea_accesss(s3fs_s3): print (f"\n{ftype} ----------------\n") # Get the file data array, on asingle thread - v = pyfive.File(chunked_file)['v'] + with pyfive.File(chunked_file) as hf: + v = hf['v'] print (v) print (f"Storage chunk size: {v.chunks}") array0 = v[...].view(type=np.ndarray) From 2ba27dc14b16c015a085eedf637ed6b4bfa6da25 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 20 Jan 2025 13:20:59 +0000 Subject: [PATCH 083/105] no returned memory maps --- pyfive/h5d.py | 20 ++++++++++++++++---- tests/data/issue23_A_contiguous.nc | Bin 0 -> 10943 bytes tests/test_lazy_variables.py | 20 ++++++++++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 tests/data/issue23_A_contiguous.nc diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 990b9b2..d031b76 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -265,14 +265,26 @@ def _get_contiguous_data(self, args): if not isinstance(self.dtype, tuple): if not self.posix: + # Not posix return self._get_direct_from_contiguous(args) else: + # posix try: - # return a memory-map to the stored array - # I think this would mean that we only move the sub-array corresponding to result! - view = np.memmap(self._fh, dtype=self.dtype, mode='c', - offset=self.data_offset, shape=self.shape, order=self._order) + # Create a memory-map to the stored array, which + # means that we will end up only copying the + # sub-array into in memory. + view = np.memmap( + self._fh, + dtype=self.dtype, + mode='c', + offset=self.data_offset, + shape=self.shape, + order=self._order + ) + # Create the sub-array result = view[args] + # Copy the data from disk to physical memory + result = result.view(type=np.ndarray) return result except UnsupportedOperation: return self._get_direct_from_contiguous(args) diff --git a/tests/data/issue23_A_contiguous.nc b/tests/data/issue23_A_contiguous.nc new file mode 100644 index 0000000000000000000000000000000000000000..18cf68c1ef960e4f0e1299de57626f06f9c432b7 GIT binary patch literal 10943 zcmeHNeQZ=!7QZvEoepIvh5QOtT3W@A5Qkyr!Hk){sWY!# zSsHEDD1NOg5Cx-xU<@%4L4wBk0gWGtx@L*`hZ-YmqQ>utQ5IxX_MCg}dDBiIFowGS zybCku-TTft@0{N`_q<8Zy=!4hWK89hO0VB9cpOz@??tpyr*8Nfjx@C`Z0Ya@c6+Pk znl?(^Vv%B6=@lMinK)IH0nxmon1knmnaHP!fbb)0C~NhamYvR8d2u@2sDO_bKA}iW zeQkZixN&eTK0=)^!bAkE8X8TdLcByd7|;G^$0%w3m}wr*1YGfYJf3pAfWcIofEJ&o z8(;^HplBQ|8jpg`@+!QeT~q*S(`EF;7nSg0DN*ot)bYJ-MpjKs0Dr-hSk$x=$rX*E zY&MrnTXEaWi*fKA@=uqBN*qCmDJlre!nr1V3bfYd&Q1vz!h>$NEV?or?OfCrjdg~a zTf-tS4c9gBR-znjzA`NN@j=L+iQDk~-NYl~hddtEXgpNC>t(`F48l+WEHxmX!f^UI zjsuhZV)CtUr(EWPR!O)`^^w+fM4T)sKR#~YFyeO74%35Hs+35n2O(|8^BgDDWU40k2Y$gUIWzF8-x0wD1bw6kPO_abPNzXM3zwYu#)3okrntk@4}A6pJF+o$q`_&#g?Y`qNJQVQHyF%CCt zVNbO#>by#v3Cy_NCc_6y0~Qrl%v7d^svBtb;5ph>Qa%E|cCy?@a8v2k`+Ma4yrz2^j-Tte0sf#N8~qgfC7#t$xF}K}G|rEY`d# zDq*o`s6ChMHgk3wiYC4)YT<*x_nJA}JWUO=VLw52ZECE&q;^hC((E-e*>2cc_Uu%0 z#oQ)*pVX>1%BJaLc#w$tUOj)?Ax67GoULAqTh(2BbCrQXswVYVX`2ig0}NU;$5aRU*-@fD@0`O z2bhi}jLSl8tuh(SGqWpi`RL>@H!VgsH$=luXFj~b=I$TsPkOb!<(A< z*Xcv`7O(RU5?*ILl_WMy|LCH1)*mQZXFV8G9IT)1=l4VL`dDIj_*uVN7I(5BFSyi2 zz(v4Cz(v4Cz(v4Cz(v4Cz(v4Cz(v4CV5AV}Uot?dqeXOqnmaT|d0y1JEF*rV{VQnBRL{_y=_lQaU(ua;nltV1Rq_`r zzILMeQpJ}b+MXtQz#_UZNA!#yqQVHS`s)08|5JU&Bd_T@bEz%Hky~zj$zGo_X#HN} z(to%8eS@{mxF_?)f1mtetGd6@IJ0a?Rn4wPjq8(=L-B1jD&K#+yQy-UaqT6$-{{@5 zL*3tIeEP)0@Pu7T{$``~&v!L1`+TQy;nIJ+a&YEL#^Fxn7*KM175^*7_}d2F+7f=w zSo>X{G53ZSj5mAtY@YI`?Zyj>0^V6`?r`i#NrtD5S!*VJ_^(5c8{aN|e%kHdJfrS! zSN82v^l!@FwT?aH=T_sRpCtfFe#Yl>yU^%z+LOYM_vtVG`Z4=ok?Nt&-F^Yh zW#<7d@7p~8a$LFRyd&Z?EJw6O56G3?qL(cr z5Ztu?fp%^>K~T;m!(|s0E`5`5?kVH`iw5l??nMyid*dg4U64QfJX$?VL0(de5d9Eh z#``~phqayOiTk(zeAtPZL>b%894itn^h8`6&Y-B zmDF}d7$U3tmKAbp|8z2#kG-7ma7oqzfiG+K{e>(+h1H~jzYiU~lUnXjlCY;bawUmd z{SuQwmhk*B`^YuuDyQCIm3l#CY5-z)%&sp2i&$~j*No@w!0Zz=eX(tcLXambPqyo; zQVLTEx}-Wt!}udQ@wtPS&T)3UEck>Q{PZiSC8QubZTID+a3KtX&VmR2vsa$dZMxB!1g_b9kF&M@h=!6=EI{Q zP^%$Ia+KjS=ABQIdw<$i=YZ*yPp5Dwk>dI913Rd|4yq()uq#IT)D8kmC2yZXD9Zbc zK|x1vUB9s#!Q_`49}3W5TX5stCw~@r$~a*>H}Q{&x27MgM(F@V4@2#Te-PE~T=qsY zVGlVmku&pVJeNoX8Zqe3S(4#daT-{K#^<}uM0!O!5li*J5wWzrDlnCf*bF=LfqtPx zJ(__9LLF#K3_99(I@}+eZ(JV6ki`izlZkbib}E~sgCux(0d>Z6W_-ThWyY-lyc_UP y$f-G!gl7Q#YghG8#ZLsFLXYf^My}9cj2s+ip7%}JJMDiGjG$Nm literal 0 HcmV?d00001 diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py index 6ecfb72..ef9d157 100644 --- a/tests/test_lazy_variables.py +++ b/tests/test_lazy_variables.py @@ -56,6 +56,26 @@ def test_file_data_oustside_context(): assert qsum1 == qsum2 +def test_numpy_array_type(): + """Pyfive slices should always return a np.ndarray, not a np.memmap. + + """ + # Get data from contiguous file + with pyfive.File(HERE/'data/issue23_A_contiguous.nc') as hfile: + qdata = hfile['q'] + qdata1 = qdata[...] + assert isinstance(qdata1, np.ndarray) + assert not isinstance(qdata1, np.memmap) + + # Get data from chunked file + with pyfive.File(HERE/'data/issue23_A.nc') as hfile: + qdata = hfile['q'] + qdata2 = qdata[...] + assert isinstance(qdata2, np.ndarray) + assert not isinstance(qdata2, np.memmap) + + # Check that the data are equal in both cases + assert (qdata1 == qdata2).all() From e6b518b805635763d12abfcf2168945ea0fa2b30 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 20 Jan 2025 13:40:41 +0000 Subject: [PATCH 084/105] Test for #16 and #29 --- pyfive/h5d.py | 2 +- tests/test_vlen_str.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 tests/test_vlen_str.py diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 990b9b2..bedd6aa 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -287,7 +287,7 @@ def _get_contiguous_data(self, args): shape=self.shape, order=self._order) return np.array([Reference(addr) for addr in ref_addresses])[args] else: - raise NotImplementedError('datatype not implemented - {dtype_class}') + raise NotImplementedError(f'datatype not implemented - {dtype_class}') def _get_direct_from_contiguous(self, args=None): diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py new file mode 100644 index 0000000..5f00dc4 --- /dev/null +++ b/tests/test_vlen_str.py @@ -0,0 +1,22 @@ +import h5py +import pyfive +import io + +def make_file(file_like, _vlen_string): + with h5py.File(file_like,'w') as hfile: + + dt = h5py.special_dtype(vlen=str) + v = hfile.create_dataset("var_len_str", (1,), dtype=dt) + v[0] = _vlen_string + + +def test_vlen_string(): + + tfile = io.BytesIO() + _vlen_string = "foo" + make_file(tfile, _vlen_string) + with pyfive.File(tfile) as hfile: + print(hfile) + ds1 = hfile['var_len_str'] + assert _vlen_string == ds1[0] + \ No newline at end of file From b1ae323c76e3f37bbe842f58151dad9bf9a5ecc2 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 20 Jan 2025 14:44:16 +0000 Subject: [PATCH 085/105] More versions of the #29 tests --- tests/test_vlen_str.py | 58 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py index 5f00dc4..c0c36e0 100644 --- a/tests/test_vlen_str.py +++ b/tests/test_vlen_str.py @@ -1,22 +1,68 @@ import h5py import pyfive +import netCDF4 as nc import io +import numpy as np -def make_file(file_like, _vlen_string): +def make_file_hdf5(file_like, _vlen_string): with h5py.File(file_like,'w') as hfile: dt = h5py.special_dtype(vlen=str) v = hfile.create_dataset("var_len_str", (1,), dtype=dt) v[0] = _vlen_string - -def test_vlen_string(): +def make_file_nc(file_like,m_array): + + n = nc.Dataset(file_like, "w", format="NETCDF4") + n.createDimension("time", 4) + months = n.createVariable("months", str, ("time",)) + months[:] = np.array(m_array, dtype="S8") + n.close() + +def test_vlen_string_hdf5(): tfile = io.BytesIO() _vlen_string = "foo" - make_file(tfile, _vlen_string) + make_file_hdf5(tfile, _vlen_string) with pyfive.File(tfile) as hfile: - print(hfile) + ds1 = hfile['var_len_str'] assert _vlen_string == ds1[0] - \ No newline at end of file + +def test_vlen_string_nc1(): + """ this verson currently fails because netcdf4 is doing something odd in memory """ + + t1file = io.BytesIO() + m_array = ["January", "February", "March", "April"] + make_file_nc(t1file,m_array) + + with nc.Dataset(t1file,'r') as ncfile: + ds1 = ncfile['months'] + assert np.array_equal(m_array, ds1) + + + with h5py.File(t1file) as pfile: + ds1 = pfile['months'] + assert np.array_equal(m_array, ds1) + + + with pyfive.File(t1file) as hfile: + ds1 = hfile['months'] + assert np.array_equal(m_array, ds1) + +def test_vlen_string_nc2(tmp_path): + tfile = tmp_path / 'test_vlen_string.nc' + m_array = ["January", "February", "March", "April"] + make_file_nc(tfile, m_array) + + with nc.Dataset(tfile, 'r') as ncfile: + ds1 = ncfile.variables['months'][:] + assert np.array_equal(m_array, ds1.astype(str)) + + with h5py.File(tfile, 'r') as pfile: + ds1 = pfile['months'][:] + assert np.array_equal(m_array, ds1.astype(str)) + + with pyfive.File(tfile) as hfile: + ds1 = hfile['months'][:] + assert np.array_equal(m_array, ds1.astype(str)) \ No newline at end of file From c7e157c641011db58cd7fb486dc1db71c5310d13 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 20 Jan 2025 14:46:52 +0000 Subject: [PATCH 086/105] Better .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1679dae..fbf3007 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,9 @@ .coverage .pyc build -__pycache__ +__pycache__/ *.egg-info .idea .DS_Store test-reports/ +<_io.Bytes*> From 40c898b7700d0af92f137341087c1cd36034e6cb Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 20 Jan 2025 14:57:51 +0000 Subject: [PATCH 087/105] Giving up on in-memory netcdf tests for #29 --- tests/test_vlen_str.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py index c0c36e0..28ec2c6 100644 --- a/tests/test_vlen_str.py +++ b/tests/test_vlen_str.py @@ -11,13 +11,17 @@ def make_file_hdf5(file_like, _vlen_string): v = hfile.create_dataset("var_len_str", (1,), dtype=dt) v[0] = _vlen_string -def make_file_nc(file_like,m_array): +def make_file_nc(file_like,m_array, inmemory=False): - n = nc.Dataset(file_like, "w", format="NETCDF4") + if inmemory: + n = nc.Dataset(file_like, 'w', diskless=True) + else: + n = nc.Dataset(file_like, "w", format="NETCDF4") n.createDimension("time", 4) months = n.createVariable("months", str, ("time",)) months[:] = np.array(m_array, dtype="S8") - n.close() + if not inmemory: + n.close() def test_vlen_string_hdf5(): @@ -29,12 +33,12 @@ def test_vlen_string_hdf5(): ds1 = hfile['var_len_str'] assert _vlen_string == ds1[0] -def test_vlen_string_nc1(): +def NOtest_vlen_string_nc1(): """ this verson currently fails because netcdf4 is doing something odd in memory """ t1file = io.BytesIO() m_array = ["January", "February", "March", "April"] - make_file_nc(t1file,m_array) + make_file_nc(t1file,m_array, inmemory=True) with nc.Dataset(t1file,'r') as ncfile: ds1 = ncfile['months'] From 8acf067af4e73487540956e8c7b3bc4c919eeb4f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 20 Jan 2025 15:28:23 +0000 Subject: [PATCH 088/105] explicitly close POSIX files --- pyfive/h5d.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index d031b76..c33f206 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -273,8 +273,9 @@ def _get_contiguous_data(self, args): # Create a memory-map to the stored array, which # means that we will end up only copying the # sub-array into in memory. + fh = self._fh view = np.memmap( - self._fh, + fh, dtype=self.dtype, mode='c', offset=self.data_offset, @@ -285,6 +286,7 @@ def _get_contiguous_data(self, args): result = view[args] # Copy the data from disk to physical memory result = result.view(type=np.ndarray) + fh.close() return result except UnsupportedOperation: return self._get_direct_from_contiguous(args) @@ -294,10 +296,14 @@ def _get_contiguous_data(self, args): size = self.dtype[1] if size != 8: raise NotImplementedError('Unsupported Reference type - size {size}') + + fh = self._fh ref_addresses = np.memmap( - self._fh, dtype=(' Date: Tue, 21 Jan 2025 07:43:46 +0000 Subject: [PATCH 089/105] vlen strings data test case, vanilla version, and version with missing defined. tests fail. --- pyfive/h5d.py | 4 ++-- tests/test_vlen_str.py | 35 ++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index bedd6aa..79a7ab3 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -286,6 +286,8 @@ def _get_contiguous_data(self, args): self._fh, dtype=(' {our_view}') + with open(our_view,'r') as f: + for line in f.readlines(): + print(line) + + with pyfive.File(our_file) as hfile: - ds1 = hfile['var_len_str'] - assert _vlen_string == ds1[0] + ds1 = hfile['var_len_str'][:] + print(ds1) + assert np.array_equal(ds1,vlen_strings) def NOtest_vlen_string_nc1(): """ this verson currently fails because netcdf4 is doing something odd in memory """ @@ -44,12 +55,10 @@ def NOtest_vlen_string_nc1(): ds1 = ncfile['months'] assert np.array_equal(m_array, ds1) - with h5py.File(t1file) as pfile: ds1 = pfile['months'] assert np.array_equal(m_array, ds1) - with pyfive.File(t1file) as hfile: ds1 = hfile['months'] assert np.array_equal(m_array, ds1) From 0e4a45b5472ae70274db5fcdc34b4c30c19c7335 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 21 Jan 2025 10:06:09 +0000 Subject: [PATCH 090/105] add extra posix test for file closure --- pyfive/h5d.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index c33f206..4f8e5a9 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -302,7 +302,9 @@ def _get_contiguous_data(self, args): fh, dtype=(' Date: Tue, 21 Jan 2025 13:02:20 +0000 Subject: [PATCH 091/105] More on h5d and testing. The iter_chunks method is broken and we now have a test which shows that. --- pyfive/h5d.py | 29 ++++++++++++++++++----------- tests/test_h5d.py | 37 +++++++++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 79a7ab3..7932f28 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -169,7 +169,7 @@ def iter_chunks(self, args): The optional sel argument is a slice or tuple of slices that defines the region to be used. If not set, the entire dataspace will be used for the iterator. For each chunk within the given region, the iterator yields a tuple of slices that gives the - intersection of the given chunk with the selection area. + intersection of the given chunk with the selection area. This can be used to read data in that chunk. """ if self.chunks is None: @@ -177,7 +177,7 @@ def iter_chunks(self, args): def convert_selection(tuple_of_slices): # while a slice of the form slice(a,b,None) is equivalent - # in funtion to a slice of form (a,b,1) it is not the same. + # in function to a slice of form (a,b,1) it is not the same. # For compatability I've gone for "the same" def convert_slice(aslice): if aslice.step is None: @@ -186,7 +186,19 @@ def convert_slice(aslice): return tuple([convert_slice(a) for a in tuple_of_slices]) array = ZarrArrayStub(self.shape, self.chunks) - indexer = OrthogonalIndexer(args, array) + + if args: + # convert to getitem type args + converted = [] + for s in args: + if isinstance(s, slice) and (s.stop - s.start) == 1: + converted.append(s.start) + else: + converted.append(s) + args = tuple(converted) + indexer = OrthogonalIndexer(*args, array) + else: + indexer = OrthogonalIndexer(args, array) for _, _, out_selection in indexer: yield convert_selection(out_selection) @@ -244,14 +256,8 @@ def _build_index(self, dataobject): dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims) self._index = {} - # we do this to avoid either using an iterator or many - # temporary list creations if there are repeated chunk accesses. self._nthindex = [] - # The zarr orthogonal indexer returns the position in chunk - # space, whereas pyfive wants the position in array space. - # Here we index the pyfive chunk_index in zarr index space. - for node in chunk_btree.all_nodes[0]: for node_key, addr in zip(node['keys'], node['addresses']): start = node_key['chunk_offset'][:-1] @@ -398,16 +404,17 @@ def _get_selection_via_chunks(self, args): out = np.empty(out_shape, dtype=dtype, order=self._order) for chunk_coords, chunk_selection, out_selection in indexer: + # map from chunk coordinate space to array space which is how hdf5 keeps the index chunk_coords = tuple(map(mul, chunk_coords, self.chunks)) filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords) if self.filter_pipeline is not None: - # FIXME: Why do I assume it's always a V1 Btree? + # we are only using the class method here, future filter pipelines may need their own function chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize) chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy() out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection] if true_dtype is not None: - # no idea if this is going to work! + if dtype_class == 'REFERENCE': to_reference = np.vectorize(Reference) out = to_reference(out) diff --git a/tests/test_h5d.py b/tests/test_h5d.py index 6f126d3..24bbb6e 100644 --- a/tests/test_h5d.py +++ b/tests/test_h5d.py @@ -2,17 +2,13 @@ import pyfive from pathlib import Path import pytest +from operator import mul mypath = Path(__file__).parent filename = 'compressed.hdf5' variable_name = 'dataset3' breaking_address=(2,0) -#mypath = mypath.parent/'bnl/' -#filename = 'da193o_25_day__grid_T_198807-198807.nc' -#variable_name = 'tos' -# breaking_address=(2,0,3) - def chunk_down(ff, vv): """ Test the chunking stuff @@ -33,14 +29,18 @@ def chunk_down(ff, vv): return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d, v -def get_chunks(ff, vv, view=0): +def get_chunks(ff, vv): var = ff[vv] chunks = list(var.iter_chunks()) - for i in range(view): - print('Chunk ',i) - print(chunks[i]) return chunks +def get_slices(var, using_py5): + """ Return suitlable test slice from var """ + rank = len(var.shape) + assert rank == 2 + slice1 = slice(2,3) + slice2 = slice(2,3) + return (slice1, slice2) def test_h5d_chunking_details(): @@ -52,7 +52,7 @@ def test_h5d_chunking_details(): assert h5detail == p5detail -def test_iter_chunks(): +def test_iter_chunks_all(): with h5py.File(mypath/filename) as f: h5chunks = get_chunks(f, variable_name) @@ -62,3 +62,20 @@ def test_iter_chunks(): assert h5chunks == p5chunks +def test_iter_chunks_sel(): + """ I don't really understand what h5py is doing here, so + obviously I don't have the right method in pyfive and/ + or the right test #FIXME """ + + with h5py.File(mypath/filename) as f: + var = f[variable_name] + slices = get_slices(var, False) + h5chunks = list(var.iter_chunks(slices)) + print(h5chunks) + + with pyfive.File(mypath/filename) as g: + var = g[variable_name] + slices = get_slices(var, True) + p5chunks = list(var.iter_chunks(slices)) + + assert h5chunks == p5chunks \ No newline at end of file From 82dc2a9b75ed2d7a63b86d706b68e4029adccb47 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 21 Jan 2025 14:19:35 +0000 Subject: [PATCH 092/105] Support for pyactivestorage via a bespoke `get_chunk_info_from_chunk_coord` method --- pyfive/h5d.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 7932f28..1ff1cf3 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -107,7 +107,6 @@ def __eq__(self, other): def get_chunk_info(self, index): """ Retrieve storage information about a chunk specified by its index. - Our index is in chunk space, but H5Py wants it in coordinate space. """ if not self._index: return None @@ -226,6 +225,20 @@ def set_psuedo_chunk_size(self, newsize_MB): pass # silently ignore it, we'll be using a np.memmap else: raise ValueError('Attempt to set pseudo chunking on non-contigous variable') + + def get_chunk_info_from_chunk_coord(self, chunk_coords): + """ + Retrieve storage information about a chunk specified by its index. + This index is in chunk space (as used by zarr) and needs to be converted + to hdf5 coordinate space. Additionaly, if this file is not chunked, the storeinfo + is returned for the contiguous data as if it were one chunk. + """ + if not self._index: + dummy = StoreInfo(None, None, self.data_offset, self.dtype.itemsize*np.prod(self.shape)) + return dummy + else: + coord_index = tuple(map(mul, chunk_coords, self.chunks)) + return self.get_chunk_info_by_coord(coord_index) ###### # The following DatasetID methods are used by PyFive and you wouldn't expect From 838b0a52f4d258ccc82063fe9034bd46364e58a1 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 21 Jan 2025 17:02:34 +0000 Subject: [PATCH 093/105] better ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index fbf3007..94d92df 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ __pycache__/ .DS_Store test-reports/ <_io.Bytes*> +tests/__pycache__/ From a6336839490935ea0efe8a6b0bc9ef935ece7181 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 21 Jan 2025 18:13:20 +0000 Subject: [PATCH 094/105] The first vlen data test passes with this code --- pyfive/core.py | 2 ++ pyfive/dataobjects.py | 10 +++------- pyfive/h5d.py | 12 +++++++++++- pyfive/misc_low_level.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/pyfive/core.py b/pyfive/core.py index b0d2718..4d9dba6 100644 --- a/pyfive/core.py +++ b/pyfive/core.py @@ -60,3 +60,5 @@ def _unpack_integer(nbytes, buf, offset=0): fmt = "{}s".format(nbytes) values = struct.unpack_from(fmt, buf, offset=offset) return int.from_bytes(values[0], byteorder="little", signed=False) + + diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index ba24a7b..2639ab5 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -19,7 +19,7 @@ from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders from pyfive.btree import BTreeV2AttrCreationOrder, BTreeV2AttrNames from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER -from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap +from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap, GLOBAL_HEAP_ID from pyfive.h5d import DatasetID from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub @@ -686,9 +686,8 @@ def is_datatype(self): return True else: return False - - + def determine_data_shape(buf, offset): """ Return the shape of the dataset pointed to in a Dataspace message. """ @@ -723,10 +722,7 @@ def determine_data_shape(buf, offset): # all metadata fields are stored in little-endian byte order. -GLOBAL_HEAP_ID = OrderedDict(( - ('collection_address', 'Q'), # 8 byte addressing - ('object_index', 'I'), -)) + GLOBAL_HEAP_ID_SIZE = _structure_size(GLOBAL_HEAP_ID) # IV.A.2.m The Attribute Message diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 79a7ab3..5fcf1bd 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -4,7 +4,9 @@ from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub from pyfive.btree import BTreeV1RawDataChunks from pyfive.core import Reference, UNDEFINED_ADDRESS +from pyfive.misc_low_level import get_vlen_string_data from io import UnsupportedOperation + import struct import logging from importlib.metadata import version @@ -69,6 +71,13 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): self.shape = dataobject.shape self.rank = len(self.shape) self.chunks = dataobject.chunks + + # experimental code. We need to find out whether or not this + # is unnecessary duplication. At the moment it seems best for + # each variable to have it's own copy of those needed for + # data access. Though that's clearly not optimal if they include + # other data. To be determined. + self._global_heaps={} self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params() self._unique = (self._filename, self.shape, self._msg_offset) @@ -287,7 +296,8 @@ def _get_contiguous_data(self, args): shape=self.shape, order=self._order) return np.array([Reference(addr) for addr in ref_addresses])[args] elif dtype_class == 'VLEN_STRING': - raise NotImplementedError('Variable length strings under construction') + fh = self._fh + return get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype) else: raise NotImplementedError(f'datatype not implemented - {dtype_class}') diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py index e4c8786..e7993e7 100644 --- a/pyfive/misc_low_level.py +++ b/pyfive/misc_low_level.py @@ -11,6 +11,7 @@ from .core import _unpack_integer from .core import InvalidHDF5File from .core import UNDEFINED_ADDRESS +from math import prod class SuperBlock(object): @@ -333,6 +334,33 @@ def _indirect_info(self, nrows): nindirect = nobjects - ndirect_max return ndirect, nindirect +def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): + """ Return the data for a variable which is + made up of variable length string data """ + fh.seek(data_offset) + count = prod(shape) + _, _, character_set = dtype + value = np.empty(count,dtype=object) + offset = 0 + buf = fh.read(16*count) + for i in range(count): + vlen_size, = struct.unpack_from(' Date: Tue, 21 Jan 2025 19:59:46 +0000 Subject: [PATCH 095/105] closer to a solution for #29. These tests pass, but we need to deal with multidimensional arrays as well. --- pyfive/dataobjects.py | 11 +++++++++-- pyfive/misc_low_level.py | 7 +++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 2639ab5..a327163 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -335,8 +335,15 @@ def fillvalue(self): size = 0 if size: - payload = self.msg_data[offset:offset+size] - fillvalue = np.frombuffer(payload, self.dtype, count=1)[0] + if isinstance(self.dtype, tuple): + try: + assert self.dtype[0] == 'VLEN_STRING' + except: + raise ValueError('Unrecognised fill type') + fillvalue = self._attr_value(self.dtype, self.msg_data, 1, offset)[0] + else: + payload = self.msg_data[offset:offset+size] + fillvalue = np.frombuffer(payload, self.dtype, count=1)[0] else: fillvalue = 0 return fillvalue diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py index e7993e7..2d11d0c 100644 --- a/pyfive/misc_low_level.py +++ b/pyfive/misc_low_level.py @@ -12,6 +12,7 @@ from .core import InvalidHDF5File from .core import UNDEFINED_ADDRESS from math import prod +import numpy as np class SuperBlock(object): @@ -335,8 +336,9 @@ def _indirect_info(self, nrows): return ndirect, nindirect def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): - """ Return the data for a variable which is - made up of variable length string data """ + """ Return the data for a variable which is made up of variable length string data """ + # we need to import this from DatasetID, and that's imported from Dataobjects hence + # hiding it here in misc_low_level. fh.seek(data_offset) count = prod(shape) _, _, character_set = dtype @@ -347,6 +349,7 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): vlen_size, = struct.unpack_from(' Date: Wed, 22 Jan 2025 07:35:49 +0000 Subject: [PATCH 096/105] Partially working vlen string support, issues with global heap usage and dtypes remain --- pyfive/dataobjects.py | 1 + pyfive/misc_low_level.py | 2 +- tests/test_vlen_str.py | 105 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 106 insertions(+), 2 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index a327163..d1bb5f5 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -285,6 +285,7 @@ def _vlen_size_and_data(self, buf, offset): # stored in the data object storage. gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4) gheap_address = gheap_id['collection_address'] + print('Collection address in _vlen', gheap_address) if gheap_address not in self._global_heaps: # load the global heap and cache the instance gheap = GlobalHeap(self.fh, gheap_address) diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py index 2d11d0c..58fd50c 100644 --- a/pyfive/misc_low_level.py +++ b/pyfive/misc_low_level.py @@ -349,7 +349,7 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): vlen_size, = struct.unpack_from(' Failing {k}') + print('Original data', v) + print('h5py', hdata) + print('pyfive', pdata) + raise From e7c465e3b096a0e9cd170fee9a54a978fc1be6b8 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 22 Jan 2025 09:08:49 +0000 Subject: [PATCH 097/105] Passing all vlen tests for #29, though we are ignoreing the dtype of the dataset itself, which needs to be a new issue. Also the caching stuff needs to be a new issue. --- pyfive/dataobjects.py | 2 +- pyfive/h5d.py | 3 ++- pyfive/misc_low_level.py | 17 ++++++++++------- tests/test_vlen_str.py | 26 +++++++++++++------------- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index d1bb5f5..55af76c 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -285,7 +285,7 @@ def _vlen_size_and_data(self, buf, offset): # stored in the data object storage. gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4) gheap_address = gheap_id['collection_address'] - print('Collection address in _vlen', gheap_address) + #print('Collection address in _vlen', gheap_address) if gheap_address not in self._global_heaps: # load the global heap and cache the instance gheap = GlobalHeap(self.fh, gheap_address) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 5fcf1bd..b1ad147 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -297,7 +297,8 @@ def _get_contiguous_data(self, args): return np.array([Reference(addr) for addr in ref_addresses])[args] elif dtype_class == 'VLEN_STRING': fh = self._fh - return get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype) + array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype) + return array.reshape(self.shape, order=self._order) else: raise NotImplementedError(f'datatype not implemented - {dtype_class}') diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py index 58fd50c..216ad62 100644 --- a/pyfive/misc_low_level.py +++ b/pyfive/misc_low_level.py @@ -342,6 +342,8 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): fh.seek(data_offset) count = prod(shape) _, _, character_set = dtype + if int(character_set) not in [0, 1]: + raise ValueError(f'Unexpected string type, cannot decode character set {character_set}') value = np.empty(count,dtype=object) offset = 0 buf = fh.read(16*count) @@ -349,19 +351,20 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): vlen_size, = struct.unpack_from(' {our_view}') - with open(our_view,'r') as f: - for line in f.readlines(): - print(line) + #os.system(f'h5dump {our_file} > {our_view}') + #with open(our_view,'r') as f: + # for line in f.readlines(): + # print(line) with pyfive.File(our_file) as hfile: @@ -163,22 +163,22 @@ def test_vlen_string_nc2(tmp_path): ds1 = hfile['months'][:] assert np.array_equal(m_array, ds1.astype(str)) -def test_pathalogical_strings(tmp_path): +def test_pathological_strings(tmp_path): tfile = tmp_path/'test_strings.nc' validation=make_pathological_nc(tfile) + warnings.warn('Validation of variable length strings assumes h5py is wrong') with pyfive.File(tfile) as pfile: with h5py.File(tfile) as hfile: for k,v in validation.items(): hdata = hfile[k][...] - #decode_function = np.vectorize(lambda x: x.decode('utf-8')) - #hdata = decode_function(hdata) pdata = pfile[k][...] try: assert np.array_equal(v, pdata),f'Failed original test for {k}' - assert np.array_equal(hdata, pdata), f'Failed comparison test for {d}' + assert np.array_equal(hdata.astype(str), pdata.astype(str)), f'Failed comparison test for {k}' + print(f'--> Passing {k} ({hdata.dtype},{pdata.dtype})') except: - print(f'---> Failing {k}') + print(f'---> Failing {k} ({hdata.dtype},{pdata.dtype})') print('Original data', v) - print('h5py', hdata) - print('pyfive', pdata) + print('h5py', hfile[k].dtype, hdata) + print('pyfive',pfile[k].dtype, pdata) raise From e40c7d75d9472e5efa07e5866febaf17d1badb9f Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 22 Jan 2025 14:53:55 +0000 Subject: [PATCH 098/105] Remaining tests for vlen and iterchunks, support for vlen dtypes (closes #34) --- pyfive/h5d.py | 71 +++++++++++++++++++++++++------------------- pyfive/high_level.py | 7 +---- tests/test_h5d.py | 18 ++++++----- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index dd5381d..af53583 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -84,9 +84,9 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): if isinstance(dataobject.dtype,tuple): # this may not behave the same as h5py, do we care? #FIXME - self.dtype = dataobject.dtype + self._dtype = dataobject.dtype else: - self.dtype = np.dtype(dataobject.dtype) + self._dtype = np.dtype(dataobject.dtype) self._meta = DatasetMeta(dataobject) @@ -158,13 +158,13 @@ def get_data(self, args): case 1: # contiguous storage if self.data_offset == UNDEFINED_ADDRESS: # no storage is backing array, return all zeros - return np.zeros(self.shape, dtype=self.dtype)[args] + return np.zeros(self.shape, dtype=self._dtype)[args] else: return self._get_contiguous_data(args) case 2: # chunked storage if not self._index: - return np.zeros(self.shape, dtype=self.dtype)[args] - if isinstance(self.dtype, tuple): + return np.zeros(self.shape, dtype=self._dtype)[args] + if isinstance(self._dtype, tuple): # references need to read all the chunks for now return self._get_selection_via_chunks(())[args] else: @@ -196,19 +196,20 @@ def convert_slice(aslice): array = ZarrArrayStub(self.shape, self.chunks) if args: - # convert to getitem type args - converted = [] - for s in args: - if isinstance(s, slice) and (s.stop - s.start) == 1: - converted.append(s.start) - else: - converted.append(s) - args = tuple(converted) - indexer = OrthogonalIndexer(*args, array) + # We have implemented what the docstring says it does below, + # but that's not what h5py actually does, and what is it + # actually does is useless, so we haven't implemented that + raise NotImplementedError("h5py does something silly, and our implementation does not") + indexer = OrthogonalIndexer(args[0], array) else: indexer = OrthogonalIndexer(args, array) - for _, _, out_selection in indexer: - yield convert_selection(out_selection) + for chunk_coords, chunk_selection, out_selection in indexer: + if args: + yield convert_selection(chunk_selection) + else: + yield convert_selection(out_selection) + + ##### The following property is made available to support ActiveStorage ##### and to help those who may want to generate kerchunk indices and @@ -243,7 +244,7 @@ def get_chunk_info_from_chunk_coord(self, chunk_coords): is returned for the contiguous data as if it were one chunk. """ if not self._index: - dummy = StoreInfo(None, None, self.data_offset, self.dtype.itemsize*np.prod(self.shape)) + dummy = StoreInfo(None, None, self.data_offset, self._dtype.itemsize*np.prod(self.shape)) return dummy else: coord_index = tuple(map(mul, chunk_coords, self.chunks)) @@ -291,7 +292,7 @@ def _build_index(self, dataobject): def _get_contiguous_data(self, args): - if not isinstance(self.dtype, tuple): + if not isinstance(self._dtype, tuple): if not self.posix: # Not posix return self._get_direct_from_contiguous(args) @@ -304,7 +305,7 @@ def _get_contiguous_data(self, args): fh = self._fh view = np.memmap( fh, - dtype=self.dtype, + dtype=self._dtype, mode='c', offset=self.data_offset, shape=self.shape, @@ -319,9 +320,9 @@ def _get_contiguous_data(self, args): except UnsupportedOperation: return self._get_direct_from_contiguous(args) else: - dtype_class = self.dtype[0] + dtype_class = self._dtype[0] if dtype_class == 'REFERENCE': - size = self.dtype[1] + size = self._dtype[1] if size != 8: raise NotImplementedError('Unsupported Reference type - size {size}') @@ -336,7 +337,7 @@ def _get_contiguous_data(self, args): return result elif dtype_class == 'VLEN_STRING': fh = self._fh - array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype) + array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self._dtype) return array.reshape(self.shape, order=self._order) else: raise NotImplementedError(f'datatype not implemented - {dtype_class}') @@ -353,7 +354,7 @@ def _get_direct_from_contiguous(self, args=None): """ def __get_pseudo_shape(): """ Determine an appropriate chunk and stride for a given pseudo chunk size """ - element_size = self.dtype.itemsize + element_size = self._dtype.itemsize chunk_shape = np.copy(self.shape) while True: chunk_size = np.prod(chunk_shape) * element_size @@ -381,17 +382,17 @@ def coord_to_offset(self,chunk_coords): array = ZarrArrayStub(self.shape, chunk_shape) indexer = OrthogonalIndexer(args, array) out_shape = indexer.shape - out = np.empty(out_shape, dtype=self.dtype, order=self._order) + out = np.empty(out_shape, dtype=self._dtype, order=self._order) chunk_size = np.prod(chunk_shape) for chunk_coords, chunk_selection, out_selection in indexer: index = self.data_offset + offset_finder.coord_to_offset(chunk_coords) fh.seek(index) chunk_buffer = fh.read(stride) - chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy() + chunk_data = np.frombuffer(chunk_buffer, dtype=self._dtype).copy() if len(chunk_data) < chunk_size: # last chunk over end of file - padded_chunk_data = np.zeros(chunk_size, dtype=self.dtype) + padded_chunk_data = np.zeros(chunk_size, dtype=self._dtype) padded_chunk_data[:len(chunk_data)] = chunk_data chunk_data = padded_chunk_data out[out_selection] = chunk_data.reshape(chunk_shape, order=self._order)[chunk_selection] @@ -402,7 +403,7 @@ def coord_to_offset(self,chunk_coords): return out else: - itemsize = np.dtype(self.dtype).itemsize + itemsize = np.dtype(self._dtype).itemsize num_elements = np.prod(self.shape, dtype=int) num_bytes = num_elements*itemsize @@ -410,7 +411,7 @@ def coord_to_offset(self,chunk_coords): # read the lot) fh.seek(self.data_offset) chunk_buffer = fh.read(num_bytes) - chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy() + chunk_data = np.frombuffer(chunk_buffer, dtype=self._dtype).copy() chunk_data = chunk_data.reshape(self.shape, order=self._order) chunk_data = chunk_data[args] if self.posix: @@ -436,9 +437,9 @@ def _get_selection_via_chunks(self, args): the dataset array and in doing so, only load the relevant chunks. """ # need a local dtype as we may override it for a reference read. - dtype = self.dtype + dtype = self._dtype - if isinstance(self.dtype, tuple): + if isinstance(self._dtype, tuple): # this is a reference and we're returning that true_dtype = tuple(dtype) dtype_class = dtype[0] @@ -465,7 +466,7 @@ def _get_selection_via_chunks(self, args): filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords) if self.filter_pipeline is not None: # we are only using the class method here, future filter pipelines may need their own function - chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize) + chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self._dtype.itemsize) chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy() out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection] @@ -505,6 +506,14 @@ def _fh(self): return fh + @property + def dtype(self): + if isinstance(self._dtype,tuple): + if self._dtype[0] == 'VLEN_STRING': + return object + + return self._dtype + class DatasetMeta: diff --git a/pyfive/high_level.py b/pyfive/high_level.py index d1350dd..1fa6859 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -349,12 +349,7 @@ def ndim(self): @property def dtype(self): """ dtype attribute. """ - try: - return np.dtype(self.id.dtype) - except NotImplementedError as e: - raise NotImplementedError(f'{e} (for {self.name})') - except TypeError as e: - return self.id.dtype + return self.id.dtype @property def value(self): diff --git a/tests/test_h5d.py b/tests/test_h5d.py index 24bbb6e..4fdb7fb 100644 --- a/tests/test_h5d.py +++ b/tests/test_h5d.py @@ -38,8 +38,8 @@ def get_slices(var, using_py5): """ Return suitlable test slice from var """ rank = len(var.shape) assert rank == 2 - slice1 = slice(2,3) - slice2 = slice(2,3) + slice1 = slice(8,15) + slice2 = slice(8,15) return (slice1, slice2) def test_h5d_chunking_details(): @@ -71,11 +71,13 @@ def test_iter_chunks_sel(): var = f[variable_name] slices = get_slices(var, False) h5chunks = list(var.iter_chunks(slices)) - print(h5chunks) + #print(h5chunks,var.shape, var.chunks) - with pyfive.File(mypath/filename) as g: - var = g[variable_name] - slices = get_slices(var, True) - p5chunks = list(var.iter_chunks(slices)) + with pytest.raises(NotImplementedError): + with pyfive.File(mypath/filename) as g: + var = g[variable_name] + slices = get_slices(var, True) + p5chunks = list(var.iter_chunks(slices)) + #print(p5chunks,var.shape, var.chunks) - assert h5chunks == p5chunks \ No newline at end of file + assert h5chunks == p5chunks \ No newline at end of file From 599db7b993b23d006317ce665e22e8ff676982d5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 23 Jan 2025 09:57:51 +0000 Subject: [PATCH 099/105] dev --- pyfive/h5d.py | 113 ++++++++++++++++++++++++++++++++------- pyfive/misc_low_level.py | 5 +- 2 files changed, 98 insertions(+), 20 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index af53583..30c2442 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -303,7 +303,7 @@ def _get_contiguous_data(self, args): # means that we will end up only copying the # sub-array into in memory. fh = self._fh - view = np.memmap( + view = np.memmap( fh, dtype=self._dtype, mode='c', @@ -338,7 +338,7 @@ def _get_contiguous_data(self, args): elif dtype_class == 'VLEN_STRING': fh = self._fh array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self._dtype) - return array.reshape(self.shape, order=self._order) + return array.reshape(self.shape, order=self._order)[args] else: raise NotImplementedError(f'datatype not implemented - {dtype_class}') @@ -438,7 +438,8 @@ def _get_selection_via_chunks(self, args): """ # need a local dtype as we may override it for a reference read. dtype = self._dtype - + print ('DTYPE', dtype) + print ('args =', args) if isinstance(self._dtype, tuple): # this is a reference and we're returning that true_dtype = tuple(dtype) @@ -448,8 +449,11 @@ def _get_selection_via_chunks(self, args): if size != 8: raise NotImplementedError('Unsupported Reference type') dtype = ' Date: Thu, 23 Jan 2025 17:22:09 +0000 Subject: [PATCH 100/105] dev --- pyfive/h5d.py | 158 ++++++++++++++++----------------------- pyfive/high_level.py | 1 + pyfive/indexing.py | 2 +- pyfive/misc_low_level.py | 73 +++++++++++++++--- tests/test_vlen_str.py | 73 ++++++++++++++++-- 5 files changed, 192 insertions(+), 115 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 30c2442..6a5b018 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -4,7 +4,7 @@ from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub from pyfive.btree import BTreeV1RawDataChunks from pyfive.core import Reference, UNDEFINED_ADDRESS -from pyfive.misc_low_level import get_vlen_string_data +from pyfive.misc_low_level import get_vlen_string_data_contiguous, get_vlen_string_data_from_chunk from io import UnsupportedOperation import struct @@ -152,20 +152,30 @@ def read_direct_chunk(self, chunk_position, **kwargs): def get_data(self, args): """ Called by the dataset getitem method """ + dtype = self._dtype match self.layout_class: case 0: #compact storage raise NotImplementedError("Compact Storage") case 1: # contiguous storage if self.data_offset == UNDEFINED_ADDRESS: # no storage is backing array, return all zeros - return np.zeros(self.shape, dtype=self._dtype)[args] + if isinstance(dtype, tuple): + return np.zeros(self.shape, dtype='U')[args] + else: + print(99, self.__dict__) + return np.full(self.shape, self.fillvalue, dtype=dtype)[args] +# return np.zeros(self.shape, dtype=dtype)[args] else: return self._get_contiguous_data(args) case 2: # chunked storage if not self._index: - return np.zeros(self.shape, dtype=self._dtype)[args] - if isinstance(self._dtype, tuple): - # references need to read all the chunks for now + if isinstance(dtype, tuple): + return np.zeros(self.shape, dtype='U')[args] + else: + print(991) + return np.zeros(self.shape, dtype=dtype)[args] + if isinstance(dtype, tuple) and dtype[0] == "REFERENCE": + # references need to read all the chunks for now return self._get_selection_via_chunks(())[args] else: # this is lazily reading only the chunks we need @@ -337,7 +347,16 @@ def _get_contiguous_data(self, args): return result elif dtype_class == 'VLEN_STRING': fh = self._fh - array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self._dtype) + array = get_vlen_string_data_contiguous( + fh, + self.data_offset, + self._global_heaps, + self.shape, + self._dtype + ) + if self.posix: + fh.close() + return array.reshape(self.shape, order=self._order)[args] else: raise NotImplementedError(f'datatype not implemented - {dtype_class}') @@ -432,14 +451,13 @@ def _get_raw_chunk(self, storeinfo): return out def _get_selection_via_chunks(self, args): - """ - Use the zarr orthogonal indexer to extract data for a specfic selection within - the dataset array and in doing so, only load the relevant chunks. + """Use the zarr orthogonal indexer to extract data for a specfic + selection within the dataset array and in doing so, only load + the relevant chunks. + """ # need a local dtype as we may override it for a reference read. dtype = self._dtype - print ('DTYPE', dtype) - print ('args =', args) if isinstance(self._dtype, tuple): # this is a reference and we're returning that true_dtype = tuple(dtype) @@ -449,110 +467,62 @@ def _get_selection_via_chunks(self, args): if size != 8: raise NotImplementedError('Unsupported Reference type') dtype = '' % info def __getitem__(self, args): + print (88888, self.fillvalue) data = self.id.get_data(args) if self._astype is None: return data diff --git a/pyfive/indexing.py b/pyfive/indexing.py index e1d9509..b8e4c62 100644 --- a/pyfive/indexing.py +++ b/pyfive/indexing.py @@ -757,7 +757,7 @@ def __init__(self, selection, array): f"expected integer or slice, got {type(dim_sel)!r}" ) - dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) + dim_indexer = SliceDiImndexer(slice_, dim_len, dim_chunk_size) dim_indexers.append(dim_indexer) if start >= dim_len or start < 0: diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py index 5cd8f8a..19419e9 100644 --- a/pyfive/misc_low_level.py +++ b/pyfive/misc_low_level.py @@ -132,8 +132,7 @@ class GlobalHeap(object): """ def __init__(self, fh, offset): - - print ('fh=', fh, offset) + fh.seek(offset) header = _unpack_struct_from_file(GLOBAL_HEAP_HEADER, fh) assert header['signature'] == b'GCOL' @@ -336,7 +335,9 @@ def _indirect_info(self, nrows): nindirect = nobjects - ndirect_max return ndirect, nindirect -def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): +def get_vlen_string_data_contiguous( + fh, data_offset, global_heaps, shape, dtype +): """ Return the data for a variable which is made up of variable length string data """ # we need to import this from DatasetID, and that's imported from Dataobjects hence # hiding it here in misc_low_level. @@ -349,10 +350,8 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype): offset = 0 buf = fh.read(16*count) for i in range(count): - vlen_size, = struct.unpack_from(' Date: Thu, 23 Jan 2025 17:51:56 +0000 Subject: [PATCH 101/105] dev --- pyfive/h5d.py | 14 ++++++-------- pyfive/high_level.py | 5 ++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 6a5b018..00fdd5b 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -150,7 +150,7 @@ def read_direct_chunk(self, chunk_position, **kwargs): storeinfo = self._index[chunk_position] return storeinfo.filter_mask, self._get_raw_chunk(storeinfo) - def get_data(self, args): + def get_data(self, args, fillvalue): """ Called by the dataset getitem method """ dtype = self._dtype match self.layout_class: @@ -158,13 +158,12 @@ def get_data(self, args): raise NotImplementedError("Compact Storage") case 1: # contiguous storage if self.data_offset == UNDEFINED_ADDRESS: - # no storage is backing array, return all zeros + # no storage is backing array, return an array of + # fill values if isinstance(dtype, tuple): - return np.zeros(self.shape, dtype='U')[args] - else: - print(99, self.__dict__) - return np.full(self.shape, self.fillvalue, dtype=dtype)[args] -# return np.zeros(self.shape, dtype=dtype)[args] + dtype = np.array(fillvalue).dtype + + return np.full(self.shape, fillvalue, dtype=dtype)[args] else: return self._get_contiguous_data(args) case 2: # chunked storage @@ -172,7 +171,6 @@ def get_data(self, args): if isinstance(dtype, tuple): return np.zeros(self.shape, dtype='U')[args] else: - print(991) return np.zeros(self.shape, dtype=dtype)[args] if isinstance(dtype, tuple) and dtype[0] == "REFERENCE": # references need to read all the chunks for now diff --git a/pyfive/high_level.py b/pyfive/high_level.py index ffdb19c..32c0e5a 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -8,7 +8,7 @@ import numpy as np -from pyfive.core import Reference +from pyfive.core import Reference, UNDEFINED_ADDRESS from pyfive.dataobjects import DataObjects, DatasetID from pyfive.misc_low_level import SuperBlock @@ -298,8 +298,7 @@ def __repr__(self): return '' % info def __getitem__(self, args): - print (88888, self.fillvalue) - data = self.id.get_data(args) + data = self.id.get_data(args, self.fillvalue) if self._astype is None: return data return data.astype(self._astype) From bd161475b1d128c1190071b7fadc733fa042c55f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 24 Jan 2025 16:27:36 +0000 Subject: [PATCH 102/105] vlen related fixes --- pyfive/h5d.py | 25 +++++++++++++++++-------- pyfive/high_level.py | 2 +- pyfive/misc_low_level.py | 2 +- tests/test_fillvalue.py | 38 ++++++++++++++++++++++++++++++++++++++ tests/test_vlen_str.py | 12 ++++++------ 5 files changed, 63 insertions(+), 16 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 00fdd5b..f731699 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -163,15 +163,21 @@ def get_data(self, args, fillvalue): if isinstance(dtype, tuple): dtype = np.array(fillvalue).dtype + # Note: We can improve this so only an array of + # the shape implied by 'args' is + # created. One for the future. return np.full(self.shape, fillvalue, dtype=dtype)[args] else: return self._get_contiguous_data(args) case 2: # chunked storage if not self._index: + # no storage is backing array, return an array of + # fill values if isinstance(dtype, tuple): - return np.zeros(self.shape, dtype='U')[args] - else: - return np.zeros(self.shape, dtype=dtype)[args] + dtype = np.array(fillvalue).dtype + + return np.full(self.shape, fillvalue, dtype=dtype)[args] + if isinstance(dtype, tuple) and dtype[0] == "REFERENCE": # references need to read all the chunks for now return self._get_selection_via_chunks(())[args] @@ -462,9 +468,11 @@ def _get_selection_via_chunks(self, args): dtype_class = dtype[0] if dtype_class == 'REFERENCE': size = dtype[1] + dtype = ' Date: Tue, 28 Jan 2025 09:35:55 +0000 Subject: [PATCH 103/105] Update pyfive/indexing.py --- pyfive/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyfive/indexing.py b/pyfive/indexing.py index b8e4c62..e1d9509 100644 --- a/pyfive/indexing.py +++ b/pyfive/indexing.py @@ -757,7 +757,7 @@ def __init__(self, selection, array): f"expected integer or slice, got {type(dim_sel)!r}" ) - dim_indexer = SliceDiImndexer(slice_, dim_len, dim_chunk_size) + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) dim_indexers.append(dim_indexer) if start >= dim_len or start < 0: From eed7e99ce031db20f89339be2115187de9e1a581 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Thu, 30 Jan 2025 15:03:05 +0000 Subject: [PATCH 104/105] install only in test mode --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 5e99483..623695f 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,7 +28,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install flake8 pytest - python -m pip install -e . + python -m pip install .[test] # install in test mode if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | From 6255fc00eb658c65847b7bb1b88cf967e67f7f12 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Thu, 30 Jan 2025 15:15:56 +0000 Subject: [PATCH 105/105] actual correct name for testing regime --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 623695f..3e91bd9 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,7 +28,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install flake8 pytest - python -m pip install .[test] # install in test mode + python -m pip install .[testing] # install in test mode if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: |