From 02fca542f531901b565116ec893a1b0d1d59cb20 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 22 Feb 2024 12:20:54 +0000
Subject: [PATCH 001/105] Using s3 to get at some real data for testing

---
 bnl/playing.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/bnl/playing.py b/bnl/playing.py
index ebd6b01..8666d22 100644
--- a/bnl/playing.py
+++ b/bnl/playing.py
@@ -1,18 +1,35 @@
 import pyfive
 from pathlib import Path
 from pyfive.as_dataobjects import ADataObjects
+import s3fs 
+import h5py
 
 MYFILE = 'da193o_25_day__grid_T_198807-198807.nc'
-MYFILE = '../tests/chunked.hdf5'
+#MYFILE = '../tests/chunked.hdf5'
+MYFILE =  'da193a_25_day__198807-198807.nc'
+
 MYPATH = Path(__file__).parent
 
 #f = h5py.File(MYPATH/MYFILE,'r')
-f2 = pyfive.File(MYPATH/MYFILE)
-path = 'dataset1'
-link_target = f2._links[path]
-dsref = ADataObjects(f2.file._fh, link_target)
-chunk_index = dsref.get_offset_addresses()
-print(chunk_index)
+
+S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
+fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
+uri = 'bnl/'+MYFILE
+
+with fs.open(uri,'rb') as s3file2:
+    f2 = pyfive.File(s3file2)
+    #f2 = pyfive.File(MYPATH/MYFILE)
+    path = 'dataset1'
+    path = 'tos'
+    path = 'm01s06i247_4'
+    link_target = f2._links[path]
+    dsref = ADataObjects(f2.file._fh, link_target)
+    chunk_index = dsref.get_offset_addresses()
+    for e in chunk_index:
+        print(e)
+with fs.open(uri,'rb') as s3file2:
+    f3 = h5py.File(s3file2,'r')
+    print(f3[path])
 
 
 #v='tos'

From df3669a11438be9b12dcb63e92cf6c4d74598a9f Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 22 Feb 2024 12:37:31 +0000
Subject: [PATCH 002/105] Getting the address as well as size into the index

---
 pyfive/as_dataobjects.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py
index b811fb3..ccf9c38 100644
--- a/pyfive/as_dataobjects.py
+++ b/pyfive/as_dataobjects.py
@@ -57,7 +57,7 @@ def _as_get_chunk_addresses(self, args):
                         size = node_key['chunk_size']
                     start = node_key['chunk_offset'][:-1]
                     region = [slice(i, i+j) for i, j in zip(start, self.shape)]
-                    self._as_chunk_index.append([region, start, size])
+                    self._as_chunk_index.append([region, start, addr, size])
 
         if args is not None:
             return NotImplementedError

From 16c0e813e0f769e5a45c55c1cb4272f09d5edf64 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 22 Feb 2024 15:22:42 +0000
Subject: [PATCH 003/105] With timer

---
 bnl/playing.py | 55 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/bnl/playing.py b/bnl/playing.py
index 8666d22..56ae5a8 100644
--- a/bnl/playing.py
+++ b/bnl/playing.py
@@ -3,33 +3,48 @@
 from pyfive.as_dataobjects import ADataObjects
 import s3fs 
 import h5py
+import time
 
 MYFILE = 'da193o_25_day__grid_T_198807-198807.nc'
 #MYFILE = '../tests/chunked.hdf5'
 MYFILE =  'da193a_25_day__198807-198807.nc'
 
+INPUT_OPTIONS = [
+    ('da193o_25_day__grid_T_198807-198807.nc','tos','s3'),
+    ('da193a_25_day__198807-198807.nc','m01s06i247_4','s3'),
+    ('../tests/chunked.hdf5','dataset1','local'),
+    ('CMIP6-test.nc','tas', 's3')
+]
+
 MYPATH = Path(__file__).parent
 
-#f = h5py.File(MYPATH/MYFILE,'r')
-
-S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
-fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
-uri = 'bnl/'+MYFILE
-
-with fs.open(uri,'rb') as s3file2:
-    f2 = pyfive.File(s3file2)
-    #f2 = pyfive.File(MYPATH/MYFILE)
-    path = 'dataset1'
-    path = 'tos'
-    path = 'm01s06i247_4'
-    link_target = f2._links[path]
-    dsref = ADataObjects(f2.file._fh, link_target)
-    chunk_index = dsref.get_offset_addresses()
-    for e in chunk_index:
-        print(e)
-with fs.open(uri,'rb') as s3file2:
-    f3 = h5py.File(s3file2,'r')
-    print(f3[path])
+option = 1
+location = INPUT_OPTIONS[option][2]
+MYFILE = INPUT_OPTIONS[option][0]
+
+if location == 's3':
+
+    S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
+    fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
+    uri = 'bnl/'+MYFILE
+
+    t1 = time.time()
+    with fs.open(uri,'rb') as s3file2:
+        f2 = pyfive.File(s3file2)
+        #f2 = pyfive.File(MYPATH/MYFILE)
+        path = INPUT_OPTIONS[option][1]
+        link_target = f2._links[path]
+        dsref = ADataObjects(f2.file._fh, link_target)
+        chunk_index = dsref.get_offset_addresses()
+        t2 = time.time()
+        print(f'Chunk index timer  {t2-t1:.2}s')
+        for e in chunk_index:
+            print(e)
+
+
+    with fs.open(uri,'rb') as s3file2:
+        f3 = h5py.File(s3file2,'r')
+        print(f3[path])
 
 
 #v='tos'

From c464be8e881d85770e752e3d7e90b4cda0cf38d2 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 23 Feb 2024 11:12:15 +0000
Subject: [PATCH 004/105] Not working yet. Don't reckon I have the arguments to
 OrthogonalIndexer right yet.

---
 bnl/playing.py           | 53 ++++++++++++++++++++++++++--------------
 pyfive/as_dataobjects.py | 45 ++++++++++++++++++++++++++++------
 2 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/bnl/playing.py b/bnl/playing.py
index 56ae5a8..e379e1b 100644
--- a/bnl/playing.py
+++ b/bnl/playing.py
@@ -18,9 +18,24 @@
 
 MYPATH = Path(__file__).parent
 
-option = 1
+option = 2
 location = INPUT_OPTIONS[option][2]
 MYFILE = INPUT_OPTIONS[option][0]
+path = INPUT_OPTIONS[option][1]
+
+
+def working(f2, path, printindex=True):
+    link_target = f2._links[path]
+    t1 = time.time()
+    dsref = ADataObjects(f2.file._fh, link_target)
+    chunk_index = dsref.get_offset_addresses()
+    t2 = time.time()
+    print(f'Chunk index timer  {t2-t1:.2}s')
+    if printindex:
+        for e in chunk_index:
+            print(e)
+   
+    return t2
 
 if location == 's3':
 
@@ -31,29 +46,31 @@
     t1 = time.time()
     with fs.open(uri,'rb') as s3file2:
         f2 = pyfive.File(s3file2)
-        #f2 = pyfive.File(MYPATH/MYFILE)
-        path = INPUT_OPTIONS[option][1]
-        link_target = f2._links[path]
-        dsref = ADataObjects(f2.file._fh, link_target)
-        chunk_index = dsref.get_offset_addresses()
-        t2 = time.time()
-        print(f'Chunk index timer  {t2-t1:.2}s')
-        for e in chunk_index:
-            print(e)
-
+        t2 = working(f2, path)
+        print(f'Complete chunking timer {t2-t1:.2}s')
 
     with fs.open(uri,'rb') as s3file2:
         f3 = h5py.File(s3file2,'r')
         print(f3[path])
 
+elif location == 'local':
 
-#v='tos'
-#tos =f2[v]
-#v='dataset1'
-#print(tos)
-#x = tos[2,:]
-#print(x)
-#print(tos.shape)
+   
+    f2 = pyfive.File(MYPATH/MYFILE)
+    x = f2[path]
+    y = x[2,:]
+    print(x.shape)
+    print(y)
+    t1 = time.time()
+    t2 = working(f2, path, printindex=False)
+    d = ADataObjects(f2.file._fh, f2._links[path])
+    r = d[2:]
+    if len(r) != len(y):
+        print(f"yeah, well, it's not working (returning {len(r)} items instead of {len(y)})")
+        print(r)
+        raise ValueError('Busted')
 
 
+else:
+    raise ValueError('You stuffed up')
 
diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py
index ccf9c38..ab5e3da 100644
--- a/pyfive/as_dataobjects.py
+++ b/pyfive/as_dataobjects.py
@@ -2,6 +2,19 @@
 from .datatype_msg import DatatypeMessage
 import numpy as np
 from .btree import BTreeV1RawDataChunks
+from .indexing import OrthogonalIndexer
+
+
+class ZarrSubstitute:
+    """ 
+    This mimics the funcationality of the zarr array produced by kerchunk,
+    but with only what is needed for indexing
+    """
+    def __init__(self, chunk_index, shape, chunks):
+        self.chunk_index = chunk_index
+        self._chunks = chunks
+        self._shape = shape
+
 
 class ADataObjects(DataObjects):
     """ 
@@ -16,7 +29,7 @@ def __init__(self,*args,**kwargs):
         # not yet sure we need our own copy
         self._as_chunk_index=[]
 
-    def get_offset_addresses(self, args=None):
+    def get_offset_addresses(self):
         """ 
         Get the offset addresses for the data requested
         """
@@ -32,17 +45,18 @@ def get_offset_addresses(self, args=None):
         elif layout_class == 1:  # contiguous storage
             return NotImplementedError("Contiguous storage")
         if layout_class == 2:  # chunked storage
-            return self._as_get_chunk_addresses(args)
+            return self._as_get_chunk_addresses()
     
 
-    def _as_get_chunk_addresses(self, args):
+    def _as_get_chunk_addresses(self):
         """ 
         Get the offset addresses associated with all the chunks 
         known to the b-tree of this object
         """
-        self._get_chunk_params()
-        
         if self._as_chunk_index == []:
+
+            self._get_chunk_params()
+
             chunk_btree = BTreeV1RawDataChunks(
                 self.fh, self._chunk_address, self._chunk_dims)
 
@@ -59,9 +73,24 @@ def _as_get_chunk_addresses(self, args):
                     region = [slice(i, i+j) for i, j in zip(start, self.shape)]
                     self._as_chunk_index.append([region, start, addr, size])
 
-        if args is not None:
-            return NotImplementedError
-        return self._as_chunk_index
+    def __getitem__(self, args):
+
+        if self._as_chunk_index == []:
+            self._as_get_chunk_addresses
+
+        array = ZarrSubstitute(self._as_chunk_index, self.shape, self.chunks)
+
+        indexer = OrthogonalIndexer(args, array)
+        stripped_indexer = [(a, b, c) for a,b,c in indexer]
+        print(stripped_indexer)
+        mycoords = []
+        for chunk_coords, chunk_selection, out_selection in stripped_indexer:
+            coord = '.'.join([str(c) for c in chunk_coords])
+            mycoords.append((chunk_coords,coord))
+        print("This isn't yet doing what you think it is, it's only returning chunk indices for your selection")
+        return mycoords
+
+       
 
 
From afaa4f57ae72fe404b184846aa0a3ba4dd6e13a2 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 23 Feb 2024 11:16:19 +0000
Subject: [PATCH 005/105] A few more notes in the code so I can come back to it
 anon.

---
 bnl/playing.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/bnl/playing.py b/bnl/playing.py
index e379e1b..fd1f321 100644
--- a/bnl/playing.py
+++ b/bnl/playing.py
@@ -65,10 +65,14 @@ def working(f2, path, printindex=True):
     t2 = working(f2, path, printindex=False)
     d = ADataObjects(f2.file._fh, f2._links[path])
     r = d[2:]
-    if len(r) != len(y):
+    if len(r) >= len(y):
         print(f"yeah, well, it's not working (returning {len(r)} items instead of {len(y)})")
+        # as it's stands, r should be a set of indices for chunks containing y, which should have 
+        # length less than or equal to length (y). At the moment it's too long, so that's clearly
+        # broken
         print(r)
         raise ValueError('Busted')
+    raise ValueError('Busted, but in a better way')
 
 
 else:

From 18bc37c464e5d9d3b9335fc3135e20f97adc00d0 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 23 Feb 2024 11:17:26 +0000
Subject: [PATCH 006/105] Woops. Need this.

---
 pyfive/indexing.py | 1101 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1101 insertions(+)
 create mode 100644 pyfive/indexing.py

diff --git a/pyfive/indexing.py b/pyfive/indexing.py
new file mode 100644
index 0000000..f1e5e5e
--- /dev/null
+++ b/pyfive/indexing.py
@@ -0,0 +1,1101 @@
+# This file was lifted from zarr 23/02/24
+# lift and shift rather than import to keep installation dependencies down
+# and because we are very likely to change the array format for the chunk index
+# The code is MIT licensesd.
+
+import collections
+import itertools
+import math
+import numbers
+
+import numpy as np
+
+# in the original zarr source the following error definitions are an import from zarr.errors
+
+class _BaseZarrIndexError(IndexError):
+    _msg = ""
+
+    def __init__(self, *args):
+        super().__init__(self._msg.format(*args))
+
+class ArrayIndexError(IndexError):
+    pass
+
+def err_too_many_indices(selection, shape):
+    raise IndexError(f"too many indices for array; expected {len(shape)}, got {len(selection)}")
+
+class VindexInvalidSelectionError(_BaseZarrIndexError):
+    _msg = (
+        "unsupported selection type for vectorized indexing; only "
+        "coordinate selection (tuple of integer arrays) and mask selection "
+        "(single Boolean array) are supported; got {0!r}"
+    )
+class BoundsCheckError(_BaseZarrIndexError):
+    _msg = "index out of bounds for dimension with length {0}"
+
+class NegativeStepError(IndexError):
+    def __init__(self):
+        super().__init__("only slices with step >= 1 are supported")
+
+# And the rest of the code is the original file.
+
+def is_integer(x):
+    """True if x is an integer (both pure Python or NumPy).
+
+    Note that Python's bool is considered an integer too.
+    """
+    return isinstance(x, numbers.Integral)
+
+
+def is_integer_list(x):
+    """True if x is a list of integers.
+
+    This function assumes ie *does not check* that all elements of the list
+    have the same type. Mixed type lists will result in other errors that will
+    bubble up anyway.
+    """
+    return isinstance(x, list) and len(x) > 0 and is_integer(x[0])
+
+
+def is_integer_array(x, ndim=None):
+    t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui"
+    if ndim is not None:
+        t = t and len(x.shape) == ndim
+    return t
+
+
+def is_bool_array(x, ndim=None):
+    t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool
+    if ndim is not None:
+        t = t and len(x.shape) == ndim
+    return t
+
+
+def is_scalar(value, dtype):
+    if np.isscalar(value):
+        return True
+    if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names):
+        return True
+    return False
+
+
+def is_pure_fancy_indexing(selection, ndim):
+    """Check whether a selection contains only scalars or integer array-likes.
+
+    Parameters
+    ----------
+    selection : tuple, slice, or scalar
+        A valid selection value for indexing into arrays.
+
+    Returns
+    -------
+    is_pure : bool
+        True if the selection is a pure fancy indexing expression (ie not mixed
+        with boolean or slices).
+    """
+    if ndim == 1:
+        if is_integer_list(selection) or is_integer_array(selection):
+            return True
+        # if not, we go through the normal path below, because a 1-tuple
+        # of integers is also allowed.
+    no_slicing = (
+        isinstance(selection, tuple)
+        and len(selection) == ndim
+        and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection))
+    )
+    return (
+        no_slicing
+        and all(
+            is_integer(elem) or is_integer_list(elem) or is_integer_array(elem)
+            for elem in selection
+        )
+        and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection)
+    )
+
+
+def is_pure_orthogonal_indexing(selection, ndim):
+    if not ndim:
+        return False
+
+    # Case 1: Selection is a single iterable of integers
+    if is_integer_list(selection) or is_integer_array(selection, ndim=1):
+        return True
+
+    # Case two: selection contains either zero or one integer iterables.
+    # All other selection elements are slices or integers
+    return (
+        isinstance(selection, tuple)
+        and len(selection) == ndim
+        and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1
+        and all(
+            is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, (int, slice))
+            for elem in selection
+        )
+    )
+
+
+def normalize_integer_selection(dim_sel, dim_len):
+    # normalize type to int
+    dim_sel = int(dim_sel)
+
+    # handle wraparound
+    if dim_sel < 0:
+        dim_sel = dim_len + dim_sel
+
+    # handle out of bounds
+    if dim_sel >= dim_len or dim_sel < 0:
+        raise BoundsCheckError(dim_len)
+
+    return dim_sel
+
+
+ChunkDimProjection = collections.namedtuple(
+    "ChunkDimProjection", ("dim_chunk_ix", "dim_chunk_sel", "dim_out_sel")
+)
+"""A mapping from chunk to output array for a single dimension.
+
+Parameters
+----------
+dim_chunk_ix
+    Index of chunk.
+dim_chunk_sel
+    Selection of items from chunk array.
+dim_out_sel
+    Selection of items in target (output) array.
+
+"""
+
+
+class IntDimIndexer:
+    def __init__(self, dim_sel, dim_len, dim_chunk_len):
+        # normalize
+        dim_sel = normalize_integer_selection(dim_sel, dim_len)
+
+        # store attributes
+        self.dim_sel = dim_sel
+        self.dim_len = dim_len
+        self.dim_chunk_len = dim_chunk_len
+        self.nitems = 1
+
+    def __iter__(self):
+        dim_chunk_ix = self.dim_sel // self.dim_chunk_len
+        dim_offset = dim_chunk_ix * self.dim_chunk_len
+        dim_chunk_sel = self.dim_sel - dim_offset
+        dim_out_sel = None
+        yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel)
+
+
+def ceildiv(a, b):
+    return math.ceil(a / b)
+
+
+class SliceDimIndexer:
+    def __init__(self, dim_sel, dim_len, dim_chunk_len):
+        # normalize
+        self.start, self.stop, self.step = dim_sel.indices(dim_len)
+        if self.step < 1:
+            raise NegativeStepError()
+
+        # store attributes
+        self.dim_len = dim_len
+        self.dim_chunk_len = dim_chunk_len
+        self.nitems = max(0, ceildiv((self.stop - self.start), self.step))
+        self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len)
+
+    def __iter__(self):
+        # figure out the range of chunks we need to visit
+        dim_chunk_ix_from = self.start // self.dim_chunk_len
+        dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len)
+
+        # iterate over chunks in range
+        for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to):
+            # compute offsets for chunk within overall array
+            dim_offset = dim_chunk_ix * self.dim_chunk_len
+            dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len)
+
+            # determine chunk length, accounting for trailing chunk
+            dim_chunk_len = dim_limit - dim_offset
+
+            if self.start < dim_offset:
+                # selection starts before current chunk
+                dim_chunk_sel_start = 0
+                remainder = (dim_offset - self.start) % self.step
+                if remainder:
+                    dim_chunk_sel_start += self.step - remainder
+                # compute number of previous items, provides offset into output array
+                dim_out_offset = ceildiv((dim_offset - self.start), self.step)
+
+            else:
+                # selection starts within current chunk
+                dim_chunk_sel_start = self.start - dim_offset
+                dim_out_offset = 0
+
+            if self.stop > dim_limit:
+                # selection ends after current chunk
+                dim_chunk_sel_stop = dim_chunk_len
+
+            else:
+                # selection ends within current chunk
+                dim_chunk_sel_stop = self.stop - dim_offset
+
+            dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step)
+            dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step)
+
+            # If there are no elements on the selection within this chunk, then skip
+            if dim_chunk_nitems == 0:
+                continue
+
+            dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems)
+
+            yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel)
+
+
+def check_selection_length(selection, shape):
+    if len(selection) > len(shape):
+        err_too_many_indices(selection, shape)
+
+
+def replace_ellipsis(selection, shape):
+    selection = ensure_tuple(selection)
+
+    # count number of ellipsis present
+    n_ellipsis = sum(1 for i in selection if i is Ellipsis)
+
+    if n_ellipsis > 1:
+        # more than 1 is an error
+        raise IndexError("an index can only have a single ellipsis ('...')")
+
+    elif n_ellipsis == 1:
+        # locate the ellipsis, count how many items to left and right
+        n_items_l = selection.index(Ellipsis)  # items to left of ellipsis
+        n_items_r = len(selection) - (n_items_l + 1)  # items to right of ellipsis
+        n_items = len(selection) - 1  # all non-ellipsis items
+
+        if n_items >= len(shape):
+            # ellipsis does nothing, just remove it
+            selection = tuple(i for i in selection if i != Ellipsis)
+
+        else:
+            # replace ellipsis with as many slices are needed for number of dims
+            new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items))
+            if n_items_r:
+                new_item += selection[-n_items_r:]
+            selection = new_item
+
+    # fill out selection if not completely specified
+    if len(selection) < len(shape):
+        selection += (slice(None),) * (len(shape) - len(selection))
+
+    # check selection not too long
+    check_selection_length(selection, shape)
+
+    return selection
+
+
+def replace_lists(selection):
+    return tuple(
+        np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection
+    )
+
+
+def ensure_tuple(v):
+    if not isinstance(v, tuple):
+        v = (v,)
+    return v
+
+
+ChunkProjection = collections.namedtuple(
+    "ChunkProjection", ("chunk_coords", "chunk_selection", "out_selection")
+)
+"""A mapping of items from chunk to output array. Can be used to extract items from the
+chunk array for loading into an output array. Can also be used to extract items from a
+value array for setting/updating in a chunk array.
+
+Parameters
+----------
+chunk_coords
+    Indices of chunk.
+chunk_selection
+    Selection of items from chunk array.
+out_selection
+    Selection of items in target (output) array.
+
+"""
+
+
+def is_slice(s):
+    return isinstance(s, slice)
+
+
+def is_contiguous_slice(s):
+    return is_slice(s) and (s.step is None or s.step == 1)
+
+
+def is_positive_slice(s):
+    return is_slice(s) and (s.step is None or s.step >= 1)
+
+
+def is_contiguous_selection(selection):
+    selection = ensure_tuple(selection)
+    return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection)
+
+
+def is_basic_selection(selection):
+    selection = ensure_tuple(selection)
+    return all(is_integer(s) or is_positive_slice(s) for s in selection)
+
+
+# noinspection PyProtectedMember
+class BasicIndexer:
+    def __init__(self, selection, array):
+        # handle ellipsis
+        selection = replace_ellipsis(selection, array._shape)
+
+        # setup per-dimension indexers
+        dim_indexers = []
+        for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks):
+            if is_integer(dim_sel):
+                dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len)
+
+            elif is_slice(dim_sel):
+                dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len)
+
+            else:
+                raise IndexError(
+                    f"unsupported selection item for basic indexing; "
+                    f"expected integer or slice, got {type(dim_sel)!r}"
+                )
+
+            dim_indexers.append(dim_indexer)
+
+        self.dim_indexers = dim_indexers
+        self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer))
+        self.drop_axes = None
+
+    def __iter__(self):
+        for dim_projections in itertools.product(*self.dim_indexers):
+            chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections)
+            chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections)
+            out_selection = tuple(
+                p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None
+            )
+
+            yield ChunkProjection(chunk_coords, chunk_selection, out_selection)
+
+
+class BoolArrayDimIndexer:
+    def __init__(self, dim_sel, dim_len, dim_chunk_len):
+        # check number of dimensions
+        if not is_bool_array(dim_sel, 1):
+            raise IndexError(
+                "Boolean arrays in an orthogonal selection must " "be 1-dimensional only"
+            )
+
+        # check shape
+        if dim_sel.shape[0] != dim_len:
+            raise IndexError(
+                f"Boolean array has the wrong length for dimension; "
+                f"expected {dim_len}, got { dim_sel.shape[0]}"
+            )
+
+        # store attributes
+        self.dim_sel = dim_sel
+        self.dim_len = dim_len
+        self.dim_chunk_len = dim_chunk_len
+        self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len)
+
+        # precompute number of selected items for each chunk
+        self.chunk_nitems = np.zeros(self.nchunks, dtype="i8")
+        for dim_chunk_ix in range(self.nchunks):
+            dim_offset = dim_chunk_ix * self.dim_chunk_len
+            self.chunk_nitems[dim_chunk_ix] = np.count_nonzero(
+                self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len]
+            )
+        self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems)
+        self.nitems = self.chunk_nitems_cumsum[-1]
+        self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0]
+
+    def __iter__(self):
+        # iterate over chunks with at least one item
+        for dim_chunk_ix in self.dim_chunk_ixs:
+            # find region in chunk
+            dim_offset = dim_chunk_ix * self.dim_chunk_len
+            dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len]
+
+            # pad out if final chunk
+            if dim_chunk_sel.shape[0] < self.dim_chunk_len:
+                tmp = np.zeros(self.dim_chunk_len, dtype=bool)
+                tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel
+                dim_chunk_sel = tmp
+
+            # find region in output
+            if dim_chunk_ix == 0:
+                start = 0
+            else:
+                start = self.chunk_nitems_cumsum[dim_chunk_ix - 1]
+            stop = self.chunk_nitems_cumsum[dim_chunk_ix]
+            dim_out_sel = slice(start, stop)
+
+            yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel)
+
+
+class Order:
+    UNKNOWN = 0
+    INCREASING = 1
+    DECREASING = 2
+    UNORDERED = 3
+
+    @staticmethod
+    def check(a):
+        diff = np.diff(a)
+        diff_positive = diff >= 0
+        n_diff_positive = np.count_nonzero(diff_positive)
+        all_increasing = n_diff_positive == len(diff_positive)
+        any_increasing = n_diff_positive > 0
+        if all_increasing:
+            order = Order.INCREASING
+        elif any_increasing:
+            order = Order.UNORDERED
+        else:
+            order = Order.DECREASING
+        return order
+
+
+def wraparound_indices(x, dim_len):
+    loc_neg = x < 0
+    if np.any(loc_neg):
+        x[loc_neg] = x[loc_neg] + dim_len
+
+
+def boundscheck_indices(x, dim_len):
+    if np.any(x < 0) or np.any(x >= dim_len):
+        raise BoundsCheckError(dim_len)
+
+
+class IntArrayDimIndexer:
+    """Integer array selection against a single dimension."""
+
+    def __init__(
+        self,
+        dim_sel,
+        dim_len,
+        dim_chunk_len,
+        wraparound=True,
+        boundscheck=True,
+        order=Order.UNKNOWN,
+    ):
+        # ensure 1d array
+        dim_sel = np.asanyarray(dim_sel)
+        if not is_integer_array(dim_sel, 1):
+            raise IndexError(
+                "integer arrays in an orthogonal selection must be " "1-dimensional only"
+            )
+
+        # handle wraparound
+        if wraparound:
+            wraparound_indices(dim_sel, dim_len)
+
+        # handle out of bounds
+        if boundscheck:
+            boundscheck_indices(dim_sel, dim_len)
+
+        # store attributes
+        self.dim_len = dim_len
+        self.dim_chunk_len = dim_chunk_len
+        self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len)
+        self.nitems = len(dim_sel)
+
+        # determine which chunk is needed for each selection item
+        # note: for dense integer selections, the division operation here is the
+        # bottleneck
+        dim_sel_chunk = dim_sel // dim_chunk_len
+
+        # determine order of indices
+        if order == Order.UNKNOWN:
+            order = Order.check(dim_sel)
+        self.order = order
+
+        if self.order == Order.INCREASING:
+            self.dim_sel = dim_sel
+            self.dim_out_sel = None
+        elif self.order == Order.DECREASING:
+            self.dim_sel = dim_sel[::-1]
+            # TODO should be possible to do this without creating an arange
+            self.dim_out_sel = np.arange(self.nitems - 1, -1, -1)
+        else:
+            # sort indices to group by chunk
+            self.dim_out_sel = np.argsort(dim_sel_chunk)
+            self.dim_sel = np.take(dim_sel, self.dim_out_sel)
+
+        # precompute number of selected items for each chunk
+        self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks)
+
+        # find chunks that we need to visit
+        self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0]
+
+        # compute offsets into the output array
+        self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems)
+
+    def __iter__(self):
+        for dim_chunk_ix in self.dim_chunk_ixs:
+            # find region in output
+            if dim_chunk_ix == 0:
+                start = 0
+            else:
+                start = self.chunk_nitems_cumsum[dim_chunk_ix - 1]
+            stop = self.chunk_nitems_cumsum[dim_chunk_ix]
+            if self.order == Order.INCREASING:
+                dim_out_sel = slice(start, stop)
+            else:
+                dim_out_sel = self.dim_out_sel[start:stop]
+
+            # find region in chunk
+            dim_offset = dim_chunk_ix * self.dim_chunk_len
+            dim_chunk_sel = self.dim_sel[start:stop] - dim_offset
+
+            yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel)
+
+
+def slice_to_range(s: slice, l: int):  # noqa: E741
+    return range(*s.indices(l))
+
+
+def ix_(selection, shape):
+    """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_
+    but with support for slices and single ints."""
+
+    # normalisation
+    selection = replace_ellipsis(selection, shape)
+
+    # replace slice and int as these are not supported by numpy.ix_
+    selection = [
+        (
+            slice_to_range(dim_sel, dim_len)
+            if isinstance(dim_sel, slice)
+            else [dim_sel] if is_integer(dim_sel) else dim_sel
+        )
+        for dim_sel, dim_len in zip(selection, shape)
+    ]
+
+    # now get numpy to convert to a coordinate selection
+    selection = np.ix_(*selection)
+
+    return selection
+
+
+def oindex(a, selection):
+    """Implementation of orthogonal indexing with slices and ints."""
+    selection = replace_ellipsis(selection, a.shape)
+    drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s))
+    selection = ix_(selection, a.shape)
+    result = a[selection]
+    if drop_axes:
+        result = result.squeeze(axis=drop_axes)
+    return result
+
+
+def oindex_set(a, selection, value):
+    selection = replace_ellipsis(selection, a.shape)
+    drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s))
+    selection = ix_(selection, a.shape)
+    if not np.isscalar(value) and drop_axes:
+        value = np.asanyarray(value)
+        value_selection = [slice(None)] * len(a.shape)
+        for i in drop_axes:
+            value_selection[i] = np.newaxis
+        value_selection = tuple(value_selection)
+        value = value[value_selection]
+    a[selection] = value
+
+
+# noinspection PyProtectedMember
+class OrthogonalIndexer:
+    def __init__(self, selection, array):
+        # handle ellipsis
+        selection = replace_ellipsis(selection, array._shape)
+
+        # normalize list to array
+        selection = replace_lists(selection)
+
+        # setup per-dimension indexers
+        dim_indexers = []
+        for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks):
+            if is_integer(dim_sel):
+                dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len)
+
+            elif isinstance(dim_sel, slice):
+                dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len)
+
+            elif is_integer_array(dim_sel):
+                dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len)
+
+            elif is_bool_array(dim_sel):
+                dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len)
+
+            else:
+                raise IndexError(
+                    f"unsupported selection item for orthogonal indexing; "
+                    f"expected integer, slice, integer array or Boolean "
+                    f"array, got {type(dim_sel)!r}"
+                )
+
+            dim_indexers.append(dim_indexer)
+
+        self.array = array
+        self.dim_indexers = dim_indexers
+        self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer))
+        self.is_advanced = not is_basic_selection(selection)
+        if self.is_advanced:
+            self.drop_axes = tuple(
+                i
+                for i, dim_indexer in enumerate(self.dim_indexers)
+                if isinstance(dim_indexer, IntDimIndexer)
+            )
+        else:
+            self.drop_axes = None
+
+    def __iter__(self):
+        for dim_projections in itertools.product(*self.dim_indexers):
+            chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections)
+            chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections)
+            out_selection = tuple(
+                p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None
+            )
+
+            # handle advanced indexing arrays orthogonally
+            if self.is_advanced:
+                # N.B., numpy doesn't support orthogonal indexing directly as yet,
+                # so need to work around via np.ix_. Also np.ix_ does not support a
+                # mixture of arrays and slices or integers, so need to convert slices
+                # and integers into ranges.
+                chunk_selection = ix_(chunk_selection, self.array._chunks)
+
+                # special case for non-monotonic indices
+                if not is_basic_selection(out_selection):
+                    out_selection = ix_(out_selection, self.shape)
+
+            yield ChunkProjection(chunk_coords, chunk_selection, out_selection)
+
+
+class OIndex:
+    def __init__(self, array):
+        self.array = array
+
+    def __getitem__(self, selection):
+        fields, selection = pop_fields(selection)
+        selection = ensure_tuple(selection)
+        selection = replace_lists(selection)
+        return self.array.get_orthogonal_selection(selection, fields=fields)
+
+    def __setitem__(self, selection, value):
+        fields, selection = pop_fields(selection)
+        selection = ensure_tuple(selection)
+        selection = replace_lists(selection)
+        return self.array.set_orthogonal_selection(selection, value, fields=fields)
+
+
+# noinspection PyProtectedMember
+class BlockIndexer:
+    def __init__(self, selection, array):
+        # handle ellipsis
+        selection = replace_ellipsis(selection, array._shape)
+
+        # normalize list to array
+        selection = replace_lists(selection)
+
+        # setup per-dimension indexers
+        dim_indexers = []
+        for dim_sel, dim_len, dim_chunk_size in zip(selection, array._shape, array._chunks):
+            dim_numchunks = int(np.ceil(dim_len / dim_chunk_size))
+
+            if is_integer(dim_sel):
+                if dim_sel < 0:
+                    dim_sel = dim_numchunks + dim_sel
+
+                start = dim_sel * dim_chunk_size
+                stop = start + dim_chunk_size
+                slice_ = slice(start, stop)
+
+            elif is_slice(dim_sel):
+                start = dim_sel.start if dim_sel.start is not None else 0
+                stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks
+
+                if dim_sel.step not in {1, None}:
+                    raise IndexError(
+                        f"unsupported selection item for block indexing; "
+                        f"expected integer or slice with step=1, got {type(dim_sel)!r}"
+                    )
+
+                # Can't reuse wraparound_indices because it expects a numpy array
+                # We have integers here.
+                if start < 0:
+                    start = dim_numchunks + start
+                if stop < 0:
+                    stop = dim_numchunks + stop
+
+                start = start * dim_chunk_size
+                stop = stop * dim_chunk_size
+                slice_ = slice(start, stop)
+
+            else:
+                raise IndexError(
+                    f"unsupported selection item for block indexing; "
+                    f"expected integer or slice, got {type(dim_sel)!r}"
+                )
+
+            dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size)
+            dim_indexers.append(dim_indexer)
+
+            if start >= dim_len or start < 0:
+                raise BoundsCheckError(dim_len)
+
+        self.dim_indexers = dim_indexers
+        self.shape = tuple(s.nitems for s in self.dim_indexers)
+        self.drop_axes = None
+
+    def __iter__(self):
+        for dim_projections in itertools.product(*self.dim_indexers):
+            chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections)
+            chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections)
+            out_selection = tuple(
+                p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None
+            )
+
+            yield ChunkProjection(chunk_coords, chunk_selection, out_selection)
+
+
+class BlockIndex:
+    def __init__(self, array):
+        self.array = array
+
+    def __getitem__(self, selection):
+        fields, selection = pop_fields(selection)
+        selection = ensure_tuple(selection)
+        selection = replace_lists(selection)
+        return self.array.get_block_selection(selection, fields=fields)
+
+    def __setitem__(self, selection, value):
+        fields, selection = pop_fields(selection)
+        selection = ensure_tuple(selection)
+        selection = replace_lists(selection)
+        return self.array.set_block_selection(selection, value, fields=fields)
+
+
+# noinspection PyProtectedMember
+def is_coordinate_selection(selection, array):
+    return (len(selection) == len(array._shape)) and all(
+        is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection
+    )
+
+
+# noinspection PyProtectedMember
+def is_mask_selection(selection, array):
+    return (
+        len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == array._shape
+    )
+
+
+# noinspection PyProtectedMember
+class CoordinateIndexer:
+    def __init__(self, selection, array):
+        # some initial normalization
+        selection = ensure_tuple(selection)
+        selection = tuple([i] if is_integer(i) else i for i in selection)
+        selection = replace_lists(selection)
+
+        # validation
+        if not is_coordinate_selection(selection, array):
+            raise IndexError(
+                f"invalid coordinate selection; expected one integer "
+                f"(coordinate) array per dimension of the target array, "
+                f"got {selection!r}"
+            )
+
+        # handle wraparound, boundscheck
+        for dim_sel, dim_len in zip(selection, array.shape):
+            # handle wraparound
+            wraparound_indices(dim_sel, dim_len)
+
+            # handle out of bounds
+            boundscheck_indices(dim_sel, dim_len)
+
+        # compute chunk index for each point in the selection
+        chunks_multi_index = tuple(
+            dim_sel // dim_chunk_len for (dim_sel, dim_chunk_len) in zip(selection, array._chunks)
+        )
+
+        # broadcast selection - this will raise error if array dimensions don't match
+        selection = np.broadcast_arrays(*selection)
+        chunks_multi_index = np.broadcast_arrays(*chunks_multi_index)
+
+        # remember shape of selection, because we will flatten indices for processing
+        self.sel_shape = selection[0].shape if selection[0].shape else (1,)
+
+        # flatten selection
+        selection = [dim_sel.reshape(-1) for dim_sel in selection]
+        chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index]
+
+        # ravel chunk indices
+        chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape)
+
+        # group points by chunk
+        if np.any(np.diff(chunks_raveled_indices) < 0):
+            # optimisation, only sort if needed
+            sel_sort = np.argsort(chunks_raveled_indices)
+            selection = tuple(dim_sel[sel_sort] for dim_sel in selection)
+        else:
+            sel_sort = None
+
+        # store attributes
+        self.selection = selection
+        self.sel_sort = sel_sort
+        self.shape = selection[0].shape if selection[0].shape else (1,)
+        self.drop_axes = None
+        self.array = array
+
+        # precompute number of selected items for each chunk
+        self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks)
+        self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems)
+        # locate the chunks we need to process
+        self.chunk_rixs = np.nonzero(self.chunk_nitems)[0]
+
+        # unravel chunk indices
+        self.chunk_mixs = np.unravel_index(self.chunk_rixs, array._cdata_shape)
+
+    def __iter__(self):
+        # iterate over chunks
+        for i, chunk_rix in enumerate(self.chunk_rixs):
+            chunk_coords = tuple(m[i] for m in self.chunk_mixs)
+            if chunk_rix == 0:
+                start = 0
+            else:
+                start = self.chunk_nitems_cumsum[chunk_rix - 1]
+            stop = self.chunk_nitems_cumsum[chunk_rix]
+            if self.sel_sort is None:
+                out_selection = slice(start, stop)
+            else:
+                out_selection = self.sel_sort[start:stop]
+
+            chunk_offsets = tuple(
+                dim_chunk_ix * dim_chunk_len
+                for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks)
+            )
+            chunk_selection = tuple(
+                dim_sel[start:stop] - dim_chunk_offset
+                for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets)
+            )
+
+            yield ChunkProjection(chunk_coords, chunk_selection, out_selection)
+
+
+# noinspection PyProtectedMember
+class MaskIndexer(CoordinateIndexer):
+    def __init__(self, selection, array):
+        # some initial normalization
+        selection = ensure_tuple(selection)
+        selection = replace_lists(selection)
+
+        # validation
+        if not is_mask_selection(selection, array):
+            raise IndexError(
+                f"invalid mask selection; expected one Boolean (mask)"
+                f"array with the same shape as the target array, got {selection!r}"
+            )
+
+        # convert to indices
+        selection = np.nonzero(selection[0])
+
+        # delegate the rest to superclass
+        super().__init__(selection, array)
+
+
+class VIndex:
+    def __init__(self, array):
+        self.array = array
+
+    def __getitem__(self, selection):
+        fields, selection = pop_fields(selection)
+        selection = ensure_tuple(selection)
+        selection = replace_lists(selection)
+        if is_coordinate_selection(selection, self.array):
+            return self.array.get_coordinate_selection(selection, fields=fields)
+        elif is_mask_selection(selection, self.array):
+            return self.array.get_mask_selection(selection, fields=fields)
+        else:
+            raise VindexInvalidSelectionError(selection)
+
+    def __setitem__(self, selection, value):
+        fields, selection = pop_fields(selection)
+        selection = ensure_tuple(selection)
+        selection = replace_lists(selection)
+        if is_coordinate_selection(selection, self.array):
+            self.array.set_coordinate_selection(selection, value, fields=fields)
+        elif is_mask_selection(selection, self.array):
+            self.array.set_mask_selection(selection, value, fields=fields)
+        else:
+            raise VindexInvalidSelectionError(selection)
+
+
+def check_fields(fields, dtype):
+    # early out
+    if fields is None:
+        return dtype
+    # check type
+    if not isinstance(fields, (str, list, tuple)):
+        raise IndexError(
+            f"'fields' argument must be a string or list of strings; found " f"{type(fields)!r}"
+        )
+    if fields:
+        if dtype.names is None:
+            raise IndexError("invalid 'fields' argument, array does not have any fields")
+        try:
+            if isinstance(fields, str):
+                # single field selection
+                out_dtype = dtype[fields]
+            else:
+                # multiple field selection
+                out_dtype = np.dtype([(f, dtype[f]) for f in fields])
+        except KeyError as e:
+            raise IndexError(f"invalid 'fields' argument, field not found: {e!r}")
+        else:
+            return out_dtype
+    else:
+        return dtype
+
+
+def check_no_multi_fields(fields):
+    if isinstance(fields, list):
+        if len(fields) == 1:
+            return fields[0]
+        elif len(fields) > 1:
+            raise IndexError("multiple fields are not supported for this operation")
+    return fields
+
+
+def pop_fields(selection):
+    if isinstance(selection, str):
+        # single field selection
+        fields = selection
+        selection = ()
+    elif not isinstance(selection, tuple):
+        # single selection item, no fields
+        fields = None
+        # leave selection as-is
+    else:
+        # multiple items, split fields from selection items
+        fields = [f for f in selection if isinstance(f, str)]
+        fields = fields[0] if len(fields) == 1 else fields
+        selection = tuple(s for s in selection if not isinstance(s, str))
+        selection = selection[0] if len(selection) == 1 else selection
+    return fields, selection
+
+
+def make_slice_selection(selection):
+    ls = []
+    for dim_selection in selection:
+        if is_integer(dim_selection):
+            ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1))
+        elif isinstance(dim_selection, np.ndarray):
+            if len(dim_selection) == 1:
+                ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1))
+            else:
+                raise ArrayIndexError()
+        else:
+            ls.append(dim_selection)
+    return ls
+
+
+class PartialChunkIterator:
+    """Iterator to retrieve the specific coordinates of requested data
+    from within a compressed chunk.
+
+    Parameters
+    ----------
+    selection : tuple
+        tuple of slice objects to take from the chunk
+    arr_shape : shape of chunk to select data from
+
+    Attributes
+    -----------
+    arr_shape
+    selection
+
+    Returns
+    -------
+    Tuple with 3 elements:
+
+    start: int
+        elements offset in the chunk to read from
+    nitems: int
+        number of elements to read in the chunk from start
+    partial_out_selection: list of slices
+        indices of a temporary empty array of size `Array._chunks` to assign
+        the decompressed data to after the partial read.
+
+    Notes
+    -----
+    An array is flattened when compressed with blosc, so this iterator takes
+    the wanted selection of an array and determines the wanted coordinates
+    of the flattened, compressed data to be read and then decompressed. The
+    decompressed data is then placed in a temporary empty array of size
+    `Array._chunks` at the indices yielded as partial_out_selection.
+    Once all the slices yielded by this iterator have been read, decompressed
+    and written to the temporary array, the wanted slice of the chunk can be
+    indexed from the temporary array and written to the out_selection slice
+    of the out array.
+
+    """
+
+    def __init__(self, selection, arr_shape):
+        selection = make_slice_selection(selection)
+        self.arr_shape = arr_shape
+
+        # number of selection dimensions can't be greater than the number of chunk dimensions
+        if len(selection) > len(self.arr_shape):
+            raise ValueError(
+                "Selection has more dimensions then the array:\n"
+                f"selection dimensions = {len(selection)}\n"
+                f"array dimensions = {len(self.arr_shape)}"
+            )
+
+        # any selection can not be out of the range of the chunk
+        selection_shape = np.empty(self.arr_shape)[tuple(selection)].shape
+        if any(
+            selection_dim < 0 or selection_dim > arr_dim
+            for selection_dim, arr_dim in zip(selection_shape, self.arr_shape)
+        ):
+            raise IndexError(
+                "a selection index is out of range for the dimension"
+            )  # pragma: no cover
+
+        for i, dim_size in enumerate(self.arr_shape[::-1]):
+            index = len(self.arr_shape) - (i + 1)
+            if index <= len(selection) - 1:
+                slice_size = selection_shape[index]
+                if slice_size == dim_size and index > 0:
+                    selection.pop()
+                else:
+                    break
+
+        chunk_loc_slices = []
+        last_dim_slice = None if selection[-1].step > 1 else selection.pop()
+        for arr_shape_i, sl in zip(arr_shape, selection):
+            dim_chunk_loc_slices = []
+            assert isinstance(sl, slice)
+            for x in slice_to_range(sl, arr_shape_i):
+                dim_chunk_loc_slices.append(slice(x, x + 1, 1))
+            chunk_loc_slices.append(dim_chunk_loc_slices)
+        if last_dim_slice:
+            chunk_loc_slices.append([last_dim_slice])
+        self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices))
+
+    def __iter__(self):
+        chunk1 = self.chunk_loc_slices[0]
+        nitems = (chunk1[-1].stop - chunk1[-1].start) * np.prod(
+            self.arr_shape[len(chunk1) :], dtype=int
+        )
+        for partial_out_selection in self.chunk_loc_slices:
+            start = 0
+            for i, sl in enumerate(partial_out_selection):
+                start += sl.start * np.prod(self.arr_shape[i + 1 :], dtype=int)
+            yield start, nitems, partial_out_selection

From 4b0ac08545cad640a8a3204ee2ed6af50352048d Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sat, 24 Feb 2024 17:33:53 +0000
Subject: [PATCH 007/105] First working lazy read (only reads chunks needed for
 selection)

---
 bnl/playing.py   | 17 ++++++-----------
 bnl/playings3.py | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+), 11 deletions(-)
 create mode 100644 bnl/playings3.py

diff --git a/bnl/playing.py b/bnl/playing.py
index fd1f321..091e18f 100644
--- a/bnl/playing.py
+++ b/bnl/playing.py
@@ -4,6 +4,7 @@
 import s3fs 
 import h5py
 import time
+import numpy as np
 
 MYFILE = 'da193o_25_day__grid_T_198807-198807.nc'
 #MYFILE = '../tests/chunked.hdf5'
@@ -13,7 +14,8 @@
     ('da193o_25_day__grid_T_198807-198807.nc','tos','s3'),
     ('da193a_25_day__198807-198807.nc','m01s06i247_4','s3'),
     ('../tests/chunked.hdf5','dataset1','local'),
-    ('CMIP6-test.nc','tas', 's3')
+    ('CMIP6-test.nc','tas', 's3'),
+    ('CMIP6-test.nc','tas', 'local')
 ]
 
 MYPATH = Path(__file__).parent
@@ -55,7 +57,6 @@ def working(f2, path, printindex=True):
 
 elif location == 'local':
 
-   
     f2 = pyfive.File(MYPATH/MYFILE)
     x = f2[path]
     y = x[2,:]
@@ -64,15 +65,9 @@ def working(f2, path, printindex=True):
     t1 = time.time()
     t2 = working(f2, path, printindex=False)
     d = ADataObjects(f2.file._fh, f2._links[path])
-    r = d[2:]
-    if len(r) >= len(y):
-        print(f"yeah, well, it's not working (returning {len(r)} items instead of {len(y)})")
-        # as it's stands, r should be a set of indices for chunks containing y, which should have 
-        # length less than or equal to length (y). At the moment it's too long, so that's clearly
-        # broken
-        print(r)
-        raise ValueError('Busted')
-    raise ValueError('Busted, but in a better way')
+    r = d[2,:]
+    assert np.array_equal(r,y),f'original {y} does not equal {r} (minimal chunk read)'
+    print(r)
 
 
 else:
diff --git a/bnl/playings3.py b/bnl/playings3.py
new file mode 100644
index 0000000..67b2f65
--- /dev/null
+++ b/bnl/playings3.py
@@ -0,0 +1,21 @@
+import h5py
+import pyfive
+import s3fs
+
+MYFILE = 'da193o_25_day__grid_T_198807-198807.nc'
+S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
+
+
+uri = 'bnl/'+MYFILE
+fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
+
+
+with fs.open(uri,'rb') as s3file:
+    f = h5py.File(s3file,'r')
+    tos =f['tos']
+    print(tos)
+with fs.open(uri,'rb') as s3file2:
+    f2 = pyfive.File(s3file2)
+    tos2 = f2['tos']
+    print(tos2)
+

From 5356aa0163d53c0140fd95807e8e3ab36c0995cb Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sat, 24 Feb 2024 17:37:14 +0000
Subject: [PATCH 008/105] Woops didnt' commit the real oil

---
 bnl/playings3.py         | 21 -------------
 pyfive/as_dataobjects.py | 65 +++++++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 42 deletions(-)
 delete mode 100644 bnl/playings3.py

diff --git a/bnl/playings3.py b/bnl/playings3.py
deleted file mode 100644
index 67b2f65..0000000
--- a/bnl/playings3.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import h5py
-import pyfive
-import s3fs
-
-MYFILE = 'da193o_25_day__grid_T_198807-198807.nc'
-S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
-
-
-uri = 'bnl/'+MYFILE
-fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
-
-
-with fs.open(uri,'rb') as s3file:
-    f = h5py.File(s3file,'r')
-    tos =f['tos']
-    print(tos)
-with fs.open(uri,'rb') as s3file2:
-    f2 = pyfive.File(s3file2)
-    tos2 = f2['tos']
-    print(tos2)
-
diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py
index ab5e3da..278a912 100644
--- a/pyfive/as_dataobjects.py
+++ b/pyfive/as_dataobjects.py
@@ -5,15 +5,14 @@
 from .indexing import OrthogonalIndexer
 
 
-class ZarrSubstitute:
+class ZarrArrayStub:
     """ 
     This mimics the funcationality of the zarr array produced by kerchunk,
     but with only what is needed for indexing
     """
-    def __init__(self, chunk_index, shape, chunks):
-        self.chunk_index = chunk_index
-        self._chunks = chunks
-        self._shape = shape
+    def __init__(self, shape, chunks):
+        self._chunks = list(chunks)
+        self._shape = list(shape)
 
 
 class ADataObjects(DataObjects):
@@ -26,8 +25,11 @@ def __init__(self,*args,**kwargs):
         """
         super().__init__(*args,**kwargs)
 
-        # not yet sure we need our own copy
-        self._as_chunk_index=[]
+        #  Need our own copy for now to utilise the zarr indexer.
+        #  An optimisation could be to modify what is returned from OrthogonalIndexer
+        self._zchunk_index={}
+
+        self.order='C'
 
     def get_offset_addresses(self):
         """ 
@@ -53,7 +55,7 @@ def _as_get_chunk_addresses(self):
         Get the offset addresses associated with all the chunks 
         known to the b-tree of this object
         """
-        if self._as_chunk_index == []:
+        if self._zchunk_index == {}:
 
             self._get_chunk_params()
 
@@ -63,6 +65,12 @@ def _as_get_chunk_addresses(self):
             count = np.prod(self.shape)
             itemsize = np.dtype(self.dtype).itemsize
             chunk_buffer_size = count * itemsize
+
+            # The zarr orthogonal indexer returns the position in chunk
+            # space, whereas pyfive wants the position in array space.
+            # Here we index the pyfive chunk_index in zarr index space.
+        
+            ichunks = [1/c for c in self.chunks]
             
             for node in chunk_btree.all_nodes[0]:
                 for node_key, addr in zip(node['keys'], node['addresses']):
@@ -70,27 +78,42 @@ def _as_get_chunk_addresses(self):
                     if self.filter_pipeline:
                         size = node_key['chunk_size']
                     start = node_key['chunk_offset'][:-1]
-                    region = [slice(i, i+j) for i, j in zip(start, self.shape)]
-                    self._as_chunk_index.append([region, start, addr, size])
+                    key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
+                    self._zchunk_index[key] = (addr,size)
 
     def __getitem__(self, args):
 
-        if self._as_chunk_index == []:
-            self._as_get_chunk_addresses
+        if self._zchunk_index == {}:
+            self._as_get_chunk_addresses()
+            print("Loaded addresses for ", len(self._zchunk_index),' chunks')
 
-        array = ZarrSubstitute(self._as_chunk_index, self.shape, self.chunks)
+        array = ZarrArrayStub(self.shape, self.chunks)
 
         indexer = OrthogonalIndexer(args, array)
         stripped_indexer = [(a, b, c) for a,b,c in indexer]
-        print(stripped_indexer)
-        mycoords = []
-        for chunk_coords, chunk_selection, out_selection in stripped_indexer:
-            coord = '.'.join([str(c) for c in chunk_coords])
-            mycoords.append((chunk_coords,coord))
-        print("This isn't yet doing what you think it is, it's only returning chunk indices for your selection")
-        return mycoords
 
-       
+        filter_pipeline=None #FIXME, needs to be an argument or grabbed from somewhere
+        count = np.prod(self.chunks)
+        itemsize = np.dtype(self.dtype).itemsize
+        default_chunk_buffer_size = itemsize*count
+    
+        out_shape = indexer.shape
+        out = np.empty(out_shape, dtype=self.dtype, order=self.order)
+
+        for chunk_coords, chunk_selection, out_selection in stripped_indexer:
+            addr, chunk_buffer_size = self._zchunk_index[chunk_coords] 
+            self.fh.seek(addr)
+            if filter_pipeline is None:
+                chunk_buffer = self.fh.read(default_chunk_buffer_size)
+            else:
+                raise NotImplementedError
+                # The plan here would be to take the _filter_chunk method from BTree1RawDataChunks
+                # pop it out on it's own and make it a class method here as well as wherever else it needs to be
+            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+            out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
+
+        return out
+        
 
 
From 9fe23943c1c77260df7ebf55ce9843e49c73b042 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sat, 24 Feb 2024 18:54:06 +0000
Subject: [PATCH 009/105] Should now support filtering chunks in the partical
 chunk loading.  Haven't got any tests around this yet.

---
 pyfive/as_dataobjects.py | 32 ++++++++++++--------------------
 pyfive/btree.py          | 14 ++++++++++++++
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py
index 278a912..1f1515b 100644
--- a/pyfive/as_dataobjects.py
+++ b/pyfive/as_dataobjects.py
@@ -59,7 +59,7 @@ def _as_get_chunk_addresses(self):
 
             self._get_chunk_params()
 
-            chunk_btree = BTreeV1RawDataChunks(
+            self.chunk_btree = BTreeV1RawDataChunks(
                 self.fh, self._chunk_address, self._chunk_dims)
 
             count = np.prod(self.shape)
@@ -72,43 +72,35 @@ def _as_get_chunk_addresses(self):
         
             ichunks = [1/c for c in self.chunks]
             
-            for node in chunk_btree.all_nodes[0]:
+            for node in self.chunk_btree.all_nodes[0]:
                 for node_key, addr in zip(node['keys'], node['addresses']):
-                    size = chunk_buffer_size
-                    if self.filter_pipeline:
-                        size = node_key['chunk_size']
+                    size = node_key['chunk_size']
+                    if self._filter_pipeline:
+                        filter_mask = node_key['filter_mask']
+                    else:
+                        filter_mask=None
                     start = node_key['chunk_offset'][:-1]
                     key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
-                    self._zchunk_index[key] = (addr,size)
+                    self._zchunk_index[key] = (addr,size,filter_mask)
 
     def __getitem__(self, args):
 
         if self._zchunk_index == {}:
             self._as_get_chunk_addresses()
-            print("Loaded addresses for ", len(self._zchunk_index),' chunks')
 
         array = ZarrArrayStub(self.shape, self.chunks)
 
         indexer = OrthogonalIndexer(args, array)
         stripped_indexer = [(a, b, c) for a,b,c in indexer]
 
-        filter_pipeline=None #FIXME, needs to be an argument or grabbed from somewhere
-        count = np.prod(self.chunks)
-        itemsize = np.dtype(self.dtype).itemsize
-        default_chunk_buffer_size = itemsize*count
-    
+        itemsize = np.dtype(self.dtype).itemsize    
         out_shape = indexer.shape
         out = np.empty(out_shape, dtype=self.dtype, order=self.order)
 
         for chunk_coords, chunk_selection, out_selection in stripped_indexer:
-            addr, chunk_buffer_size = self._zchunk_index[chunk_coords] 
-            self.fh.seek(addr)
-            if filter_pipeline is None:
-                chunk_buffer = self.fh.read(default_chunk_buffer_size)
-            else:
-                raise NotImplementedError
-                # The plan here would be to take the _filter_chunk method from BTree1RawDataChunks
-                # pop it out on it's own and make it a class method here as well as wherever else it needs to be
+            addr, chunk_buffer_size, filter_mask = self._zchunk_index[chunk_coords] 
+            chunk_buffer = self.chunk_btree.get_one_chunk_buffer(
+                addr, chunk_buffer_size, itemsize,self._filter_pipeline, filter_mask)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
             out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
 
diff --git a/pyfive/btree.py b/pyfive/btree.py
index 38518fc..d4ee3cd 100644
--- a/pyfive/btree.py
+++ b/pyfive/btree.py
@@ -199,6 +199,20 @@ def construct_data_from_chunks(
 
         non_padded_region = tuple([slice(i) for i in data_shape])
         return data[non_padded_region]
+    
+    def get_one_chunk_buffer(self, addr, size, itemsize, filter_pipeline, filter_mask): 
+        """ 
+        Used when getting data chunk by chunk for reading partial data arrays
+        All the shaping and positioning is done in the calling function.
+        """
+        self.fh.seek(addr)
+        chunk_buffer = self.fh.read(size)
+        if filter_pipeline is not None:
+            chunk_buffer = self.fh.read(size)
+            filter_mask = filter_mask
+            chunk_buffer = self._filter_chunk(
+                chunk_buffer, filter_mask, filter_pipeline, itemsize)
+        return chunk_buffer
 
     @classmethod
     def _filter_chunk(cls, chunk_buffer, filter_mask, filter_pipeline, itemsize):

From dafb3c9bf25956c9f34abce919c289cddd610198 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 25 Feb 2024 09:05:32 +0000
Subject: [PATCH 010/105] Some additional documentation

---
 pyfive/as_dataobjects.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py
index 1f1515b..66d8caf 100644
--- a/pyfive/as_dataobjects.py
+++ b/pyfive/as_dataobjects.py
@@ -8,7 +8,7 @@
 class ZarrArrayStub:
     """ 
     This mimics the funcationality of the zarr array produced by kerchunk,
-    but with only what is needed for indexing
+    but with only what is needed for indexing.
     """
     def __init__(self, shape, chunks):
         self._chunks = list(chunks)
@@ -17,7 +17,7 @@ def __init__(self, shape, chunks):
 
 class ADataObjects(DataObjects):
     """ 
-    Subclass of DataObjets which access the chunk addresses for a given slice of data
+    Subclass of DataObjets which accesses the chunk addresses for a given slice of data
     """
     def __init__(self,*args,**kwargs):
         """
@@ -26,9 +26,7 @@ def __init__(self,*args,**kwargs):
         super().__init__(*args,**kwargs)
 
         #  Need our own copy for now to utilise the zarr indexer.
-        #  An optimisation could be to modify what is returned from OrthogonalIndexer
         self._zchunk_index={}
-
         self.order='C'
 
     def get_offset_addresses(self):
@@ -47,13 +45,15 @@ def get_offset_addresses(self):
         elif layout_class == 1:  # contiguous storage
             return NotImplementedError("Contiguous storage")
         if layout_class == 2:  # chunked storage
-            return self._as_get_chunk_addresses()
-    
+            self._as_get_chunk_addresses()
+            return self._zchunk_index
+
 
     def _as_get_chunk_addresses(self):
         """ 
         Get the offset addresses associated with all the chunks 
-        known to the b-tree of this object
+        known to the b-tree of this object, and load them into
+        an index suitable for use with the zarr indexer.
         """
         if self._zchunk_index == {}:
 
@@ -64,7 +64,6 @@ def _as_get_chunk_addresses(self):
 
             count = np.prod(self.shape)
             itemsize = np.dtype(self.dtype).itemsize
-            chunk_buffer_size = count * itemsize
 
             # The zarr orthogonal indexer returns the position in chunk
             # space, whereas pyfive wants the position in array space.
@@ -76,6 +75,7 @@ def _as_get_chunk_addresses(self):
                 for node_key, addr in zip(node['keys'], node['addresses']):
                     size = node_key['chunk_size']
                     if self._filter_pipeline:
+                        # I am not sure this varies per chunk, but in case it does
                         filter_mask = node_key['filter_mask']
                     else:
                         filter_mask=None
@@ -84,6 +84,10 @@ def _as_get_chunk_addresses(self):
                     self._zchunk_index[key] = (addr,size,filter_mask)
 
     def __getitem__(self, args):
+        """
+        Use the zarr orthongal indexer to extract data for a specfic selection within
+        the dataset array and in doing so, only load the relevant chunks.
+        """
 
         if self._zchunk_index == {}:
             self._as_get_chunk_addresses()
@@ -91,16 +95,17 @@ def __getitem__(self, args):
         array = ZarrArrayStub(self.shape, self.chunks)
 
         indexer = OrthogonalIndexer(args, array)
-        stripped_indexer = [(a, b, c) for a,b,c in indexer]
+        # FIXME: Need to understand what drop_axes was up to and whether or not
+        # it is relevant to this or not (I didn't understand it in the zarr implementation).
 
         itemsize = np.dtype(self.dtype).itemsize    
         out_shape = indexer.shape
         out = np.empty(out_shape, dtype=self.dtype, order=self.order)
 
-        for chunk_coords, chunk_selection, out_selection in stripped_indexer:
+        for chunk_coords, chunk_selection, out_selection in indexer:
             addr, chunk_buffer_size, filter_mask = self._zchunk_index[chunk_coords] 
             chunk_buffer = self.chunk_btree.get_one_chunk_buffer(
-                addr, chunk_buffer_size, itemsize,self._filter_pipeline, filter_mask)
+                addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
             out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
 

From 53e4ebe1715d6ec6f34b0a62ad32752c43f5a6f6 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 29 Feb 2024 10:00:01 +0000
Subject: [PATCH 011/105] Seems to work, prior to re-integration

---
 bnl/playing.py           |  5 ++---
 pyfive/as_dataobjects.py | 34 ++++++++++++++--------------------
 pyfive/btree.py          |  4 ++--
 pyfive/indexing.py       | 11 +++++++++++
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/bnl/playing.py b/bnl/playing.py
index 091e18f..aeff19d 100644
--- a/bnl/playing.py
+++ b/bnl/playing.py
@@ -30,7 +30,7 @@ def working(f2, path, printindex=True):
     link_target = f2._links[path]
     t1 = time.time()
     dsref = ADataObjects(f2.file._fh, link_target)
-    chunk_index = dsref.get_offset_addresses()
+    chunk_index = dsref._get_chunk_addresses()
     t2 = time.time()
     print(f'Chunk index timer  {t2-t1:.2}s')
     if printindex:
@@ -66,10 +66,9 @@ def working(f2, path, printindex=True):
     t2 = working(f2, path, printindex=False)
     d = ADataObjects(f2.file._fh, f2._links[path])
     r = d[2,:]
+    print(r)
     assert np.array_equal(r,y),f'original {y} does not equal {r} (minimal chunk read)'
     print(r)
-
-
 else:
     raise ValueError('You stuffed up')
 
diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py
index 66d8caf..7e32398 100644
--- a/pyfive/as_dataobjects.py
+++ b/pyfive/as_dataobjects.py
@@ -2,18 +2,7 @@
 from .datatype_msg import DatatypeMessage
 import numpy as np
 from .btree import BTreeV1RawDataChunks
-from .indexing import OrthogonalIndexer
-
-
-class ZarrArrayStub:
-    """ 
-    This mimics the funcationality of the zarr array produced by kerchunk,
-    but with only what is needed for indexing.
-    """
-    def __init__(self, shape, chunks):
-        self._chunks = list(chunks)
-        self._shape = list(shape)
-
+from .indexing import OrthogonalIndexer, ZarrArrayStub
 
 class ADataObjects(DataObjects):
     """ 
@@ -29,7 +18,7 @@ def __init__(self,*args,**kwargs):
         self._zchunk_index={}
         self.order='C'
 
-    def get_offset_addresses(self):
+    def _get_offset_addresses(self):
         """ 
         Get the offset addresses for the data requested
         """
@@ -46,10 +35,18 @@ def get_offset_addresses(self):
             return NotImplementedError("Contiguous storage")
         if layout_class == 2:  # chunked storage
             self._as_get_chunk_addresses()
-            return self._zchunk_index
+        
+    def get_chunk_details(self, chunk_coords):
+        """ 
+        Returns the chunk details associated with chunk coords
+        returned by the Zarr orthogonal indexer 
+        """
+        if self._zchunk_index == {}:
+            self._get_chunk_addresses()
 
+        return self._zchunk_index[chunk_coords]
 
-    def _as_get_chunk_addresses(self):
+    def _get_chunk_addresses(self):
         """ 
         Get the offset addresses associated with all the chunks 
         known to the b-tree of this object, and load them into
@@ -85,13 +82,10 @@ def _as_get_chunk_addresses(self):
 
     def __getitem__(self, args):
         """
-        Use the zarr orthongal indexer to extract data for a specfic selection within
+        Use the zarr orthogonal indexer to extract data for a specfic selection within
         the dataset array and in doing so, only load the relevant chunks.
         """
 
-        if self._zchunk_index == {}:
-            self._as_get_chunk_addresses()
-
         array = ZarrArrayStub(self.shape, self.chunks)
 
         indexer = OrthogonalIndexer(args, array)
@@ -103,7 +97,7 @@ def __getitem__(self, args):
         out = np.empty(out_shape, dtype=self.dtype, order=self.order)
 
         for chunk_coords, chunk_selection, out_selection in indexer:
-            addr, chunk_buffer_size, filter_mask = self._zchunk_index[chunk_coords] 
+            addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) 
             chunk_buffer = self.chunk_btree.get_one_chunk_buffer(
                 addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
diff --git a/pyfive/btree.py b/pyfive/btree.py
index d4ee3cd..fedbc64 100644
--- a/pyfive/btree.py
+++ b/pyfive/btree.py
@@ -208,8 +208,8 @@ def get_one_chunk_buffer(self, addr, size, itemsize, filter_pipeline, filter_mas
         self.fh.seek(addr)
         chunk_buffer = self.fh.read(size)
         if filter_pipeline is not None:
-            chunk_buffer = self.fh.read(size)
-            filter_mask = filter_mask
+            #chunk_buffer = self.fh.read(size)
+            #filter_mask = filter_mask
             chunk_buffer = self._filter_chunk(
                 chunk_buffer, filter_mask, filter_pipeline, itemsize)
         return chunk_buffer
diff --git a/pyfive/indexing.py b/pyfive/indexing.py
index f1e5e5e..e43ab9f 100644
--- a/pyfive/indexing.py
+++ b/pyfive/indexing.py
@@ -10,6 +10,17 @@
 
 import numpy as np
 
+
+class ZarrArrayStub:
+    """ 
+    This mimics the functionality of the zarr array produced by kerchunk,
+    but with only what is needed for indexing. This is the only "new" code
+    on top of the original Zarr source.
+    """
+    def __init__(self, shape, chunks):
+        self._chunks = list(chunks)
+        self._shape = list(shape)
+
 # in the original zarr source the following error definitions are an import from zarr.errors
 
 class _BaseZarrIndexError(IndexError):

From 9ac0bbdc4cf8c783b17e304ba19bac846bd82e93 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 1 Mar 2024 20:31:19 +0000
Subject: [PATCH 012/105] Moved chunk support into standard API

---
 pyfive/as_dataobjects.py | 110 -----------------
 pyfive/dataobjects.py    | 252 ++++++++++++++++++++++++++++-----------
 pyfive/high_level.py     |   4 +-
 3 files changed, 185 insertions(+), 181 deletions(-)
 delete mode 100644 pyfive/as_dataobjects.py

diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py
deleted file mode 100644
index 7e32398..0000000
--- a/pyfive/as_dataobjects.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from .dataobjects import DataObjects, DATA_STORAGE_MSG_TYPE
-from .datatype_msg import DatatypeMessage
-import numpy as np
-from .btree import BTreeV1RawDataChunks
-from .indexing import OrthogonalIndexer, ZarrArrayStub
-
-class ADataObjects(DataObjects):
-    """ 
-    Subclass of DataObjets which accesses the chunk addresses for a given slice of data
-    """
-    def __init__(self,*args,**kwargs):
-        """
-        Initialise via super class
-        """
-        super().__init__(*args,**kwargs)
-
-        #  Need our own copy for now to utilise the zarr indexer.
-        self._zchunk_index={}
-        self.order='C'
-
-    def _get_offset_addresses(self):
-        """ 
-        Get the offset addresses for the data requested
-        """
-
-        # offset and size from data storage message
-        msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
-        msg_offset = msg['offset_to_message']
-        version, dims, layout_class, property_offset = (
-            self._get_data_message_properties(msg_offset))
-
-        if layout_class == 0:  # compact storage
-            raise NotImplementedError("Compact storage")
-        elif layout_class == 1:  # contiguous storage
-            return NotImplementedError("Contiguous storage")
-        if layout_class == 2:  # chunked storage
-            self._as_get_chunk_addresses()
-        
-    def get_chunk_details(self, chunk_coords):
-        """ 
-        Returns the chunk details associated with chunk coords
-        returned by the Zarr orthogonal indexer 
-        """
-        if self._zchunk_index == {}:
-            self._get_chunk_addresses()
-
-        return self._zchunk_index[chunk_coords]
-
-    def _get_chunk_addresses(self):
-        """ 
-        Get the offset addresses associated with all the chunks 
-        known to the b-tree of this object, and load them into
-        an index suitable for use with the zarr indexer.
-        """
-        if self._zchunk_index == {}:
-
-            self._get_chunk_params()
-
-            self.chunk_btree = BTreeV1RawDataChunks(
-                self.fh, self._chunk_address, self._chunk_dims)
-
-            count = np.prod(self.shape)
-            itemsize = np.dtype(self.dtype).itemsize
-
-            # The zarr orthogonal indexer returns the position in chunk
-            # space, whereas pyfive wants the position in array space.
-            # Here we index the pyfive chunk_index in zarr index space.
-        
-            ichunks = [1/c for c in self.chunks]
-            
-            for node in self.chunk_btree.all_nodes[0]:
-                for node_key, addr in zip(node['keys'], node['addresses']):
-                    size = node_key['chunk_size']
-                    if self._filter_pipeline:
-                        # I am not sure this varies per chunk, but in case it does
-                        filter_mask = node_key['filter_mask']
-                    else:
-                        filter_mask=None
-                    start = node_key['chunk_offset'][:-1]
-                    key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
-                    self._zchunk_index[key] = (addr,size,filter_mask)
-
-    def __getitem__(self, args):
-        """
-        Use the zarr orthogonal indexer to extract data for a specfic selection within
-        the dataset array and in doing so, only load the relevant chunks.
-        """
-
-        array = ZarrArrayStub(self.shape, self.chunks)
-
-        indexer = OrthogonalIndexer(args, array)
-        # FIXME: Need to understand what drop_axes was up to and whether or not
-        # it is relevant to this or not (I didn't understand it in the zarr implementation).
-
-        itemsize = np.dtype(self.dtype).itemsize    
-        out_shape = indexer.shape
-        out = np.empty(out_shape, dtype=self.dtype, order=self.order)
-
-        for chunk_coords, chunk_selection, out_selection in indexer:
-            addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) 
-            chunk_buffer = self.chunk_btree.get_one_chunk_buffer(
-                addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask)
-            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-            out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
-
-        return out
-        
-
-
-
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 886b48f..a2fe60b 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -18,6 +18,7 @@
 from .btree import BTreeV2GroupNames, BTreeV2GroupOrders
 from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
 from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
+from .indexing import OrthogonalIndexer, ZarrArrayStub
 
 
 class DataObjects(object):
@@ -25,7 +26,7 @@ class DataObjects(object):
     HDF5 DataObjects.
     """
 
-    def __init__(self, fh, offset):
+    def __init__(self, fh, offset, order='C'):
         """ initalize. """
         fh.seek(offset)
         version_hint = struct.unpack_from('<B', fh.read(1))[0]
@@ -50,6 +51,7 @@ def __init__(self, fh, offset):
         self._chunks = None
         self._chunk_dims = None
         self._chunk_address = None
+        self.order = order
 
     @staticmethod
     def _parse_v1_objects(fh):
@@ -405,74 +407,6 @@ def filter_pipeline(self):
         self._filter_pipeline = filters
         return self._filter_pipeline
 
-    def get_data(self):
-        """ Return the data pointed to in the DataObject. """
-
-        # offset and size from data storage message
-        msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
-        msg_offset = msg['offset_to_message']
-        version, dims, layout_class, property_offset = (
-            self._get_data_message_properties(msg_offset))
-
-        if layout_class == 0:  # compact storage
-            raise NotImplementedError("Compact storage")
-        elif layout_class == 1:  # contiguous storage
-            return self._get_contiguous_data(property_offset)
-        if layout_class == 2:  # chunked storage
-            return self._get_chunked_data(msg_offset)
-
-    def _get_data_message_properties(self, msg_offset):
-        """ Return the message properties of the DataObject. """
-        dims, layout_class, property_offset = None, None, None
-        version, arg1, arg2 = struct.unpack_from(
-            '<BBB', self.msg_data, msg_offset)
-        if (version == 1) or (version == 2):
-            dims = arg1
-            layout_class = arg2
-            property_offset = msg_offset
-            property_offset += struct.calcsize('<BBB')
-            # reserved fields: 1 byte, 1 int
-            property_offset += struct.calcsize('<BI')
-            # compact storage (layout class 0) not supported:
-            assert (layout_class == 1) or (layout_class == 2)
-        elif (version == 3) or (version == 4):
-            layout_class = arg1
-            property_offset = msg_offset
-            property_offset += struct.calcsize('<BB')
-        assert (version >= 1) and (version <= 4)
-        return version, dims, layout_class, property_offset
-
-    def _get_contiguous_data(self, property_offset):
-        data_offset, = struct.unpack_from('<Q', self.msg_data, property_offset)
-
-        if data_offset == UNDEFINED_ADDRESS:
-            # no storage is backing array, return all zeros
-            return np.zeros(self.shape, dtype=self.dtype)
-
-        if not isinstance(self.dtype, tuple):
-            # return a memory-map to the stored array with copy-on-write
-            return np.memmap(self.fh, dtype=self.dtype, mode='c',
-                             offset=data_offset, shape=self.shape, order='C')
-        else:
-            dtype_class = self.dtype[0]
-            if dtype_class == 'REFERENCE':
-                size = self.dtype[1]
-                if size != 8:
-                    raise NotImplementedError('Unsupported Reference type')
-                ref_addresses = np.memmap(
-                    self.fh, dtype=('<u8'), mode='c', offset=data_offset,
-                    shape=self.shape, order='C')
-                return np.array([Reference(addr) for addr in ref_addresses])
-            else:
-                raise NotImplementedError('datatype not implemented')
-
-    def _get_chunked_data(self, offset):
-        """ Return data which is chunked. """
-        self._get_chunk_params()
-        chunk_btree = BTreeV1RawDataChunks(
-            self.fh, self._chunk_address, self._chunk_dims)
-        return chunk_btree.construct_data_from_chunks(
-            self.chunks, self.shape, self.dtype, self.filter_pipeline)
 
     def _get_chunk_params(self):
         """
@@ -647,7 +581,187 @@ def _decode_link_info_msg(data, offset):
     def is_dataset(self):
         """ True when DataObjects points to a dataset, False for a group. """
         return len(self.find_msg_type(DATASPACE_MSG_TYPE)) > 0
+    
+
+class DatasetDataObject(DataObjects):
+    """ 
+    Subclass of DataObjects associated with one Dataset, and 
+    which handles actual data access.
+    """
+    def __init__(self,*args,**kwargs):
+        """
+        Initialise via super class
+        """
+        super().__init__(*args,**kwargs)
+
+        #  Need our own copy for now to utilise the zarr indexer.
+        self._zchunk_index={}
+        self.order='C'
 
+        # offset and size from data storage message
+        msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
+        self.msg_offset = msg['offset_to_message']
+        version, dims, self.layout_class, self.property_offset = (
+            self._get_data_message_properties(self.msg_offset))
+        
+    def get_data(self, args=None):
+        """ 
+        Return the data pointed to in the DataObject.
+        """
+
+        if self.layout_class == 0:  # compact storage
+            raise NotImplementedError("Compact storage")
+        elif self.layout_class == 1:  # contiguous storage
+            if args is None:
+                return self._get_contiguous_data(self.property_offset)
+            else:
+                return self._get_contiguous_data(self.property_offset)[args]
+        if self.layout_class == 2:  # chunked storage
+            if args is None:
+                return self._get_chunked_data(self.msg_offset)
+            else:
+                return self._get_selection_via_chunks(args)
+
+    def _get_data_message_properties(self, msg_offset):
+        """ Return the message properties of the DataObject. """
+        dims, layout_class, property_offset = None, None, None
+        version, arg1, arg2 = struct.unpack_from(
+            '<BBB', self.msg_data, msg_offset)
+        if (version == 1) or (version == 2):
+            dims = arg1
+            layout_class = arg2
+            property_offset = msg_offset
+            property_offset += struct.calcsize('<BBB')
+            # reserved fields: 1 byte, 1 int
+            property_offset += struct.calcsize('<BI')
+            # compact storage (layout class 0) not supported:
+            assert (layout_class == 1) or (layout_class == 2)
+        elif (version == 3) or (version == 4):
+            layout_class = arg1
+            property_offset = msg_offset
+            property_offset += struct.calcsize('<BB')
+        assert (version >= 1) and (version <= 4)
+        return version, dims, layout_class, property_offset
+
+    def _get_contiguous_data(self, property_offset):
+        data_offset, = struct.unpack_from('<Q', self.msg_data, property_offset)
+
+        if data_offset == UNDEFINED_ADDRESS:
+            # no storage is backing array, return all zeros
+            return np.zeros(self.shape, dtype=self.dtype)
+
+        if not isinstance(self.dtype, tuple):
+            # return a memory-map to the stored array with copy-on-write
+            return np.memmap(self.fh, dtype=self.dtype, mode='c',
+                             offset=data_offset, shape=self.shape, order=self.order)
+        else:
+            dtype_class = self.dtype[0]
+            if dtype_class == 'REFERENCE':
+                size = self.dtype[1]
+                if size != 8:
+                    raise NotImplementedError('Unsupported Reference type')
+                ref_addresses = np.memmap(
+                    self.fh, dtype=('<u8'), mode='c', offset=data_offset,
+                    shape=self.shape, order=self.order)
+                return np.array([Reference(addr) for addr in ref_addresses])
+            else:
+                raise NotImplementedError('datatype not implemented')
+
+    def _get_chunked_data(self, offset):
+        """ Return data which is chunked. """
+        self._get_chunk_params()
+        chunk_btree = BTreeV1RawDataChunks(
+            self.fh, self._chunk_address, self._chunk_dims)
+        return chunk_btree.construct_data_from_chunks(
+            self.chunks, self.shape, self.dtype, self.filter_pipeline)
+
+
+    def _get_offset_addresses(self):
+        """ 
+        Get the offset addresses for the data requested
+        """
+
+        # offset and size from data storage message
+        msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
+        msg_offset = msg['offset_to_message']
+        version, dims, layout_class, property_offset = (
+            self._get_data_message_properties(msg_offset))
+
+        if layout_class == 0:  # compact storage
+            raise NotImplementedError("Compact storage")
+        elif layout_class == 1:  # contiguous storage
+            return NotImplementedError("Contiguous storage")
+        if layout_class == 2:  # chunked storage
+            self._as_get_chunk_addresses()
+        
+    def get_chunk_details(self, chunk_coords):
+        """ 
+        Returns the chunk details associated with chunk coords
+        returned by the Zarr orthogonal indexer 
+        """
+        if self._zchunk_index == {}:
+            self._get_chunk_addresses()
+
+        return self._zchunk_index[chunk_coords]
+
+    def _get_chunk_addresses(self):
+        """ 
+        Get the offset addresses associated with all the chunks 
+        known to the b-tree of this object, and load them into
+        an index suitable for use with the zarr indexer.
+        """
+        if self._zchunk_index == {}:
+
+            self._get_chunk_params()
+
+            self.chunk_btree = BTreeV1RawDataChunks(
+                self.fh, self._chunk_address, self._chunk_dims)
+
+            count = np.prod(self.shape)
+            itemsize = np.dtype(self.dtype).itemsize
+
+            # The zarr orthogonal indexer returns the position in chunk
+            # space, whereas pyfive wants the position in array space.
+            # Here we index the pyfive chunk_index in zarr index space.
+        
+            ichunks = [1/c for c in self.chunks]
+            
+            for node in self.chunk_btree.all_nodes[0]:
+                for node_key, addr in zip(node['keys'], node['addresses']):
+                    size = node_key['chunk_size']
+                    if self._filter_pipeline:
+                        # I am not sure this varies per chunk, but in case it does
+                        filter_mask = node_key['filter_mask']
+                    else:
+                        filter_mask=None
+                    start = node_key['chunk_offset'][:-1]
+                    key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
+                    self._zchunk_index[key] = (addr,size,filter_mask)
+
+    def _get_selection_via_chunks(self, args):
+        """
+        Use the zarr orthogonal indexer to extract data for a specfic selection within
+        the dataset array and in doing so, only load the relevant chunks.
+        """
+    
+        array = ZarrArrayStub(self.shape, self.chunks)
+        indexer = OrthogonalIndexer(args, array)
+        # FIXME: Need to understand what drop_axes was up to and whether or not
+        # it is relevant to this or not (I didn't understand it in the zarr implementation).
+
+        itemsize = np.dtype(self.dtype).itemsize    
+        out_shape = indexer.shape
+        out = np.empty(out_shape, dtype=self.dtype, order=self.order)
+
+        for chunk_coords, chunk_selection, out_selection in indexer:
+            addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) 
+            chunk_buffer = self.chunk_btree.get_one_chunk_buffer(
+                addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask)
+            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+            out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
+
+        return out
+        
 
 def determine_data_shape(buf, offset):
     """ Return the shape of the dataset pointed to in a Dataspace message. """
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index be7f164..e13bdc9 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from .core import Reference
-from .dataobjects import DataObjects
+from .dataobjects import DataObjects, DatasetDataObject
 from .misc_low_level import SuperBlock
 
 
@@ -89,7 +89,7 @@ def __getitem__(self, y):
         if dataobjs.is_dataset:
             if additional_obj != '.':
                 raise KeyError('%s is a dataset, not a group' % (obj_name))
-            return Dataset(obj_name, dataobjs, self)
+            return Dataset(obj_name, DatasetDataObject(self.file._fh, link_target), self)
         return Group(obj_name, dataobjs, self)[additional_obj]
 
     def __iter__(self):

From a88a150c74e1f76644929d49e79dc07ae27423e0 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 1 Mar 2024 21:30:57 +0000
Subject: [PATCH 013/105] removing playing code

---
 bnl/playing.py | 74 --------------------------------------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 bnl/playing.py

diff --git a/bnl/playing.py b/bnl/playing.py
deleted file mode 100644
index aeff19d..0000000
--- a/bnl/playing.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import pyfive
-from pathlib import Path
-from pyfive.as_dataobjects import ADataObjects
-import s3fs 
-import h5py
-import time
-import numpy as np
-
-MYFILE = 'da193o_25_day__grid_T_198807-198807.nc'
-#MYFILE = '../tests/chunked.hdf5'
-MYFILE =  'da193a_25_day__198807-198807.nc'
-
-INPUT_OPTIONS = [
-    ('da193o_25_day__grid_T_198807-198807.nc','tos','s3'),
-    ('da193a_25_day__198807-198807.nc','m01s06i247_4','s3'),
-    ('../tests/chunked.hdf5','dataset1','local'),
-    ('CMIP6-test.nc','tas', 's3'),
-    ('CMIP6-test.nc','tas', 'local')
-]
-
-MYPATH = Path(__file__).parent
-
-option = 2
-location = INPUT_OPTIONS[option][2]
-MYFILE = INPUT_OPTIONS[option][0]
-path = INPUT_OPTIONS[option][1]
-
-
-def working(f2, path, printindex=True):
-    link_target = f2._links[path]
-    t1 = time.time()
-    dsref = ADataObjects(f2.file._fh, link_target)
-    chunk_index = dsref._get_chunk_addresses()
-    t2 = time.time()
-    print(f'Chunk index timer  {t2-t1:.2}s')
-    if printindex:
-        for e in chunk_index:
-            print(e)
-   
-    return t2
-
-if location == 's3':
-
-    S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
-    fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
-    uri = 'bnl/'+MYFILE
-
-    t1 = time.time()
-    with fs.open(uri,'rb') as s3file2:
-        f2 = pyfive.File(s3file2)
-        t2 = working(f2, path)
-        print(f'Complete chunking timer {t2-t1:.2}s')
-
-    with fs.open(uri,'rb') as s3file2:
-        f3 = h5py.File(s3file2,'r')
-        print(f3[path])
-
-elif location == 'local':
-
-    f2 = pyfive.File(MYPATH/MYFILE)
-    x = f2[path]
-    y = x[2,:]
-    print(x.shape)
-    print(y)
-    t1 = time.time()
-    t2 = working(f2, path, printindex=False)
-    d = ADataObjects(f2.file._fh, f2._links[path])
-    r = d[2,:]
-    print(r)
-    assert np.array_equal(r,y),f'original {y} does not equal {r} (minimal chunk read)'
-    print(r)
-else:
-    raise ValueError('You stuffed up')
-

From 96dc178e989c8eedc20b8268e022bdb9140556ca Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 3 Mar 2024 09:29:15 +0000
Subject: [PATCH 014/105] Fixes bug which stops the selection read from
 actually occurring and changes to actually using the filter pipeline. At this
 point is failling test_reference.

---
 pyfive/__init__.py    | 1 +
 pyfive/dataobjects.py | 4 ++--
 pyfive/high_level.py  | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyfive/__init__.py b/pyfive/__init__.py
index 9df950e..9fa6180 100644
--- a/pyfive/__init__.py
+++ b/pyfive/__init__.py
@@ -3,5 +3,6 @@
 """
 
 from .high_level import File
+from .indexing import ZarrArrayStub, OrthogonalIndexer
 
 __version__ = '0.4.0.dev'
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 0730e12..90be3c4 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -741,7 +741,7 @@ def _get_chunk_addresses(self):
             for node in self.chunk_btree.all_nodes[0]:
                 for node_key, addr in zip(node['keys'], node['addresses']):
                     size = node_key['chunk_size']
-                    if self._filter_pipeline:
+                    if self.filter_pipeline:
                         # I am not sure this varies per chunk, but in case it does
                         filter_mask = node_key['filter_mask']
                     else:
@@ -768,7 +768,7 @@ def _get_selection_via_chunks(self, args):
         for chunk_coords, chunk_selection, out_selection in indexer:
             addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) 
             chunk_buffer = self.chunk_btree.get_one_chunk_buffer(
-                addr, chunk_buffer_size, itemsize, self._filter_pipeline, filter_mask)
+                addr, chunk_buffer_size, itemsize, self.filter_pipeline, filter_mask)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
             out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
 
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 6614f3a..c033217 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -276,7 +276,7 @@ def __repr__(self):
         return '<HDF5 dataset "%s": shape %s, type "%s">' % info
 
     def __getitem__(self, args):
-        data = self._dataobjects.get_data()[args]
+        data = self._dataobjects.get_data(args)
         if self._astype is None:
             return data
         return data.astype(self._astype)

From eb44c15f3bd8fc0c7defc3ea91d80037d7136153 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 3 Mar 2024 09:45:00 +0000
Subject: [PATCH 015/105] Hack to avoid reference datatypes in chunk by chunk
 selections.

---
 pyfive/dataobjects.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 90be3c4..e427d08 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -629,8 +629,12 @@ def get_data(self, args=None):
             else:
                 return self._get_contiguous_data(self.property_offset)[args]
         if self.layout_class == 2:  # chunked storage
+            # If reading all chunks, use the (hopefully faster) "do it one go" method.
+            # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version
             if args is None:
                 return self._get_chunked_data(self.msg_offset)
+            elif isinstance(self.dtype, tuple):
+                return self._get_chunked_data(self.msg_offset)[args]
             else:
                 return self._get_selection_via_chunks(args)
 

From 51f7cca261e03aa3dabae93350c6a3589a6c8847 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 5 Mar 2024 07:31:31 +0000
Subject: [PATCH 016/105] Remove obsolete function

---
 pyfive/dataobjects.py | 19 -------------------
 pyfive/high_level.py  |  1 +
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index e427d08..7b0c1a8 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -690,25 +690,6 @@ def _get_chunked_data(self, offset):
             self.fh, self._chunk_address, self._chunk_dims)
         return chunk_btree.construct_data_from_chunks(
             self.chunks, self.shape, self.dtype, self.filter_pipeline)
-
-
-    def _get_offset_addresses(self):
-        """ 
-        Get the offset addresses for the data requested
-        """
-
-        # offset and size from data storage message
-        msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
-        msg_offset = msg['offset_to_message']
-        version, dims, layout_class, property_offset = (
-            self._get_data_message_properties(msg_offset))
-
-        if layout_class == 0:  # compact storage
-            raise NotImplementedError("Compact storage")
-        elif layout_class == 1:  # contiguous storage
-            return NotImplementedError("Contiguous storage")
-        if layout_class == 2:  # chunked storage
-            self._as_get_chunk_addresses()
         
     def get_chunk_details(self, chunk_coords):
         """ 
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index c033217..8bf0544 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -323,6 +323,7 @@ def ndim(self):
     @property
     def dtype(self):
         """ dtype attribute. """
+        # In the HDF5 implementation this is a numpy dtype
         return self._dataobjects.dtype
 
     @property

From 1f61d6c1e03ec21ba9bc2991d17473f311d141a1 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 5 Mar 2024 08:05:38 +0000
Subject: [PATCH 017/105] Support for third party access to contiguous data
 address and size. Also remove list definition which breaks references.

---
 pyfive/dataobjects.py | 21 +++++++++++++++++----
 pyfive/indexing.py    |  7 +++++--
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 7b0c1a8..100210c 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -694,12 +694,25 @@ def _get_chunked_data(self, offset):
     def get_chunk_details(self, chunk_coords):
         """ 
         Returns the chunk details associated with chunk coords
-        returned by the Zarr orthogonal indexer 
+        returned by the Zarr orthogonal indexer. The special case
+        is that if the data is contiguous, we still want to return
+        the offset and size, as the point of this entry point is
+        to provide third party applications an address to the data.
         """
-        if self._zchunk_index == {}:
-            self._get_chunk_addresses()
+        if self.layout_class == 0:  # compact storage
+            raise NotImplementedError("Compact storage")
+        elif self.layout_class == 1:  # contiguous storage
+            # This option never used by pyfive itself as we use the memory map for
+            # access to contiguous data, but third parties may need it.
+            # Ignore coordinates, just give the location and size of entire array
+            data_offset, = struct.unpack_from('<Q', self.msg_data, self.property_offset)
+            # No way there can be filtering of an unchunked dataset, so no filter mask?
+            return data_offset, np.prod(self.shape)*np.dtype(self.dtype).itemsize, None
+        else:
+            if self._zchunk_index == {}:
+                self._get_chunk_addresses()
 
-        return self._zchunk_index[chunk_coords]
+            return self._zchunk_index[chunk_coords]
 
     def _get_chunk_addresses(self):
         """ 
diff --git a/pyfive/indexing.py b/pyfive/indexing.py
index e43ab9f..e1d9509 100644
--- a/pyfive/indexing.py
+++ b/pyfive/indexing.py
@@ -18,8 +18,11 @@ class ZarrArrayStub:
     on top of the original Zarr source.
     """
     def __init__(self, shape, chunks):
-        self._chunks = list(chunks)
-        self._shape = list(shape)
+        if chunks is None:
+            self._chunks = shape
+        else:
+            self._chunks = chunks
+        self._shape = shape
 
 # in the original zarr source the following error definitions are an import from zarr.errors
 

From e6217b5cdb002a64773b7a30c9833b5ca370b21f Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 7 Mar 2024 17:35:41 +0000
Subject: [PATCH 018/105] First cut, fails references and classic, even with
 new stuff turned off?

---
 pyfive/dataobjects.py | 105 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 96 insertions(+), 9 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 100210c..d221698 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -5,6 +5,7 @@
 from collections import OrderedDict
 import struct
 import warnings
+from io import UnsupportedOperation
 
 import numpy as np
 
@@ -610,6 +611,18 @@ def __init__(self,*args,**kwargs):
         self._zchunk_index={}
         self.order='C'
 
+        ##########################################################################
+        # pseudo chunking control. 
+        #these can be changed from outside for testing purposes
+        # pseudo chunk blocksize: this is a size below which we don't bother 
+        # pseudo chunking for contiguous data and just load the lot at data
+        # access time: units are kibibytes
+        self.pseudo_chunking = False
+        self.pseudo_block_size_kib = 0
+        # We can't use mmaps on S3
+        self.avoid_mmap = False
+        ##########################################################################
+
         # offset and size from data storage message
         msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
         self.msg_offset = msg['offset_to_message']
@@ -624,10 +637,9 @@ def get_data(self, args=None):
         if self.layout_class == 0:  # compact storage
             raise NotImplementedError("Compact storage")
         elif self.layout_class == 1:  # contiguous storage
-            if args is None:
-                return self._get_contiguous_data(self.property_offset)
-            else:
-                return self._get_contiguous_data(self.property_offset)[args]
+            if self.avoid_mmap:
+                return self._get_selection_from_contiguous(args)
+            return self._get_contiguous_data(self.property_offset,args)
         if self.layout_class == 2:  # chunked storage
             # If reading all chunks, use the (hopefully faster) "do it one go" method.
             # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version
@@ -659,17 +671,20 @@ def _get_data_message_properties(self, msg_offset):
         assert (version >= 1) and (version <= 4)
         return version, dims, layout_class, property_offset
 
-    def _get_contiguous_data(self, property_offset):
+    def _get_contiguous_data(self, property_offset, args):
         data_offset, = struct.unpack_from('<Q', self.msg_data, property_offset)
 
         if data_offset == UNDEFINED_ADDRESS:
             # no storage is backing array, return all zeros
-            return np.zeros(self.shape, dtype=self.dtype)
+            result = np.zeros(self.shape, dtype=self.dtype)
 
         if not isinstance(self.dtype, tuple):
-            # return a memory-map to the stored array with copy-on-write
-            return np.memmap(self.fh, dtype=self.dtype, mode='c',
-                             offset=data_offset, shape=self.shape, order=self.order)
+            try:
+                # return a memory-map to the stored array with copy-on-write
+                result = np.memmap(self.fh, dtype=self.dtype, mode='c',
+                            offset=data_offset, shape=self.shape, order=self.order)
+            except UnsupportedOperation:
+                return self._get_selection_from_contiguous(args)
         else:
             dtype_class = self.dtype[0]
             if dtype_class == 'REFERENCE':
@@ -682,6 +697,10 @@ def _get_contiguous_data(self, property_offset):
                 return np.array([Reference(addr) for addr in ref_addresses])
             else:
                 raise NotImplementedError('datatype not implemented')
+        if args is None:
+            return result
+        else:
+            return result[args]
 
     def _get_chunked_data(self, offset):
         """ Return data which is chunked. """
@@ -771,6 +790,74 @@ def _get_selection_via_chunks(self, args):
             out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
 
         return out
+    
+
+    def _get_selection_from_contiguous(self, args=None):
+        """
+        Two options, we either read the entire contiguous array, and pull out
+        the selection (args) from that, or we try and read the contiguous
+        array in pseudo chunks (being the contiguous data which varies on
+        the slowest axis). We only do the latter if a) we have the
+        pseudo_chunking turned on (see init method), b) the array is 
+        multi dimensional, and c) the slabs are big enough to bother
+        (i.e bigger than pseudo_block_size_kib). We use the fact that the
+        storage order is known (HDF writes in the C-order, last axis
+        varying fastest).
+        """
+
+        # don't want to be dong this if we are actually chunked!
+        assert self.chunks is None
+        
+        data_offset, = struct.unpack_from('<Q', self.msg_data, self.property_offset)
+        itemsize = np.dtype(self.dtype).itemsize
+        
+        #are we 1d, too small to worry about pseudo chunks, or is pseudo chunking turned off?
+        #if so, read the lot as one chunk, otherwise use collapse all but last dimension into chunks
+      
+        if len(self.shape) == 1:
+            self.pseudo_chunking = False
+
+        if self.pseudo_chunking:
+            stride = np.prod(self.shape[1:])*itemsize
+            if stride < self.pseudo_block_size_kib*1024:
+                self.pseudo_chunking = False
+        
+        if self.pseudo_chunking:
+            pseudo_chunks = np.copy(self.shape)
+            pseudo_chunks[0] = 1
+        else:
+            stride = np.prod(self.shape)*itemsize
+            pseudo_chunks=self.shape
+ 
+        if args is None:
+
+            # we need it all, let's get it all
+            self.fh.seek(data_offset)
+            chunk_buffer = self.fh.read(stride)
+            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+            chunk_data.reshape(self.shape, order=self.order)
+            return chunk_data
+        
+        else:
+
+            # we are hoping that with the pseudo chunking we can avoid some reads
+            # this will be the case if the selection is "along the grain",
+            # otherwise this could be slower.
+
+            array = ZarrArrayStub(self.shape, pseudo_chunks)
+            indexer = OrthogonalIndexer(args, array)
+            out_shape = indexer.shape
+            out = np.empty(out_shape, dtype=self.dtype, order=self.order)
+
+            for chunk_coords, chunk_selection, out_selection in indexer:
+                index = data_offset+chunk_coords[0]*stride
+                self.fh.seek(index)
+                chunk_buffer = self.fh.read(stride)
+                chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+                out[out_selection] = chunk_data.reshape(pseudo_chunks, order=self.order)[chunk_selection]
+            return out
+        
+
         
 
 def determine_data_shape(buf, offset):

From 67c93e0d6b443247ba5a967d66da014eb4af1bb1 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 7 Mar 2024 18:07:52 +0000
Subject: [PATCH 019/105] This version appears to now support failing over from
 a memory map to a pseudo chunked read.  Lots of things to do around
 optimising that read, but let's test this more widely first.

---
 pyfive/dataobjects.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index d221698..dcdef0a 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -617,10 +617,10 @@ def __init__(self,*args,**kwargs):
         # pseudo chunk blocksize: this is a size below which we don't bother 
         # pseudo chunking for contiguous data and just load the lot at data
         # access time: units are kibibytes
-        self.pseudo_chunking = False
+        self.pseudo_chunking = True
         self.pseudo_block_size_kib = 0
         # We can't use mmaps on S3
-        self.avoid_mmap = False
+        self.avoid_mmap = True
         ##########################################################################
 
         # offset and size from data storage message
@@ -637,8 +637,6 @@ def get_data(self, args=None):
         if self.layout_class == 0:  # compact storage
             raise NotImplementedError("Compact storage")
         elif self.layout_class == 1:  # contiguous storage
-            if self.avoid_mmap:
-                return self._get_selection_from_contiguous(args)
             return self._get_contiguous_data(self.property_offset,args)
         if self.layout_class == 2:  # chunked storage
             # If reading all chunks, use the (hopefully faster) "do it one go" method.
@@ -676,15 +674,18 @@ def _get_contiguous_data(self, property_offset, args):
 
         if data_offset == UNDEFINED_ADDRESS:
             # no storage is backing array, return all zeros
-            result = np.zeros(self.shape, dtype=self.dtype)
+            return np.zeros(self.shape, dtype=self.dtype)[args]
 
         if not isinstance(self.dtype, tuple):
-            try:
-                # return a memory-map to the stored array with copy-on-write
-                result = np.memmap(self.fh, dtype=self.dtype, mode='c',
-                            offset=data_offset, shape=self.shape, order=self.order)
-            except UnsupportedOperation:
+            if self.avoid_mmap:
                 return self._get_selection_from_contiguous(args)
+            else:
+                try:
+                    # return a memory-map to the stored array with copy-on-write
+                    return np.memmap(self.fh, dtype=self.dtype, mode='c',
+                                offset=data_offset, shape=self.shape, order=self.order)[args]
+                except UnsupportedOperation:
+                    return self._get_selection_from_contiguous(args)
         else:
             dtype_class = self.dtype[0]
             if dtype_class == 'REFERENCE':
@@ -694,13 +695,9 @@ def _get_contiguous_data(self, property_offset, args):
                 ref_addresses = np.memmap(
                     self.fh, dtype=('<u8'), mode='c', offset=data_offset,
                     shape=self.shape, order=self.order)
-                return np.array([Reference(addr) for addr in ref_addresses])
+                return np.array([Reference(addr) for addr in ref_addresses])[args]
             else:
                 raise NotImplementedError('datatype not implemented')
-        if args is None:
-            return result
-        else:
-            return result[args]
 
     def _get_chunked_data(self, offset):
         """ Return data which is chunked. """

From a08ee20cb43bcd68242f11045d499ffbe5e89c7a Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 21 Apr 2024 17:00:00 +0100
Subject: [PATCH 020/105] First cut, no tests yet

---
 pyfive/dataobjects.py | 125 +++++++++--------------------------
 pyfive/h5d.py         | 148 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 94 deletions(-)
 create mode 100644 pyfive/h5d.py

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index dcdef0a..7031d40 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -19,7 +19,8 @@
 from .btree import BTreeV2GroupNames, BTreeV2GroupOrders
 from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
 from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
-from .indexing import OrthogonalIndexer, ZarrArrayStub
+from .h5d import H5Dataset
+
 
 # these constants happen to have the same value...
 UNLIMITED_SIZE = UNDEFINED_ADDRESS
@@ -598,27 +599,27 @@ def is_dataset(self):
 
 class DatasetDataObject(DataObjects):
     """ 
-    Subclass of DataObjects associated with one Dataset, and 
-    which handles actual data access.
+    Subclass of DataObjects associated with one Dataset, 
+    handles actual data access.
     """
     def __init__(self,*args,**kwargs):
         """
         Initialise via super class
         """
         super().__init__(*args,**kwargs)
+        self._id = None
 
-        #  Need our own copy for now to utilise the zarr indexer.
-        self._zchunk_index={}
+        # make this explicit, but controllable
         self.order='C'
 
         ##########################################################################
         # pseudo chunking control. 
-        #these can be changed from outside for testing purposes
+        # these can be changed from outside for optimisation
         # pseudo chunk blocksize: this is a size below which we don't bother 
         # pseudo chunking for contiguous data and just load the lot at data
         # access time: units are kibibytes
         self.pseudo_chunking = True
-        self.pseudo_block_size_kib = 0
+        self.pseudo_block_size_kib = 1024
         # We can't use mmaps on S3
         self.avoid_mmap = True
         ##########################################################################
@@ -629,6 +630,16 @@ def __init__(self,*args,**kwargs):
         version, dims, self.layout_class, self.property_offset = (
             self._get_data_message_properties(self.msg_offset))
         
+    @property
+    def id(self):
+        """ 
+        Represents a PyFive approximation of an HDF5 dataset identifier.
+        Objects of this class provides methods for working directly with chunked data.
+        """
+        if self._id is None:
+            self._id = H5Dataset(self)
+        return self._id
+
     def get_data(self, args=None):
         """ 
         Return the data pointed to in the DataObject.
@@ -646,7 +657,7 @@ def get_data(self, args=None):
             elif isinstance(self.dtype, tuple):
                 return self._get_chunked_data(self.msg_offset)[args]
             else:
-                return self._get_selection_via_chunks(args)
+                return self.id._get_selection_via_chunks(args)
 
     def _get_data_message_properties(self, msg_offset):
         """ Return the message properties of the DataObject. """
@@ -699,95 +710,21 @@ def _get_contiguous_data(self, property_offset, args):
             else:
                 raise NotImplementedError('datatype not implemented')
 
-    def _get_chunked_data(self, offset):
-        """ Return data which is chunked. """
-        self._get_chunk_params()
-        chunk_btree = BTreeV1RawDataChunks(
-            self.fh, self._chunk_address, self._chunk_dims)
-        return chunk_btree.construct_data_from_chunks(
-            self.chunks, self.shape, self.dtype, self.filter_pipeline)
-        
-    def get_chunk_details(self, chunk_coords):
-        """ 
-        Returns the chunk details associated with chunk coords
-        returned by the Zarr orthogonal indexer. The special case
-        is that if the data is contiguous, we still want to return
-        the offset and size, as the point of this entry point is
-        to provide third party applications an address to the data.
-        """
-        if self.layout_class == 0:  # compact storage
-            raise NotImplementedError("Compact storage")
-        elif self.layout_class == 1:  # contiguous storage
-            # This option never used by pyfive itself as we use the memory map for
-            # access to contiguous data, but third parties may need it.
-            # Ignore coordinates, just give the location and size of entire array
-            data_offset, = struct.unpack_from('<Q', self.msg_data, self.property_offset)
-            # No way there can be filtering of an unchunked dataset, so no filter mask?
-            return data_offset, np.prod(self.shape)*np.dtype(self.dtype).itemsize, None
-        else:
-            if self._zchunk_index == {}:
-                self._get_chunk_addresses()
 
-            return self._zchunk_index[chunk_coords]
 
-    def _get_chunk_addresses(self):
+    def iterchunks(self, sel=None):
         """ 
-        Get the offset addresses associated with all the chunks 
-        known to the b-tree of this object, and load them into
-        an index suitable for use with the zarr indexer.
+        Iterate over chunks in a chunked dataset. 
+        The optional sel argument is a slice or tuple of slices that defines the region to be used. #FIXME: sel not yet implemented.
+        If not set, the entire dataspace will be used for the iterator.
+        For each chunk within the given region, the iterator yields a tuple of slices that gives the intersection of the given chunk 
+        with the selection area. This can be used to read or write data in that chunk.
+        A TypeError will be raised if the dataset is not chunked.
+        A ValueError will be raised if the selection region is invalid.
         """
-        if self._zchunk_index == {}:
-
-            self._get_chunk_params()
-
-            self.chunk_btree = BTreeV1RawDataChunks(
-                self.fh, self._chunk_address, self._chunk_dims)
-
-            count = np.prod(self.shape)
-            itemsize = np.dtype(self.dtype).itemsize
-
-            # The zarr orthogonal indexer returns the position in chunk
-            # space, whereas pyfive wants the position in array space.
-            # Here we index the pyfive chunk_index in zarr index space.
-        
-            ichunks = [1/c for c in self.chunks]
-            
-            for node in self.chunk_btree.all_nodes[0]:
-                for node_key, addr in zip(node['keys'], node['addresses']):
-                    size = node_key['chunk_size']
-                    if self.filter_pipeline:
-                        # I am not sure this varies per chunk, but in case it does
-                        filter_mask = node_key['filter_mask']
-                    else:
-                        filter_mask=None
-                    start = node_key['chunk_offset'][:-1]
-                    key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
-                    self._zchunk_index[key] = (addr,size,filter_mask)
-
-    def _get_selection_via_chunks(self, args):
-        """
-        Use the zarr orthogonal indexer to extract data for a specfic selection within
-        the dataset array and in doing so, only load the relevant chunks.
-        """
-    
-        array = ZarrArrayStub(self.shape, self.chunks)
-        indexer = OrthogonalIndexer(args, array)
-        # FIXME: Need to understand what drop_axes was up to and whether or not
-        # it is relevant to this or not (I didn't understand it in the zarr implementation).
-
-        itemsize = np.dtype(self.dtype).itemsize    
-        out_shape = indexer.shape
-        out = np.empty(out_shape, dtype=self.dtype, order=self.order)
-
-        for chunk_coords, chunk_selection, out_selection in indexer:
-            addr, chunk_buffer_size, filter_mask = self.get_chunk_details(chunk_coords) 
-            chunk_buffer = self.chunk_btree.get_one_chunk_buffer(
-                addr, chunk_buffer_size, itemsize, self.filter_pipeline, filter_mask)
-            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-            out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
-
-        return out
-    
+        if self.chunks is None:
+            raise TypeError('Dataset is not chunked')
+        return self.id._iter_chunks(sel)
 
     def _get_selection_from_contiguous(self, args=None):
         """
@@ -802,7 +739,7 @@ def _get_selection_from_contiguous(self, args=None):
         varying fastest).
         """
 
-        # don't want to be dong this if we are actually chunked!
+        # don't want to be doing this if we are actually chunked!
         assert self.chunks is None
         
         data_offset, = struct.unpack_from('<Q', self.msg_data, self.property_offset)
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
new file mode 100644
index 0000000..2d294af
--- /dev/null
+++ b/pyfive/h5d.py
@@ -0,0 +1,148 @@
+import numpy as np
+from collections import namedtuple
+from .indexing import OrthogonalIndexer, ZarrArrayStub
+from .btree import BTreeV1RawDataChunks
+
+StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size")
+
+class H5Dataset:
+    """ 
+    Represents an HDF5 dataset identifier.
+    
+    Also, many H5D* functions which take a dataset instance as their first argument 
+    are presented as methods of this class. This is a subset of those supported
+    by H5Py's module H5D.
+    
+    """
+    def __init__(self, dataobject):
+        """ 
+        Instantiated with the pyfive datasetdataobject
+        """
+        self.parent_object = dataobject
+
+        # Should we read this at instantiation?
+        # I figure yes, given folks will likely only
+        # go this low if they want to manipulate chunks
+        # Otherwise we'd have to instantiate it as None and
+        # call the build routine on every chunk manipulation.
+        # Even if that's just a return, it's a lot of empty function calls
+        # on an iteration over chunks.
+        self.index  = self.__build_index()
+
+    def __hash__(self):
+        """ 
+        H5py says this is hasable, we haven't implemented that.
+        """
+        raise NotImplementedError
+        
+    def __eq__(self, other):
+        """
+        H5Py says that equality is determined by true HDF5 identity.
+        """
+        # We kick that upstairs. 
+        return self.parent_object == other.parent_object
+
+    @property
+    def shape(self):
+        return self.parent_object.shape
+    @property
+    def rank(self):
+        return self.parent_object.rank
+    @property
+    def dtype(self):
+        return np.dtype(self.parent_object.dtype)
+    
+    def get_chunk_info(self, index):
+        """
+        Retrieve storage information about a chunk specified by its index.
+        """
+        return self.index[self._nthindex[index]]
+
+    def get_chunk_info_by_coord(self, coordinate_index):
+        return self.index(coordinate_index)
+    
+    def read_direct_chunk(self, chunk_position, **kwargs):
+        """
+        Returns a tuple containing the filter_mask and the raw data storing this chunk as bytes.
+        Additional arugments supported by H5Py are not supported here.
+        """
+        storeinfo = self.index[chunk_position]
+        return storeinfo.filter_mask, self._get_raw_chunk(storeinfo)
+        
+    ######
+    # The following H5Dataset methods are used by PyFive and you wouldn't expect
+    # third parties to use them. They are not H5Py methods.
+    ######
+
+    def __build_index(self):
+        """ 
+        Build the chunk index if it doesn't exist
+        """
+        if self.index is not None: 
+            return
+        chunk_btree = BTreeV1RawDataChunks(
+                self.fh, self._chunk_address, self._chunk_dims)
+        count = np.prod(self.shape)
+        itemsize = np.dtype(self.dtype).itemsize
+        
+        self.index = {}
+
+        # we do this to avoid either using an iterator or many 
+        # temporary list creations if there are repeated chunk accesses.
+        self._nthindex = []
+        
+        # The zarr orthogonal indexer returns the position in chunk
+        # space, whereas pyfive wants the position in array space.
+        # Here we index the pyfive chunk_index in zarr index space.
+    
+        ichunks = [1/c for c in self.chunks]
+        
+        for node in chunk_btree.all_nodes[0]:
+            for node_key, addr in zip(node['keys'], node['addresses']):
+                key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
+                size = node_key['chunk_size']
+                filter_mask = node_key['filter_mask']
+                start = node_key['chunk_offset'][:-1]
+                self._nthindex.append(key)
+                self.index[key] = StoreInfo(key, filter_mask, start, size)
+
+    def _iter_chunks(self, sel=None):
+        """
+        Provides internal support for iter_chunks method on parent.
+        Errors should be trapped there. 
+        """
+
+        if sel is None:
+            yield from self.index.values()
+        else:
+            raise NotImplementedError
+    
+    def _get_raw_chunk(self, storeinfo):
+        """ 
+        Obtain the bytes associated with a chunk.
+        """
+
+        self.parent_object.fh.seek(storeinfo.byte_offset)
+        return self.parent_object.fh.read(storeinfo.size)  
+
+    def _get_selection_via_chunks(self, args):
+        """
+        Use the zarr orthogonal indexer to extract data for a specfic selection within
+        the dataset array and in doing so, only load the relevant chunks.
+        """
+
+        array = ZarrArrayStub(self.shape, self.chunks)
+        indexer = OrthogonalIndexer(args, array) 
+        out_shape = indexer.shape
+        out = np.empty(out_shape, dtype=self.dtype, order=self.order)
+
+        for chunk_coords, chunk_selection, out_selection in indexer:
+            chunk_info = self.get_chunk_info_by_coord(chunk_coords)
+            filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords.chunk_offset)
+            if self.filter_pipeline is not None:
+                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.itemsize)
+            chunk_buffer = self._unpack_chunk(chunk_buffer, chunk_info)
+            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+            out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
+
+        return out

From dc00503285fa5bd36654625569c1a16d7f77e309 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 22 Apr 2024 11:50:41 +0100
Subject: [PATCH 021/105] Improvements

---
 pyfive/dataobjects.py |  3 ++-
 pyfive/h5d.py         | 62 +++++++++++++++++++++++--------------------
 pyfive/high_level.py  | 11 +++++++-
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 7031d40..67edb07 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -20,7 +20,7 @@
 from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
 from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
 from .h5d import H5Dataset
-
+from .indexing import OrthogonalIndexer, ZarrArrayStub
 
 # these constants happen to have the same value...
 UNLIMITED_SIZE = UNDEFINED_ADDRESS
@@ -637,6 +637,7 @@ def id(self):
         Objects of this class provides methods for working directly with chunked data.
         """
         if self._id is None:
+            self._get_chunk_params()
             self._id = H5Dataset(self)
         return self._id
 
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 2d294af..3673131 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -20,14 +20,14 @@ def __init__(self, dataobject):
         """
         self.parent_object = dataobject
 
+        self.index =  None
         # Should we read this at instantiation?
         # I figure yes, given folks will likely only
-        # go this low if they want to manipulate chunks
-        # Otherwise we'd have to instantiate it as None and
-        # call the build routine on every chunk manipulation.
-        # Even if that's just a return, it's a lot of empty function calls
-        # on an iteration over chunks.
-        self.index  = self.__build_index()
+        # go this low if they want to manipulate chunks.
+        # Otherwise we'd call the (cached) build routine on
+        # each chunk manipulation. That could be a lot of
+        # empty function calls, even if they are cheap cf I/O. 
+        self.__build_index()
 
     def __hash__(self):
         """ 
@@ -59,7 +59,10 @@ def get_chunk_info(self, index):
         return self.index[self._nthindex[index]]
 
     def get_chunk_info_by_coord(self, coordinate_index):
-        return self.index(coordinate_index)
+        return self.index[coordinate_index]
+    
+    def get_num_chunks(self):
+        return len(self.index)
     
     def read_direct_chunk(self, chunk_position, **kwargs):
         """
@@ -78,15 +81,14 @@ def __build_index(self):
         """ 
         Build the chunk index if it doesn't exist
         """
+        
         if self.index is not None: 
             return
+        
         chunk_btree = BTreeV1RawDataChunks(
-                self.fh, self._chunk_address, self._chunk_dims)
-        count = np.prod(self.shape)
-        itemsize = np.dtype(self.dtype).itemsize
+                self.parent_object.fh, self.parent_object._chunk_address, self.parent_object._chunk_dims)
         
         self.index = {}
-
         # we do this to avoid either using an iterator or many 
         # temporary list creations if there are repeated chunk accesses.
         self._nthindex = []
@@ -95,28 +97,31 @@ def __build_index(self):
         # space, whereas pyfive wants the position in array space.
         # Here we index the pyfive chunk_index in zarr index space.
     
-        ichunks = [1/c for c in self.chunks]
+        # Can't help myself optimising to remove excessive divides
+        ichunks = [1/c for c in self.parent_object.chunks]
         
         for node in chunk_btree.all_nodes[0]:
             for node_key, addr in zip(node['keys'], node['addresses']):
+                start = node_key['chunk_offset'][:-1]
                 key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
                 size = node_key['chunk_size']
                 filter_mask = node_key['filter_mask']
-                start = node_key['chunk_offset'][:-1]
                 self._nthindex.append(key)
-                self.index[key] = StoreInfo(key, filter_mask, start, size)
+                self.index[key] = StoreInfo(key, filter_mask, addr, size)
+
 
-    def _iter_chunks(self, sel=None):
+    def _iter_chunks(self, args):
         """
         Provides internal support for iter_chunks method on parent.
         Errors should be trapped there. 
         """
-
-        if sel is None:
-            yield from self.index.values()
-        else:
-            raise NotImplementedError
-    
+        raise NotImplementedError
+        # FIXME: This isn't it!
+        array = ZarrArrayStub(self.shape, self.parent_object.chunks)
+        indexer = OrthogonalIndexer(args, array) 
+        for chunk_coords, chunk_selection, out_selection in indexer:
+            yield out_selection
+        
     def _get_raw_chunk(self, storeinfo):
         """ 
         Obtain the bytes associated with a chunk.
@@ -131,18 +136,17 @@ def _get_selection_via_chunks(self, args):
         the dataset array and in doing so, only load the relevant chunks.
         """
 
-        array = ZarrArrayStub(self.shape, self.chunks)
+        array = ZarrArrayStub(self.shape, self.parent_object.chunks)
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
-        out = np.empty(out_shape, dtype=self.dtype, order=self.order)
+        out = np.empty(out_shape, dtype=self.dtype, order=self.parent_object.order)
+        filter_pipeline = self.parent_object.filter_pipeline
 
         for chunk_coords, chunk_selection, out_selection in indexer:
-            chunk_info = self.get_chunk_info_by_coord(chunk_coords)
-            filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords.chunk_offset)
-            if self.filter_pipeline is not None:
-                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.itemsize)
-            chunk_buffer = self._unpack_chunk(chunk_buffer, chunk_info)
+            filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
+            if filter_pipeline is not None:
+                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-            out[out_selection] = chunk_data.reshape(self.chunks, order=self.order)[chunk_selection]
+            out[out_selection] = chunk_data.reshape(self.parent_object.chunks, order=self.parent_object.order)[chunk_selection]
 
         return out
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 8bf0544..e02a944 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -304,12 +304,21 @@ def astype(self, dtype):
     def len(self):
         """ Return the size of the first axis. """
         return self.shape[0]
+    
+    def iter_chunks(self, sel=None):
+        if sel == None:
+            sel = self.shape
+        return self._dataobjects.id._iter_chunks(sel)
+    
+    @property
+    def id(self):
+        return self._dataobjects.id
 
     @property
     def shape(self):
         """ shape attribute. """
         return self._dataobjects.shape
-
+    
     @property
     def maxshape(self):
         """ maxshape attribute. (None for unlimited dimensions) """

From 9ffb5b23daa6327b53229e865c3bb9dc2bdb611f Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 22 Apr 2024 13:28:19 +0100
Subject: [PATCH 022/105] With some failing tests

---
 pyfive/h5d.py     |  1 +
 tests/test_h5d.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 tests/test_h5d.py

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 3673131..d77f492 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -70,6 +70,7 @@ def read_direct_chunk(self, chunk_position, **kwargs):
         Additional arugments supported by H5Py are not supported here.
         """
         storeinfo = self.index[chunk_position]
+        print(storeinfo)
         return storeinfo.filter_mask, self._get_raw_chunk(storeinfo)
         
     ######
diff --git a/tests/test_h5d.py b/tests/test_h5d.py
new file mode 100644
index 0000000..0ef4936
--- /dev/null
+++ b/tests/test_h5d.py
@@ -0,0 +1,62 @@
+import h5py
+import pyfive
+from pathlib import Path
+import pytest
+
+mypath = Path(__file__).parent
+
+filename = 'compressed.hdf5'
+variable_name = 'dataset3'
+
+def chunk_down(ff, vv):
+    """ 
+    Test the chunking stuff
+    """
+    var = ff[vv]
+    varid = var.id
+    n = varid.get_num_chunks()
+    c = varid.get_chunk_info(4)
+    with pytest.raises(OSError):
+        # This isn't on the chunk boundary, so should fail
+        address = (2,0)
+        d = varid.read_direct_chunk(address)
+    address = c.chunk_offset
+    d = varid.read_direct_chunk(address)
+    return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d
+
+
+def get_chunks(ff, vv, view=3):
+    var = ff[vv]
+    x = var[:,2]
+    y = var[:,:]
+    chunks = list(var.iter_chunks())
+    for i in range(view):
+        print('Chunk ',i)
+        print(chunks[i])
+    return list(var.iter_chunks())
+
+
+def test_h5d_chunking_details():
+
+    with h5py.File(mypath/filename) as f:
+        h5detail = chunk_down(f, variable_name)
+
+    with pyfive.File(mypath/filename) as g:
+        p5detail = chunk_down(g, variable_name)
+
+    assert h5detail == p5detail 
+
+def test_iter_chunks():
+
+    with h5py.File(mypath/filename) as f:
+        h5chunks = get_chunks(f, variable_name)
+
+    with pyfive.File(mypath/filename) as g:
+        p5chunks = get_chunks(f, variable_name)
+
+    assert (h5chunks == p5chunks).all()
+
+
+if __name__ == "__main__":
+    test_h5d_chunking_details()
+

From 223a931a5c3aaa9aaf19b3cb6a3f9d81d99525da Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 22 Apr 2024 14:32:21 +0100
Subject: [PATCH 023/105] Fixed one test

---
 pyfive/h5d.py     | 21 +++++++++++++--------
 tests/test_h5d.py | 18 +++++++++++++-----
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index d77f492..2e9d5ef 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -19,6 +19,9 @@ def __init__(self, dataobject):
         Instantiated with the pyfive datasetdataobject
         """
         self.parent_object = dataobject
+        self._chunks = self.parent_object.chunks
+        self._ichunks = [1/c for c in self._chunks]
+        self._order = self.parent_object.order
 
         self.index =  None
         # Should we read this at instantiation?
@@ -29,6 +32,8 @@ def __init__(self, dataobject):
         # empty function calls, even if they are cheap cf I/O. 
         self.__build_index()
 
+
+
     def __hash__(self):
         """ 
         H5py says this is hasable, we haven't implemented that.
@@ -55,6 +60,7 @@ def dtype(self):
     def get_chunk_info(self, index):
         """
         Retrieve storage information about a chunk specified by its index.
+        Our index is in chunk space, but H5Py wants it in coordinate space.
         """
         return self.index[self._nthindex[index]]
 
@@ -69,8 +75,9 @@ def read_direct_chunk(self, chunk_position, **kwargs):
         Returns a tuple containing the filter_mask and the raw data storing this chunk as bytes.
         Additional arugments supported by H5Py are not supported here.
         """
+        if chunk_position not in self.index:
+            raise OSError("Chunk coordinates must lie on chunk boundaries")
         storeinfo = self.index[chunk_position]
-        print(storeinfo)
         return storeinfo.filter_mask, self._get_raw_chunk(storeinfo)
         
     ######
@@ -97,14 +104,11 @@ def __build_index(self):
         # The zarr orthogonal indexer returns the position in chunk
         # space, whereas pyfive wants the position in array space.
         # Here we index the pyfive chunk_index in zarr index space.
-    
-        # Can't help myself optimising to remove excessive divides
-        ichunks = [1/c for c in self.parent_object.chunks]
         
         for node in chunk_btree.all_nodes[0]:
             for node_key, addr in zip(node['keys'], node['addresses']):
                 start = node_key['chunk_offset'][:-1]
-                key = tuple([int(i*d) for i,d in zip(list(start),ichunks)])
+                key = start
                 size = node_key['chunk_size']
                 filter_mask = node_key['filter_mask']
                 self._nthindex.append(key)
@@ -137,17 +141,18 @@ def _get_selection_via_chunks(self, args):
         the dataset array and in doing so, only load the relevant chunks.
         """
 
-        array = ZarrArrayStub(self.shape, self.parent_object.chunks)
+        array = ZarrArrayStub(self.shape, self._chunks)
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
-        out = np.empty(out_shape, dtype=self.dtype, order=self.parent_object.order)
+        out = np.empty(out_shape, dtype=self.dtype, order=self._order)
         filter_pipeline = self.parent_object.filter_pipeline
 
         for chunk_coords, chunk_selection, out_selection in indexer:
+            chunk_coords = tuple([int(i*d) for i,d in zip(list(chunk_coords),self._chunks)])
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if filter_pipeline is not None:
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-            out[out_selection] = chunk_data.reshape(self.parent_object.chunks, order=self.parent_object.order)[chunk_selection]
+            out[out_selection] = chunk_data.reshape(self._chunks, order=self._order)[chunk_selection]
 
         return out
diff --git a/tests/test_h5d.py b/tests/test_h5d.py
index 0ef4936..a7075aa 100644
--- a/tests/test_h5d.py
+++ b/tests/test_h5d.py
@@ -4,31 +4,39 @@
 import pytest
 
 mypath = Path(__file__).parent
-
 filename = 'compressed.hdf5'
 variable_name = 'dataset3'
+breaking_address=(2,0)
+
+#mypath = mypath.parent/'bnl/'
+#filename = 'da193o_25_day__grid_T_198807-198807.nc'
+#variable_name = 'tos'
+# breaking_address=(2,0,3)
 
 def chunk_down(ff, vv):
     """ 
     Test the chunking stuff
     """
     var = ff[vv]
+    v = var[2,2]
+    print(v)
     varid = var.id
     n = varid.get_num_chunks()
     c = varid.get_chunk_info(4)
+    for i in range(varid.get_num_chunks()):
+        print(varid.get_chunk_info(i))
     with pytest.raises(OSError):
         # This isn't on the chunk boundary, so should fail
-        address = (2,0)
+        address = breaking_address
         d = varid.read_direct_chunk(address)
     address = c.chunk_offset
     d = varid.read_direct_chunk(address)
-    return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d
+   
+    return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d, v
 
 
 def get_chunks(ff, vv, view=3):
     var = ff[vv]
-    x = var[:,2]
-    y = var[:,:]
     chunks = list(var.iter_chunks())
     for i in range(view):
         print('Chunk ',i)

From 3a256ab4f6b163dee03c979c9b5918df7fd2e832 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 22 Apr 2024 17:18:37 +0100
Subject: [PATCH 024/105] All tests for new functionality pass, but I've broken
 something old

---
 pyfive/h5d.py        | 22 ++++++++++++++--------
 pyfive/high_level.py |  6 ++----
 tests/test_h5d.py    |  9 +++++----
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 2e9d5ef..fc5f0a9 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -1,5 +1,6 @@
 import numpy as np
 from collections import namedtuple
+from operator import mul
 from .indexing import OrthogonalIndexer, ZarrArrayStub
 from .btree import BTreeV1RawDataChunks
 
@@ -20,7 +21,6 @@ def __init__(self, dataobject):
         """
         self.parent_object = dataobject
         self._chunks = self.parent_object.chunks
-        self._ichunks = [1/c for c in self._chunks]
         self._order = self.parent_object.order
 
         self.index =  None
@@ -32,8 +32,6 @@ def __init__(self, dataobject):
         # empty function calls, even if they are cheap cf I/O. 
         self.__build_index()
 
-
-
     def __hash__(self):
         """ 
         H5py says this is hasable, we haven't implemented that.
@@ -120,13 +118,21 @@ def _iter_chunks(self, args):
         Provides internal support for iter_chunks method on parent.
         Errors should be trapped there. 
         """
-        raise NotImplementedError
-        # FIXME: This isn't it!
+        def convert_selection(tuple_of_slices):
+            # while a slice of the form slice(a,b,None) is equivalent
+            # in funtion to a slice of form (a,b,1) it is not the same.
+            # For compatability I've gone for "the same"
+            def convert_slice(aslice):
+                if aslice.step is None:
+                    return slice(aslice.start,aslice.stop,1)
+                return aslice
+            return tuple([convert_slice(a) for a in tuple_of_slices])
+    
         array = ZarrArrayStub(self.shape, self.parent_object.chunks)
         indexer = OrthogonalIndexer(args, array) 
         for chunk_coords, chunk_selection, out_selection in indexer:
-            yield out_selection
-        
+            yield convert_selection(out_selection)
+    
     def _get_raw_chunk(self, storeinfo):
         """ 
         Obtain the bytes associated with a chunk.
@@ -148,7 +154,7 @@ def _get_selection_via_chunks(self, args):
         filter_pipeline = self.parent_object.filter_pipeline
 
         for chunk_coords, chunk_selection, out_selection in indexer:
-            chunk_coords = tuple([int(i*d) for i,d in zip(list(chunk_coords),self._chunks)])
+            chunk_coords = tuple(map(mul,chunk_coords,self._chunks))
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if filter_pipeline is not None:
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize)
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index e02a944..6db9159 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -305,10 +305,8 @@ def len(self):
         """ Return the size of the first axis. """
         return self.shape[0]
     
-    def iter_chunks(self, sel=None):
-        if sel == None:
-            sel = self.shape
-        return self._dataobjects.id._iter_chunks(sel)
+    def iter_chunks(self, *args):
+        return self._dataobjects.id._iter_chunks(args)
     
     @property
     def id(self):
diff --git a/tests/test_h5d.py b/tests/test_h5d.py
index a7075aa..2a9d1db 100644
--- a/tests/test_h5d.py
+++ b/tests/test_h5d.py
@@ -41,7 +41,7 @@ def get_chunks(ff, vv, view=3):
     for i in range(view):
         print('Chunk ',i)
         print(chunks[i])
-    return list(var.iter_chunks())
+    return chunks
 
 
 def test_h5d_chunking_details():
@@ -60,11 +60,12 @@ def test_iter_chunks():
         h5chunks = get_chunks(f, variable_name)
 
     with pyfive.File(mypath/filename) as g:
-        p5chunks = get_chunks(f, variable_name)
+        p5chunks = get_chunks(g, variable_name)
 
-    assert (h5chunks == p5chunks).all()
+    assert h5chunks == p5chunks 
 
 
 if __name__ == "__main__":
-    test_h5d_chunking_details()
+    #test_h5d_chunking_details()
+    test_iter_chunks()
 

From 32d83dd8871ed6f62014832c6cf73cc09bf2e0a7 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 22 Apr 2024 20:23:15 +0100
Subject: [PATCH 025/105] Now passing all tests

---
 pyfive/dataobjects.py |  9 +++------
 pyfive/h5d.py         | 35 ++++++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 67edb07..5508da7 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -641,7 +641,7 @@ def id(self):
             self._id = H5Dataset(self)
         return self._id
 
-    def get_data(self, args=None):
+    def get_data(self, args):
         """ 
         Return the data pointed to in the DataObject.
         """
@@ -651,12 +651,9 @@ def get_data(self, args=None):
         elif self.layout_class == 1:  # contiguous storage
             return self._get_contiguous_data(self.property_offset,args)
         if self.layout_class == 2:  # chunked storage
-            # If reading all chunks, use the (hopefully faster) "do it one go" method.
             # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version
-            if args is None:
-                return self._get_chunked_data(self.msg_offset)
-            elif isinstance(self.dtype, tuple):
-                return self._get_chunked_data(self.msg_offset)[args]
+            if isinstance(self.dtype, tuple):
+                return self.id._get_reference_chunks(self.msg_offset)[args]
             else:
                 return self.id._get_selection_via_chunks(args)
 
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index fc5f0a9..7f2b08c 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -22,6 +22,8 @@ def __init__(self, dataobject):
         self.parent_object = dataobject
         self._chunks = self.parent_object.chunks
         self._order = self.parent_object.order
+        self._fh = self.parent_object.fh
+        self.filter_pipeline = self.parent_object.filter_pipeline
 
         self.index =  None
         # Should we read this at instantiation?
@@ -53,7 +55,12 @@ def rank(self):
         return self.parent_object.rank
     @property
     def dtype(self):
-        return np.dtype(self.parent_object.dtype)
+        # FIXME: Not sure what H5Py is doing here need to find out,
+        # but I'm sure it's not this.
+        if self.parent_object.dtype == ('REFERENCE',8):
+            return self.parent_object.dtype
+        else:
+            return np.dtype(self.parent_object.dtype)
     
     def get_chunk_info(self, index):
         """
@@ -92,7 +99,7 @@ def __build_index(self):
             return
         
         chunk_btree = BTreeV1RawDataChunks(
-                self.parent_object.fh, self.parent_object._chunk_address, self.parent_object._chunk_dims)
+                self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims)
         
         self.index = {}
         # we do this to avoid either using an iterator or many 
@@ -128,7 +135,7 @@ def convert_slice(aslice):
                 return aslice
             return tuple([convert_slice(a) for a in tuple_of_slices])
     
-        array = ZarrArrayStub(self.shape, self.parent_object.chunks)
+        array = ZarrArrayStub(self.shape, self._chunks)
         indexer = OrthogonalIndexer(args, array) 
         for chunk_coords, chunk_selection, out_selection in indexer:
             yield convert_selection(out_selection)
@@ -138,8 +145,19 @@ def _get_raw_chunk(self, storeinfo):
         Obtain the bytes associated with a chunk.
         """
 
-        self.parent_object.fh.seek(storeinfo.byte_offset)
-        return self.parent_object.fh.read(storeinfo.size)  
+        self._fh.seek(storeinfo.byte_offset)
+        return self._fh.read(storeinfo.size) 
+
+    def _get_reference_chunks(self, offset):
+        """ 
+        Return reference data which is chunked. At the moment
+        we re-read the b-tree to do this, since we didn't cache
+        it at index construction. #FIXME
+        """
+        chunk_btree = BTreeV1RawDataChunks(
+            self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims)
+        return chunk_btree.construct_data_from_chunks(
+            self._chunks, self.shape, self.dtype, self.filter_pipeline) 
 
     def _get_selection_via_chunks(self, args):
         """
@@ -151,14 +169,13 @@ def _get_selection_via_chunks(self, args):
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
         out = np.empty(out_shape, dtype=self.dtype, order=self._order)
-        filter_pipeline = self.parent_object.filter_pipeline
 
         for chunk_coords, chunk_selection, out_selection in indexer:
             chunk_coords = tuple(map(mul,chunk_coords,self._chunks))
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
-            if filter_pipeline is not None:
-                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, filter_pipeline, self.dtype.itemsize)
+            if self.filter_pipeline is not None:
+                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
             out[out_selection] = chunk_data.reshape(self._chunks, order=self._order)[chunk_selection]
 
-        return out
+        return out
\ No newline at end of file

From f5f89c53874dd669edb1abdb14d4425913dbaf96 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 22 Apr 2024 20:34:35 +0100
Subject: [PATCH 026/105] Checking coverage of get_chunk_info_by_coord(method)

---
 tests/test_h5d.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_h5d.py b/tests/test_h5d.py
index 2a9d1db..730f379 100644
--- a/tests/test_h5d.py
+++ b/tests/test_h5d.py
@@ -19,23 +19,21 @@ def chunk_down(ff, vv):
     """
     var = ff[vv]
     v = var[2,2]
-    print(v)
     varid = var.id
     n = varid.get_num_chunks()
     c = varid.get_chunk_info(4)
-    for i in range(varid.get_num_chunks()):
-        print(varid.get_chunk_info(i))
     with pytest.raises(OSError):
         # This isn't on the chunk boundary, so should fail
         address = breaking_address
         d = varid.read_direct_chunk(address)
     address = c.chunk_offset
     d = varid.read_direct_chunk(address)
-   
+    dd = varid.get_chunk_info_by_coord(address)
+
     return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d, v
 
 
-def get_chunks(ff, vv, view=3):
+def get_chunks(ff, vv, view=0):
     var = ff[vv]
     chunks = list(var.iter_chunks())
     for i in range(view):
@@ -66,6 +64,6 @@ def test_iter_chunks():
 
 
 if __name__ == "__main__":
-    #test_h5d_chunking_details()
+    test_h5d_chunking_details()
     test_iter_chunks()
 

From 2c8f59c6e68a199d6908b872d07121c844b78b00 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 22 Apr 2024 20:37:06 +0100
Subject: [PATCH 027/105] Missing docstring

---
 pyfive/h5d.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 7f2b08c..bde427b 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -70,9 +70,16 @@ def get_chunk_info(self, index):
         return self.index[self._nthindex[index]]
 
     def get_chunk_info_by_coord(self, coordinate_index):
+        """
+        Retrieve information about a chunk specified by the array address of the chunk’s 
+        first element in each dimension.
+        """
         return self.index[coordinate_index]
     
     def get_num_chunks(self):
+        """ 
+        Return total number of chunks in dataset
+        """
         return len(self.index)
     
     def read_direct_chunk(self, chunk_position, **kwargs):

From 013ce6252a2f82fd8e1ec07802eb3a00bf657ad9 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 26 Apr 2024 11:42:47 +0100
Subject: [PATCH 028/105] Cleaning up

---
 pyfive/btree.py | 14 --------------
 pyfive/h5d.py   |  7 +++----
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/pyfive/btree.py b/pyfive/btree.py
index fedbc64..38518fc 100644
--- a/pyfive/btree.py
+++ b/pyfive/btree.py
@@ -199,20 +199,6 @@ def construct_data_from_chunks(
 
         non_padded_region = tuple([slice(i) for i in data_shape])
         return data[non_padded_region]
-    
-    def get_one_chunk_buffer(self, addr, size, itemsize, filter_pipeline, filter_mask): 
-        """ 
-        Used when getting data chunk by chunk for reading partial data arrays
-        All the shaping and positioning is done in the calling function.
-        """
-        self.fh.seek(addr)
-        chunk_buffer = self.fh.read(size)
-        if filter_pipeline is not None:
-            #chunk_buffer = self.fh.read(size)
-            #filter_mask = filter_mask
-            chunk_buffer = self._filter_chunk(
-                chunk_buffer, filter_mask, filter_pipeline, itemsize)
-        return chunk_buffer
 
     @classmethod
     def _filter_chunk(cls, chunk_buffer, filter_mask, filter_pipeline, itemsize):
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index bde427b..7c9f838 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -12,8 +12,8 @@ class H5Dataset:
     
     Also, many H5D* functions which take a dataset instance as their first argument 
     are presented as methods of this class. This is a subset of those supported
-    by H5Py's module H5D.
-    
+    by H5Py's module H5D, but includes all the low level methods for working with 
+    chunked data.
     """
     def __init__(self, dataobject):
         """ 
@@ -36,7 +36,7 @@ def __init__(self, dataobject):
 
     def __hash__(self):
         """ 
-        H5py says this is hasable, we haven't implemented that.
+        H5py says this is hashable, we haven't implemented that.
         """
         raise NotImplementedError
         
@@ -151,7 +151,6 @@ def _get_raw_chunk(self, storeinfo):
         """ 
         Obtain the bytes associated with a chunk.
         """
-
         self._fh.seek(storeinfo.byte_offset)
         return self._fh.read(storeinfo.size) 
 

From c21ee637857320cab6e1ea3fd511c9c72ff23b81 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 9 Jul 2024 09:50:33 +0100
Subject: [PATCH 029/105] Ok, these were the pull request fixes that I thought
 I'd merged

---
 pyfive/__init__.py    |  2 +-
 pyfive/dataobjects.py | 55 ++++++++++++++++++++++++-------------------
 pyfive/h5d.py         | 10 ++++----
 tests/test_h5d.py     |  5 ----
 4 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/pyfive/__init__.py b/pyfive/__init__.py
index 9fa6180..c11dfb0 100644
--- a/pyfive/__init__.py
+++ b/pyfive/__init__.py
@@ -3,6 +3,6 @@
 """
 
 from .high_level import File
-from .indexing import ZarrArrayStub, OrthogonalIndexer
+
 
 __version__ = '0.4.0.dev'
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 5508da7..4be83ea 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -9,18 +9,18 @@
 
 import numpy as np
 
-from .datatype_msg import DatatypeMessage
-from .core import _padded_size, _structure_size
-from .core import _unpack_struct_from, _unpack_struct_from_file
-from .core import InvalidHDF5File
-from .core import Reference
-from .core import UNDEFINED_ADDRESS
-from .btree import BTreeV1Groups, BTreeV1RawDataChunks
-from .btree import BTreeV2GroupNames, BTreeV2GroupOrders
-from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
-from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
-from .h5d import H5Dataset
-from .indexing import OrthogonalIndexer, ZarrArrayStub
+from pyfive.datatype_msg import DatatypeMessage
+from pyfive.core import _padded_size, _structure_size
+from pyfive.core import _unpack_struct_from, _unpack_struct_from_file
+from pyfive.core import InvalidHDF5File
+from pyfive.core import Reference
+from pyfive.core import UNDEFINED_ADDRESS
+from pyfive.btree import BTreeV1Groups, BTreeV1RawDataChunks
+from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders
+from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
+from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
+from pyfive.h5d import H5Dataset
+from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 
 # these constants happen to have the same value...
 UNLIMITED_SIZE = UNDEFINED_ADDRESS
@@ -602,15 +602,15 @@ class DatasetDataObject(DataObjects):
     Subclass of DataObjects associated with one Dataset, 
     handles actual data access.
     """
-    def __init__(self,*args,**kwargs):
+    def __init__(self, *args, **kwargs):
         """
         Initialise via super class
         """
-        super().__init__(*args,**kwargs)
+        super().__init__(*args, **kwargs)
         self._id = None
 
         # make this explicit, but controllable
-        self.order='C'
+        self.order = 'C'
 
         ##########################################################################
         # pseudo chunking control. 
@@ -636,6 +636,8 @@ def id(self):
         Represents a PyFive approximation of an HDF5 dataset identifier.
         Objects of this class provides methods for working directly with chunked data.
         """
+        # When instantiated self._id is None, this property is called when the
+        # class instance is first used in anger to actually get the chunk indices etc 
         if self._id is None:
             self._get_chunk_params()
             self._id = H5Dataset(self)
@@ -649,7 +651,7 @@ def get_data(self, args):
         if self.layout_class == 0:  # compact storage
             raise NotImplementedError("Compact storage")
         elif self.layout_class == 1:  # contiguous storage
-            return self._get_contiguous_data(self.property_offset,args)
+            return self._get_contiguous_data(self.property_offset, args)
         if self.layout_class == 2:  # chunked storage
             # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version
             if isinstance(self.dtype, tuple):
@@ -700,13 +702,13 @@ def _get_contiguous_data(self, property_offset, args):
             if dtype_class == 'REFERENCE':
                 size = self.dtype[1]
                 if size != 8:
-                    raise NotImplementedError('Unsupported Reference type')
+                    raise NotImplementedError('Unsupported Reference type - size {size}')
                 ref_addresses = np.memmap(
                     self.fh, dtype=('<u8'), mode='c', offset=data_offset,
                     shape=self.shape, order=self.order)
                 return np.array([Reference(addr) for addr in ref_addresses])[args]
             else:
-                raise NotImplementedError('datatype not implemented')
+                raise NotImplementedError('datatype not implemented - {dtype_class}')
 
 
@@ -738,7 +740,9 @@ def _get_selection_from_contiguous(self, args=None):
         """
 
         # don't want to be doing this if we are actually chunked!
-        assert self.chunks is None
+        if self.chunks is not None:
+            raise RuntimeError('Unexpected call to continguous selection for chunked data')
+    
         
         data_offset, = struct.unpack_from('<Q', self.msg_data, self.property_offset)
         itemsize = np.dtype(self.dtype).itemsize
@@ -750,8 +754,8 @@ def _get_selection_from_contiguous(self, args=None):
             self.pseudo_chunking = False
 
         if self.pseudo_chunking:
-            stride = np.prod(self.shape[1:])*itemsize
-            if stride < self.pseudo_block_size_kib*1024:
+            stride = np.prod(self.shape[1:]) * itemsize
+            if stride < self.pseudo_block_size_kib * 1024:
                 self.pseudo_chunking = False
         
         if self.pseudo_chunking:
@@ -763,7 +767,7 @@ def _get_selection_from_contiguous(self, args=None):
  
         if args is None:
 
-            # we need it all, let's get it all
+            # we need it all, let's get it all (i.e. this really does read the lot)
             self.fh.seek(data_offset)
             chunk_buffer = self.fh.read(stride)
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
@@ -782,11 +786,14 @@ def _get_selection_from_contiguous(self, args=None):
             out = np.empty(out_shape, dtype=self.dtype, order=self.order)
 
             for chunk_coords, chunk_selection, out_selection in indexer:
-                index = data_offset+chunk_coords[0]*stride
+                index = data_offset + chunk_coords[0] * stride
                 self.fh.seek(index)
                 chunk_buffer = self.fh.read(stride)
                 chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-                out[out_selection] = chunk_data.reshape(pseudo_chunks, order=self.order)[chunk_selection]
+                try: 
+                    out[out_selection] = chunk_data.reshape(pseudo_chunks, order=self.order)[chunk_selection]
+                except Exception as e:
+                    raise IOError('Pseudo chunk reshape failed, original error is {e}')
             return out
         
 
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 7c9f838..0d57f92 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -1,8 +1,8 @@
 import numpy as np
 from collections import namedtuple
 from operator import mul
-from .indexing import OrthogonalIndexer, ZarrArrayStub
-from .btree import BTreeV1RawDataChunks
+from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
+from pyfive.btree import BTreeV1RawDataChunks
 
 StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size")
 
@@ -57,7 +57,7 @@ def rank(self):
     def dtype(self):
         # FIXME: Not sure what H5Py is doing here need to find out,
         # but I'm sure it's not this.
-        if self.parent_object.dtype == ('REFERENCE',8):
+        if self.parent_object.dtype == ('REFERENCE', 8):
             return self.parent_object.dtype
         else:
             return np.dtype(self.parent_object.dtype)
@@ -138,7 +138,7 @@ def convert_selection(tuple_of_slices):
             # For compatability I've gone for "the same"
             def convert_slice(aslice):
                 if aslice.step is None:
-                    return slice(aslice.start,aslice.stop,1)
+                    return slice(aslice.start, aslice.stop, 1)
                 return aslice
             return tuple([convert_slice(a) for a in tuple_of_slices])
     
@@ -177,7 +177,7 @@ def _get_selection_via_chunks(self, args):
         out = np.empty(out_shape, dtype=self.dtype, order=self._order)
 
         for chunk_coords, chunk_selection, out_selection in indexer:
-            chunk_coords = tuple(map(mul,chunk_coords,self._chunks))
+            chunk_coords = tuple(map(mul, chunk_coords, self._chunks))
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if self.filter_pipeline is not None:
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
diff --git a/tests/test_h5d.py b/tests/test_h5d.py
index 730f379..6f126d3 100644
--- a/tests/test_h5d.py
+++ b/tests/test_h5d.py
@@ -62,8 +62,3 @@ def test_iter_chunks():
 
     assert h5chunks == p5chunks 
 
-
-if __name__ == "__main__":
-    test_h5d_chunking_details()
-    test_iter_chunks()
-

From 3c3f6d6d4ffb01bda16a0a663ac3adf9c9e9d920 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 10 Jul 2024 11:39:59 +0100
Subject: [PATCH 030/105] Adding Datatype and check_enum_dtype in a minimal
 manner - closes #8

---
 pyfive/__init__.py   |  6 ++++--
 pyfive/h5t.py        | 15 +++++++++++++++
 pyfive/high_level.py | 22 ++++++++++++++++++----
 3 files changed, 37 insertions(+), 6 deletions(-)
 create mode 100644 pyfive/h5t.py

diff --git a/pyfive/__init__.py b/pyfive/__init__.py
index c11dfb0..6cba821 100644
--- a/pyfive/__init__.py
+++ b/pyfive/__init__.py
@@ -1,8 +1,10 @@
 """
 pyfive : a pure python HDF5 file reader.
+This is the public API exposed by pyfive,
+which is a small subset of the H5PY API.
 """
 
-from .high_level import File
-
+from pyfive.high_level import File, Group, Dataset, Datatype
+from pyfive.h5t import check_enum_dtype
 
 __version__ = '0.4.0.dev'
diff --git a/pyfive/h5t.py b/pyfive/h5t.py
new file mode 100644
index 0000000..9eae216
--- /dev/null
+++ b/pyfive/h5t.py
@@ -0,0 +1,15 @@
+def check_enum_dtype(dt):
+    """
+    If the dtype represents an HDF5 enumerated type, returns the dictionary
+    mapping string names to integer values.
+    Returns None if the dtype does not represent an HDF5 enumerated type.
+    
+    Note that currently pyfive does not support HDF5 enumerated types,
+    so this will always return None (see datatype_msg), and AFIK, should
+    never get called in anger. It is only included so h5netcdf wont
+    barf at its absence when pyfive is used as a backend.
+    """
+    try:
+        return dt.metadata.get('enum', None)
+    except AttributeError:
+        return None
\ No newline at end of file
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 6db9159..7561ab1 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -7,9 +7,10 @@
 
 import numpy as np
 
-from .core import Reference
-from .dataobjects import DataObjects, DatasetDataObject
-from .misc_low_level import SuperBlock
+from pyfive.core import Reference
+from pyfive.dataobjects import DataObjects, DatasetDataObject
+from pyfive.misc_low_level import SuperBlock
+from pyfive.datatype_msg import DatatypeMessage
 
 
 class Group(Mapping):
@@ -172,8 +173,10 @@ class File(Group):
 
     """
 
-    def __init__(self, filename):
+    def __init__(self, filename, mode='r'):
         """ initalize. """
+        if mode != 'r':
+            raise NotImplementedError('pyfive only provides support for reading and treats all reads as binary')
         self._close = False
         if hasattr(filename, 'read'):
             if not hasattr(filename, 'seek'):
@@ -443,3 +446,14 @@ def __enter__(self):
 
     def __exit__(self, *args):
         self._dset._astype = None
+
+
+class Datatype(DatatypeMessage):
+    """
+    Class provided for compatbility with the H5PY API.
+    It's not yet clear where and how this might be used
+    by that name (if at all), but the existence of a 
+    class with this name is required by h5netcdf.
+    """
+    def __init__(self, *args, **kw):
+         super().__init__(self, *args, **kw)
\ No newline at end of file

From 99945989d215cd9fbd893d62dd2f0d06939631ab Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 12 Jul 2024 10:33:45 +0100
Subject: [PATCH 031/105] Basic support for elements of h5netcdf and what it
 expects to be able to import from it's backend (which it expects to be h5py).
  Also added some better handling for situations where we find datatypes which
 pyfive doesn't understand.

---
 pyfive/__init__.py    |  3 ++-
 pyfive/dataobjects.py | 12 ++++++++++++
 pyfive/h5py.py        | 38 ++++++++++++++++++++++++++++++++++++++
 pyfive/high_level.py  | 32 +++++++++++++++++++-------------
 4 files changed, 71 insertions(+), 14 deletions(-)
 create mode 100644 pyfive/h5py.py

diff --git a/pyfive/__init__.py b/pyfive/__init__.py
index 6cba821..8e43221 100644
--- a/pyfive/__init__.py
+++ b/pyfive/__init__.py
@@ -4,7 +4,8 @@
 which is a small subset of the H5PY API.
 """
 
-from pyfive.high_level import File, Group, Dataset, Datatype
+from pyfive.high_level import File, Group, Dataset
 from pyfive.h5t import check_enum_dtype
+from pyfive.h5py import Datatype, Empty
 
 __version__ = '0.4.0.dev'
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 4be83ea..318cdce 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -596,6 +596,18 @@ def is_dataset(self):
         """ True when DataObjects points to a dataset, False for a group. """
         return len(self.find_msg_type(DATASPACE_MSG_TYPE)) > 0
     
+    @property
+    def is_datatype(self):
+        """ Is this a standalone datatype definition?"""
+        if self.msgs[0]['type'] == DATATYPE_MSG_TYPE:
+            #I'm thinking that for the moment, this almost certainly means 
+            #an unimplemented user datatype. If so, let's tell the higher
+            #level now, as the following will raise a NotImplementedError
+            x = DatatypeMessage(self.msg_data, self.msgs[0]['offset_to_message'])
+            return True
+        else:
+            return False
+    
 
 class DatasetDataObject(DataObjects):
     """ 
diff --git a/pyfive/h5py.py b/pyfive/h5py.py
new file mode 100644
index 0000000..62c2076
--- /dev/null
+++ b/pyfive/h5py.py
@@ -0,0 +1,38 @@
+### This file contains H5Py classes which are not used by
+### pyfive, but which are included in the public API for
+### htnetcdf which expects to see these H5PY classes.
+
+
+from pyfive.datatype_msg import DatatypeMessage
+import numpy as np
+
+class Datatype:
+    """ 
+    Class provided for compatability with the H5PY API,
+    to allow applications such as h5netcdf to import it,
+    but not use it.
+    """
+    def __init__(self,*args,**kw):
+        raise NotImplementedError
+
+class Empty:
+
+    """
+    Proxy object to represent empty/null dataspaces (a.k.a H5S_NULL).
+    This can have an associated dtype, but has no shape or data. This is not
+    the same as an array with shape (0,). This class provided for compatibility
+    with the H5Py API to support h5netcdf. It is not used by pyfive.
+    """
+    shape = None
+    size = None
+
+    def __init__(self, dtype):
+        self.dtype = np.dtype(dtype)
+
+    def __eq__(self, other):
+        if isinstance(other, Empty) and self.dtype == other.dtype:
+            return True
+        return False
+
+    def __repr__(self):
+        return "Empty(dtype={0!r})".format(self.dtype)
\ No newline at end of file
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 7561ab1..fbb2f95 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -4,13 +4,14 @@
 from collections.abc import Mapping, Sequence
 import os
 import posixpath
+import warnings
 
 import numpy as np
 
 from pyfive.core import Reference
 from pyfive.dataobjects import DataObjects, DatasetDataObject
 from pyfive.misc_low_level import SuperBlock
-from pyfive.datatype_msg import DatatypeMessage
+
 
 
 class Group(Mapping):
@@ -91,7 +92,19 @@ def __getitem__(self, y):
             if additional_obj != '.':
                 raise KeyError('%s is a dataset, not a group' % (obj_name))
             return Dataset(obj_name, DatasetDataObject(self.file._fh, link_target), self)
-        return Group(obj_name, dataobjs, self)[additional_obj]
+       
+        try:
+            # if true, this may well raise a NotImplementedError, if so, we need
+            # to warn the user, who may be able to use other parts of the data.
+            is_datatype = dataobjs.is_datatype
+        except NotImplementedError as e:
+            warnings.warn(f'Found datatype {obj_name} but pyfive cannot read this data: {e}')
+            is_datatype = True
+
+        if is_datatype: 
+            pass
+        else:
+            return Group(obj_name, dataobjs, self)[additional_obj]
 
     def __iter__(self):
         for k in self._links.keys():
@@ -334,7 +347,10 @@ def ndim(self):
     def dtype(self):
         """ dtype attribute. """
         # In the HDF5 implementation this is a numpy dtype
-        return self._dataobjects.dtype
+        try:
+            return self._dataobjects.dtype
+        except NotImplementedError as e:
+            raise NotImplementedError(f'{e} (for {self.name})')
 
     @property
     def value(self):
@@ -447,13 +463,3 @@ def __enter__(self):
     def __exit__(self, *args):
         self._dset._astype = None
 
-
-class Datatype(DatatypeMessage):
-    """
-    Class provided for compatbility with the H5PY API.
-    It's not yet clear where and how this might be used
-    by that name (if at all), but the existence of a 
-    class with this name is required by h5netcdf.
-    """
-    def __init__(self, *args, **kw):
-         super().__init__(self, *args, **kw)
\ No newline at end of file

From c12b5b30fd1099cca7b4d677f9ee023a4da0227f Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 12 Jul 2024 10:56:19 +0100
Subject: [PATCH 032/105] Test support for graceful enum failure

---
 tests/make_enum_file.py | 13 +++++++++++++
 tests/test_enum_var.py  | 23 +++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 tests/make_enum_file.py
 create mode 100644 tests/test_enum_var.py

diff --git a/tests/make_enum_file.py b/tests/make_enum_file.py
new file mode 100644
index 0000000..cdba322
--- /dev/null
+++ b/tests/make_enum_file.py
@@ -0,0 +1,13 @@
+""" Create an HDF5 files with an enum datatype using the netcdf interface """
+from netCDF4 import Dataset
+import numpy as np
+
+ncd = Dataset('enum_variable.hdf5','w')
+enum_dict = dict(stratus=1, cumulus=2, nimbus=3, missing=255)
+enum_type = ncd.createEnumType(np.uint8,'enum_t', enum_dict)
+
+dim = ncd.createDimension('axis',5) 
+enum_var = ncd.createVariable('enum_var',enum_type,'axis',
+                                fill_value=enum_dict['missing'])
+enum_var[:] = [enum_dict[k] for k in ['stratus','stratus','missing','nimbus','cumulus']]
+ncd.close()
diff --git a/tests/test_enum_var.py b/tests/test_enum_var.py
new file mode 100644
index 0000000..e5851be
--- /dev/null
+++ b/tests/test_enum_var.py
@@ -0,0 +1,23 @@
+""" Unit tests for pyfive dealing with an enum variable """
+
+import os
+import pytest
+
+import pyfive
+
+DIRNAME = os.path.dirname(__file__)
+ENUMVAR_HDF5_FILE = os.path.join(DIRNAME, 'enum_variable.hdf5')
+
+def test_read_enum_variable():
+
+    with pyfive.File(ENUMVAR_HDF5_FILE) as hfile:
+
+        for x in hfile: 
+            if x == 'enum_t':
+                with pytest.warns(UserWarning,match='^Found '):
+                    print(x, hfile[x])
+            elif x == 'enum_var':
+                with pytest.raises(NotImplementedError):
+                    print(x, hfile[x])
+            else: 
+                print(x, hfile[x])
\ No newline at end of file

From c80ed92fdb11f4f8b2dec924ec0e5187ec31ec1b Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 14 Jul 2024 08:59:06 +0100
Subject: [PATCH 033/105] Committing to the dtype returned as a numpy dtype,
 and the extra h5t interface.

---
 pyfive/__init__.py    |  2 +-
 pyfive/dataobjects.py |  2 +-
 pyfive/h5t.py         | 66 ++++++++++++++++++++++++++++++++++++++++++-
 pyfive/high_level.py  |  3 +-
 tests/test_repr.py    |  2 +-
 5 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/pyfive/__init__.py b/pyfive/__init__.py
index 8e43221..8e67261 100644
--- a/pyfive/__init__.py
+++ b/pyfive/__init__.py
@@ -5,7 +5,7 @@
 """
 
 from pyfive.high_level import File, Group, Dataset
-from pyfive.h5t import check_enum_dtype
+from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype
 from pyfive.h5py import Datatype, Empty
 
 __version__ = '0.4.0.dev'
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 318cdce..774d942 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -188,7 +188,7 @@ def unpack_attribute(self, offset):
 
         # read in the dataspace information
         shape, maxshape = determine_data_shape(self.msg_data, offset)
-        items = int(np.product(shape))
+        items = int(np.prod(shape))
         offset += _padded_size(attr_dict['dataspace_size'], padding_multiple)
 
         # read in the value(s)
diff --git a/pyfive/h5t.py b/pyfive/h5t.py
index 9eae216..3379e4c 100644
--- a/pyfive/h5t.py
+++ b/pyfive/h5t.py
@@ -1,9 +1,15 @@
+#
+# These are provided to support h5netcdf, and are not used
+# by the pyfive package itself. 
+#
+
+
 def check_enum_dtype(dt):
     """
     If the dtype represents an HDF5 enumerated type, returns the dictionary
     mapping string names to integer values.
     Returns None if the dtype does not represent an HDF5 enumerated type.
-    
+    ---
     Note that currently pyfive does not support HDF5 enumerated types,
     so this will always return None (see datatype_msg), and AFIK, should
     never get called in anger. It is only included so h5netcdf wont
@@ -12,4 +18,62 @@ def check_enum_dtype(dt):
     try:
         return dt.metadata.get('enum', None)
     except AttributeError:
+        return None
+    
+
+def check_string_dtype(dt):
+    """
+    If the dtype represents an HDF5 string, returns a string_info object.
+    The returned string_info object holds the encoding and the length.
+    The encoding can only be 'utf-8' or 'ascii'. The length may be None
+    for a variable-length string, or a fixed length in bytes.
+    Returns None if the dtype does not represent an HDF5 string.
+    ---
+    It's not obvious what this is used for yet, so we just return None 
+    for now.
+    """
+    #vlen_kind = check_vlen_dtype(dt)
+    #    return string_info('utf-8', None)
+    ##if vlen_kind is unicode:
+    #elif vlen_kind is bytes:
+    #    return string_info('ascii', None)
+    #elif dt.kind == 'S':
+    #    enc = (dt.metadata or {}).get('h5py_encoding', 'ascii')
+    #    return string_info(enc, dt.itemsize)
+    #else:
+    #    return None
+    return None
+def check_dtype(**kwds):
+    """ Check a dtype for h5py special type "hint" information.  Only one
+    keyword may be given.
+
+    vlen = dtype
+        If the dtype represents an HDF5 vlen, returns the Python base class.
+        Currently only built-in string vlens (str) are supported.  Returns
+        None if the dtype does not represent an HDF5 vlen.
+
+    enum = dtype
+        If the dtype represents an HDF5 enumerated type, returns the dictionary
+        mapping string names to integer values.  Returns None if the dtype does
+        not represent an HDF5 enumerated type.
+
+    ref = dtype
+        If the dtype represents an HDF5 reference type, returns the reference
+        class (either Reference or RegionReference).  Returns None if the dtype
+        does not represent an HDF5 reference type.
+    """
+
+    if len(kwds) != 1:
+        raise TypeError("Exactly one keyword may be provided")
+
+    name, dt = kwds.popitem()
+
+    if name not in ('vlen', 'enum', 'ref'):
+        raise TypeError('Unknown special type "%s"' % name)
+
+    try:
+        return dt.metadata[name]
+    except TypeError:
+        return None
+    except KeyError:
         return None
\ No newline at end of file
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index fbb2f95..486a8a3 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -346,9 +346,8 @@ def ndim(self):
     @property
     def dtype(self):
         """ dtype attribute. """
-        # In the HDF5 implementation this is a numpy dtype
         try:
-            return self._dataobjects.dtype
+            return np.dtype(self._dataobjects.dtype)
         except NotImplementedError as e:
             raise NotImplementedError(f'{e} (for {self.name})')
 
diff --git a/tests/test_repr.py b/tests/test_repr.py
index d4895a4..cad532f 100644
--- a/tests/test_repr.py
+++ b/tests/test_repr.py
@@ -19,7 +19,7 @@ def test_repr():
         assert str(group1) == group1_str
 
         dataset1 = hfile['dataset1']
-        dataset1_str = '<HDF5 dataset "dataset1": shape (4,), type "<i4">'
+        dataset1_str = '<HDF5 dataset "dataset1": shape (4,), type "int32">'
         assert str(dataset1) == dataset1_str
 
         subgroup1 = group1['subgroup1']

From 04bbef69a3fd99c9586541635309f2d2d35fe50c Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 19 Jul 2024 11:36:39 +0100
Subject: [PATCH 034/105] Test for reference_list

---
 tests/test_reference_list.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 tests/test_reference_list.py

diff --git a/tests/test_reference_list.py b/tests/test_reference_list.py
new file mode 100644
index 0000000..a1aef3c
--- /dev/null
+++ b/tests/test_reference_list.py
@@ -0,0 +1,23 @@
+""" Unit tests for pyfive's ability to deal with reference lists """
+import os
+import warnings
+
+import numpy as np
+from numpy.testing import assert_array_equal, assert_almost_equal
+
+import pyfive
+
+DIRNAME = os.path.dirname(__file__)
+NETCDF4_CLASSIC_FILE = os.path.join(DIRNAME, 'netcdf4_classic.nc')
+
+
+def test_read_netcdf4_dims():
+
+    with pyfive.File(NETCDF4_CLASSIC_FILE) as hfile:
+
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            dimensions_x = hfile['x'].dims 
+            if caught_warnings:
+                for warning in caught_warnings:
+                    print('Caught warning ', warning)
+                raise NotImplementedError('We need to fix this warning!')
\ No newline at end of file

From 552c463336d71205bd81bb3725ed408aee53a167 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 19 Dec 2024 17:30:45 +0000
Subject: [PATCH 035/105] (New reference list test still broken) H5D has been
 disconnected from the parent dataobjects so once instantiated it does not
 need to touch the file metadata.

---
 bnl/opening_speed.py  |  30 ++++++++++
 pyfive/dataobjects.py |   9 +--
 pyfive/h5d.py         | 124 +++++++++++++++++++++++-------------------
 pyfive/high_level.py  |   6 +-
 4 files changed, 109 insertions(+), 60 deletions(-)
 create mode 100644 bnl/opening_speed.py

diff --git a/bnl/opening_speed.py b/bnl/opening_speed.py
new file mode 100644
index 0000000..db38d96
--- /dev/null
+++ b/bnl/opening_speed.py
@@ -0,0 +1,30 @@
+import h5py
+import pyfive
+from pathlib import Path
+import time
+
+mypath = Path(__file__).parent
+fname = 'da193o_25_day__grid_T_198807-198807.nc'
+p = mypath/fname
+
+h1 = time.time()
+f1 = h5py.File(p,'r')
+h2 = time.time()
+f2 = pyfive.File(p)
+h3 = time.time()
+
+
+
+v = f2['tos']
+d = v._dataobjects
+h4 = time.time()
+d._get_chunk_addresses()
+h5 = time.time()
+
+
+print("File Opening Time Comparison")
+print(f'h5py:   {h2-h1:9.6f}')
+print(f'pyfive: {h3-h2:9.6f}')
+print(f'Additional times: {h4-h3:9.6f}, {h5-h4:9.6f}')
+print(f'Total times: H5 {h4-h3:9.6f}, P5 {h5-h4:9.6f}')
+
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 774d942..8c68bde 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -648,8 +648,7 @@ def id(self):
         Represents a PyFive approximation of an HDF5 dataset identifier.
         Objects of this class provides methods for working directly with chunked data.
         """
-        # When instantiated self._id is None, this property is called when the
-        # class instance is first used in anger to actually get the chunk indices etc 
+        # We want to make sure that this is lazy and cached
         if self._id is None:
             self._get_chunk_params()
             self._id = H5Dataset(self)
@@ -667,10 +666,12 @@ def get_data(self, args):
         if self.layout_class == 2:  # chunked storage
             # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version
             if isinstance(self.dtype, tuple):
-                return self.id._get_reference_chunks(self.msg_offset)[args]
+                # references need to read all the chunks for now
+                return self.id._get_selection_via_chunks(())[args]
             else:
+                # this is lazily reading only the chunks we need
                 return self.id._get_selection_via_chunks(args)
-
+            
     def _get_data_message_properties(self, msg_offset):
         """ Return the message properties of the DataObject. """
         dims, layout_class, property_offset = None, None, None
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 0d57f92..532d1af 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -3,6 +3,7 @@
 from operator import mul
 from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 from pyfive.btree import BTreeV1RawDataChunks
+from pyfive.core import Reference
 
 StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size")
 
@@ -13,55 +14,54 @@ class H5Dataset:
     Also, many H5D* functions which take a dataset instance as their first argument 
     are presented as methods of this class. This is a subset of those supported
     by H5Py's module H5D, but includes all the low level methods for working with 
-    chunked data.
+    chunked data, lazily or not. This class has been deliberately implemented in
+    such as way so that once you have an instance, it is completely independent
+    of the parent file, and it can be used efficiently in threads without rereading
+    the btree etc.
     """
     def __init__(self, dataobject):
         """ 
-        Instantiated with the pyfive datasetdataobject
-        """
-        self.parent_object = dataobject
-        self._chunks = self.parent_object.chunks
-        self._order = self.parent_object.order
-        self._fh = self.parent_object.fh
-        self.filter_pipeline = self.parent_object.filter_pipeline
+        Instantiated with the pyfive datasetdataobject, we copy and cache everything 
+        we want so it can be used after the parent file is closed, without needing 
+        to go back to storage.
+        """
+        self._chunks = dataobject.chunks
+        self._order = dataobject.order
+        self._filename = dataobject.fh.name
+        self.filter_pipeline = dataobject.filter_pipeline
+        self.shape = dataobject.shape
+        self.rank = len(self.shape)
+        self._msg_offset = dataobject.msg_offset
+        self._unique = (self._filename, self.shape, self._msg_offset)
+
+        if dataobject.dtype == ('REFERENCE', 8):
+            # this may not behave the same as h5py, do we care? #FIXME
+            self.dtype = dataobject.dtype
+        else:
+            self.dtype = np.dtype(dataobject.dtype)
 
         self.index =  None
-        # Should we read this at instantiation?
-        # I figure yes, given folks will likely only
-        # go this low if they want to manipulate chunks.
-        # Otherwise we'd call the (cached) build routine on
-        # each chunk manipulation. That could be a lot of
-        # empty function calls, even if they are cheap cf I/O. 
-        self.__build_index()
+        
+        # This reads the b-tree and caches it in a form suitable for use with
+        # the zarr indexer we use to lazily get chunks.
+
+        self.__build_index(dataobject)
 
     def __hash__(self):
-        """ 
-        H5py says this is hashable, we haven't implemented that.
+        """ The hash is based on assuming the file path, the location
+        of the data in the file, and the data shape are a unique
+        combination.
         """
-        raise NotImplementedError
+        return hash(self.unique)
         
     def __eq__(self, other):
         """
-        H5Py says that equality is determined by true HDF5 identity.
-        """
-        # We kick that upstairs. 
-        return self.parent_object == other.parent_object
-
-    @property
-    def shape(self):
-        return self.parent_object.shape
-    @property
-    def rank(self):
-        return self.parent_object.rank
-    @property
-    def dtype(self):
-        # FIXME: Not sure what H5Py is doing here need to find out,
-        # but I'm sure it's not this.
-        if self.parent_object.dtype == ('REFERENCE', 8):
-            return self.parent_object.dtype
-        else:
-            return np.dtype(self.parent_object.dtype)
+        Equality is based on the filename, location of the data in the file
+        and the shape of the data.
+        """
+        return self._unique == other._unique
     
+
     def get_chunk_info(self, index):
         """
         Retrieve storage information about a chunk specified by its index.
@@ -97,7 +97,7 @@ def read_direct_chunk(self, chunk_position, **kwargs):
     # third parties to use them. They are not H5Py methods.
     ######
 
-    def __build_index(self):
+    def __build_index(self, dataobject):
         """ 
         Build the chunk index if it doesn't exist
         """
@@ -106,7 +106,7 @@ def __build_index(self):
             return
         
         chunk_btree = BTreeV1RawDataChunks(
-                self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims)
+                dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims)
         
         self.index = {}
         # we do this to avoid either using an iterator or many 
@@ -144,44 +144,58 @@ def convert_slice(aslice):
     
         array = ZarrArrayStub(self.shape, self._chunks)
         indexer = OrthogonalIndexer(args, array) 
-        for chunk_coords, chunk_selection, out_selection in indexer:
+        for _, _, out_selection in indexer:
             yield convert_selection(out_selection)
     
     def _get_raw_chunk(self, storeinfo):
         """ 
         Obtain the bytes associated with a chunk.
         """
-        self._fh.seek(storeinfo.byte_offset)
-        return self._fh.read(storeinfo.size) 
-
-    def _get_reference_chunks(self, offset):
-        """ 
-        Return reference data which is chunked. At the moment
-        we re-read the b-tree to do this, since we didn't cache
-        it at index construction. #FIXME
-        """
-        chunk_btree = BTreeV1RawDataChunks(
-            self._fh, self.parent_object._chunk_address, self.parent_object._chunk_dims)
-        return chunk_btree.construct_data_from_chunks(
-            self._chunks, self.shape, self.dtype, self.filter_pipeline) 
+        with open(self._filename,'rb') as open_file:
+            open_file.seek(storeinfo.byte_offset)
+            return open_file.read(storeinfo.size) 
 
     def _get_selection_via_chunks(self, args):
         """
         Use the zarr orthogonal indexer to extract data for a specfic selection within
         the dataset array and in doing so, only load the relevant chunks.
         """
+        # need a local dtype as we may override it for a reference read.
+        dtype = self.dtype
+
+        if isinstance(self.dtype, tuple): 
+            # this is a reference and we're returning that
+            true_dtype = tuple(dtype)
+            dtype_class = dtype[0]
+            if dtype_class == 'REFERENCE':
+                size = dtype[1]
+                if size != 8:
+                    raise NotImplementedError('Unsupported Reference type')
+                dtype = '<u8'
+            else:
+                raise NotImplementedError('datatype not implemented')
+        else:
+            true_dtype = None
 
         array = ZarrArrayStub(self.shape, self._chunks)
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
-        out = np.empty(out_shape, dtype=self.dtype, order=self._order)
+        out = np.empty(out_shape, dtype=dtype, order=self._order)
 
         for chunk_coords, chunk_selection, out_selection in indexer:
             chunk_coords = tuple(map(mul, chunk_coords, self._chunks))
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if self.filter_pipeline is not None:
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
-            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+            chunk_data = np.frombuffer(chunk_buffer, dtype=dtype)
             out[out_selection] = chunk_data.reshape(self._chunks, order=self._order)[chunk_selection]
+       
+        if true_dtype is not None:
+            # no idea if this is going to work!
+            if dtype_class == 'REFERENCE':
+                to_reference = np.vectorize(Reference)
+                out = to_reference(out)
+            else:
+                raise NotImplementedError('datatype not implemented')
 
         return out
\ No newline at end of file
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 486a8a3..a7e7f26 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -286,6 +286,7 @@ def __init__(self, name, dataobjects, parent):
         self._dataobjects = dataobjects
         self._attrs = None
         self._astype = None
+        self._id = None
 
     def __repr__(self):
         info = (os.path.basename(self.name), self.shape, self.dtype)
@@ -326,7 +327,10 @@ def iter_chunks(self, *args):
     
     @property
     def id(self):
-        return self._dataobjects.id
+        # we want to make sure that this is lazy and cached
+        if self._id is None:
+            self._id = self._dataobjects.id
+        return self._id
 
     @property
     def shape(self):

From e32a1b03fad642d1005cd7764e0103edcad4f22e Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 20 Dec 2024 14:09:49 +0000
Subject: [PATCH 036/105] Interim commit so we have something to point to in a
 discussion around the problem with references and caching everything we need
 in a variable.

---
 .gitignore            |   1 +
 pyfive/dataobjects.py | 170 ++++++++++++----------------------
 pyfive/h5d.py         | 206 +++++++++++++++++++++++++++++++++---------
 pyfive/high_level.py  |  42 ++++-----
 4 files changed, 241 insertions(+), 178 deletions(-)

diff --git a/.gitignore b/.gitignore
index 737af19..bcc99b3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .coverage
+.pyc
 build
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 8c68bde..8a8c371 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -19,7 +19,7 @@
 from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders
 from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
 from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
-from pyfive.h5d import H5Dataset
+from pyfive.h5d import DatasetID
 from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 
 # these constants happen to have the same value...
@@ -247,6 +247,7 @@ def _vlen_size_and_data(self, buf, offset):
         gheap = self._global_heaps[gheap_address]
         vlen_data = gheap.objects[gheap_id['object_index']]
         return vlen_size, vlen_data
+    
 
     @property
     def shape(self):
@@ -575,6 +576,28 @@ def _iter_links_btree_v2(self, name_btree_address, order_btree_address, heap_add
         for creationorder, value in sorted(adict.items()):
             yield value
 
+
+    def _get_data_message_properties(self, msg_offset):
+        """ Return the message properties of the DataObject. """
+        dims, layout_class, property_offset = None, None, None
+        version, arg1, arg2 = struct.unpack_from(
+            '<BBB', self.msg_data, msg_offset)
+        if (version == 1) or (version == 2):
+            dims = arg1
+            layout_class = arg2
+            property_offset = msg_offset
+            property_offset += struct.calcsize('<BBB')
+            # reserved fields: 1 byte, 1 int
+            property_offset += struct.calcsize('<BI')
+            # compact storage (layout class 0) not supported:
+            assert (layout_class == 1) or (layout_class == 2)
+        elif (version == 3) or (version == 4):
+            layout_class = arg1
+            property_offset = msg_offset
+            property_offset += struct.calcsize('<BB')
+        assert (version >= 1) and (version <= 4)
+        return version, dims, layout_class, property_offset
+
     @staticmethod
     def _decode_link_info_msg(data, offset):
         version, flags = struct.unpack_from('<BB', data, offset)
@@ -591,6 +614,14 @@ def _decode_link_info_msg(data, offset):
         data = _unpack_struct_from(fmt, data, offset)
         return {k: None if v == UNDEFINED_ADDRESS else v for k, v in data.items()}
 
+    def get_id_storage_params(self):
+        """ Return msg offset, layout and offset from data storage message """
+        msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
+        msg_offset = msg['offset_to_message']
+        version, dims, layout_class, property_offset = (
+            self._get_data_message_properties(msg_offset))
+        return msg_offset, layout_class, property_offset
+    
     @property
     def is_dataset(self):
         """ True when DataObjects points to a dataset, False for a group. """
@@ -607,8 +638,7 @@ def is_datatype(self):
             return True
         else:
             return False
-    
-
+        
 class DatasetDataObject(DataObjects):
     """ 
     Subclass of DataObjects associated with one Dataset, 
@@ -624,17 +654,12 @@ def __init__(self, *args, **kwargs):
         # make this explicit, but controllable
         self.order = 'C'
 
-        ##########################################################################
-        # pseudo chunking control. 
-        # these can be changed from outside for optimisation
-        # pseudo chunk blocksize: this is a size below which we don't bother 
-        # pseudo chunking for contiguous data and just load the lot at data
-        # access time: units are kibibytes
-        self.pseudo_chunking = True
-        self.pseudo_block_size_kib = 1024
-        # We can't use mmaps on S3
-        self.avoid_mmap = True
-        ##########################################################################
+        try:
+            self.fh.fileno()
+            self.avoid_mmap = False
+        except (AttributeError, OSError):
+            # We can't use mmaps on S3
+            self.avoid_mmap = True
 
         # offset and size from data storage message
         msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
@@ -651,7 +676,7 @@ def id(self):
         # We want to make sure that this is lazy and cached
         if self._id is None:
             self._get_chunk_params()
-            self._id = H5Dataset(self)
+            self._id = DatasetID(self)
         return self._id
 
     def get_data(self, args):
@@ -672,26 +697,7 @@ def get_data(self, args):
                 # this is lazily reading only the chunks we need
                 return self.id._get_selection_via_chunks(args)
             
-    def _get_data_message_properties(self, msg_offset):
-        """ Return the message properties of the DataObject. """
-        dims, layout_class, property_offset = None, None, None
-        version, arg1, arg2 = struct.unpack_from(
-            '<BBB', self.msg_data, msg_offset)
-        if (version == 1) or (version == 2):
-            dims = arg1
-            layout_class = arg2
-            property_offset = msg_offset
-            property_offset += struct.calcsize('<BBB')
-            # reserved fields: 1 byte, 1 int
-            property_offset += struct.calcsize('<BI')
-            # compact storage (layout class 0) not supported:
-            assert (layout_class == 1) or (layout_class == 2)
-        elif (version == 3) or (version == 4):
-            layout_class = arg1
-            property_offset = msg_offset
-            property_offset += struct.calcsize('<BB')
-        assert (version >= 1) and (version <= 4)
-        return version, dims, layout_class, property_offset
+
 
     def _get_contiguous_data(self, property_offset, args):
         data_offset, = struct.unpack_from('<Q', self.msg_data, property_offset)
@@ -702,14 +708,14 @@ def _get_contiguous_data(self, property_offset, args):
 
         if not isinstance(self.dtype, tuple):
             if self.avoid_mmap:
-                return self._get_selection_from_contiguous(args)
+                return self._get_direct_from_contiguous(args)
             else:
                 try:
                     # return a memory-map to the stored array with copy-on-write
                     return np.memmap(self.fh, dtype=self.dtype, mode='c',
                                 offset=data_offset, shape=self.shape, order=self.order)[args]
                 except UnsupportedOperation:
-                    return self._get_selection_from_contiguous(args)
+                    return self._get_direct_from_contiguous(args)
         else:
             dtype_class = self.dtype[0]
             if dtype_class == 'REFERENCE':
@@ -723,93 +729,37 @@ def _get_contiguous_data(self, property_offset, args):
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
-
-
     def iterchunks(self, sel=None):
         """ 
         Iterate over chunks in a chunked dataset. 
-        The optional sel argument is a slice or tuple of slices that defines the region to be used. #FIXME: sel not yet implemented.
+        The optional sel argument is a slice or tuple of slices that defines the region to be used. 
         If not set, the entire dataspace will be used for the iterator.
-        For each chunk within the given region, the iterator yields a tuple of slices that gives the intersection of the given chunk 
-        with the selection area. This can be used to read or write data in that chunk.
-        A TypeError will be raised if the dataset is not chunked.
-        A ValueError will be raised if the selection region is invalid.
+        For each chunk within the given region, the iterator yields a tuple of slices that gives the
+        intersection of the given chunk  with the selection area. 
+        This can be used to read data in that chunk.
         """
         if self.chunks is None:
             raise TypeError('Dataset is not chunked')
         return self.id._iter_chunks(sel)
 
-    def _get_selection_from_contiguous(self, args=None):
+    def _get_direct_from_contiguous(self, args=None):
         """
-        Two options, we either read the entire contiguous array, and pull out
-        the selection (args) from that, or we try and read the contiguous
-        array in pseudo chunks (being the contiguous data which varies on
-        the slowest axis). We only do the latter if a) we have the
-        pseudo_chunking turned on (see init method), b) the array is 
-        multi dimensional, and c) the slabs are big enough to bother
-        (i.e bigger than pseudo_block_size_kib). We use the fact that the
-        storage order is known (HDF writes in the C-order, last axis
-        varying fastest).
+        We read the entire contiguous array, and pull out the selection (args) from that.
+        This is a fallback situation if we can't use a memory map which would otherwise be lazy.
+        This will normally be when we don't have a true Posix file.
         """
-
-        # don't want to be doing this if we are actually chunked!
-        if self.chunks is not None:
-            raise RuntimeError('Unexpected call to continguous selection for chunked data')
     
-        
         data_offset, = struct.unpack_from('<Q', self.msg_data, self.property_offset)
         itemsize = np.dtype(self.dtype).itemsize
-        
-        #are we 1d, too small to worry about pseudo chunks, or is pseudo chunking turned off?
-        #if so, read the lot as one chunk, otherwise use collapse all but last dimension into chunks
-      
-        if len(self.shape) == 1:
-            self.pseudo_chunking = False
-
-        if self.pseudo_chunking:
-            stride = np.prod(self.shape[1:]) * itemsize
-            if stride < self.pseudo_block_size_kib * 1024:
-                self.pseudo_chunking = False
-        
-        if self.pseudo_chunking:
-            pseudo_chunks = np.copy(self.shape)
-            pseudo_chunks[0] = 1
-        else:
-            stride = np.prod(self.shape)*itemsize
-            pseudo_chunks=self.shape
- 
-        if args is None:
-
-            # we need it all, let's get it all (i.e. this really does read the lot)
-            self.fh.seek(data_offset)
-            chunk_buffer = self.fh.read(stride)
-            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-            chunk_data.reshape(self.shape, order=self.order)
-            return chunk_data
-        
-        else:
-
-            # we are hoping that with the pseudo chunking we can avoid some reads
-            # this will be the case if the selection is "along the grain",
-            # otherwise this could be slower.
-
-            array = ZarrArrayStub(self.shape, pseudo_chunks)
-            indexer = OrthogonalIndexer(args, array)
-            out_shape = indexer.shape
-            out = np.empty(out_shape, dtype=self.dtype, order=self.order)
-
-            for chunk_coords, chunk_selection, out_selection in indexer:
-                index = data_offset + chunk_coords[0] * stride
-                self.fh.seek(index)
-                chunk_buffer = self.fh.read(stride)
-                chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-                try: 
-                    out[out_selection] = chunk_data.reshape(pseudo_chunks, order=self.order)[chunk_selection]
-                except Exception as e:
-                    raise IOError('Pseudo chunk reshape failed, original error is {e}')
-            return out
-        
-
+        num_elements = np.prod(self.shape)
+        num_bytes = num_elements*itemsize
+       
+        # we need it all, let's get it all (i.e. this really does read the lot)
+        self.fh.seek(data_offset)
+        chunk_buffer = self.fh.read(num_bytes)
+        chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+        chunk_data = chunk_data.reshape(self.shape, order=self.order)
+        return chunk_data[args]
         
 
 def determine_data_shape(buf, offset):
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 532d1af..ada759c 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -3,11 +3,13 @@
 from operator import mul
 from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 from pyfive.btree import BTreeV1RawDataChunks
-from pyfive.core import Reference
+from pyfive.core import Reference, UNDEFINED_ADDRESS
+from io import UnsupportedOperation
+import struct
 
 StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size")
 
-class H5Dataset:
+class DatasetID:
     """ 
     Represents an HDF5 dataset identifier.
     
@@ -15,9 +17,9 @@ class H5Dataset:
     are presented as methods of this class. This is a subset of those supported
     by H5Py's module H5D, but includes all the low level methods for working with 
     chunked data, lazily or not. This class has been deliberately implemented in
-    such as way so that once you have an instance, it is completely independent
-    of the parent file, and it can be used efficiently in threads without rereading
-    the btree etc.
+    such as way so as to cache all the relevant metadata, so that once you have an 
+    instance, it is completely independent of the parent file, and it can be used 
+    efficiently in distributed threads without rereading the btree etc.
     """
     def __init__(self, dataobject):
         """ 
@@ -25,27 +27,40 @@ def __init__(self, dataobject):
         we want so it can be used after the parent file is closed, without needing 
         to go back to storage.
         """
-        self._chunks = dataobject.chunks
+
         self._order = dataobject.order
         self._filename = dataobject.fh.name
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
         self.rank = len(self.shape)
-        self._msg_offset = dataobject.msg_offset
+        self.chunks = dataobject.chunks
+
+        self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
         self._unique = (self._filename, self.shape, self._msg_offset)
 
+        try:
+            dataobject.fh.fileno()
+            self.avoid_mmap = False
+        except (AttributeError,OSError):
+            # not a posix file on a posix filesystem
+            self.avoid_mmap = True
+
         if dataobject.dtype == ('REFERENCE', 8):
             # this may not behave the same as h5py, do we care? #FIXME
             self.dtype = dataobject.dtype
         else:
             self.dtype = np.dtype(dataobject.dtype)
 
-        self.index =  None
-        
-        # This reads the b-tree and caches it in a form suitable for use with
-        # the zarr indexer we use to lazily get chunks.
+        self._meta = DatasetMeta(dataobject)
 
-        self.__build_index(dataobject)
+        self._index =  None
+        match self.layout_class:
+            case 0:  #compact storage
+                raise NotImplementedError("Compact Storage")
+            case 1:  # contiguous storage
+                self.data_offset, = struct.unpack_from('<Q', dataobject.msg_data, self.property_offset)
+            case 2:  # chunked storage
+                self._build_index(dataobject)
 
     def __hash__(self):
         """ The hash is based on assuming the file path, the location
@@ -60,55 +75,108 @@ def __eq__(self, other):
         and the shape of the data.
         """
         return self._unique == other._unique
-    
 
     def get_chunk_info(self, index):
         """
         Retrieve storage information about a chunk specified by its index.
         Our index is in chunk space, but H5Py wants it in coordinate space.
         """
-        return self.index[self._nthindex[index]]
+        return self._index[self._nthindex[index]]
 
     def get_chunk_info_by_coord(self, coordinate_index):
         """
         Retrieve information about a chunk specified by the array address of the chunk’s 
         first element in each dimension.
         """
-        return self.index[coordinate_index]
+        return self._index[coordinate_index]
     
     def get_num_chunks(self):
         """ 
         Return total number of chunks in dataset
         """
-        return len(self.index)
+        return len(self._index)
     
     def read_direct_chunk(self, chunk_position, **kwargs):
         """
         Returns a tuple containing the filter_mask and the raw data storing this chunk as bytes.
         Additional arugments supported by H5Py are not supported here.
         """
-        if chunk_position not in self.index:
+        if chunk_position not in self._index:
             raise OSError("Chunk coordinates must lie on chunk boundaries")
-        storeinfo = self.index[chunk_position]
+        storeinfo = self._index[chunk_position]
         return storeinfo.filter_mask, self._get_raw_chunk(storeinfo)
+    
+    def get_data(self, args):
+        """ Called by the dataset getitem method """
+
+        match self.layout_class:
+            case 0:  #compact storage
+                raise NotImplementedError("Compact Storage")
+            case 1:  # contiguous storage
+                return self._get_contiguous_data(args)
+            case 2:  # chunked storage
+                if isinstance(self.dtype, tuple):
+                # references need to read all the chunks for now
+                    return self._get_selection_via_chunks(())[args]
+                else:
+                    # this is lazily reading only the chunks we need
+                    return self._get_selection_via_chunks(args)
+                
+    def iter_chunks(self, args):
+        """ 
+        Iterate over chunks in a chunked dataset. 
+        The optional sel argument is a slice or tuple of slices that defines the region to be used. 
+        If not set, the entire dataspace will be used for the iterator.
+        For each chunk within the given region, the iterator yields a tuple of slices that gives the
+        intersection of the given chunk  with the selection area. 
+        This can be used to read data in that chunk.
+        """
+        if self.chunks is None:
+            raise TypeError('Dataset is not chunked')
+        
+        def convert_selection(tuple_of_slices):
+            # while a slice of the form slice(a,b,None) is equivalent
+            # in funtion to a slice of form (a,b,1) it is not the same.
+            # For compatability I've gone for "the same"
+            def convert_slice(aslice):
+                if aslice.step is None:
+                    return slice(aslice.start, aslice.stop, 1)
+                return aslice
+            return tuple([convert_slice(a) for a in tuple_of_slices])
+    
+        array = ZarrArrayStub(self.shape, self.chunks)
+        indexer = OrthogonalIndexer(args, array) 
+        for _, _, out_selection in indexer:
+            yield convert_selection(out_selection)
+
+    ##### The following property is made available to support ActiveStorage
+    ##### and to help those who may want to generate kerchunk indices and
+    ##### bypass the iterator methods.
+    @property
+    def index(self):
+        """ Direct access to the chunk index, if there is one."""
+        if self._index is None:
+            raise ValueError('No chunk index available for HDF layout class {self.layout}')
+        else:
+            return self._index
         
     ######
-    # The following H5Dataset methods are used by PyFive and you wouldn't expect
+    # The following DatasetID methods are used by PyFive and you wouldn't expect
     # third parties to use them. They are not H5Py methods.
     ######
 
-    def __build_index(self, dataobject):
+    def _build_index(self, dataobject):
         """ 
         Build the chunk index if it doesn't exist
         """
         
-        if self.index is not None: 
+        if self._index is not None: 
             return
         
         chunk_btree = BTreeV1RawDataChunks(
                 dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims)
         
-        self.index = {}
+        self._index = {}
         # we do this to avoid either using an iterator or many 
         # temporary list creations if there are repeated chunk accesses.
         self._nthindex = []
@@ -124,28 +192,63 @@ def __build_index(self, dataobject):
                 size = node_key['chunk_size']
                 filter_mask = node_key['filter_mask']
                 self._nthindex.append(key)
-                self.index[key] = StoreInfo(key, filter_mask, addr, size)
+                self._index[key] = StoreInfo(key, filter_mask, addr, size)
+
+    def _get_contiguous_data(self, args):
+    
+        if self.data_offset == UNDEFINED_ADDRESS:
+            # no storage is backing array, return all zeros
+            return np.zeros(self.shape, dtype=self.dtype)[args]
+
+        if not isinstance(self.dtype, tuple):
+            if self.avoid_mmap:
+                return self._get_direct_from_contiguous(args)
+            else:
+                try:
+                    with open(self._filename,'rb') as open_file:
+                        # return a memory-map to the stored array
+                        # I think this would mean that we only move the sub-array corresponding to result!
+                        view =  np.memmap(open_file, dtype=self.dtype, mode='c',
+                                    offset=self.data_offset, shape=self.shape, order=self._order)
+                    result = view[args]
+                    return result
+                except UnsupportedOperation:
+                    return self._get_direct_from_contiguous(args)
+        else:
+            dtype_class = self.dtype[0]
+            if dtype_class == 'REFERENCE':
+                size = self.dtype[1]
+                if size != 8:
+                    raise NotImplementedError('Unsupported Reference type - size {size}')
+                with open(self._filename,'rb') as open_file:
+                    view = np.memmap(
+                        self.fh, dtype=('<u8'), mode='c', offset=self.data_offset,
+                        shape=self.shape, order=self._order)
+                    ref_addresses = view[args]
+                    np.array([Reference(addr) for addr in ref_addresses])[args]
+            else:
+                raise NotImplementedError('datatype not implemented - {dtype_class}')
 
 
-    def _iter_chunks(self, args):
+    def _get_direct_from_contiguous(self, args=None):
         """
-        Provides internal support for iter_chunks method on parent.
-        Errors should be trapped there. 
+        We read the entire contiguous array, and pull out the selection (args) from that.
+        This is a fallback situation if we can't use a memory map which would otherwise be lazy.
+        This will normally be when we don't have a true Posix file.
         """
-        def convert_selection(tuple_of_slices):
-            # while a slice of the form slice(a,b,None) is equivalent
-            # in funtion to a slice of form (a,b,1) it is not the same.
-            # For compatability I've gone for "the same"
-            def convert_slice(aslice):
-                if aslice.step is None:
-                    return slice(aslice.start, aslice.stop, 1)
-                return aslice
-            return tuple([convert_slice(a) for a in tuple_of_slices])
     
-        array = ZarrArrayStub(self.shape, self._chunks)
-        indexer = OrthogonalIndexer(args, array) 
-        for _, _, out_selection in indexer:
-            yield convert_selection(out_selection)
+        itemsize = np.dtype(self.dtype).itemsize
+        num_elements = np.prod(self.shape)
+        num_bytes = num_elements*itemsize
+       
+        # we need it all, let's get it all (i.e. this really does read the lot)
+        with open(self._filename,'rb') as open_file:
+            open_file.seek(self.data_offset)
+            chunk_buffer = open_file.read(num_bytes)
+        chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+        chunk_data = chunk_data.reshape(self.shape, order=self.order)
+        return chunk_data[args]
+
     
     def _get_raw_chunk(self, storeinfo):
         """ 
@@ -177,18 +280,18 @@ def _get_selection_via_chunks(self, args):
         else:
             true_dtype = None
 
-        array = ZarrArrayStub(self.shape, self._chunks)
+        array = ZarrArrayStub(self.shape, self.chunks)
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
         out = np.empty(out_shape, dtype=dtype, order=self._order)
 
         for chunk_coords, chunk_selection, out_selection in indexer:
-            chunk_coords = tuple(map(mul, chunk_coords, self._chunks))
+            chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if self.filter_pipeline is not None:
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
             chunk_data = np.frombuffer(chunk_buffer, dtype=dtype)
-            out[out_selection] = chunk_data.reshape(self._chunks, order=self._order)[chunk_selection]
+            out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
        
         if true_dtype is not None:
             # no idea if this is going to work!
@@ -198,4 +301,21 @@ def _get_selection_via_chunks(self, args):
             else:
                 raise NotImplementedError('datatype not implemented')
 
-        return out
\ No newline at end of file
+        return out
+    
+
+class DatasetMeta:
+    """ 
+    This is a convenience class to bundle up and cache the metadata
+    exposed by the Dataset when DatasetId is constructed.
+    """
+    def __init__(self, dataobject):
+
+        self.attributes = dataobject.compression
+        self.maxshape = dataobject.maxshape
+        self.compression = dataobject.compression
+        self.compression_opts = dataobject.compression_opts
+        self.shuffle = dataobject.shuffle
+        self.fletcher32 = dataobject.fletcher32
+        self.fillvalue = dataobject.fillvalue
+        self.attributes = dataobject.get_attributes()
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index a7e7f26..451e7b8 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from pyfive.core import Reference
-from pyfive.dataobjects import DataObjects, DatasetDataObject
+from pyfive.dataobjects import DataObjects, DatasetID, DatasetDataObject
 from pyfive.misc_low_level import SuperBlock
 
 
@@ -91,7 +91,8 @@ def __getitem__(self, y):
         if dataobjs.is_dataset:
             if additional_obj != '.':
                 raise KeyError('%s is a dataset, not a group' % (obj_name))
-            return Dataset(obj_name, DatasetDataObject(self.file._fh, link_target), self)
+            #return Dataset(obj_name, DatasetDataObject(self.file._fh, link_target), self)
+            return Dataset(obj_name, DatasetID(dataobjs), self)
        
         try:
             # if true, this may well raise a NotImplementedError, if so, we need
@@ -277,23 +278,21 @@ class Dataset(object):
 
     """
 
-    def __init__(self, name, dataobjects, parent):
+    def __init__(self, name, datasetid, parent):
         """ initalize. """
         self.parent = parent
         self.file = parent.file
         self.name = name
-
-        self._dataobjects = dataobjects
         self._attrs = None
         self._astype = None
-        self._id = None
+        self.id=datasetid
 
     def __repr__(self):
         info = (os.path.basename(self.name), self.shape, self.dtype)
         return '<HDF5 dataset "%s": shape %s, type "%s">' % info
 
     def __getitem__(self, args):
-        data = self._dataobjects.get_data(args)
+        data = self.id.get_data(args)
         if self._astype is None:
             return data
         return data.astype(self._astype)
@@ -325,22 +324,16 @@ def len(self):
     def iter_chunks(self, *args):
         return self._dataobjects.id._iter_chunks(args)
     
-    @property
-    def id(self):
-        # we want to make sure that this is lazy and cached
-        if self._id is None:
-            self._id = self._dataobjects.id
-        return self._id
 
     @property
     def shape(self):
         """ shape attribute. """
-        return self._dataobjects.shape
+        return self.id.shape
     
     @property
     def maxshape(self):
         """ maxshape attribute. (None for unlimited dimensions) """
-        return self._dataobjects.maxshape
+        return self.id._meta.maxshape
 
     @property
     def ndim(self):
@@ -351,7 +344,7 @@ def ndim(self):
     def dtype(self):
         """ dtype attribute. """
         try:
-            return np.dtype(self._dataobjects.dtype)
+            return np.dtype(self.id.dtype)
         except NotImplementedError as e:
             raise NotImplementedError(f'{e} (for {self.name})')
 
@@ -370,17 +363,17 @@ def size(self):
     @property
     def chunks(self):
         """ chunks attribute. """
-        return self._dataobjects.chunks
+        return self.id.chunks
 
     @property
     def compression(self):
         """ compression attribute. """
-        return self._dataobjects.compression
+        return self.id._meta.compression
 
     @property
     def compression_opts(self):
         """ compression_opts attribute. """
-        return self._dataobjects.compression_opts
+        return self.id._meta.compression_opts
 
     @property
     def scaleoffset(self):
@@ -390,17 +383,17 @@ def scaleoffset(self):
     @property
     def shuffle(self):
         """ shuffle attribute. """
-        return self._dataobjects.shuffle
+        return self.id._meta.shuffle
 
     @property
     def fletcher32(self):
         """ fletcher32 attribute. """
-        return self._dataobjects.fletcher32
+        return self.id._meta.fletcher32
 
     @property
     def fillvalue(self):
         """ fillvalue attribute. """
-        return self._dataobjects.fillvalue
+        return self.id._meta.fillvalue
 
     @property
     def dims(self):
@@ -410,9 +403,8 @@ def dims(self):
     @property
     def attrs(self):
         """ attrs attribute. """
-        if self._attrs is None:
-            self._attrs = self._dataobjects.get_attributes()
-        return self._attrs
+        return self.id._meta.attributes
+     
 
 
 class DimensionManager(Sequence):

From 2d881014963a578ec5134828d0964163b7998e83 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sat, 21 Dec 2024 09:01:26 +0000
Subject: [PATCH 037/105] Transition to H5D cached backend is complete, though
 we still have the test_reference_list fail

---
 pyfive/dataobjects.py            |  4 +++-
 pyfive/h5d.py                    | 18 +++++++++++-------
 pyfive/high_level.py             |  9 ++++++++-
 tests/test_filter_pipeline_v2.py |  8 ++++++++
 4 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 8a8c371..fc43e99 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -326,7 +326,9 @@ def compression_opts(self):
         if GZIP_DEFLATE_FILTER in self._filter_ids:
             gzip_entry = [d for d in self.filter_pipeline
                           if d['filter_id'] == GZIP_DEFLATE_FILTER][0]
-            return gzip_entry['client_data'][0]
+            #raise ValueError(f'gzip_entry {gzip_entry.keys()}')
+            key = {0:'client_data_values',1:'client_data'}['client_data' in gzip_entry]
+            return gzip_entry[key][0]
         return None
 
     @property
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index ada759c..f0a4be8 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -19,7 +19,7 @@ class DatasetID:
     chunked data, lazily or not. This class has been deliberately implemented in
     such as way so as to cache all the relevant metadata, so that once you have an 
     instance, it is completely independent of the parent file, and it can be used 
-    efficiently in distributed threads without rereading the btree etc.
+    efficiently in distributed threads without thread contention to the b-tree etc.
     """
     def __init__(self, dataobject):
         """ 
@@ -34,7 +34,7 @@ def __init__(self, dataobject):
         self.shape = dataobject.shape
         self.rank = len(self.shape)
         self.chunks = dataobject.chunks
-
+        
         self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
         self._unique = (self._filename, self.shape, self._msg_offset)
 
@@ -45,7 +45,7 @@ def __init__(self, dataobject):
             # not a posix file on a posix filesystem
             self.avoid_mmap = True
 
-        if dataobject.dtype == ('REFERENCE', 8):
+        if isinstance(dataobject.dtype,tuple):
             # this may not behave the same as h5py, do we care? #FIXME
             self.dtype = dataobject.dtype
         else:
@@ -221,11 +221,10 @@ def _get_contiguous_data(self, args):
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type - size {size}')
                 with open(self._filename,'rb') as open_file:
-                    view = np.memmap(
-                        self.fh, dtype=('<u8'), mode='c', offset=self.data_offset,
+                    ref_addresses = np.memmap(
+                        open_file, dtype=('<u8'), mode='c', offset=self.data_offset,
                         shape=self.shape, order=self._order)
-                    ref_addresses = view[args]
-                    np.array([Reference(addr) for addr in ref_addresses])[args]
+                    return np.array([Reference(addr) for addr in ref_addresses])[args]
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
@@ -319,3 +318,8 @@ def __init__(self, dataobject):
         self.fletcher32 = dataobject.fletcher32
         self.fillvalue = dataobject.fillvalue
         self.attributes = dataobject.get_attributes()
+
+        #horrible kludge for now, this isn't really the same sort of thing
+        #https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
+        # this is used directly in the Dataset init method.
+        self.offset = dataobject.offset
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 451e7b8..ebc5f49 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -277,6 +277,7 @@ class Dataset(object):
         Group instance containing this dataset.
 
     """
+    
 
     def __init__(self, name, datasetid, parent):
         """ initalize. """
@@ -287,6 +288,12 @@ def __init__(self, name, datasetid, parent):
         self._astype = None
         self.id=datasetid
 
+        #horrible kludge for now,
+        #https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
+        #we hide stuff we need here
+        self._dataobjects = self.id._meta
+   
+
     def __repr__(self):
         info = (os.path.basename(self.name), self.shape, self.dtype)
         return '<HDF5 dataset "%s": shape %s, type "%s">' % info
@@ -322,7 +329,7 @@ def len(self):
         return self.shape[0]
     
     def iter_chunks(self, *args):
-        return self._dataobjects.id._iter_chunks(args)
+        return self.id.iter_chunks(args)
     
 
     @property
diff --git a/tests/test_filter_pipeline_v2.py b/tests/test_filter_pipeline_v2.py
index 9eeb9a3..b2e3af6 100644
--- a/tests/test_filter_pipeline_v2.py
+++ b/tests/test_filter_pipeline_v2.py
@@ -17,3 +17,11 @@ def test_filter_pipeline_descr_v2():
         d = hfile['data']
         assert d.shape == (10,10,10)
         assert_almost_equal(d[0,0,0], 1.0)
+
+def test_filter_pipeline_compression_opts_v2():
+
+     with pyfive.File(FILTER_PIPELINE_V2_FILE) as hfile:
+        assert 'data' in hfile
+        d = hfile['data']
+        # the point of this test is to ensure we can actually retrieve the compression opts
+        x = d.compression_opts
\ No newline at end of file

From 26780222c8565192f53b58cd97b3cab3f51047d3 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sat, 21 Dec 2024 09:10:26 +0000
Subject: [PATCH 038/105] Removed obsolete DatasetDataObjects

---
 pyfive/dataobjects.py | 120 ------------------------------------------
 pyfive/high_level.py  |   3 +-
 2 files changed, 1 insertion(+), 122 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index fc43e99..3c56490 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -641,127 +641,7 @@ def is_datatype(self):
         else:
             return False
         
-class DatasetDataObject(DataObjects):
-    """ 
-    Subclass of DataObjects associated with one Dataset, 
-    handles actual data access.
-    """
-    def __init__(self, *args, **kwargs):
-        """
-        Initialise via super class
-        """
-        super().__init__(*args, **kwargs)
-        self._id = None
-
-        # make this explicit, but controllable
-        self.order = 'C'
-
-        try:
-            self.fh.fileno()
-            self.avoid_mmap = False
-        except (AttributeError, OSError):
-            # We can't use mmaps on S3
-            self.avoid_mmap = True
-
-        # offset and size from data storage message
-        msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0]
-        self.msg_offset = msg['offset_to_message']
-        version, dims, self.layout_class, self.property_offset = (
-            self._get_data_message_properties(self.msg_offset))
-        
-    @property
-    def id(self):
-        """ 
-        Represents a PyFive approximation of an HDF5 dataset identifier.
-        Objects of this class provides methods for working directly with chunked data.
-        """
-        # We want to make sure that this is lazy and cached
-        if self._id is None:
-            self._get_chunk_params()
-            self._id = DatasetID(self)
-        return self._id
-
-    def get_data(self, args):
-        """ 
-        Return the data pointed to in the DataObject.
-        """
 
-        if self.layout_class == 0:  # compact storage
-            raise NotImplementedError("Compact storage")
-        elif self.layout_class == 1:  # contiguous storage
-            return self._get_contiguous_data(self.property_offset, args)
-        if self.layout_class == 2:  # chunked storage
-            # If the dtype is a tuple, we don't really know how to deal with it chunk by chunk in this version
-            if isinstance(self.dtype, tuple):
-                # references need to read all the chunks for now
-                return self.id._get_selection_via_chunks(())[args]
-            else:
-                # this is lazily reading only the chunks we need
-                return self.id._get_selection_via_chunks(args)
-            
-
-
-    def _get_contiguous_data(self, property_offset, args):
-        data_offset, = struct.unpack_from('<Q', self.msg_data, property_offset)
-
-        if data_offset == UNDEFINED_ADDRESS:
-            # no storage is backing array, return all zeros
-            return np.zeros(self.shape, dtype=self.dtype)[args]
-
-        if not isinstance(self.dtype, tuple):
-            if self.avoid_mmap:
-                return self._get_direct_from_contiguous(args)
-            else:
-                try:
-                    # return a memory-map to the stored array with copy-on-write
-                    return np.memmap(self.fh, dtype=self.dtype, mode='c',
-                                offset=data_offset, shape=self.shape, order=self.order)[args]
-                except UnsupportedOperation:
-                    return self._get_direct_from_contiguous(args)
-        else:
-            dtype_class = self.dtype[0]
-            if dtype_class == 'REFERENCE':
-                size = self.dtype[1]
-                if size != 8:
-                    raise NotImplementedError('Unsupported Reference type - size {size}')
-                ref_addresses = np.memmap(
-                    self.fh, dtype=('<u8'), mode='c', offset=data_offset,
-                    shape=self.shape, order=self.order)
-                return np.array([Reference(addr) for addr in ref_addresses])[args]
-            else:
-                raise NotImplementedError('datatype not implemented - {dtype_class}')
-
-    def iterchunks(self, sel=None):
-        """ 
-        Iterate over chunks in a chunked dataset. 
-        The optional sel argument is a slice or tuple of slices that defines the region to be used. 
-        If not set, the entire dataspace will be used for the iterator.
-        For each chunk within the given region, the iterator yields a tuple of slices that gives the
-        intersection of the given chunk  with the selection area. 
-        This can be used to read data in that chunk.
-        """
-        if self.chunks is None:
-            raise TypeError('Dataset is not chunked')
-        return self.id._iter_chunks(sel)
-
-    def _get_direct_from_contiguous(self, args=None):
-        """
-        We read the entire contiguous array, and pull out the selection (args) from that.
-        This is a fallback situation if we can't use a memory map which would otherwise be lazy.
-        This will normally be when we don't have a true Posix file.
-        """
-    
-        data_offset, = struct.unpack_from('<Q', self.msg_data, self.property_offset)
-        itemsize = np.dtype(self.dtype).itemsize
-        num_elements = np.prod(self.shape)
-        num_bytes = num_elements*itemsize
-       
-        # we need it all, let's get it all (i.e. this really does read the lot)
-        self.fh.seek(data_offset)
-        chunk_buffer = self.fh.read(num_bytes)
-        chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-        chunk_data = chunk_data.reshape(self.shape, order=self.order)
-        return chunk_data[args]
         
 
 def determine_data_shape(buf, offset):
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index ebc5f49..9b174e3 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from pyfive.core import Reference
-from pyfive.dataobjects import DataObjects, DatasetID, DatasetDataObject
+from pyfive.dataobjects import DataObjects, DatasetID
 from pyfive.misc_low_level import SuperBlock
 
 
@@ -91,7 +91,6 @@ def __getitem__(self, y):
         if dataobjs.is_dataset:
             if additional_obj != '.':
                 raise KeyError('%s is a dataset, not a group' % (obj_name))
-            #return Dataset(obj_name, DatasetDataObject(self.file._fh, link_target), self)
             return Dataset(obj_name, DatasetID(dataobjs), self)
        
         try:

From 00789566d3de3df0f7b7105a2e73e9fdad61798e Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 22 Dec 2024 07:45:34 +0000
Subject: [PATCH 039/105] Expose package version in code, and separate testing
 requirements out in package metadata.

---
 pyfive/__init__.py |  3 ++-
 setup.cfg          | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/pyfive/__init__.py b/pyfive/__init__.py
index 8e67261..0fb5696 100644
--- a/pyfive/__init__.py
+++ b/pyfive/__init__.py
@@ -7,5 +7,6 @@
 from pyfive.high_level import File, Group, Dataset
 from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype
 from pyfive.h5py import Datatype, Empty
+from importlib.metadata import version
 
-__version__ = '0.4.0.dev'
+__version__ = version
diff --git a/setup.cfg b/setup.cfg
index 3c6e79c..d942621 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,13 @@
 [bdist_wheel]
 universal=1
+
+[metadata]
+name = pyfive
+version = '0.4.0.dev_h5nc'
+
+[options.extras_require]
+testing = 
+    pytest
+    h5py
+    netCDF4
+

From 503cb45648094dd7d412f0019cbe157d6fff05ff Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sun, 22 Dec 2024 07:46:00 +0000
Subject: [PATCH 040/105] Attempt to get b-tree logging in h5d

---
 pyfive/h5d.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index f0a4be8..3ea20e4 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -6,6 +6,8 @@
 from pyfive.core import Reference, UNDEFINED_ADDRESS
 from io import UnsupportedOperation
 import struct
+import logging
+from importlib.metadata import version
 
 StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size")
 
@@ -167,12 +169,19 @@ def index(self):
 
     def _build_index(self, dataobject):
         """ 
-        Build the chunk index if it doesn't exist
+        Build the chunk index if it doesn't exist. This is only 
+        called for chunk data, and only when the variable is accessed.
+        That is, it is not called when we an open a file, or when
+        we list the variables in a file, but only when we do
+        v = open_file['var_name'] where 'var_name' is chunked.
+
         """
         
         if self._index is not None: 
             return
         
+        logging.info(f'Building chunk index in pyfive {version}')
+       
         chunk_btree = BTreeV1RawDataChunks(
                 dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims)
         

From ac96f46194686b564dd876d5554e2c36361b1ce5 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Sat, 28 Dec 2024 18:04:13 +0000
Subject: [PATCH 041/105] Cleared a few bugs and misunderstandings which arose
 from workign with h5netcdf (closed #15)

---
 pyfive/__init__.py           |   2 +-
 pyfive/h5d.py                |   9 +++++-
 pyfive/high_level.py         |   2 ++
 setup.cfg                    |  28 ++++++++++++++++-
 setup.py                     |  40 ++----------------------
 tests/h5netcdf_test.hdf5     | Bin 0 -> 19528 bytes
 tests/netcdf4_classic.nc     | Bin 6796 -> 8330 bytes
 tests/test_h5netcdf.py       |  58 +++++++++++++++++++++++++++++++++++
 tests/test_reference_list.py |  33 +++++++++++++++-----
 9 files changed, 124 insertions(+), 48 deletions(-)
 create mode 100644 tests/h5netcdf_test.hdf5
 create mode 100644 tests/test_h5netcdf.py

diff --git a/pyfive/__init__.py b/pyfive/__init__.py
index 0fb5696..8b1b869 100644
--- a/pyfive/__init__.py
+++ b/pyfive/__init__.py
@@ -9,4 +9,4 @@
 from pyfive.h5py import Datatype, Empty
 from importlib.metadata import version
 
-__version__ = version
+__version__ = version('pyfive')
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 3ea20e4..62044af 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -180,7 +180,12 @@ def _build_index(self, dataobject):
         if self._index is not None: 
             return
         
-        logging.info(f'Building chunk index in pyfive {version}')
+        # look out for an empty dataset, which will have no btree
+        if np.prod(self.shape) == 0:
+            self._index = {}
+            return
+        
+        logging.info(f'Building chunk index in pyfive {version("pyfive")}')
        
         chunk_btree = BTreeV1RawDataChunks(
                 dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims)
@@ -287,6 +292,8 @@ def _get_selection_via_chunks(self, args):
                 raise NotImplementedError('datatype not implemented')
         else:
             true_dtype = None
+            if np.prod(self.shape) == 0:
+                return np.zeros(self.shape)
 
         array = ZarrArrayStub(self.shape, self.chunks)
         indexer = OrthogonalIndexer(args, array) 
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 9b174e3..23b30f9 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -353,6 +353,8 @@ def dtype(self):
             return np.dtype(self.id.dtype)
         except NotImplementedError as e:
             raise NotImplementedError(f'{e} (for {self.name})')
+        except TypeError as e:
+            return self.id.dtype
 
     @property
     def value(self):
diff --git a/setup.cfg b/setup.cfg
index d942621..7d2df77 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,11 +3,37 @@ universal=1
 
 [metadata]
 name = pyfive
-version = '0.4.0.dev_h5nc'
+version = 0.4.0+h5nc
+description = A pure python HDF5 reader
+long_descriptino = file: README.rst
+long_description_content_type = text/x-rst
+url = https://github.com/ncas-cms/pyfive
+author = Jonathan J. Helmus
+author_email = jjhelmus@gmail.com (with additions from Bryan Lawrence)
+maintainer = Valeriu Predoi
+maintainer_email = V.Predoi@ncas.ac.uk
+license = BSD
+classifiers = 
+    Development Status :: 4 - Beta
+    Intended Audience :: Developer
+    Intended Audience :: Science/Research
+    License :: OSI Approved :: BSD License
+    Operating System :: OS Independent
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
+    Topic :: Scientific/Engineering
+
+[options]
+packages = pyfive
+python_requires >= 3.10
+install_requires =
+    numpy
 
 [options.extras_require]
 testing = 
     pytest
     h5py
     netCDF4
+    h5netcdf
 
diff --git a/setup.py b/setup.py
index 30701a8..deaace8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,39 +1,3 @@
 """ Setup script for pyfive. """
-from setuptools import setup, find_packages
-
-# get the long descriptions from the README.rst file
-with open('README.rst') as f:
-    long_description = f.read()
-
-# get the version from the __init__.py file
-with open('pyfive/__init__.py') as f:
-    for line in f:
-        if line.startswith('__version__'):
-            version = line.strip().split()[-1][1:-1]
-            break
-
-setup(
-    name='pyfive',
-    version=version,
-    description='A pure python HDF5 file reader',
-    long_description=long_description,
-    url='https://github.com/jjhelmus/pyfive',
-    author='Jonathan J. Helmus',
-    author_email='jjhelmus@gmail.com',
-    license='BSD',
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: BSD License',
-        'Operating System :: OS Independent',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Topic :: Scientific/Engineering',
-    ],
-    packages=['pyfive'],
-    install_requires=['numpy'],
-)
+from setuptools import setup
+setup()
diff --git a/tests/h5netcdf_test.hdf5 b/tests/h5netcdf_test.hdf5
new file mode 100644
index 0000000000000000000000000000000000000000..e6cdbfb98ea64ebcbf0f7574655b8a0d4cdfced4
GIT binary patch
literal 19528
zcmeHPeQZ?65ubM-=8O;H0I|(S=+UGqMZ~hP9YKPL@!5w>jqUhcO4_RDv;E9nbG~!#
zo&i&peyF74kBFp7+K5O|l}2iyfs)p!5TztlqozS^N))83ZB!Cesc9mmDE*@lRk$-d
zJJ09C#`eWHpuUxT`|)Py?f!OVKW2B|f#$~0g89ql138}$JW!&7@-wPkb1SDvT%LDJ
zU=qK*xv|r=_JtoE?9cvx+x2Nh?_Q*{{kE0^f1bzwj=R{O?*fBDC8|_AU7Xs;9{4LM
zV8A@^gAaBlQa#ZG()zF_ilScqDh3!@qhbL5X3Aqg5iCKvlod16k*H;*`SDW2P!P}y
zIVbZHE^Z>@B=ww2wl>JQ``Cl?&v<{Ms=fdWqnr{A<V((@e#d<G5@K`9s!!=B0Vwl9
z**B5O>!T+wmrG^j_xw~N@Q~(m>k5_4?E`6;T;E?qqy`i#j3RBO)!Wx!(-<F!Q&|Zs
zE>VEURE<;se+XT|sQ|0%gJA3Ca2Qh;<Np{o0F5o%n%crG?QM~8aC2)DW|RElcmMVB
zqwT@w&r4aola>`5WD@ra@5M^uK`0c!nieKo$&iA-8*##tdR0l=HgCgXa7pXY*F?}2
zi)RA(7Y!s$E7%wc^u-6vWG0?U26~39Kr&^GJ=zsb$D=(7vmzjlU@Q()u9&!0z4(-z
z_-;SFHd_+8{c3Jwy0_Q^kD?=Sl6Lbcm2eyXUd<%hgQ?NWucZkcF58ji{Kie}>C|B@
zut}YYKr8pBZSLPoQa(PlrXPEOiQA1{BJDN|{`DlKq_L#ukx)F5*d9#`nf%^tlqp5>
z38D`lId_Izt<*3SoCd2DN)kJz(-_Q+qgo-e#rRJ=y@lkFK2mIl$4gNaWc*fEGq-m3
zu3L){HvGmdy1>#Kkj4_8Gt^jU=sJB!7dlT*wPpvwVW1NGp&X>A38qr%zIZZfnHk0V
zsvbx8i(0Hiyp&JcL^9$=u2<`Jwv6nBUw`0=OstzyC6R|Idt7=QI>xwO_1*yA(`-Z!
z-L}RGU-9$PLz2PS>Tx$Lhu0grni~-`1gQu)_pLswN7NE~fM0&-R~KMQu)URQ%k@)X
z42nt&>t>VYp{|PDmo=f#<)+IpC+St1P&!CawE@#I(<m7d7kc5Z=L4wr>?B>{;tT&m
zO(*F>&l4Pb<hILC;E)sgS9NpCC?@H`X91Jj^vCKsp*Sa9=ojjA!Kpu$QYkcqlFQ}K
zsQ`n*P!b&9LBK)4LBK)4LBK)4LBK)4LBK)4LBK)4LEsZafDAuAvI&FR<q``BnUu)@
zFc)^&DtPqp!UwO1<)>^J<C<&v@yxR2M<IolyV<a{N59tXR<<}aR};UPTmA1Z8+i_Z
zc(DEJ%tl7xJbQE0ruedRmtRcU&AOW2E@oS?Er~5DH_ipWfRK`lu_xOp+PGJ=o{W5k
zHfy=i4nuHdVnhA?3DdU97&^Vr>`$NtQ*V`^)uxP?x?S!gE15eLfr>-s9I+5iE+<^T
zceTA;EbE-QNOOFG8T0S+yMOji)vS4m$>NT}r&aE5;%2}?Gy#@pZQQiBgu4Ki;xn>J
zQPrp#vSB5bKk=2}j|k9_buK$j$;A~f4$|ASnKf{)MzbDjilAl`4PpKc$~9YtVk(DH
zAryHse$QOw2k&{eEffwn!9HyxHE_S-cT}%4drrMc{Kq@0owtN4PMZN#XX*I-C)rrN
z5G+KvvnkZn+0+(niZH^Z2q7Wd7qy}p(;Clof|#^~y-%fSH24$b;3Rqn>qI~$NK2}g
z!W8kJ(jA<{OOK8Wvg*%%3(-5sFrHrvU3uj1zOK%uCXyP#TKrp79Vh>}uzLKn+}>g;
zG{@(5Vh#ch0uBNW0uBNW0uBNW0yjDWd%oJ+U0#0CC_4VC3kF-uj!J=L+A9ki*`k+;
zWfY(Xpcer5?0cf=p6Fi8SufPYg*Z^0a;n^VE-9P-m^^J~xZI=DCUyuN1RMk$1RMk$
z1RMmejldMP$a=Qe6{yPqL&>;B$5f}f1fWhU>cO7OzD8-|CEHf5P7sRCO(qq#KcnOR
zPpsp*$pK$6EzkDhQ>w#k9X>sG>dw>3+Kkq|{I?*8gB*pI37MS9oQrLxYS{Vhg%`6+
zvHWw~7IJ09k8wVfPS+!H52eeYahYW9IA8eA0b4|r=*S~6luva7MBunn7eox1fcrH!
zmpgLmj`+te@g!hgb@5{SH5vi2N`q)ye@8xcfj%JycP+bkBGlR3p*CY&_UUdcjpz;T
zW+*4y8*0r$DcNfojMBcrbQ}tvos)$)fSTUCRq~#Fud+lPKQACUS`6Xwb6tqB<L9-v
zCq4>=@sK)5`9vYa7k(b-fISevC_nM(OEh}>0>p25VC3<JuGSEE_kF{h2e)E|C2;cn
zjccJ!1&_USwi-gnqY@f!-&6}EgDapn)7}Y3F?BiYD}QP?oWRtj5Fc6eb(pVG>w;b1
zg8MMF9KQ0P=V^EngUjIAzZ^LYk0HTQc;mj-m*F!QTm>V+$6to`kRSjjK6~L6cpZZ)
z;0LjT@4$WxmV(h1z5u(Bb|w5||8pM!iQy7B{>SgijUbFWE`J16`^#kl(D2IOt6S0T
zI9UK_@KcUk$j;32BNq^Kv)lazi&UZHWvP){2D;tla)Fiz8Nak#$yEX^I`)BomQdS<
zz`|ndCX1ZC>BWNkg@fxtQj-5~-yRiK6Arm21_T?C0O=ul>wWtQe*SY!rbiBaV!+<>
zmrRNYYBGt$G$v-<#C0||-$~f;^;`h-l(dPcZ!~{+D0}uIg;ua2@y)eUUD81H9CP6h
zo;^Mm<B+QDq$iP1-b@?wXYb}Jsn*5dzt%tBt^|AXi`3HY($2ZQT%<0bV;*3L0zg4#
zs0S~*KBF&v7A)b%@@8ZBSUsiS1+8-WH9Ru%dxs;K&u*A7J-rU@rA4o;v2~h@pf3zc
z6~rIAa?2?JIq{<}P|M730}Z+8q(>Fib=mW!T<qvH#LQLKhLAF&%T`qy33Q*MS9rF7
zmnz~3e_ruEc8XS1#Hn6WxN-JG16Vm$n$oFO{AM2v#50+AawnvcrlPiDX_G?M?v#p;
z8g3V94VPs87L^Ee^_(1?*2r)D2JqttzIX?7ncZKOgzY(3>Sfdwf65D<<F3@dbq|;E
zZ@Q==50f9?%I|_F%`5U-W6!~Qx2h6L_nrSFQjBoP^bnrlY}drAEzu~YNcbkfcrJC0
zrd#>RG_~B_16Q_^uK2g}s=pYAeO&do1v}EIK{IW|)je5S^;aM@YaqZbyxWT37pPme
zHdc!t9l$Ry)NR<XVQs9hzqYQbqH0|QJs2FWtFEZ3s;GRs;r+c6OSte8f4UMbeV*Vi
zeWl=k4H-v)Anl*oRq{EI)IgPLMVge&?4%z}<iG!6%(O=nHI=s!ou=JDkW`tGBiyuh
zgZ29l*4kKHdt_BM%ZY5FqU|j3B5f^tgsmrh(22CorMA;*ipRA#)r<$g9%1c;mMWb3
zd97ztarSKafUbe)i~oK;xmbjw?bWuve8*mqy-{E6ja6fXxgkO<83j4xfZ!p%Tb-iL
zxBlZls4+VBL5)#vIho{qNh2b1bPemb8enF+$dYy#=W7KoQSW}Z^w^nf`H)_KpW?W3
ziOxdPdvC;ZzcfE2C%&Bf<x~J!Zb)1f0}{W|Bq8?8o5#n^lEarn&5aEpUvZ+xPhR@(
zFAa|vQ;cExK*QM5!`LGqNiaOpof>=pp=HbP$dblu4h&kuf6%N9j|gAkP%?p^F|y1)
z`DlZIBE-&(XEvmohDU5T^D<T%C&blTtC-60i2bG@(;H1h(*xD=K<1K#@ubDiUcmV<
zIm07nodx)kh+POl(ZFx|u=C;CYG*n%G<cYdMd%EV*nEm{ZaxsTdShl^1dn$=5)=aI
zoNkLGdut-eR5H>_A5e*mr~j}~=*3J_gp09A!c0c6iqc+&N941>Ob!i1tmF4eHnAW1
P)C0_$+onu7{onrqKFWoL

literal 0
HcmV?d00001

diff --git a/tests/netcdf4_classic.nc b/tests/netcdf4_classic.nc
index 89d6cdda5ab01b889e280f064c3cbbc6c0b86bac..260b2ee37d90547e13d81c7caba72b16bab37171 100644
GIT binary patch
literal 8330
zcmeI1O-vI}5P+xMwlsAS5DaP*R}RDgX`obyF$74v{Ag_rMU7mVLYJ^cT5K0Wh>3>y
z6Y!=d<3SUnM-C<)QEwXcU}A_z<H?I2@kVgw{gfXe8X=Lxo6w!N?|0t!W?oumwl~^c
z8aNbi`~4tgMaZ1NYnntua^U$)qBq*_)<3vwS)HbdCOKrI4!FTZUCPVpS?Q)HK$I&0
zpDY0AZB*8<N{1?dwBzKPp&X6kc6EZ=g(CV(eM_uIgI((fhyFNtMojjuzHWeRs}x)=
zTpHL?9?>ebRz*9hICd0xjRDnusJ@Q9yljtJfbTIQ2HX?l_a9~hG%svqBj5D~>PkEJ
zQ`bnWE185!tRiMaEZTS0j3@gN@l-O>6*C3Qivs2-_8_5Y+&Il%1c=k5;+8X<83{-2
zF+0OpNV~;2z@ER!(`Vfe5|Q3B(rRr>tNCu<3kT>}GgIB=!6uqRALNJP<@<_voi*9E
zI<YU=FfeKt4E#$QS<8t;yN!%JW@QU@E^7=;I7T+-tTs)i^LBdZiWM}Zgu$FX{q`@;
zO{a4sN0R&Z2B=1>r*|ye@bAL{Xf8NFnqwy|8F;p2pm$L}4gHn*tVABk<?<Ojn|7?i
zx`j9MV48rV3~>7ZsWIWT!(RS2E^47d7S`#uyM1-UoDcdbrWzA{eAQHNd2qK(JczUT
zRnZRsvdfod*^$Ngk(HtEVI-_NUaT)U00(g}`6SIoQMuWE*b+Qs2`(&8w0@+SC1mOJ
z>^{JDG8lcxz#_^VR{$4eVdwSK;`wi@sJ@6RNII}?$|z#Nn2JZv<#Xd!-m&p8DJX04
zL*yo{Jh^Ins1dyv1FbzAJQ)l%j%G$qv^NKv!@(9Z{l()qHyK!*pMukeSjFwu0u@F_
z@pcREf$$1~*o^6X&)4ay`Qx4X5n@%mDCmj|Gn>dB`t;fhrKfK1WK5L*-<YT=A&3Bv
zxB|Uq3j#<^5D1YwG4UFA62#ue?sS)k33TwO5)lY8E{Gio#YO0oh`7bf!v=PAI~o(T
zl)fEjxY{20{A7Ho<HyqX1!na29zBsnjL{GQDaZ)F1dymCA`#lfdb#@RGFehu5l{pa
z0YyL&Py`eKML-cy1QY>9KoL*`b|V4$%}>AY<v&g~zVAkxl}(C(BA^H;0*Zhl@b3gz
zhT|F~M>QyeCQ45Ao84x=8IPDLmi24JuLhJH$fTWg!E#p9f5mhy_^F-eR}7ZHlGy^O
T*XA0@R+(d=GRfS;{g=N0G}BJp

delta 518
zcmeBj>@l4nqtYYA00ssSieZz{m+k%@E<sE@TbPu<V$4hv9n>d_Gbz>cXo3Wo8F+w@
zg@GYHFWET0xTGjExr9N0g8?K21PTmHK#~!N1q49+4rUO^z+exdprTA5ML<vmp&%L<
z7=mga=YwtMWoKj*0m(5jGBQF8W|*wNXee+TYTP8KMv%iMH!~Vej%G@bV_^8j-1ghw
z*~6R17-S?f0~^q(KywmHN{S38$1^KRwEqA9A0)|yDrp3g<Y)pK`sCH+LzDM2xw8Yk
zz|2@9HCd5OY_dF)6f2JpP<FC9yQ4}C7sPUeCnKRwEP%R-L1ywo76q0%?emV4@3V<9
zzMp)LSw^=8>S?#Xbqyd#atMKxGcy961P0*n0*dhD0Npp4Q^9U>BD1_KOc%`P{mnnu
zfV~I|QO3zRtS*d9lQ*);vFx#MT)jDvV>9a{4h5n2Fx61{20uuOfAR(vc^;6nK`sG>
FBmhZnS@!?{

diff --git a/tests/test_h5netcdf.py b/tests/test_h5netcdf.py
new file mode 100644
index 0000000..140f11d
--- /dev/null
+++ b/tests/test_h5netcdf.py
@@ -0,0 +1,58 @@
+# tests the variables found in the file h5netcdf_test.hdf5,
+# which is produced by the write_h5netcdf test routine in the h5netcdf package
+#
+import pyfive
+import h5py
+import warnings
+
+
+def test_file_contents():
+    p5file = pyfive.File('h5netcdf_test.hdf5') 
+    h5file = h5py.File('h5netcdf_test.hdf5')
+
+    expected_variables = [
+        "foo",
+        "z",
+        "intscalar",
+        "scalar",
+        "mismatched_dim",
+        "foo_unlimited",
+         "var_len_str",
+        "enum_var",
+    ]
+
+    cannot_handle = ['var_len_str', 'enum_var']
+
+    p5contents = set([a for a in p5file])
+    h5contents = set([a for a in h5file])
+
+    assert p5contents == h5contents
+
+    for x in list(set(expected_variables) - set(cannot_handle)):
+        try:
+            # check we can get the variable
+            p5x, h5x = p5file[x], h5file[x]
+            if p5x is None:
+                warnings.warn(f'Had to skip {x}')
+          
+            if isinstance(h5x,h5py.Dataset):
+                # check the dtype
+                assert p5x.dtype == h5x.dtype
+                # check the shape
+                assert p5x.shape == h5x.shape
+                # now look into the details
+                if h5x.shape != ():
+                    # do the values match
+                    sh5x = str(h5x[:])
+                    sp5x = str(p5x[:])
+                    assert sh5x == sp5x
+                # what about the dimensions?
+                dh5x = h5x.dims
+                dp5x = p5x.dims
+                assert len(dh5x) == len(dp5x)
+                print(p5x)
+        except:
+            print('Attempting to compare ',x)
+            print(h5file[x])
+            print(p5file[x])
+            raise
\ No newline at end of file
diff --git a/tests/test_reference_list.py b/tests/test_reference_list.py
index a1aef3c..9080184 100644
--- a/tests/test_reference_list.py
+++ b/tests/test_reference_list.py
@@ -6,6 +6,7 @@
 from numpy.testing import assert_array_equal, assert_almost_equal
 
 import pyfive
+import h5py
 
 DIRNAME = os.path.dirname(__file__)
 NETCDF4_CLASSIC_FILE = os.path.join(DIRNAME, 'netcdf4_classic.nc')
@@ -13,11 +14,29 @@
 
 def test_read_netcdf4_dims():
 
-    with pyfive.File(NETCDF4_CLASSIC_FILE) as hfile:
+    # We want to know about this warning and deal with it
+    #warnings.simplefilter('error', UserWarning)
 
-        with warnings.catch_warnings(record=True) as caught_warnings:
-            dimensions_x = hfile['x'].dims 
-            if caught_warnings:
-                for warning in caught_warnings:
-                    print('Caught warning ', warning)
-                raise NotImplementedError('We need to fix this warning!')
\ No newline at end of file
+    # not using a context manager so we can compare and contrast in debugging
+    hfile1 = h5py.File(NETCDF4_CLASSIC_FILE)
+    dimensions_x1 = hfile1['x'].dims
+    dimensions_v1 = hfile1['var1'].dims
+
+    hfile2 = pyfive.File(NETCDF4_CLASSIC_FILE)
+    dimensions_x2 = hfile2['x'].dims
+    dimensions_v2 = hfile2['var1'].dims
+
+    # as we created no explicit data for this dimension, this is the case where
+    # getitme goes to storage and finds an UNDEFINED_ADDRESS and returns zeros.
+    y = hfile2['x'][:]
+    
+    assert len(dimensions_v1) == len(dimensions_v2)
+    assert len(dimensions_x1) == len(dimensions_x2)
+
+    # The dimension scale spec is here: https://support.hdfgroup.org/documentation/hdf5-docs/hdf5_topics/H5DS_Spec.pdf
+    # The issue is that we don't support reference_lists.
+    # But we don't know if this matters or not, given this is failing on the dimensions of a dimension
+    # ChatGPT says: 
+    #  
+    # - NetCDF4 Classic files often abstract away dimensions, so their behavior in HDF5 tools 
+    #   like h5py may not always align with expectations for standard HDF5 datasets.

From b586db0b8710323f1fdefb535c752a758b0d887c Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 30 Dec 2024 17:04:07 +0000
Subject: [PATCH 042/105] Continue to use open file in h5d, closes #18

---
 MANIFEST.in   |  1 +
 pyfive/h5d.py | 52 +++++++++++++++++++++++++--------------------------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index e02cfac..9ae33e4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
 # Include the license file
 include LICENSE.txt
+include README.rst
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 62044af..39932c9 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -26,12 +26,19 @@ class DatasetID:
     def __init__(self, dataobject):
         """ 
         Instantiated with the pyfive datasetdataobject, we copy and cache everything 
-        we want so it can be used after the parent file is closed, without needing 
-        to go back to storage.
+        we want so that the only file operations are now data accesses.
         """
 
         self._order = dataobject.order
-        self._filename = dataobject.fh.name
+        self._fh = dataobject.fh
+        try:
+            self._filename = self._fh.name
+            dataobject.fh.fileno()
+            self.avoid_mmap = False
+        except (AttributeError, OSError):
+            # maybe this is an S3File instance?
+            self._filename = self._fh.path
+            self.avoid_mmap = True
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
         self.rank = len(self.shape)
@@ -40,13 +47,6 @@ def __init__(self, dataobject):
         self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
         self._unique = (self._filename, self.shape, self._msg_offset)
 
-        try:
-            dataobject.fh.fileno()
-            self.avoid_mmap = False
-        except (AttributeError,OSError):
-            # not a posix file on a posix filesystem
-            self.avoid_mmap = True
-
         if isinstance(dataobject.dtype,tuple):
             # this may not behave the same as h5py, do we care? #FIXME
             self.dtype = dataobject.dtype
@@ -219,11 +219,10 @@ def _get_contiguous_data(self, args):
                 return self._get_direct_from_contiguous(args)
             else:
                 try:
-                    with open(self._filename,'rb') as open_file:
-                        # return a memory-map to the stored array
-                        # I think this would mean that we only move the sub-array corresponding to result!
-                        view =  np.memmap(open_file, dtype=self.dtype, mode='c',
-                                    offset=self.data_offset, shape=self.shape, order=self._order)
+                    # return a memory-map to the stored array
+                    # I think this would mean that we only move the sub-array corresponding to result!
+                    view =  np.memmap(self._fh, dtype=self.dtype, mode='c',
+                                offset=self.data_offset, shape=self.shape, order=self._order)
                     result = view[args]
                     return result
                 except UnsupportedOperation:
@@ -234,11 +233,11 @@ def _get_contiguous_data(self, args):
                 size = self.dtype[1]
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type - size {size}')
-                with open(self._filename,'rb') as open_file:
-                    ref_addresses = np.memmap(
-                        open_file, dtype=('<u8'), mode='c', offset=self.data_offset,
-                        shape=self.shape, order=self._order)
-                    return np.array([Reference(addr) for addr in ref_addresses])[args]
+                
+                ref_addresses = np.memmap(
+                    self._fh, dtype=('<u8'), mode='c', offset=self.data_offset,
+                    shape=self.shape, order=self._order)
+                return np.array([Reference(addr) for addr in ref_addresses])[args]
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
@@ -255,11 +254,11 @@ def _get_direct_from_contiguous(self, args=None):
         num_bytes = num_elements*itemsize
        
         # we need it all, let's get it all (i.e. this really does read the lot)
-        with open(self._filename,'rb') as open_file:
-            open_file.seek(self.data_offset)
-            chunk_buffer = open_file.read(num_bytes)
+       
+        self._fh.seek(self.data_offset)
+        chunk_buffer = self._fh.read(num_bytes) 
         chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
-        chunk_data = chunk_data.reshape(self.shape, order=self.order)
+        chunk_data = chunk_data.reshape(self.shape, order=self._order)
         return chunk_data[args]
 
     
@@ -267,9 +266,8 @@ def _get_raw_chunk(self, storeinfo):
         """ 
         Obtain the bytes associated with a chunk.
         """
-        with open(self._filename,'rb') as open_file:
-            open_file.seek(storeinfo.byte_offset)
-            return open_file.read(storeinfo.size) 
+        self._fh.seek(storeinfo.byte_offset)
+        return self._fh.read(storeinfo.size) 
 
     def _get_selection_via_chunks(self, args):
         """

From 7f17cc83cbca2c577f80d08cb0973196a1f9befb Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 1 Jan 2025 16:04:46 +0000
Subject: [PATCH 043/105] Test for true bytes-io testing (needed for h5netcdf
 test compatability)

---
 tests/test_file_like.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/test_file_like.py b/tests/test_file_like.py
index 0933fc0..6d46970 100644
--- a/tests/test_file_like.py
+++ b/tests/test_file_like.py
@@ -2,6 +2,7 @@
 
 import io
 import os
+import h5py
 
 import numpy as np
 from numpy.testing import assert_array_equal, assert_almost_equal
@@ -57,3 +58,25 @@ def test_read_latest_fileobj():
             ref_attr6 = u'Test' + b'\xc2\xa7'.decode('utf-8')
             assert dset3.attrs['attr6'] == ref_attr6
             assert isinstance(dset3.attrs['attr6'], string_type)
+
+
+def write_compressed_tobytes(file_like):
+    """ Make an HDF file for testing """
+    
+    f = h5py.File(file_like, 'w', libver='earliest')
+
+    # gzip compressed dataset
+    f.create_dataset('dataset1', shape=(21, 16), chunks=(2, 2), dtype='<u2',
+        compression='gzip', shuffle=False,
+        data=np.arange(21*16).reshape(21, 16), track_times=False)
+    f.close()
+
+
+def test_iobytes():
+    tfile = io.BytesIO()
+    write_compressed_tobytes(tfile)
+    with pyfive.File(tfile) as hfile:
+        ds1 = hfile['dataset1']
+        shape = ds1.shape
+        assert shape == (21,16)
+

From fd13670f74f18bfa768f672c92f4591cd85acef3 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 2 Jan 2025 08:38:48 +0000
Subject: [PATCH 044/105] Deals with filename issues (closes #19)  (and deals
 with another iobytes isseue around scalars on contigous storage).

---
 bnl/opening_speed.py | 118 ++++++++++++++++++++++++++++++++++++-------
 pyfive/h5d.py        |  11 ++--
 pyfive/high_level.py |   2 +-
 3 files changed, 108 insertions(+), 23 deletions(-)

diff --git a/bnl/opening_speed.py b/bnl/opening_speed.py
index db38d96..9928bc8 100644
--- a/bnl/opening_speed.py
+++ b/bnl/opening_speed.py
@@ -2,29 +2,111 @@
 import pyfive
 from pathlib import Path
 import time
+import s3fs
 
-mypath = Path(__file__).parent
-fname = 'da193o_25_day__grid_T_198807-198807.nc'
-p = mypath/fname
+S3_URL = 'https://uor-aces-o.s3-ext.jc.rl.ac.uk/'
+S3_BUCKET = 'bnl'
 
-h1 = time.time()
-f1 = h5py.File(p,'r')
-h2 = time.time()
-f2 = pyfive.File(p)
-h3 = time.time()
+def test_speed(s3=False):
 
+    mypath = Path(__file__).parent
+    fname1 = 'da193o_25_day__grid_T_198807-198807.nc'
+    vname1 = 'tos'
+    p1 = mypath/fname1
 
+    fname2 = 'ch330a.pc19790301-def-short.nc'
+    vname2 = 'UM_m01s16i202_vn1106'
+    p2 = Path.home()/'Repositories/h5netcdf/h5netcdf/tests/'/fname2
+
+    do_run(p1, fname1, vname1, s3)
+
+    do_run(p2, fname2, vname2, s3)
+
+
+def do_s3(package, fname, vname): 
+
+    fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
+    uri = S3_BUCKET + '/' + fname
+    with fs.open(uri,'rb') as p:    
+        t_opening, t_var, t_calc, t_tot = do_inner(package, p, vname)
+
+    return t_opening, t_var, t_calc, t_tot
+
+def do_inner(package, p, vname, withdask=False):
+    h0 = time.time()
+    pf1 = package.File(p)
+    h3 = time.time()
+    t_opening = 1000* (h3-h0)
+
+    h5a = time.time()
+    vp = pf1[vname]
+    h5 = time.time()
+    t_var = 1000* (h5-h5a)
+
+    h6a = time.time()
+    sh = sum(vp)
+    h6 = time.time()
+    t_calc = 1000* (h6-h6a)
+
+    t_tot = t_calc+t_var+t_opening
+
+    pf1.close()
+    return t_opening, t_var, t_calc, t_tot
+
+
+
+def do_run(p, fname, vname, s3):
+
+    if s3:
+        import s3fs
+      
+
+    # for posix force this to be a comparison from memory
+    # by ensuring file is in disk cache and ignore first access
+    # but we then do an even number of accesses to make sure we are not 
+    # biased by caching. 
+    n = 0
+    datanames = ['h_opening','p_opening','h_var','p_var','h_calc','p_calc','h_tot','p_tot']
+    results = {x:0.0 for x in datanames}
+    while n <2:
+        n+=1
+
+        if s3:
+            h_opening, h_var, h_calc, h_tot = do_s3(h5py, fname, vname)
+            p_opening, p_var, p_calc, p_tot = do_s3(pyfive, fname, vname)
+
+        else:
+            h_opening, h_var, h_calc, h_tot = do_inner(h5py, p, vname)
+            p_opening, p_var, p_calc, p_tot = do_inner(pyfive, p, vname)
+
+        if n>1:
+            for x,r  in zip(datanames,[h_opening,p_opening,h_var,p_var,h_calc,p_calc,h_tot,p_tot]):
+                results[x] += r
+
+    for v in results.values():
+        v = v/(n-1)
+
+
+    print("File Opening Time Comparison ", fname, f' (ms, S3={s3})')
+    print(f"h5py:   {results['h_opening']:9.6f}")
+    print(f"pyfive: {results['p_opening']:9.6f}")
+
+    print(f'Variable instantiation for [{vname}]')
+    print(f"h5py:   {results['h_var']:9.6f}")
+    print(f"pyfive: {results['p_var']:9.6f}")
+
+    print('Access and calculation time for summation')
+    print(f"h5py:   {results['h_calc']:9.6f}")
+    print(f"pyfive: {results['p_calc']:9.6f}")
+
+    print('Total times')
+    print(f"h5py:   {results['h_tot']:9.6f}")
+    print(f"pyfive: {results['p_tot']:9.6f}")
+
+if __name__=="__main__":
+    test_speed()
+    test_speed(s3=True)
 
-v = f2['tos']
-d = v._dataobjects
-h4 = time.time()
-d._get_chunk_addresses()
-h5 = time.time()
 
 
-print("File Opening Time Comparison")
-print(f'h5py:   {h2-h1:9.6f}')
-print(f'pyfive: {h3-h2:9.6f}')
-print(f'Additional times: {h4-h3:9.6f}, {h5-h4:9.6f}')
-print(f'Total times: H5 {h4-h3:9.6f}, P5 {h5-h4:9.6f}')
 
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 39932c9..7920ffa 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -32,12 +32,12 @@ def __init__(self, dataobject):
         self._order = dataobject.order
         self._fh = dataobject.fh
         try:
-            self._filename = self._fh.name
+            self._filename = dataobject.filename
             dataobject.fh.fileno()
             self.avoid_mmap = False
         except (AttributeError, OSError):
             # maybe this is an S3File instance?
-            self._filename = self._fh.path
+            self._filename = getattr(self._fh,'path','None')
             self.avoid_mmap = True
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
@@ -247,14 +247,16 @@ def _get_direct_from_contiguous(self, args=None):
         We read the entire contiguous array, and pull out the selection (args) from that.
         This is a fallback situation if we can't use a memory map which would otherwise be lazy.
         This will normally be when we don't have a true Posix file.
+        # FIXME: We can probably make this lazy by using the indexer to work out which bytes
+        # are where ...
         """
     
         itemsize = np.dtype(self.dtype).itemsize
-        num_elements = np.prod(self.shape)
+        # need to impose type in case self.shape is () in which case numpy would return a float
+        num_elements = np.prod(self.shape, dtype=int)
         num_bytes = num_elements*itemsize
        
         # we need it all, let's get it all (i.e. this really does read the lot)
-       
         self._fh.seek(self.data_offset)
         chunk_buffer = self._fh.read(num_bytes) 
         chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
@@ -302,6 +304,7 @@ def _get_selection_via_chunks(self, args):
             chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if self.filter_pipeline is not None:
+                # FIXME: Why do I assume it's always a V1 Btree?
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
             chunk_data = np.frombuffer(chunk_buffer, dtype=dtype)
             out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 23b30f9..d1350dd 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -196,7 +196,7 @@ def __init__(self, filename, mode='r'):
                 raise ValueError(
                     'File like object must have a seek method')
             self._fh = filename
-            self.filename = getattr(filename, 'name', None)
+            self.filename = getattr(filename, 'name', "None")
         else:
             self._fh = open(filename, 'rb')
             self._close = True

From 32ad75de01eeb7f12a095f1fb5e5a8474b9e24be Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 2 Jan 2025 15:57:00 +0000
Subject: [PATCH 045/105] Addressing, I think, upstream issue 53, and includes
 a test case I should have included previously

---
 pyfive/h5d.py           | 4 ++--
 tests/test_file_like.py | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 7920ffa..35ef508 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -259,7 +259,7 @@ def _get_direct_from_contiguous(self, args=None):
         # we need it all, let's get it all (i.e. this really does read the lot)
         self._fh.seek(self.data_offset)
         chunk_buffer = self._fh.read(num_bytes) 
-        chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype)
+        chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
         chunk_data = chunk_data.reshape(self.shape, order=self._order)
         return chunk_data[args]
 
@@ -306,7 +306,7 @@ def _get_selection_via_chunks(self, args):
             if self.filter_pipeline is not None:
                 # FIXME: Why do I assume it's always a V1 Btree?
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
-            chunk_data = np.frombuffer(chunk_buffer, dtype=dtype)
+            chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy()
             out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
        
         if true_dtype is not None:
diff --git a/tests/test_file_like.py b/tests/test_file_like.py
index 6d46970..6c0f0cf 100644
--- a/tests/test_file_like.py
+++ b/tests/test_file_like.py
@@ -69,6 +69,9 @@ def write_compressed_tobytes(file_like):
     f.create_dataset('dataset1', shape=(21, 16), chunks=(2, 2), dtype='<u2',
         compression='gzip', shuffle=False,
         data=np.arange(21*16).reshape(21, 16), track_times=False)
+    
+    f.create_dataset('dataset2', shape =(4,5), dtype='<u2',
+                     data=np.ones((4,5)), track_times=False)
     f.close()
 
 
@@ -76,6 +79,7 @@ def test_iobytes():
     tfile = io.BytesIO()
     write_compressed_tobytes(tfile)
     with pyfive.File(tfile) as hfile:
+        print(hfile)
         ds1 = hfile['dataset1']
         shape = ds1.shape
         assert shape == (21,16)

From 48b7b9aa788b212844f32cd34cc85a5974b9bbbc Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Fri, 3 Jan 2025 10:31:54 +0000
Subject: [PATCH 046/105] Fix location of files so tests run properly from
 parent directory.

---
 tests/test_h5netcdf.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_h5netcdf.py b/tests/test_h5netcdf.py
index 140f11d..a92c56c 100644
--- a/tests/test_h5netcdf.py
+++ b/tests/test_h5netcdf.py
@@ -4,11 +4,13 @@
 import pyfive
 import h5py
 import warnings
+from pathlib import Path
 
+DIRNAME = Path(__file__).parent
 
 def test_file_contents():
-    p5file = pyfive.File('h5netcdf_test.hdf5') 
-    h5file = h5py.File('h5netcdf_test.hdf5')
+    p5file = pyfive.File(DIRNAME/'h5netcdf_test.hdf5') 
+    h5file = h5py.File(DIRNAME/'h5netcdf_test.hdf5')
 
     expected_variables = [
         "foo",

From 22333955595d571f6e892037f5b492f9937d491d Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 7 Jan 2025 21:08:38 +0000
Subject: [PATCH 047/105] Well, I think this is a fix for #23, and it's so
 complex I'm committing it now in case my laptop dies and I can't repeat this
 :-)

---
 pyfive/btree.py          |  37 ++++++++++++++++
 pyfive/dataobjects.py    |  91 +++++++++++++++++++++++++++++++--------
 pyfive/h5d.py            |  11 +++--
 pyfive/misc_low_level.py |  27 ++++++++++++
 tests/data/issue23_A.nc  | Bin 0 -> 23279 bytes
 tests/data/issue23_B.nc  | Bin 0 -> 44746 bytes
 6 files changed, 144 insertions(+), 22 deletions(-)
 create mode 100644 tests/data/issue23_A.nc
 create mode 100644 tests/data/issue23_B.nc

diff --git a/pyfive/btree.py b/pyfive/btree.py
index 38518fc..5d34258 100644
--- a/pyfive/btree.py
+++ b/pyfive/btree.py
@@ -8,6 +8,7 @@
 
 from .core import _padded_size
 from .core import _unpack_struct_from_file
+from .core import _unpack_struct_from
 from .core import Reference
 
 
@@ -440,6 +441,25 @@ def _parse_record(self, record):
         creationorder = struct.unpack_from("<Q", record, 0)[0]
         return {'creationorder': creationorder, 'heapid':record[8:8+7]}
 
+class BTreeV2AttrCreationOrder(BTreeV2):
+    """
+    HDF5 version 2 B-Tree storing attribute creation orders (type 9).
+    See the Type 9 Record Layout, note the different ordering from type 6.
+    """
+    NODE_TYPE = 9
+
+    def _parse_record(self, record):
+        return _unpack_struct_from(V2_BTREE_NODE_TYPE_9_LAYOUT,record)
+
+class BTreeV2AttrNames(BTreeV2):
+    """
+    HDF5 version 2 B-Tree storing attribute names (type 8).
+    """
+    NODE_TYPE = 8  
+
+    def _parse_record(self, record):
+        return _unpack_struct_from(V2_BTREE_NODE_TYPE_8_LAYOUT,record)
+
 
 # IV.A.2.l The Data Storage - Filter Pipeline message
 RESERVED_FILTER = 0
@@ -449,3 +469,20 @@ def _parse_record(self, record):
 SZIP_FILTER = 4
 NBIT_FILTER = 5
 SCALEOFFSET_FILTER = 6
+
+
+# Attribute message B-Tree node types
+# haven't tested type 8 yet, not sure how to get some.
+#
+V2_BTREE_NODE_TYPE_8_LAYOUT = OrderedDict((
+    ('heapid','8s'),
+    ('flags','B'),
+    ('creationorder','I'),
+    ('namehash','I')
+))
+
+V2_BTREE_NODE_TYPE_9_LAYOUT = OrderedDict((
+    ('heapid','8s'),
+    ('flags','B'),
+    ('creationorder','I')
+))
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 3c56490..fdaa86d 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -17,6 +17,7 @@
 from pyfive.core import UNDEFINED_ADDRESS
 from pyfive.btree import BTreeV1Groups, BTreeV1RawDataChunks
 from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders
+from pyfive.btree import BTreeV2AttrCreationOrder, BTreeV2AttrNames
 from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
 from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
 from pyfive.h5d import DatasetID
@@ -145,61 +146,103 @@ def get_attributes(self):
             offset = msg['offset_to_message']
             name, value = self.unpack_attribute(offset)
             attrs[name] = value
-        # TODO attributes may also be stored in objects reference in the
+        # Attributes may also be stored in objects reference in the
         # Attribute Info Message (0x0015, 21).
+        # Assume we can have both types though I suspect this is not the case
+        attr_info = self.find_msg_type(ATTRIBUTE_INFO_MSG_TYPE)
+        if attr_info:
+            more_attrs = self._get_attributes_from_attr_info(attrs, attr_info)
+            attrs.update(more_attrs)
         return attrs
 
+    def _get_attributes_from_attr_info(self, attrs, attr_info):
+        EMPTY = 18446744073709551615
+        #assume we only have one of these
+        if len(attr_info) > 1:
+            raise NotImplementedError('Multiple Attribute Info Messages not supported')
+        offset = attr_info[0]['offset_to_message']
+        data = _unpack_struct_from(ATTR_INFO_MESSAGE, self.msg_data, offset)
+        heap_address = data['fractal_heap_address']
+        # I can't find any documentation on this, but at least some 
+        # files seem to use this to indicate no attribute info.
+        if heap_address == EMPTY:
+            return {}
+        name_btree_address = data['name_btree_address']
+        order_btree_address = data['creation_order_btree_address']
+        heap = FractalHeap(self.fh, heap_address)
+        ordered = (order_btree_address is not None)
+        if ordered: 
+            btree = BTreeV2AttrCreationOrder(self.fh, order_btree_address)
+        else:
+            btree = BTreeV2AttrNames(self.fh, name_btree_address)
+        adict = dict()
+        for record in btree.iter_records():
+            data = heap.get_data_v2(record)
+            name, value = self._parse_attribute_msg(data,0)
+            adict[name] = value
+        return adict
+
+
     def unpack_attribute(self, offset):
         """ Return the attribute name and value. """
+        return self._parse_attribute_msg(self.msg_data, offset)
 
-        # read in the attribute message header
-        # See section IV.A.2.m. The Attribute Message for details
-        version = struct.unpack_from('<B', self.msg_data, offset)[0]
+
+    def _parse_attribute_msg(self, buffer, offset):
+        """
+        Unpack attribute name and value from a given buffer starting at the offset.
+        """
+
+        # Read the attribute message header
+        version = struct.unpack_from('<B', buffer, offset)[0]
         if version == 1:
-            attr_dict = _unpack_struct_from(
-                ATTR_MSG_HEADER_V1, self.msg_data, offset)
+            attr_dict = _unpack_struct_from(ATTR_MSG_HEADER_V1, buffer, offset)
             assert attr_dict['version'] == 1
             offset += ATTR_MSG_HEADER_V1_SIZE
             padding_multiple = 8
         elif version == 3:
-            attr_dict = _unpack_struct_from(
-                ATTR_MSG_HEADER_V3, self.msg_data, offset)
+            attr_dict = _unpack_struct_from(ATTR_MSG_HEADER_V3, buffer, offset)
             assert attr_dict['version'] == 3
             offset += ATTR_MSG_HEADER_V3_SIZE
-            padding_multiple = 1    # no padding
+            padding_multiple = 1  # no padding
         else:
             raise NotImplementedError(
-                "unsupported attribute message version: %i" % (version))
+                f"Unsupported attribute message version: {version}"
+            )
 
-        # read in the attribute name
+        # Read the attribute name
         name_size = attr_dict['name_size']
-        name = self.msg_data[offset:offset+name_size]
+        name = buffer[offset:offset + name_size]
         name = name.strip(b'\x00').decode('utf-8')
         offset += _padded_size(name_size, padding_multiple)
 
-        # read in the datatype information
+        # Read the datatype information
         try:
-            dtype = DatatypeMessage(self.msg_data, offset).dtype
+            dtype = DatatypeMessage(buffer, offset).dtype
         except NotImplementedError:
             warnings.warn(
-                'Attribute %s type not implemented, set to None.' % (name, ))
+                f"Attribute {name} type not implemented, set to None."
+            )
             return name, None
         offset += _padded_size(attr_dict['datatype_size'], padding_multiple)
 
-        # read in the dataspace information
-        shape, maxshape = determine_data_shape(self.msg_data, offset)
+        # Read the dataspace information
+        shape, maxshape = determine_data_shape(buffer, offset)
         items = int(np.prod(shape))
         offset += _padded_size(attr_dict['dataspace_size'], padding_multiple)
 
-        # read in the value(s)
-        value = self._attr_value(dtype, self.msg_data, items, offset)
+        # Read the value(s)
+        value = self._attr_value(dtype, buffer, items, offset)
 
         if shape == ():
             value = value[0]
         else:
             value = value.reshape(shape)
+        
         return name, value
 
+    
+
     def _attr_value(self, dtype, buf, count, offset):
         """ Retrieve an HDF5 attribute value from a buffer. """
         if isinstance(dtype, tuple):
@@ -703,6 +746,16 @@ def determine_data_shape(buf, offset):
 ))
 ATTR_MSG_HEADER_V3_SIZE = _structure_size(ATTR_MSG_HEADER_V3)
 
+ATTR_INFO_MESSAGE = OrderedDict((
+    ('version','B'),
+    ('flags','B'),
+    ('maximum_creation_index','H'),
+    ('fractal_heap_address','Q'),
+    ('name_btree_address','Q'),
+    ('creation_order_btree_address','Q'),
+))
+
+
 # IV.A.1.a Version 1 Data Object Header Prefix
 OBJECT_HEADER_V1 = OrderedDict((
     ('version', 'B'),
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 35ef508..ccf91a1 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -32,12 +32,17 @@ def __init__(self, dataobject):
         self._order = dataobject.order
         self._fh = dataobject.fh
         try:
-            self._filename = dataobject.filename
             dataobject.fh.fileno()
+            self._filename = dataobject.fh.name
             self.avoid_mmap = False
         except (AttributeError, OSError):
-            # maybe this is an S3File instance?
-            self._filename = getattr(self._fh,'path','None')
+            try:
+                # maybe this is an S3File instance?
+                self._filename = getattr(self._fh,'path')
+            except:
+                # maybe a remote https file opened as bytes?
+                # failing that, maybe a memory file, return as None
+                self._filename = getattr(self._fh,'full_name','None')
             self.avoid_mmap = True
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index 40d39f0..6772084 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -266,6 +266,33 @@ def get_data(self, heapid):
         else:
             raise NotImplementedError
 
+
+    def get_data_v2(self,record):
+        """ 
+        Updated version utilises the full structure view of the
+        record, currently only used for attribute fractal heaps.
+        This is transition code.
+        """
+        firstbyte = record['flags']
+        reserved = firstbyte & 15  # bit 0-3
+        idtype = (firstbyte >> 4) & 3  # bit 4-5
+        version = firstbyte >> 6  # bit 6-7
+        match idtype:
+            case 0: # managed
+                if version != 0:
+                    raise ValueError('Unexpected version in fractal heap')
+                obytes = self._managed_object_offset_size
+                offset = _unpack_integer(obytes, record['heapid'],1)
+                sbytes = self._managed_object_length_size
+                size = _unpack_integer(sbytes, record['heapid'], obytes+1)
+                return self.managed[offset:offset+size]
+            case 1: # tiny
+                raise NotImplementedError
+            case 2: # huge
+                raise NotImplementedError
+            case _:
+                raise NotImplementedError
+
     def _min_size_integer(self, integer):
         """ Calculate the minimal required bytes to contain an integer. """
         return self._min_size_nbits(integer.bit_length())
diff --git a/tests/data/issue23_A.nc b/tests/data/issue23_A.nc
new file mode 100644
index 0000000000000000000000000000000000000000..494a90c8fccd29adb9dc718900551a4021e053a2
GIT binary patch
literal 23279
zcmeHP3s6+o89sM+fd!UF6s0~YtHej;QCL=^78GV#A4R|cCW(@;F6@=v6n5$ELL{Rm
zZPR8FlT0SVkV+}BO{OMws?dO%q{#%^ns%zSO;t=WX(yN%(ni~~2}vo~^PhA6U0z#Y
zYMjxYGu-{}dH?4-|2ccP|F`Gijrr?iqUJ_v^m>Bfst8xM@stda@#D0oGK)9nmuL)~
znnf&48!57|%2*ktAqo_kHJ|9H&}zuo5*RlugK}D8AbKcj@wgjQx8LP)`^Ze15kW5v
zy)5g~GSf3n@$vMY^g^37Sh*lPb)A++S%Yv3^=Rn&<ZvWYKPFL85+FsRC`uUI(122#
z02<$7Tu1eQkW0+b#C(;u85RpU+LRzrZ5yIzLz+ed!&F!ekO~_t*XFEf7<EJ>B10pQ
z2mNYE0>y%=X)99wRgN8Ii^t=2xZHNX>Lc+~2R?Qq8wAyX1(Xmmq!6lzmV^sHH|Ld>
zGAIE$fFyrYp|z-VQ*n{4)RMQE>G#PSjp^omSFOu2x(jHJ1@DP3*KiEU2n+?l2s9BG
zE=&q2nW!g;_tO@_m_2BtI5sQ8d8<!y5wMc5(AdyH0X9<}6z`~7n1oYw1lVY3GqDt6
ziX%ci@yHpdv7+V(QJJY}DYk4Z2%igyiVne#sPxpk9X?Z*faT=yZ`z=ofi5SQkT5<W
zMR|o*HXpgPHWV#$x_n0ZXE(Z4za@XY(c!99-9EYmjFt6%qub*TW;NKoE_-E-nr_5k
zA|L#I``W<V+)5jyb{Vv9oFlYzjB|jDBDo8y(J)<G*C#?Ptwu|}05^O&K^L%C(B!qD
zr<GaXv`ilLM+u$c3=pzE`P&Zx=6rO&5_aZ+LNtWWgHJ_=b@erNzpawa*=b=XkWI*L
zP1dK-Na!uWa2KD_I1&#_HMpWSZz?S#b7&i|*Z>>A0k#|L5+zd^G9jN{BjcdVv@8`G
zxp!zza&Y9g!M;gD2N%S!0J<4(gRq%E_Y?>yd8j?62EiPS2`ysV+6*1&XFy4<f*ZIP
z1lxqQBz`I667mQagxrod*Iw>D^4o0C$Uv<K7FaA_V74O57O$sH_4-|O%Y-kHMf8GE
zHmF`$Ryn3ry3*+0nqy91oo-5XI(Dqcq2Dg%^ek>S3R^{l<kC99u>7HU&jb-(1&z}1
z3Kwj`v{T($pbmWT>1YM9rPaG#ey(i{%>->7YPDBYeKxnp>vw`Zgayaw^V{7HyVqfJ
z+iO)eMrLTV7M)3#zuuwJ+s#d>Vm4@GOxR#4-ppp2eWCIWp$N|A6;U1+2BzcYhH;c1
z3pnt>0MoI2v}h00v3<jsbZkc?lkIVMC#6I{A|Mfv2uK7Z0ulj<fJ8tdAQ6xVNCYGT
z(?y`H#A<~X*7z0&<@`w+TD5$10rCEjy|92B{QrJ*DH)RpNCYGT5&?;TL_i`S5x7wi
zU>-p5sZpn|rc68x!b2cD_!+3*ESx&2BDf=t`@_e2r<lVP{A>~AnPO<@`Zx<OEl+qq
z;3W^RK=5%DWoVc0bUowTlb9zwi6wlX0hjpa$^rrBA@Bb165j9KqI=;fuW+>V20fWw
zi+Qe96bhdWH;N8je_HBNM0Xu8QhVXUoM8PH;bfdIJjdXwoU$hrbfw{<Nz*u%*|3T{
z@srO!4a^xs5||A*SE6d*Uk;4(q5n&a>VIZ7JZ<7CLbuQb$m_yvfX>hvPso)A)@}gI
zk;4FM4uZbM<7P|hd0}dN_{YD*v&*><pWl0N$sN#0XfwaI%m%=U4MEHqc!#IOYyb?F
zMfP$Y4YjLj5t}9@J$T&2{jK<m)y)}<*<jekJ@RoR16Um;*7epBYmvojW2}yqR2OiT
zJM4bDPo;DTq;~}9Iz$ioI(iXeoRIZ_oC=&)oHLATBGVILe)PD*hUKu<n9%`-2D;WJ
zBn^Nc9l%1;fNJ;o4XKlG24Fg%5;dV}MjDvH1}>}(tSo}0ZK1VkxE^eeq-|%*1DgQ@
z8B`B#$zsD^E=-S>ZQFp{l~S?SVIHpM|3XvDfN@U1)|f?;8SwT-pV8-XSE)u*M&_!t
zOjBA$CQR1|X3BWh28;u0`zhs%h#iYYCkr#3oqcRG5L>n2IT0<LNn}ujGisKiix7?J
zppPTVcr2WK11dTMK>2!UA<y~R$(I?gErbd?E2_2|XPIA{W8br<A@|Rh5*vtael8)^
z@95tO^-hVgt|n;jL9w%9R$xoR1Sur~5&?;TL_i`S5xD6PfM<!<lvs0ydA{CTRTsUc
z?t$)Zg=9V#))DGaB7G7Nda_3sz~>uy#)5!x9+RIE0f~S_Kq4R!kO)WwBmy@&0`N$=
z-{d6UjEI{%Hz8p`_WAZ7e53nRT59UTg=uL^5A5H6?AW8-3VmreeBX@Kp`J47kL4x|
zE5vI<%*3Zjl$b81L_i`S5s(N-1SA4C4+6~j6>HAkaz6fV-In92#wDw>4nKEBi6Xn(
z^k?`u*X8c4IN17+gKxYv_>`k*#U1%)`fO`k)-75PU6}o1fypxX*t1<9x7yAf{nMTI
zp6=My`0V9(hg#P(Hx2frl{TN-_qwh;y>?Ied%rey<UH~2@KAF7!2RKGpI-e_YvICz
zQ&$^r|8CdSchb(i@WPtGB+tIx`*$Q2pD*wDqJ4NcYU^<QSN{EUN^8c2_Sf471{S<J
z_)6H)%Smtc?R54(oP6M~m$LqT_CizNUp{=R{lIMnhXw||V|x6DGam0xDew;+=xzQi
zrRmmP-@p4u?Y}GgYI)D_T?a4!v){V-4+EV=wym9>!V?!)G$t+ed~mAok3BEjKPr7J
zd2r>i6a7a$rlH~cN-HW(T>ROSJ&j3O@Aagfe1Bi3^TdVzz0HShuRdS-^ET~ZTy1aH
zk=n~I{c>K9`<|9NJDeXK-?Vpj!<#cB$+6_tLpP6!$dO0{BmxoviNOCI0-T(buTdBg
zZ{!(F_<mq4!DUkN(v;5}3!KArR4}rjE9e$3WH&{O`?VMT>I+WL0d~_gjvhmg8cJvY
z?hT7M<w;MwrtGFVujd}M%0G$(>{WfL-CN}}n4wDG+@A65s9QvvzB;wawZm0qbJo)%
z%3S_N1O1sAP)x_2odZNV&>|Dm0HT~YR<3HqA!@*_8_Byunp_OmxJs?5vDK=6r^mt1
z<B1Y&+P$iMjj>j>yMZJLJm+h3{1Y6F!w7)FOn}Na>+R?|Ab0^KQO|n%yeWyaQx=d1
zhH>vNw(Yw<njjexdqz!=9Qoj4axT;xyDTh(mB+|(X5hiZFma136J@Z3ypsq>1SA3y
Q0f~S_Kq4R!xRDX~4@yw}H2?qr

literal 0
HcmV?d00001

diff --git a/tests/data/issue23_B.nc b/tests/data/issue23_B.nc
new file mode 100644
index 0000000000000000000000000000000000000000..bcc505c8133d80a4549892af0dd02c52d39999db
GIT binary patch
literal 44746
zcmeHQ3v?7!n!cS*VrUW`3Wx}LL3Rn6G<iWnL`~A&AwUQW38FYNq0(JRN;*|-Rd*gL
zijIRrd>kJmdOVKch>Sb4M;%>PSJxF4XB-iAXV-O)iZkF*M$hUvqvx!TJ+l9QAJtuX
zK?FG?-V5nlb>H{9|NZZ+|Nn2*=7w;6L10=SD=&{ocTl(&AK)P+B7#2m(g!OV!mF|h
zUeC^&#RC2LBCh#zelhoS?!PPU16jm}A(N+*Z1DUO$w@gp#eKrxEqrn%CK%S`PABAq
zVPi2O_v8_xb5A`)_S`AMCpfaqrwm^(Huxw05t^-{6aor?V~4<lkH5OJepSOtg4{5R
zOi0%(nc@|KOZ`Q%Ety7gLg7zLXC3hhigWoM<CLTjIA;G@Jd=;S(f|6kx|W78v?z-Z
zEHzdb%H^>W**#vbSzP@NOEm^l74TGoqo-4yS$o+H_03Qg`6(}eNZuoT5e!XQNqV6r
zgmZ6lkznidL;~qg`NDr<T}xv<w61q7#6<|hEMC61m`uTbMF~Nml2fMDllzf>&n09$
zc#~%wSV5*>3)}nzAuIgl{?()pvz*EKW%$5LYskGAaDk5v$D1EKauvA;TY3&Kd+@$@
z_4C(|Fa|7yl?A*9P7G}!S7R-IPslj%#=Q3QR&p*}5QDEnd*-dXmjI`S`tXO410}`3
zA{#LCY(5b6dpdqaQds=o5rSRXv9S9A@(Vzf{0vEgkV`k5@mq2yruY+w`lCx4e^0)|
zZib;s@ZS5k?QfIwFvS8w0^oHv<?Op5`m@b&dHgvx{$o6-dNB8Z=NT^#@E&^cjI)O<
zJS5>k-GjJi2t9=689EQOcnH)(&+DIk`sc^rz#XL?@h#!WKHnrPFw5uj;VeNXeg7n6
zBNOR)Y#tL*Ln_9Sii0rga>ikd|14w#+}*u+>=X>k%EB4TXFmEq#wO)9GM}7L$<z0r
z#{EadecnlU@6XDL)bI#w1S{U=_s9D8iv2Hbu5WIRkSEXQRr?SZ2F;6Mt)7sUJ*QsF
zF|0Z?hBHqW%LOGT=aG}Igd%eoYx4&tz)F-&#;@kJJS_St5z8fV1smw+v$e(#EHw|y
z4mH*`Hxon=S2k>Cc**ieQ}dD)O|8wL+QtaUUyI9v9~e8!&{VrT!qV4>^evr+L*c(h
zO+6V3*VCBMrJIgnnY1mHq^6ZjM|Eqqp|!>JAeF9+V1u%t{Bf`_K!aMF^kg*FUJ*9B
zj2M{!V+n!aC~F*$3b>;8s_)|%uJie3V@MVdA_Ez(F#y~^9}y#@QczGc-hdk|P3E!Z
z4u6(^Cj*jKgH<8bN%;N4!w+jFE?<+Nw=jcbZ$p-4#|%?T>W;V1L(DHM0CC9wJ|CFz
zFmzB<dfXb9G`A3V4q-!z&Bdk-HB+&aN?bX?VHYy*agjv#udBbmd~s++BST*7ek8JN
zqCuJvu6p2P9vOZuuBhPTz4Sku&%<%`L?^>#deI~|yx=094!08M2Bv3BAR}D1hq0)P
za9QtUECie^N0!e+37IM@k^#zB2q**;0tx|zfI>hapb$_9C<GJ&3IT<{cL0GEp@wC{
zOZ7%2z+q^0@399eIf|<|j$@e=9<L`p`QBYzeKC&79=Lb{W5g|9GRM^y6s~zps)2`3
z;~tUzJy%`(2pOn1$?fH-0!(y|*ZtEw-eak9P$+^Tj9B4yH}h0un8Y8i^PhaXV#a@P
zMac&H#yTeSlWHR0EJEI9(hwi$PX3)~vV9>!o?-%#dE@^yroJIu=PJjUAY#kD>(BFp
zwuDL9wCH-%>WS;I4v?iKnY`^>C=UfMwASpSDMzPPJ6#!CPGee9qfV!lipOX=n@*FR
zI*nRp(l$U0htfE!J-`AnQ^K|`)}tVqGh?g}Ct;b6WmBy~H=~B`&>kb%$x^pl@wnAv
zm>mHcHIf>ZR2`smnAFeK9bL1do!A_QHv|gqHez~=w)OGKG+T?u^*9a1jV=&x(dAZ5
zj|bca6!!G=1fw0XU@EHFK|Pk5mxy-FEuu~;k+AG!x}X*9?M4(k9iVlF6}LK!s1|4K
zkC@$tZJ7uZgshTn0(cOD)xqW<4eJTbPGSmHK*ZGTjy{$dlY|W$u&``wUORxWShpMi
z!HhzuL@7<I;~F%)na9w2D`f)o07r06fG#nq7K<5pTg0mE0ePN7=d2vE`*T>;b6_AK
zs~EaAof>8|o<i(2hj!^njg2fes7G&0z{oh%*4uR(N674;P{aBFAdlBB9pDf;biLlk
z>vu;ZY0-<Vlx+e)kfhDBAZsGk7B`$u-DbV)gJ{~NnJFh~8;PV74A9k%h;xhAqES7O
zWF3UDN18F(jJT2PL!9(34X&Xzl1`ie2D56S3@KJo2GMw$4o!riFRfX(NRk3H0cqXV
zV}_GS#_8^MrJSUM4O#_KXv8d#IS<&NJ3zQPUIkIk%w||?Q)s1aC3HIp<jy7-%vm<U
zx}n!Fj^~sWfj}EX(dSeI7Y56UI%Dng&nXR-Rs_p&{vV<9&VLkf-*D3_52JFFwaze|
zq>*IvmbI-865!--3e~Qng_0U+2)d-(jA(0<U6zx8(KMnoASXc)twX+`+eI|AqKK|;
zuAS?L3Ro4$8?-3rEvQ1q;@JR5`2Apk!%{jx8d06KV-avzOgJ=M(?Xz>xl{m10jwfg
zSXNR}GPjxrv}6~prlHXC^5BBt0*L6cVn(~6$6&U%RFpNARt2o62}xG2SXmk@@x+%`
zRrHosRrQva1&|3qF-t;`B4lqcvM{O3D%W=boj6vUv$_u3h@Blk9YZy3)^!>Q(om2#
zq|Eg?n|e!wbQy4k)Oz4-(My9sW|XC-RF7+be6+aRaH3YqDNgWv3XKIOhLaIk9>qrR
z46!~_>oTHFkV2U&JNw{bMO4>JtHTZ!(b9zrD}#YJFxhI_*t{IzfS$S}sP_W-p|T{|
z3aSniV$YQ0VC%((83XDDeMy|NaVt*K8T_y=v9S@1To!=JZXBN_D?=e<Q8sY#S_inZ
zgVngpi)9-fplyh^rc5KrsUMe4%(=`hS~43kJzfwA)@V*AOxz@Lx;4S&&`UiT)Z*ad
zNW<_+mKPNqpF<bmw82&+=n5`8Nm~O30_~!X(UppG76BtA@`VMoMFDy~3~ZN9OBc{3
zj+26ml6j@2g0yHNK`ZCaFRrYt2*98$*9;RGreh@aYTB7hCaUMn8)}j9!&1&5Orl0g
z0*ERBgrNlkX>F-HST1BR-h3{g#-Al!6{&A%ft%4d@BGJ{w>3WYOLzW<EVCPc0XD=(
z2X_!ySE#-iW)+Z-JBhn+U>Wf=LITzv<b!EMX%DOj28^=-JTOMvZL5p+!Lm(DD(06I
z1B}3<1ISWK1(Dk+lh&0i2$ol|QPO+$XbN+}79v19H9OX$**Y~1yIV80pcah><C+-{
zc39m(+5*d|VJ1NDr}qM9!u1B`46aw8Ahh*>b*W{tMV2jLY<`-Srdj2|(yGdmV9CPr
zl7&@4GYWINFACeA08D9Uswjg`=z60UT2orSpc1&m8roG{7FgV{*uhBGRa21)N~PUe
zJf%Bl)0p0Fn2;3+Bns0RHW|Pg1gQkNt$0*3fqBZhszG&W>Vf7Jy|A(w4lW6-j%wJO
z#M=SO*3yMl^R+hCS_d{GP%^Ax*bh;otO(Er#=!O~l4CizWrD!~egfRt=!R=txnO72
z*i8#zD&VZ>)HU3%P`z7&y%3uoz(~u2^Gizg1#~H0-%(r&8QKSBXop<^j21e`P#!EV
zuhi$e8SINX%d8*J!We34`O@7371|s;$WRLN7zzhu%UwbPnt~ltA0xDlk{E4hAoc=D
ziV;v$gMBH|&VsT4?f?y-46Q~kD6kzcX}xU#hXj^gMoUYp%POj&u9e{yWFBn()&nH4
zV`Oj?;id{IVBE~sFM(0S`sCgSXA9G!LObHdr3|1!+Ee~WARi65@AfJQ+_%+AuwKb2
zror+Bn=MyE4(o87Pxx7)-<+XYsU*^Q!p6NNq5utN4Xs^O0SOP`>Km41?Ef)w{~VsB
z{|w;^BP^-Fk8EG|Ov|cB1dStPnDnReEW`d5FY-nqn?NUhsR-qv>Q^D45Kssx1QY@a
z0fm4<;M<PCV540ooQ7_>{!usD9ZU8K<5WwS!rmr)*J!s~<G^Ag*cKFwcIAHkSVBJI
ztEs}Wk&uzkgS;i0_olEvCfqnx8mYRC7KV@R{csnMU6vqvj^{xl?somcwfp~bpvI!O
znwOSVm2AFc??SFBk?oTxN?+DL@TA8xVu;|p9COlt{P8Ov5hp#qzzBFI!4nUS_tOa^
zjD^nT$2OkW{skqS7yvnO2(Nvru9B>Q3vw18%UuVyEFv>71w7xN$mE6Id8^2eFaVBR
zpap2zd-wK{tKovc6AwO-v|e{rKe-13pbVr9J^$QW$x9dz6nGhT|C-#7Wn9iRG*dqP
z@JW6qp%#<DNt8XWzQ3&b-3|Os{eBGuGEaI4R6Qf>88gp#dB-Y~fIO7op(GC#dFaD4
z&~qODRNA_KjV`Gyqd=hdhtYL4Ilgh@rnK$fe_~P1AM>)w<PWp@nCMa-$YAA-e|?^B
zR3_~>bJ0m3uU&!5p^qdCK8%3O30VgUbfBceI!oZGG%W~6IXo5>)A~@13R*x4@-UE;
z7K1Q2z(vM!%j{@%1;NO{W#z8C7!?yS5S9*?ObDM*Ziwla2ueznO)IWz3AdM8ui=%^
zx45dAHq=7$NMxn56|HR$iqifPJQ)D;>By=`Qz+8PP6jN1RJh!sz|ql@gvcWT?!gU`
zhw*%2qC=h>*e%aLLOOJdFXO)YJ^m3GP6#PaVGKlI*HUi?5-B<*bmTawUxk1|Kp~(I
zPzW542r$dYhs3d49~g7~6?^&o$CEx9KhO#^x0?fyHpV>bxv`623ktz}+whx>kM?nC
z*3B75f29;lZY>7gA2sQgd;Irxv8%zVuv*;B9Fya}SQ$Py9o^%<*nBKi9tXy!iBTE$
zVzXu!ta|4t+6p7IMNenrQ;~3gc>lkf@Ue8`#*;4Fwd=wQrqA#dEX`Up+>R84R3p0a
zKk!BmBpbCu_2Q%M6aoqXg@8gpA)pXY2z=WSz`bhTiP#IjPnuH7JzxIysPa6TGIY7S
zl)jXqOC`u}8aSTvK5BUH*Y3Wem_18L<+Bg8y|dXP&$AQ$_b-RgT#M%?_q|lE-kFQ}
zc~$T$UghK74|f6C+?>3{sJSk8chuW=7=s?~3YcOT>yWkVoO9#31NGYl7xFsOsb0S0
zntfcqEqfs`lD=%^K>c>RP{d2Uw)7+Gwdq2?{R!p;-L$LU1`AuJ&PIdaq1D;bFa_wQ
zU3K<441lK%B%ZW78y<OabeKAO8(etQ*}E~IFhiYv4VD4wY=mY=b+)nN)qliH${ig9
z4o|-=29a#TN`*ZF@eE!j0g+77AeodzGHHZl4)!HK{20$)DR+z!;9{hAGv2}{8JhHm
zKl41=x5i1y&{#4n#k+Zv;P!rD=I<Isg|FfXuo1{lNDVRu<&F%2#l1N%4k`gfNjtM9
zO`0^d`_Ef$dE$){>+LrmJ72r!o|!Xep0JaUyKX5cV%84Ql8u$H<<!S&j>y^p7lk9T
zc0kg=Lg1Xx;Tk)5>P#Cu$b+9Ty~EaZr`3cr*PX+dIUq5LTH$DF3{J?1qqKuf9R?>A
zZ;u>g%5dROrVJU`zi~?jx$A<?F()k<)TEP$e7F7KWXp0VIS`#!F7}4V-9BUG-Y9ja
z5Kssx1QY@a0foSEg23SSlFO$_+&iNic+cNU9$m^~o4NP(-%I|1*nr4~1l)MY$Iox{
zz2vyl!CmvwHB7*Ffn%xCFhRbWyQTW&x0xxx)tff?PkSQ%hrK(l*|ceTRRTW45jkPm
zaND+7P}E3n)&_6vz`;`6I4?fxP9dNWPzWdl6aoqXg}}ERfsv1Y9~4sIe-cU@cP}ru
ziK6>~!-N5{xjA{!s6-FDd(i7Xx${D<_?9(`2Cyx0W+ej^-`Cy7W2Kl<`tzOf@N!%B
z2|s3rU6`TxuE)Y5;UN^?MHz~34YSN1z2Y16?X2T?j{C=0#z>0q;-@ZRXNY-^)bEiZ
zaHNWFIe3`_#1-FyG)N{TkxUx#4EBSMZvWkp_C@6%Jpx=rG2iE7Q;=x}4|P7`%=B~b
z-HWBk)ODMA-7-!}8FB`_pH3QA<xj^PuI@9fheQ^D4e83K__*)Bn=@g-D+=NDi0pVY
zCcwW`B7ZJkji7*(C#gGiqoXsK4@}Ju|AtdSuFZ#6QDj4cA&+-^%bL3LldT*;;`6Z=
z{bl*n-=34&9?8|Qx#$LlAGT$|#u*Pz4v+MW#5~5?{aM-lKj&3pl~^4%0Y^rT${4ct
zcDM!iKf*up4t`I{g+E%aZp-qOY{A48W61C4%2&6`<3DWtuo?-BsB(<(59L9p+?+@=
zgis)t{%!0Mett6ZonJG1vRq7uvLbkQ`>i}S{qFWZpVD%M%rU@6krEwOQBkp|ciXmy
z_PzO-m6cUi_TrZxemJ8BKBQX`7|RR-q!kCtZB97l$46ukfWzfZim&tf0~~9_1>Q>l
z1s&Gg4?K0ICt|)RNg4#`@H*sS<^_(KsI=oy1_48)9d0sSFrr0(Wq#up0n+Xgy2qTf
z2yh8cMkxQgh`?}?WqgaqP%gRI8!DFuxoUc2)ty2>A)pXY2q**;0>=Xa+{k#^w9`r+
z+B~<UstP`IHMuEkujpD0MsFL}xvqgc>mNI5`jqJn7ko_jT@`xnYJb5AQ_H@(?4N^M
zownEZ{Pc{!{v!2XPyg|I1#54QPU&l_TO7S_(SZX`etyLpZ=Dt#x8SQAcT|70>EXBj
zrRJvFPk6Sk@AHP8K6or!w||~EPIEk9srsN0PzWdl6aor?Z#e=?36pcj6^(`D7U;?p
za`$KB>&RrdJemCV^L1g;1DB`r)1;tQA*YZ%5AD5yXe{lQyRLtRMBs8dF>;D_k+)cA
z_UpTUPo86GzgkpqfZPw4r;<eno^#+rxxXz0*!bsi88)<x53ws^<YS96TIM)e<j7l2
z8t!|MI8C_nBtGn}M4eNYtC8mPF;*7EDd)9=p=B~e`NsqSc9LJ(5%@s8?_-01Q#?79
zMO6vqjPH3LIh#dod!SNiGLA_DRVAZAU}U=y7(?ddpt@EcQWz|%awqK}s)lTCPF^(H
zjr89ovu%F4;NZY_>-7tQN~^(UZ3+YjzI>toPF_YjRpS5NIh)&{$X-aMC4E`zKpT`D
z!`Ptg+Aw~WutB*D^TJF0k-*u<1!iHPVQ!T2Y5(c7$v<F<GOp5OpANVIQ`B)Ql`(hT
z^ds^nrdZ4o39lQom9%02Xqs93(88o0ufS>nIp*%x4ZkC=U_hMfN29mi{67D5z!C@q
zho27E`g!C9ey4uF5d<=APz3JYam%C(4-t6An`F|9XPikUop=U3XX^(uzL7quLJuDS
zrbU+q2`t;bF%NDZl@?v@j--#JARobdSDT*JY~z&2X!yYGkY*leEP(NvVYkA^qT$oL
zS~6vibjp2%9Fy?9`|PWs@Sy<q{ZTTJp2y$kKx#<ESW<Bi+L`482X&&8K3p=R5Bv@<
zE2g#Hh{w;zZ<v0KGQ{_!qZWLR9KQIQ6d&{T+O+sd0W=L(l&Bt$x5DQ&J1y~10Sc`_
zQ1H77)$rj>&7_3}esdc71Rn~853qK@XQVsftJ34)n*z}1@RH?`rsgFpn)tT{PJz7m
zEdf-H;szBvEXPaw*l~Hf=Onn!yoVA!_qY*O?l?ez&Hv`h(>*!=X8%%Owxo${B)E`x
zXz^vsxsl7icX6$FeDP%?ZK4~)zT}Euy>Q>E<-Rx853kAiY(7ut%a#vOc@WA>N#qM4
zeA#mM<p(pGf{SDZ`COF;p)7;65AkKohpJp|W|n)Y%p)JK{P2YhX+FZ^)GtS1I4g?i
tVFGmPW^Y1C2rdDMXgPY`)ap(lpb$_9C<GJ&3IT<HLO>y)5coDB@c(CM>5u>b

literal 0
HcmV?d00001


From 1e2c424615b46d633d1e42e034df4dc41292c6a2 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 8 Jan 2025 07:47:01 +0000
Subject: [PATCH 048/105] Cleaned up issue23 fixes, all tests pass

---
 pyfive/dataobjects.py          |  5 ++-
 pyfive/misc_low_level.py       | 41 ++++------------------
 tests/test_issue23_nc_attrs.py | 63 ++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 37 deletions(-)
 create mode 100644 tests/test_issue23_nc_attrs.py

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index fdaa86d..6386b6f 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -156,7 +156,6 @@ def get_attributes(self):
         return attrs
 
     def _get_attributes_from_attr_info(self, attrs, attr_info):
-        EMPTY = 18446744073709551615
         #assume we only have one of these
         if len(attr_info) > 1:
             raise NotImplementedError('Multiple Attribute Info Messages not supported')
@@ -165,7 +164,7 @@ def _get_attributes_from_attr_info(self, attrs, attr_info):
         heap_address = data['fractal_heap_address']
         # I can't find any documentation on this, but at least some 
         # files seem to use this to indicate no attribute info.
-        if heap_address == EMPTY:
+        if heap_address == UNDEFINED_ADDRESS:
             return {}
         name_btree_address = data['name_btree_address']
         order_btree_address = data['creation_order_btree_address']
@@ -177,7 +176,7 @@ def _get_attributes_from_attr_info(self, attrs, attr_info):
             btree = BTreeV2AttrNames(self.fh, name_btree_address)
         adict = dict()
         for record in btree.iter_records():
-            data = heap.get_data_v2(record)
+            data = heap.get_data(record['heapid'])
             name, value = self._parse_attribute_msg(data,0)
             adict[name] = value
         return adict
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index 6772084..e4c8786 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -249,42 +249,14 @@ def get_data(self, heapid):
         idtype = (firstbyte >> 4) & 3  # bit 4-5
         version = firstbyte >> 6  # bit 6-7
         data_offset = 1
-        if idtype == 0: # managed
-            assert version == 0
-            nbytes = self._managed_object_offset_size
-            offset = _unpack_integer(nbytes, heapid, data_offset)
-            data_offset += nbytes
-
-            nbytes = self._managed_object_length_size
-            size = _unpack_integer(nbytes, heapid, data_offset)
-
-            return self.managed[offset:offset+size]
-        elif idtype == 1: # tiny
-            raise NotImplementedError
-        elif idtype == 2: # huge
-            raise NotImplementedError
-        else:
-            raise NotImplementedError
-
-
-    def get_data_v2(self,record):
-        """ 
-        Updated version utilises the full structure view of the
-        record, currently only used for attribute fractal heaps.
-        This is transition code.
-        """
-        firstbyte = record['flags']
-        reserved = firstbyte & 15  # bit 0-3
-        idtype = (firstbyte >> 4) & 3  # bit 4-5
-        version = firstbyte >> 6  # bit 6-7
         match idtype:
             case 0: # managed
-                if version != 0:
-                    raise ValueError('Unexpected version in fractal heap')
-                obytes = self._managed_object_offset_size
-                offset = _unpack_integer(obytes, record['heapid'],1)
-                sbytes = self._managed_object_length_size
-                size = _unpack_integer(sbytes, record['heapid'], obytes+1)
+                assert version == 0
+                nbytes = self._managed_object_offset_size
+                offset = _unpack_integer(nbytes, heapid, data_offset)
+                data_offset += nbytes
+                nbytes = self._managed_object_length_size
+                size = _unpack_integer(nbytes, heapid, data_offset)
                 return self.managed[offset:offset+size]
             case 1: # tiny
                 raise NotImplementedError
@@ -293,6 +265,7 @@ def get_data_v2(self,record):
             case _:
                 raise NotImplementedError
 
+
     def _min_size_integer(self, integer):
         """ Calculate the minimal required bytes to contain an integer. """
         return self._min_size_nbits(integer.bit_length())
diff --git a/tests/test_issue23_nc_attrs.py b/tests/test_issue23_nc_attrs.py
new file mode 100644
index 0000000..41749ce
--- /dev/null
+++ b/tests/test_issue23_nc_attrs.py
@@ -0,0 +1,63 @@
+import pyfive
+import h5py
+import numpy as np  
+
+## Test files provided as part of https://github.com/NCAS-CMS/pyfive/issues/23
+
+
+def _compare_var_attrs(p5file, h5file):
+    """ Compare attributes, which ought to bet he same thing except for the 
+    dimension lists which have package dependent attributes """
+    assert p5file.keys() == h5file.keys()
+    for k,v in p5file.items():
+        vv = h5file[k]
+        if k == 'DIMENSION_LIST':
+            assert len(v) == len(vv)
+        else:
+            if isinstance(v, np.ndarray):
+                assert np.all(v == vv)
+            else:
+                assert v == vv
+
+
+
+def test_A_works():
+    """ This file behaves."""
+    with pyfive.File('tests/data/issue23_A.nc') as hfile:
+
+        file_attrs = hfile.attrs
+        q_attrs = hfile['q'].attrs
+
+
+    with h5py.File('tests/data/issue23_A.nc') as hfile:
+
+        file_attrs2 = dict(hfile.attrs)
+        q_attrs2 = dict(hfile['q'].attrs)
+        # note that unless we explicitly copy these to dicts,
+        # they cannot be referenced after the file is closed.
+
+    assert file_attrs == file_attrs2
+    _compare_var_attrs(q_attrs, q_attrs2)
+
+
+def test_B_fails():
+    """ This file fails """
+
+    with pyfive.File('tests/data/issue23_B.nc') as hfile:
+
+        file_attrs = hfile.attrs
+        t_attrs = hfile['tas'].attrs
+
+    with h5py.File('tests/data/issue23_B.nc') as hfile:
+
+        file_attrs2 = dict(hfile.attrs)
+        t_attrs2 = dict(hfile['tas'].attrs)
+        # note that unless we explicitly copy these to dicts,
+        # they cannot be referenced after the file is closed.
+
+    assert file_attrs == file_attrs2
+    _compare_var_attrs(t_attrs, t_attrs2)
+
+
+        
+

From 6da5fda5a39f3a9072c6f09c37fadef72b28e59f Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 8 Jan 2025 09:54:45 +0000
Subject: [PATCH 049/105] Test localisation, and a new test for laziness
 outside a context manager

---
 .gitignore                     |  4 +++
 tests/test_issue23_nc_attrs.py | 14 +++++------
 tests/test_lazy_variables.py   | 46 ++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_lazy_variables.py

diff --git a/.gitignore b/.gitignore
index bcc99b3..2b42de6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
 .coverage
 .pyc
 build
+__pycache__
+*.egg-info
+.idea
+.DS_Store
diff --git a/tests/test_issue23_nc_attrs.py b/tests/test_issue23_nc_attrs.py
index 41749ce..2d1eb18 100644
--- a/tests/test_issue23_nc_attrs.py
+++ b/tests/test_issue23_nc_attrs.py
@@ -1,9 +1,11 @@
 import pyfive
 import h5py
-import numpy as np  
+import numpy as np 
+from pathlib import Path 
 
 ## Test files provided as part of https://github.com/NCAS-CMS/pyfive/issues/23
 
+HERE = Path(__file__).parent
 
 def _compare_var_attrs(p5file, h5file):
     """ Compare attributes, which ought to bet he same thing except for the 
@@ -19,17 +21,15 @@ def _compare_var_attrs(p5file, h5file):
             else:
                 assert v == vv
 
-
-
 def test_A_works():
     """ This file behaves."""
-    with pyfive.File('tests/data/issue23_A.nc') as hfile:
+    with pyfive.File(HERE/'data/issue23_A.nc') as hfile:
 
         file_attrs = hfile.attrs
         q_attrs = hfile['q'].attrs
 
 
-    with h5py.File('tests/data/issue23_A.nc') as hfile:
+    with h5py.File(HERE/'data/issue23_A.nc') as hfile:
 
         file_attrs2 = dict(hfile.attrs)
         q_attrs2 = dict(hfile['q'].attrs)
@@ -43,12 +43,12 @@ def test_A_works():
 def test_B_fails():
     """ This file fails """
 
-    with pyfive.File('tests/data/issue23_B.nc') as hfile:
+    with pyfive.File(HERE/'data/issue23_B.nc') as hfile:
 
         file_attrs = hfile.attrs
         t_attrs = hfile['tas'].attrs
 
-    with h5py.File('tests/data/issue23_B.nc') as hfile:
+    with h5py.File(HERE/'data/issue23_B.nc') as hfile:
 
         file_attrs2 = dict(hfile.attrs)
         t_attrs2 = dict(hfile['tas'].attrs)
diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py
new file mode 100644
index 0000000..0ca3feb
--- /dev/null
+++ b/tests/test_lazy_variables.py
@@ -0,0 +1,46 @@
+
+import pyfive
+from pathlib import Path
+import numpy as np
+
+HERE = Path(__file__).parent
+
+def test_attributes_outside_context():
+    """ Pyfive should be able to access attributes outside the context manager. """
+    with pyfive.File(HERE/'data/issue23_A.nc') as hfile:
+
+        file_attrs = hfile.attrs
+        fdict = dict(file_attrs)
+        q_attrs = hfile['q'].attrs
+        qdict = dict(q_attrs)
+        
+    fdict1 = dict(file_attrs)
+    qdict1 = dict(q_attrs)
+
+    assert fdict1 == fdict
+    assert qdict1 == qdict
+
+def test_file_data_oustside_context():
+    """ Pyfive should be able to access data outside the context manager. 
+    The data variable should be capable of reopening a closed file when
+    it needs access to the data. This mode should support thread 
+    parallelism without the need for a lock.
+    """
+
+    with pyfive.File(HERE/'data/issue23_A.nc') as hfile:
+
+        qdata = hfile['q']
+        qdata1 = qdata[...]
+        qsum1 = np.sum(qdata1)
+
+    qdata2 = qdata[...]
+    qsum2 =  sum(qdata2)
+
+    assert qsum1 == qsum2
+
+
+
+
+    
+
+

From 59e8667807e3dd4e3e7c871bcad234cbad3709d9 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 8 Jan 2025 10:09:15 +0000
Subject: [PATCH 050/105] Changes to support out of context variable access as
 described in #24

---
 pyfive/h5d.py                | 56 +++++++++++++++++++++++++++++++-----
 tests/test_lazy_variables.py |  2 +-
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index ccf91a1..bf02185 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -23,18 +23,27 @@ class DatasetID:
     instance, it is completely independent of the parent file, and it can be used 
     efficiently in distributed threads without thread contention to the b-tree etc.
     """
-    def __init__(self, dataobject):
+    def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         """ 
         Instantiated with the pyfive datasetdataobject, we copy and cache everything 
         we want so that the only file operations are now data accesses.
+        
+        if pseudo_chunking_size_MB is set to a value greater than zero, and
+        if the storage is not local posix (and hence np.mmap is not available) then 
+        when accessing contiguous variables, we attempt to find a suitable
+        chunk shape to approximate that volume and read the contigous variable
+        as if were chunked. This is to facilitate lazy loading of partial data
+        from contiguous storage.
         """
 
         self._order = dataobject.order
-        self._fh = dataobject.fh
+        self.__fh = dataobject.fh
+        
         try:
             dataobject.fh.fileno()
             self._filename = dataobject.fh.name
             self.avoid_mmap = False
+            self.pseudo_chunking_size = 0 
         except (AttributeError, OSError):
             try:
                 # maybe this is an S3File instance?
@@ -44,6 +53,7 @@ def __init__(self, dataobject):
                 # failing that, maybe a memory file, return as None
                 self._filename = getattr(self._fh,'full_name','None')
             self.avoid_mmap = True
+            self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
         self.rank = len(self.shape)
@@ -116,6 +126,9 @@ def read_direct_chunk(self, chunk_position, **kwargs):
     def get_data(self, args):
         """ Called by the dataset getitem method """
 
+
+
+
         match self.layout_class:
             case 0:  #compact storage
                 raise NotImplementedError("Compact Storage")
@@ -249,17 +262,31 @@ def _get_contiguous_data(self, args):
 
     def _get_direct_from_contiguous(self, args=None):
         """
-        We read the entire contiguous array, and pull out the selection (args) from that.
-        This is a fallback situation if we can't use a memory map which would otherwise be lazy.
-        This will normally be when we don't have a true Posix file.
-        # FIXME: We can probably make this lazy by using the indexer to work out which bytes
-        # are where ...
+        If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks
+        otherwise we have to read the entire array. This is a fallback situation if we 
+        can't use a memory map which would otherwise be lazy. This will normally be when 
+        we don't have a true Posix file. We should never end up here with compressed
+        data.
         """
+        def __getstride():
+            """ Determine an appropriate chunk and stride for a given pseudo chunk size """
+            stride = 1
+            chunk_shape = np.ones(self.rank, dtype=int)
+            for i in range(self.rank):
+                stride *= self.shape[i]
+                chunk_shape = box[:i] = self.shape[:i]
+                if stride*self.dtype.itemsize > self.pseudo_chunking_size:
+                    stride //= self.shape[i]
+                    chunk_shape = box[:i] = self.shape[:i-1]
+            return chunk_shape, stride  
     
         itemsize = np.dtype(self.dtype).itemsize
         # need to impose type in case self.shape is () in which case numpy would return a float
         num_elements = np.prod(self.shape, dtype=int)
         num_bytes = num_elements*itemsize
+
+        if self.pseudo_chunking_size:
+            stride = __getstride()
        
         # we need it all, let's get it all (i.e. this really does read the lot)
         self._fh.seek(self.data_offset)
@@ -323,6 +350,21 @@ def _get_selection_via_chunks(self, args):
                 raise NotImplementedError('datatype not implemented')
 
         return out
+
+    @property
+    def _fh(self):
+        """ 
+        When the parent file has been closed, we will need to reopen it
+        to continue to access data. This facility is provided to support
+        thread safe data access. However, now the file is open outside
+        a context manager, the user is responsible for closing it,
+        though it should get closed when the variable instance is
+        garbage collected.
+        """
+        if self.__fh.closed:
+            self.__fh = open(self._filename, 'rb')
+        return self.__fh
+    
     
 
 class DatasetMeta:
diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py
index 0ca3feb..0d22d73 100644
--- a/tests/test_lazy_variables.py
+++ b/tests/test_lazy_variables.py
@@ -34,7 +34,7 @@ def test_file_data_oustside_context():
         qsum1 = np.sum(qdata1)
 
     qdata2 = qdata[...]
-    qsum2 =  sum(qdata2)
+    qsum2 =  np.sum(qdata2)
 
     assert qsum1 == qsum2
 

From 34a684ab57f6a3d6f7c7ffd782123c6929bee052 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 8 Jan 2025 10:34:48 +0000
Subject: [PATCH 051/105] Removing the pseudo chunking stuff that snuck into
 the last commit

---
 pyfive/h5d.py | 36 ++++--------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index bf02185..7fd7c11 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -23,17 +23,11 @@ class DatasetID:
     instance, it is completely independent of the parent file, and it can be used 
     efficiently in distributed threads without thread contention to the b-tree etc.
     """
-    def __init__(self, dataobject, pseudo_chunking_size_MB=4):
+    def __init__(self, dataobject):
         """ 
         Instantiated with the pyfive datasetdataobject, we copy and cache everything 
         we want so that the only file operations are now data accesses.
         
-        if pseudo_chunking_size_MB is set to a value greater than zero, and
-        if the storage is not local posix (and hence np.mmap is not available) then 
-        when accessing contiguous variables, we attempt to find a suitable
-        chunk shape to approximate that volume and read the contigous variable
-        as if were chunked. This is to facilitate lazy loading of partial data
-        from contiguous storage.
         """
 
         self._order = dataobject.order
@@ -42,8 +36,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         try:
             dataobject.fh.fileno()
             self._filename = dataobject.fh.name
-            self.avoid_mmap = False
-            self.pseudo_chunking_size = 0 
+            self.avoid_mmap = False 
         except (AttributeError, OSError):
             try:
                 # maybe this is an S3File instance?
@@ -53,7 +46,6 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
                 # failing that, maybe a memory file, return as None
                 self._filename = getattr(self._fh,'full_name','None')
             self.avoid_mmap = True
-            self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
         self.rank = len(self.shape)
@@ -126,9 +118,6 @@ def read_direct_chunk(self, chunk_position, **kwargs):
     def get_data(self, args):
         """ Called by the dataset getitem method """
 
-
-
-
         match self.layout_class:
             case 0:  #compact storage
                 raise NotImplementedError("Compact Storage")
@@ -262,31 +251,14 @@ def _get_contiguous_data(self, args):
 
     def _get_direct_from_contiguous(self, args=None):
         """
-        If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks
-        otherwise we have to read the entire array. This is a fallback situation if we 
-        can't use a memory map which would otherwise be lazy. This will normally be when 
-        we don't have a true Posix file. We should never end up here with compressed
-        data.
+        This is a fallback situation if we can't use a memory map which would otherwise be lazy. 
+        At the moment it is very unlazy.
         """
-        def __getstride():
-            """ Determine an appropriate chunk and stride for a given pseudo chunk size """
-            stride = 1
-            chunk_shape = np.ones(self.rank, dtype=int)
-            for i in range(self.rank):
-                stride *= self.shape[i]
-                chunk_shape = box[:i] = self.shape[:i]
-                if stride*self.dtype.itemsize > self.pseudo_chunking_size:
-                    stride //= self.shape[i]
-                    chunk_shape = box[:i] = self.shape[:i-1]
-            return chunk_shape, stride  
     
         itemsize = np.dtype(self.dtype).itemsize
         # need to impose type in case self.shape is () in which case numpy would return a float
         num_elements = np.prod(self.shape, dtype=int)
         num_bytes = num_elements*itemsize
-
-        if self.pseudo_chunking_size:
-            stride = __getstride()
        
         # we need it all, let's get it all (i.e. this really does read the lot)
         self._fh.seek(self.data_offset)

From 64827c4ebb0f0b8adce2136fbfbfa2f9fde14f69 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 8 Jan 2025 12:45:00 +0000
Subject: [PATCH 052/105] catching up to the main trunk in h5netcdf

---
 tests/test_lazy_variables.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py
index 0ca3feb..0d22d73 100644
--- a/tests/test_lazy_variables.py
+++ b/tests/test_lazy_variables.py
@@ -34,7 +34,7 @@ def test_file_data_oustside_context():
         qsum1 = np.sum(qdata1)
 
     qdata2 = qdata[...]
-    qsum2 =  sum(qdata2)
+    qsum2 =  np.sum(qdata2)
 
     assert qsum1 == qsum2
 

From 20693b9ea53697fca7135369ed50c904f6f4d3d1 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 8 Jan 2025 12:45:35 +0000
Subject: [PATCH 053/105] Starting to sketch out the pseudo chunking

---
 pyfive/h5d.py | 56 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index ccf91a1..bf02185 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -23,18 +23,27 @@ class DatasetID:
     instance, it is completely independent of the parent file, and it can be used 
     efficiently in distributed threads without thread contention to the b-tree etc.
     """
-    def __init__(self, dataobject):
+    def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         """ 
         Instantiated with the pyfive datasetdataobject, we copy and cache everything 
         we want so that the only file operations are now data accesses.
+        
+        if pseudo_chunking_size_MB is set to a value greater than zero, and
+        if the storage is not local posix (and hence np.mmap is not available) then 
+        when accessing contiguous variables, we attempt to find a suitable
+        chunk shape to approximate that volume and read the contigous variable
+        as if were chunked. This is to facilitate lazy loading of partial data
+        from contiguous storage.
         """
 
         self._order = dataobject.order
-        self._fh = dataobject.fh
+        self.__fh = dataobject.fh
+        
         try:
             dataobject.fh.fileno()
             self._filename = dataobject.fh.name
             self.avoid_mmap = False
+            self.pseudo_chunking_size = 0 
         except (AttributeError, OSError):
             try:
                 # maybe this is an S3File instance?
@@ -44,6 +53,7 @@ def __init__(self, dataobject):
                 # failing that, maybe a memory file, return as None
                 self._filename = getattr(self._fh,'full_name','None')
             self.avoid_mmap = True
+            self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
         self.rank = len(self.shape)
@@ -116,6 +126,9 @@ def read_direct_chunk(self, chunk_position, **kwargs):
     def get_data(self, args):
         """ Called by the dataset getitem method """
 
+
+
+
         match self.layout_class:
             case 0:  #compact storage
                 raise NotImplementedError("Compact Storage")
@@ -249,17 +262,31 @@ def _get_contiguous_data(self, args):
 
     def _get_direct_from_contiguous(self, args=None):
         """
-        We read the entire contiguous array, and pull out the selection (args) from that.
-        This is a fallback situation if we can't use a memory map which would otherwise be lazy.
-        This will normally be when we don't have a true Posix file.
-        # FIXME: We can probably make this lazy by using the indexer to work out which bytes
-        # are where ...
+        If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks
+        otherwise we have to read the entire array. This is a fallback situation if we 
+        can't use a memory map which would otherwise be lazy. This will normally be when 
+        we don't have a true Posix file. We should never end up here with compressed
+        data.
         """
+        def __getstride():
+            """ Determine an appropriate chunk and stride for a given pseudo chunk size """
+            stride = 1
+            chunk_shape = np.ones(self.rank, dtype=int)
+            for i in range(self.rank):
+                stride *= self.shape[i]
+                chunk_shape = box[:i] = self.shape[:i]
+                if stride*self.dtype.itemsize > self.pseudo_chunking_size:
+                    stride //= self.shape[i]
+                    chunk_shape = box[:i] = self.shape[:i-1]
+            return chunk_shape, stride  
     
         itemsize = np.dtype(self.dtype).itemsize
         # need to impose type in case self.shape is () in which case numpy would return a float
         num_elements = np.prod(self.shape, dtype=int)
         num_bytes = num_elements*itemsize
+
+        if self.pseudo_chunking_size:
+            stride = __getstride()
        
         # we need it all, let's get it all (i.e. this really does read the lot)
         self._fh.seek(self.data_offset)
@@ -323,6 +350,21 @@ def _get_selection_via_chunks(self, args):
                 raise NotImplementedError('datatype not implemented')
 
         return out
+
+    @property
+    def _fh(self):
+        """ 
+        When the parent file has been closed, we will need to reopen it
+        to continue to access data. This facility is provided to support
+        thread safe data access. However, now the file is open outside
+        a context manager, the user is responsible for closing it,
+        though it should get closed when the variable instance is
+        garbage collected.
+        """
+        if self.__fh.closed:
+            self.__fh = open(self._filename, 'rb')
+        return self.__fh
+    
     
 
 class DatasetMeta:

From 4126e2b6410fe20ff3cda2e575eaa297602e26f5 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Wed, 8 Jan 2025 14:55:43 +0000
Subject: [PATCH 054/105] threadsafe data access

---
 pyfive/h5d.py | 58 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index bf02185..717abb2 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -37,21 +37,21 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         """
 
         self._order = dataobject.order
-        self.__fh = dataobject.fh
+        fh = dataobject.fh
         
         try:
-            dataobject.fh.fileno()
+            fh.fileno()
             self._filename = dataobject.fh.name
             self.avoid_mmap = False
             self.pseudo_chunking_size = 0 
         except (AttributeError, OSError):
             try:
                 # maybe this is an S3File instance?
-                self._filename = getattr(self._fh,'path')
+                self._filename = getattr(fh,'path')
             except:
                 # maybe a remote https file opened as bytes?
                 # failing that, maybe a memory file, return as None
-                self._filename = getattr(self._fh,'full_name','None')
+                self._filename = getattr(fh,'full_name','None')
             self.avoid_mmap = True
             self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
         self.filter_pipeline = dataobject.filter_pipeline
@@ -128,7 +128,6 @@ def get_data(self, args):
 
 
-
         match self.layout_class:
             case 0:  #compact storage
                 raise NotImplementedError("Compact Storage")
@@ -142,6 +141,7 @@ def get_data(self, args):
                     # this is lazily reading only the chunks we need
                     return self._get_selection_via_chunks(args)
                 
+
     def iter_chunks(self, args):
         """ 
         Iterate over chunks in a chunked dataset. 
@@ -239,9 +239,11 @@ def _get_contiguous_data(self, args):
                 try:
                     # return a memory-map to the stored array
                     # I think this would mean that we only move the sub-array corresponding to result!
-                    view =  np.memmap(self._fh, dtype=self.dtype, mode='c',
+                    fh = self._fh
+                    view =  np.memmap(fh, dtype=self.dtype, mode='c',
                                 offset=self.data_offset, shape=self.shape, order=self._order)
-                    result = view[args]
+                    result = view[args].copy()
+                    fh.close()
                     return result
                 except UnsupportedOperation:
                     return self._get_direct_from_contiguous(args)
@@ -251,15 +253,19 @@ def _get_contiguous_data(self, args):
                 size = self.dtype[1]
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type - size {size}')
-                
+                fh = self._fh
                 ref_addresses = np.memmap(
-                    self._fh, dtype=('<u8'), mode='c', offset=self.data_offset,
+                    fh, dtype=('<u8'), mode='c', offset=self.data_offset,
                     shape=self.shape, order=self._order)
-                return np.array([Reference(addr) for addr in ref_addresses])[args]
+                result = np.array([Reference(addr) for addr in ref_addresses])[args]
+                fh.close()
+                return result
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
 
+
+            
     def _get_direct_from_contiguous(self, args=None):
         """
         If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks
@@ -289,8 +295,10 @@ def __getstride():
             stride = __getstride()
        
         # we need it all, let's get it all (i.e. this really does read the lot)
-        self._fh.seek(self.data_offset)
-        chunk_buffer = self._fh.read(num_bytes) 
+        fh  = self._fh
+        fh.seek(self.data_offset)
+        chunk_buffer = fh.read(num_bytes)
+        fh.close()
         chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
         chunk_data = chunk_data.reshape(self.shape, order=self._order)
         return chunk_data[args]
@@ -300,8 +308,11 @@ def _get_raw_chunk(self, storeinfo):
         """ 
         Obtain the bytes associated with a chunk.
         """
-        self._fh.seek(storeinfo.byte_offset)
-        return self._fh.read(storeinfo.size) 
+        fh = self._fh
+        fh.seek(storeinfo.byte_offset)
+        chunk_buffer = fh.read(storeinfo.size) 
+        fh.close()
+        return chunk_buffer
 
     def _get_selection_via_chunks(self, args):
         """
@@ -353,19 +364,14 @@ def _get_selection_via_chunks(self, args):
 
     @property
     def _fh(self):
-        """ 
-        When the parent file has been closed, we will need to reopen it
-        to continue to access data. This facility is provided to support
-        thread safe data access. However, now the file is open outside
-        a context manager, the user is responsible for closing it,
-        though it should get closed when the variable instance is
-        garbage collected.
+        """Return an indepdent open file handle to the parent file.
+
+        The file is open outside a context manager, so the user is
+        responsible for closing it.
+
         """
-        if self.__fh.closed:
-            self.__fh = open(self._filename, 'rb')
-        return self.__fh
-    
-    
+        return open(self._filename, 'rb')
+
 
 class DatasetMeta:
     """ 

From 87a1980c2b8d2e20d53d18549b21dc32f16cf8ae Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Thu, 9 Jan 2025 14:17:18 +0000
Subject: [PATCH 055/105] add deps for mock s3 test

---
 setup.cfg | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 7d2df77..dbb98e9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,7 +11,7 @@ url = https://github.com/ncas-cms/pyfive
 author = Jonathan J. Helmus
 author_email = jjhelmus@gmail.com (with additions from Bryan Lawrence)
 maintainer = Valeriu Predoi
-maintainer_email = V.Predoi@ncas.ac.uk
+maintainer_email = valeriu.predoi@ncas.ac.uk
 license = BSD
 classifiers = 
     Development Status :: 4 - Beta
@@ -28,7 +28,9 @@ classifiers =
 packages = pyfive
 python_requires >= 3.10
 install_requires =
+    moto
     numpy
+    s3fs
 
 [options.extras_require]
 testing = 

From c7058b6fdb16f583971d50fb8c40a5afedce7218 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Thu, 9 Jan 2025 14:17:38 +0000
Subject: [PATCH 056/105] add mock s3 test

---
 tests/test_mock_s3fs.py | 129 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 tests/test_mock_s3fs.py

diff --git a/tests/test_mock_s3fs.py b/tests/test_mock_s3fs.py
new file mode 100644
index 0000000..931355a
--- /dev/null
+++ b/tests/test_mock_s3fs.py
@@ -0,0 +1,129 @@
+import os
+import s3fs
+import pathlib
+import json
+import moto
+import pyfive
+import pytest
+
+import h5netcdf
+
+from moto.moto_server.threaded_moto_server import ThreadedMotoServer
+
+
+# some spoofy server parameters
+port = 5555
+endpoint_uri = "http://127.0.0.1:%s/" % port
+test_bucket_name = "test"
+versioned_bucket_name = "test-versioned"
+secure_bucket_name = "test-secure"
+
+def get_boto3_client():
+    from botocore.session import Session
+
+    # NB: we use the sync botocore client for setup
+    session = Session()
+    return session.create_client("s3", endpoint_url=endpoint_uri)
+
+@pytest.fixture(scope="module")
+def s3_base():
+    # writable local S3 system
+
+    # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests
+    #####
+    # lifted from https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py
+    #####
+    server = ThreadedMotoServer(ip_address="127.0.0.1", port=port)
+    server.start()
+    if "AWS_SECRET_ACCESS_KEY" not in os.environ:
+        os.environ["AWS_SECRET_ACCESS_KEY"] = "foo"
+    if "AWS_ACCESS_KEY_ID" not in os.environ:
+        os.environ["AWS_ACCESS_KEY_ID"] = "foo"
+    os.environ.pop("AWS_PROFILE", None)
+
+    print("server up")
+    yield
+    print("moto done")
+    server.stop()
+
+
+@pytest.fixture()
+def s3fs_s3(s3_base):
+    """
+    Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs.
+    Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py
+    """
+    client = get_boto3_client()
+    client.create_bucket(Bucket=test_bucket_name, ACL="public-read")
+
+    client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read")
+    client.put_bucket_versioning(
+        Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"}
+    )
+
+    # initialize secure bucket
+    client.create_bucket(Bucket=secure_bucket_name, ACL="public-read")
+    policy = json.dumps(
+        {
+            "Version": "2012-10-17",
+            "Id": "PutObjPolicy",
+            "Statement": [
+                {
+                    "Sid": "DenyUnEncryptedObjectUploads",
+                    "Effect": "Deny",
+                    "Principal": "*",
+                    "Action": "s3:PutObject",
+                    "Resource": "arn:aws:s3:::{bucket_name}/*".format(
+                        bucket_name=secure_bucket_name
+                    ),
+                    "Condition": {
+                        "StringNotEquals": {
+                            "s3:x-amz-server-side-encryption": "aws:kms"
+                        }
+                    },
+                }
+            ],
+        }
+    )
+
+    client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy)
+    s3fs.S3FileSystem.clear_instance_cache()
+    s3 = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri})
+    s3.invalidate_cache()
+
+    yield s3
+
+
+def test_s3file_with_s3fs(s3fs_s3):
+    """
+    This test spoofs a complete s3fs FileSystem via s3fs_s3,
+    creates a mock bucket inside it, then puts a REAL netCDF4 file in it,
+    then it loads it as if it was an S3 file. This is proper
+    Wild Weasel stuff right here.
+    """
+    # set up physical file and Path properties
+    ncfile = "./tests/data/issue23_A.nc"
+    file_path = pathlib.Path(ncfile)
+    file_name = pathlib.Path(ncfile).name
+
+    # use mocked s3fs
+    bucket = "MY_BUCKET"
+    s3fs_s3.mkdir(bucket)
+    s3fs_s3.put(file_path, bucket)
+    s3 = s3fs.S3FileSystem(
+        anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
+    )
+
+    # test load by h5netcdf
+    with s3.open(os.path.join("MY_BUCKET", file_name), "rb") as f:
+        print("File path", f.path)
+        ncfile = h5netcdf.File(f, 'r', invalid_netcdf=True)
+        print("File loaded from spoof S3 with h5netcdf:", ncfile)
+        print(ncfile["q"])
+    assert "q" in ncfile
+
+    # PyFive it
+    with s3.open(os.path.join("MY_BUCKET", file_name), "rb") as f:
+        pyfive_ds = pyfive.File(f)
+        print(f"Dataset loaded from mock S3 with s3fs and Pyfive: ds")
+        assert "q" in pyfive_ds

From df81fafd8c0f93c0a1f2668da72f2afb80a332a3 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 9 Jan 2025 15:39:22 +0000
Subject: [PATCH 057/105] posix & s3

---
 pyfive/h5d.py | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index a5089a7..b0f2693 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -34,10 +34,12 @@ def __init__(self, dataobject):
         fh = dataobject.fh
         
         try:
+            # See if 'fh' an underlying file descriptor
             fh.fileno()
-            self._filename = dataobject.fh.name
-            self.avoid_mmap = False 
         except (AttributeError, OSError):
+            #  No file descriptor => Not Posix
+            self.posix = False
+            self.__fh = fh
             try:
                 # maybe this is an S3File instance?
                 self._filename = getattr(fh,'path')
@@ -45,7 +47,11 @@ def __init__(self, dataobject):
                 # maybe a remote https file opened as bytes?
                 # failing that, maybe a memory file, return as None
                 self._filename = getattr(fh,'full_name','None')
-            self.avoid_mmap = True
+        else:
+            # Has a file descriptor => Posix
+            self.posix = True
+            self._filename = fh.name
+            
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
         self.rank = len(self.shape)
@@ -222,7 +228,7 @@ def _get_contiguous_data(self, args):
             return np.zeros(self.shape, dtype=self.dtype)[args]
 
         if not isinstance(self.dtype, tuple):
-            if self.avoid_mmap:
+            if not self.posix: #self.avoid_mmap:
                 return self._get_direct_from_contiguous(args)
             else:
                 try:
@@ -231,9 +237,9 @@ def _get_contiguous_data(self, args):
                     fh = self._fh
                     view =  np.memmap(fh, dtype=self.dtype, mode='c',
                                 offset=self.data_offset, shape=self.shape, order=self._order)
-                    result = view[args].copy()
-                    fh.close()
-                    return result
+                    return view[args].copy()
+#                    fh.close()
+#                    return result
                 except UnsupportedOperation:
                     return self._get_direct_from_contiguous(args)
         else:
@@ -246,9 +252,9 @@ def _get_contiguous_data(self, args):
                 ref_addresses = np.memmap(
                     fh, dtype=('<u8'), mode='c', offset=self.data_offset,
                     shape=self.shape, order=self._order)
-                result = np.array([Reference(addr) for addr in ref_addresses])[args]
-                fh.close()
-                return result
+                return np.array([Reference(addr) for addr in ref_addresses])[args]
+#                fh.close()
+#                return result
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
@@ -270,7 +276,7 @@ def _get_direct_from_contiguous(self, args=None):
         fh  = self._fh
         fh.seek(self.data_offset)
         chunk_buffer = fh.read(num_bytes)
-        fh.close()
+#        fh.close()
         chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
         chunk_data = chunk_data.reshape(self.shape, order=self._order)
         return chunk_data[args]
@@ -282,9 +288,9 @@ def _get_raw_chunk(self, storeinfo):
         """
         fh = self._fh
         fh.seek(storeinfo.byte_offset)
-        chunk_buffer = fh.read(storeinfo.size) 
-        fh.close()
-        return chunk_buffer
+        return fh.read(storeinfo.size) 
+#        fh.close()
+#        return chunk_buffer
 
     def _get_selection_via_chunks(self, args):
         """
@@ -342,7 +348,15 @@ def _fh(self):
         responsible for closing it.
 
         """
-        return open(self._filename, 'rb')
+        if self.posix:
+            # Posix
+            return open(self._filename, 'rb')
+
+        # Not posix
+        if self.__fh.closed:
+            self.__fh = open(self._filename, 'rb')
+            
+        return self.__fh
 
 
 class DatasetMeta:

From 53ff9dfa4e0a642a329f3d8960a13247ecd3e6c7 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 9 Jan 2025 16:55:02 +0000
Subject: [PATCH 058/105] tidy

---
 pyfive/h5d.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index b0f2693..13c0c00 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -51,7 +51,7 @@ def __init__(self, dataobject):
             # Has a file descriptor => Posix
             self.posix = True
             self._filename = fh.name
-            
+
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
         self.rank = len(self.shape)
@@ -228,12 +228,13 @@ def _get_contiguous_data(self, args):
             return np.zeros(self.shape, dtype=self.dtype)[args]
 
         if not isinstance(self.dtype, tuple):
-            if not self.posix: #self.avoid_mmap:
+            if not self.posix:
                 return self._get_direct_from_contiguous(args)
             else:
                 try:
                     # return a memory-map to the stored array
-                    # I think this would mean that we only move the sub-array corresponding to result!
+                    # I think this would mean that we only move the
+                    # sub-array corresponding to result!
                     fh = self._fh
                     view =  np.memmap(fh, dtype=self.dtype, mode='c',
                                 offset=self.data_offset, shape=self.shape, order=self._order)

From ee0995b82267cfac89110ec0cd3cb92bb29361be Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 10 Jan 2025 15:02:09 +0000
Subject: [PATCH 059/105] tidy up

---
 pyfive/h5d.py | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 13c0c00..a288dd4 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -34,7 +34,7 @@ def __init__(self, dataobject):
         fh = dataobject.fh
         
         try:
-            # See if 'fh' an underlying file descriptor
+            # See if 'fh' is an underlying file descriptor
             fh.fileno()
         except (AttributeError, OSError):
             #  No file descriptor => Not Posix
@@ -135,7 +135,6 @@ def get_data(self, args):
                 else:
                     # this is lazily reading only the chunks we need
                     return self._get_selection_via_chunks(args)
-                
 
     def iter_chunks(self, args):
         """ 
@@ -232,15 +231,13 @@ def _get_contiguous_data(self, args):
                 return self._get_direct_from_contiguous(args)
             else:
                 try:
-                    # return a memory-map to the stored array
+                    # Return a memory-map to the stored array.
                     # I think this would mean that we only move the
                     # sub-array corresponding to result!
                     fh = self._fh
-                    view =  np.memmap(fh, dtype=self.dtype, mode='c',
+                    view = np.memmap(fh, dtype=self.dtype, mode='c',
                                 offset=self.data_offset, shape=self.shape, order=self._order)
-                    return view[args].copy()
-#                    fh.close()
-#                    return result
+                    return view[args]
                 except UnsupportedOperation:
                     return self._get_direct_from_contiguous(args)
         else:
@@ -254,8 +251,6 @@ def _get_contiguous_data(self, args):
                     fh, dtype=('<u8'), mode='c', offset=self.data_offset,
                     shape=self.shape, order=self._order)
                 return np.array([Reference(addr) for addr in ref_addresses])[args]
-#                fh.close()
-#                return result
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
@@ -274,10 +269,9 @@ def _get_direct_from_contiguous(self, args=None):
         num_bytes = num_elements*itemsize
        
         # we need it all, let's get it all (i.e. this really does read the lot)
-        fh  = self._fh
+        fh = self._fh
         fh.seek(self.data_offset)
         chunk_buffer = fh.read(num_bytes)
-#        fh.close()
         chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
         chunk_data = chunk_data.reshape(self.shape, order=self._order)
         return chunk_data[args]
@@ -290,8 +284,6 @@ def _get_raw_chunk(self, storeinfo):
         fh = self._fh
         fh.seek(storeinfo.byte_offset)
         return fh.read(storeinfo.size) 
-#        fh.close()
-#        return chunk_buffer
 
     def _get_selection_via_chunks(self, args):
         """
@@ -350,14 +342,17 @@ def _fh(self):
 
         """
         if self.posix:
-            # Posix
+            # Posix: Open the file, without caching it.
             return open(self._filename, 'rb')
 
-        # Not posix
-        if self.__fh.closed:
-            self.__fh = open(self._filename, 'rb')
+        # Not posix: Use the cached file if it's open, otherwise open
+        #            the file and cache it.
+        fh = self.__fh
+        if fh.closed:
+            fh = open(self._filename, 'rb')
+            self.__fh = fh
             
-        return self.__fh
+        return fh
 
 
 class DatasetMeta:

From 43a8e9ccc8883189c2c44c9a6eaf9c4263985766 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 10 Jan 2025 15:10:38 +0000
Subject: [PATCH 060/105] tidy up

---
 pyfive/h5d.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index a288dd4..a3bfbc0 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -231,13 +231,12 @@ def _get_contiguous_data(self, args):
                 return self._get_direct_from_contiguous(args)
             else:
                 try:
-                    # Return a memory-map to the stored array.
-                    # I think this would mean that we only move the
-                    # sub-array corresponding to result!
-                    fh = self._fh
-                    view = np.memmap(fh, dtype=self.dtype, mode='c',
+                    # return a memory-map to the stored array
+                    # I think this would mean that we only move the sub-array corresponding to result!
+                    view =  np.memmap(self._fh, dtype=self.dtype, mode='c',
                                 offset=self.data_offset, shape=self.shape, order=self._order)
-                    return view[args]
+                    result = view[args]
+                    return result
                 except UnsupportedOperation:
                     return self._get_direct_from_contiguous(args)
         else:
@@ -246,17 +245,13 @@ def _get_contiguous_data(self, args):
                 size = self.dtype[1]
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type - size {size}')
-                fh = self._fh
                 ref_addresses = np.memmap(
-                    fh, dtype=('<u8'), mode='c', offset=self.data_offset,
+                    self._fh, dtype=('<u8'), mode='c', offset=self.data_offset,
                     shape=self.shape, order=self._order)
                 return np.array([Reference(addr) for addr in ref_addresses])[args]
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
-
-
-            
     def _get_direct_from_contiguous(self, args=None):
         """
         This is a fallback situation if we can't use a memory map which would otherwise be lazy. 
@@ -335,10 +330,14 @@ def _get_selection_via_chunks(self, args):
 
     @property
     def _fh(self):
-        """Return an indepdent open file handle to the parent file.
+        """Return an open file handle to the parent file.
 
-        The file is open outside a context manager, so the user is
-        responsible for closing it.
+        When the parent file has been closed, we will need to reopen it
+        to continue to access data. This facility is provided to support
+        thread safe data access. However, now the file is open outside
+        a context manager, the user is responsible for closing it,
+        though it should get closed when the variable instance is
+        garbage collected.
 
         """
         if self.posix:
@@ -351,7 +350,7 @@ def _fh(self):
         if fh.closed:
             fh = open(self._filename, 'rb')
             self.__fh = fh
-            
+
         return fh
 
 
From 7462033d351e6a2710c6a8f9c31135df84f371ab Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:10:12 +0000
Subject: [PATCH 061/105] add test reports to gitignore for now

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2b42de6..1679dae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ __pycache__
 *.egg-info
 .idea
 .DS_Store
+test-reports/

From 0c8ffc57a5626bf342d625b092f0fbfbef6e32a8 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:18:17 +0000
Subject: [PATCH 062/105] add conftest

---
 tests/conftest.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 tests/conftest.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..cf55293
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,77 @@
+import os
+import s3fs
+import pathlib
+import json
+import moto
+import pytest
+
+from moto.moto_server.threaded_moto_server import ThreadedMotoServer
+
+
+@pytest.fixture(scope="module")
+def s3_base():
+    # writable local S3 system
+
+    # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests
+    #####
+    # lifted from https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py
+    #####
+    server = ThreadedMotoServer(ip_address="127.0.0.1", port=port)
+    server.start()
+    if "AWS_SECRET_ACCESS_KEY" not in os.environ:
+        os.environ["AWS_SECRET_ACCESS_KEY"] = "foo"
+    if "AWS_ACCESS_KEY_ID" not in os.environ:
+        os.environ["AWS_ACCESS_KEY_ID"] = "foo"
+    os.environ.pop("AWS_PROFILE", None)
+
+    print("server up")
+    yield
+    print("moto done")
+    server.stop()
+
+
+@pytest.fixture()
+def s3fs_s3(s3_base):
+    """
+    Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs.
+    Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py
+    """
+    client = get_boto3_client()
+    client.create_bucket(Bucket=test_bucket_name, ACL="public-read")
+
+    client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read")
+    client.put_bucket_versioning(
+        Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"}
+    )
+
+    # initialize secure bucket
+    client.create_bucket(Bucket=secure_bucket_name, ACL="public-read")
+    policy = json.dumps(
+        {
+            "Version": "2012-10-17",
+            "Id": "PutObjPolicy",
+            "Statement": [
+                {
+                    "Sid": "DenyUnEncryptedObjectUploads",
+                    "Effect": "Deny",
+                    "Principal": "*",
+                    "Action": "s3:PutObject",
+                    "Resource": "arn:aws:s3:::{bucket_name}/*".format(
+                        bucket_name=secure_bucket_name
+                    ),
+                    "Condition": {
+                        "StringNotEquals": {
+                            "s3:x-amz-server-side-encryption": "aws:kms"
+                        }
+                    },
+                }
+            ],
+        }
+    )
+
+    client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy)
+    s3fs.S3FileSystem.clear_instance_cache()
+    s3 = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri})
+    s3.invalidate_cache()
+
+    yield s3

From 8cc2363093c9731a769d53755415a920a7b4c7bb Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:34:40 +0000
Subject: [PATCH 063/105] minimize conftest

---
 tests/conftest.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index cf55293..b921310 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -8,6 +8,22 @@
 from moto.moto_server.threaded_moto_server import ThreadedMotoServer
 
 
+# some spoofy server parameters
+# test parameters; don't modify these
+port = 5555
+endpoint_uri = "http://127.0.0.1:%s/" % port
+test_bucket_name = "test"
+versioned_bucket_name = "test-versioned"
+secure_bucket_name = "test-secure"
+
+def get_boto3_client():
+    from botocore.session import Session
+
+    # NB: we use the sync botocore client for setup
+    session = Session()
+    return session.create_client("s3", endpoint_url=endpoint_uri)
+
+
 @pytest.fixture(scope="module")
 def s3_base():
     # writable local S3 system

From 3086211cd95260a7db858a8211fc6a442092fe7b Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:34:58 +0000
Subject: [PATCH 064/105] make use of conftest and add minimal test for mock s3
 fs

---
 tests/test_mock_s3fs.py | 90 +++++------------------------------------
 1 file changed, 10 insertions(+), 80 deletions(-)

diff --git a/tests/test_mock_s3fs.py b/tests/test_mock_s3fs.py
index 931355a..7beffb2 100644
--- a/tests/test_mock_s3fs.py
+++ b/tests/test_mock_s3fs.py
@@ -1,97 +1,27 @@
 import os
 import s3fs
 import pathlib
-import json
-import moto
 import pyfive
 import pytest
-
 import h5netcdf
 
-from moto.moto_server.threaded_moto_server import ThreadedMotoServer
-
 
-# some spoofy server parameters
+# needed by the spoofed s3 filesystem
 port = 5555
 endpoint_uri = "http://127.0.0.1:%s/" % port
-test_bucket_name = "test"
-versioned_bucket_name = "test-versioned"
-secure_bucket_name = "test-secure"
-
-def get_boto3_client():
-    from botocore.session import Session
-
-    # NB: we use the sync botocore client for setup
-    session = Session()
-    return session.create_client("s3", endpoint_url=endpoint_uri)
-
-@pytest.fixture(scope="module")
-def s3_base():
-    # writable local S3 system
-
-    # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests
-    #####
-    # lifted from https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py
-    #####
-    server = ThreadedMotoServer(ip_address="127.0.0.1", port=port)
-    server.start()
-    if "AWS_SECRET_ACCESS_KEY" not in os.environ:
-        os.environ["AWS_SECRET_ACCESS_KEY"] = "foo"
-    if "AWS_ACCESS_KEY_ID" not in os.environ:
-        os.environ["AWS_ACCESS_KEY_ID"] = "foo"
-    os.environ.pop("AWS_PROFILE", None)
-
-    print("server up")
-    yield
-    print("moto done")
-    server.stop()
 
 
-@pytest.fixture()
-def s3fs_s3(s3_base):
-    """
-    Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs.
-    Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py
-    """
-    client = get_boto3_client()
-    client.create_bucket(Bucket=test_bucket_name, ACL="public-read")
-
-    client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read")
-    client.put_bucket_versioning(
-        Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"}
-    )
-
-    # initialize secure bucket
-    client.create_bucket(Bucket=secure_bucket_name, ACL="public-read")
-    policy = json.dumps(
-        {
-            "Version": "2012-10-17",
-            "Id": "PutObjPolicy",
-            "Statement": [
-                {
-                    "Sid": "DenyUnEncryptedObjectUploads",
-                    "Effect": "Deny",
-                    "Principal": "*",
-                    "Action": "s3:PutObject",
-                    "Resource": "arn:aws:s3:::{bucket_name}/*".format(
-                        bucket_name=secure_bucket_name
-                    ),
-                    "Condition": {
-                        "StringNotEquals": {
-                            "s3:x-amz-server-side-encryption": "aws:kms"
-                        }
-                    },
-                }
-            ],
-        }
-    )
+def test_s3fs_s3(s3fs_s3):
+    """Test mock S3 filesystem constructor."""
+    # this is an entire mock S3 FS
+    mock_s3_filesystem = s3fs_s3
 
-    client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy)
-    s3fs.S3FileSystem.clear_instance_cache()
-    s3 = s3fs.S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri})
-    s3.invalidate_cache()
+    # explore its attributes and methods
+    print(dir(mock_s3_filesystem))
 
-    yield s3
+    assert not mock_s3_filesystem.anon
+    assert not mock_s3_filesystem.version_aware
+    assert mock_s3_filesystem.client_kwargs == {'endpoint_url': 'http://127.0.0.1:5555/'}
 
 
 def test_s3file_with_s3fs(s3fs_s3):

From ed0f1172f8fabfeb7d40a758c1c0eabb75fe931e Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:39:16 +0000
Subject: [PATCH 065/105] upgrade actions versions

---
 .github/workflows/pytest.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 6081f94..dd0a799 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -5,7 +5,7 @@ name: Python package
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, mock_s3fs ]
   pull_request:
     branches: [ master ]
 
@@ -16,12 +16,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

From 6843567b2b4b7be30eac1f31c46afd4e08397024 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:42:49 +0000
Subject: [PATCH 066/105] add flask dep

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index dbb98e9..c0311f9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,6 +28,7 @@ classifiers =
 packages = pyfive
 python_requires >= 3.10
 install_requires =
+    flask
     moto
     numpy
     s3fs

From 88752d10bde0f1b92aa645c546aa9d64673d0dfd Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:43:16 +0000
Subject: [PATCH 067/105] restrict to python 3.10

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index dd0a799..5e99483 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v4

From ddeb0eabf400674105a3ad7e5cdc32153c98d0ed Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:45:03 +0000
Subject: [PATCH 068/105] add flask-cors

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index c0311f9..8c7e6eb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,6 +29,7 @@ packages = pyfive
 python_requires >= 3.10
 install_requires =
     flask
+    flask-cors
     moto
     numpy
     s3fs

From 522bf7a0146f86f8249666a13eb69024ec8bddd6 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:48:57 +0000
Subject: [PATCH 069/105] add h5 modules

---
 setup.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 8c7e6eb..8c2010f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,6 +28,8 @@ classifiers =
 packages = pyfive
 python_requires >= 3.10
 install_requires =
+    h5py
+    h5netcdf
     flask
     flask-cors
     moto

From 22476e81604cb2f2ef08b88a3a985a8e83265d76 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 14:54:38 +0000
Subject: [PATCH 070/105] mark test as xfailed

---
 tests/test_enum_var.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_enum_var.py b/tests/test_enum_var.py
index e5851be..e2cea47 100644
--- a/tests/test_enum_var.py
+++ b/tests/test_enum_var.py
@@ -8,6 +8,7 @@
 DIRNAME = os.path.dirname(__file__)
 ENUMVAR_HDF5_FILE = os.path.join(DIRNAME, 'enum_variable.hdf5')
 
+@pytest.mark.xfail(reason="FileNotFoundError: [Errno 2] No such file or directory: '/home/.../pyfive/pyfive/tests/enum_variable.hdf5")
 def test_read_enum_variable():
 
     with pyfive.File(ENUMVAR_HDF5_FILE) as hfile:

From f28c68d2b3f778207ccff5b882e7cc0002c39185 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Mon, 13 Jan 2025 16:11:51 +0000
Subject: [PATCH 071/105] add dosctrings

---
 tests/conftest.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index b921310..aa8b538 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -34,6 +34,9 @@ def s3_base():
     #####
     server = ThreadedMotoServer(ip_address="127.0.0.1", port=port)
     server.start()
+    # the user ID and secret key are needed when accessing a public bucket
+    # since our S3 FS and bucket are not actually on an AWS system, they can have
+    # bogus values
     if "AWS_SECRET_ACCESS_KEY" not in os.environ:
         os.environ["AWS_SECRET_ACCESS_KEY"] = "foo"
     if "AWS_ACCESS_KEY_ID" not in os.environ:
@@ -51,8 +54,20 @@ def s3fs_s3(s3_base):
     """
     Create a fully functional "virtual" S3 FileSystem compatible with fsspec/s3fs.
     Method inspired by https://github.com/fsspec/s3fs/blob/main/s3fs/tests/test_s3fs.py
+
+    The S3 FS, being AWS-like but not actually physically deployed anywhere, still needs
+    all the usual user IDs, secret keys, endpoint URLs etc; the setup makes use of the ACL=public
+    configuration (public-read, or public-read-write). Public DOES NOT mean anon=True, but rather,
+    All Users group – https://docs.aws.amazon.com/AmazonS3/latest/userguide/acl-overview.html
+    Access permission to this group allows anyone with AWS credentials to access the resource.
+    The requests need be signed (authenticated) or not.
+
+    Also, keys are encrypted using AWS-KMS
+    https://docs.aws.amazon.com/kms/latest/developerguide/overview.html
     """
     client = get_boto3_client()
+
+    # see not above about ACL=public-read
     client.create_bucket(Bucket=test_bucket_name, ACL="public-read")
 
     client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read")

From 2d312c1d96df2c52b3fd29330694a789e04c4808 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 08:46:45 +0000
Subject: [PATCH 072/105] Minor changes following V's S3 testing merge

---
 .coveragerc                     |   4 ----
 setup.cfg                       |   8 +++-----
 tests/data/enum_variable.hdf5   | Bin 0 -> 6149 bytes
 tests/data/enums_from_netcdf.nc | Bin 0 -> 6149 bytes
 tests/test_enum_var.py          |   3 +--
 5 files changed, 4 insertions(+), 11 deletions(-)
 delete mode 100644 .coveragerc
 create mode 100644 tests/data/enum_variable.hdf5
 create mode 100644 tests/data/enums_from_netcdf.nc

diff --git a/.coveragerc b/.coveragerc
deleted file mode 100644
index c4b4617..0000000
--- a/.coveragerc
+++ /dev/null
@@ -1,4 +0,0 @@
-[report]
-exclude_lines =
-    raise NotImplementedError
-    raise InvalidHDF5File
diff --git a/setup.cfg b/setup.cfg
index 8c2010f..c767029 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,11 +28,6 @@ classifiers =
 packages = pyfive
 python_requires >= 3.10
 install_requires =
-    h5py
-    h5netcdf
-    flask
-    flask-cors
-    moto
     numpy
     s3fs
 
@@ -42,4 +37,7 @@ testing =
     h5py
     netCDF4
     h5netcdf
+    flask
+    flask-cors
+    moto
 
diff --git a/tests/data/enum_variable.hdf5 b/tests/data/enum_variable.hdf5
new file mode 100644
index 0000000000000000000000000000000000000000..3f640db22238917469bdab26660e7541ef2094d7
GIT binary patch
literal 6149
zcmeHJ%}*0S6o0$ZmQ7s*wFW;ht{jy#fJHEd#Adtwh@~}@m}oS0p^Hq?Zn3*1-aHd8
z#*-J~&BU7r<6pqRKf$XZT)cSkUf-MD0VUvpn`U0RGqdyNef{S5Hr*;0io@wkY27eD
z${?2Hg=l9gR(vk`KkMbfl5T#|C&V6Ou}Yd8p3^#LbR>0=MlC@jNy_v4oo4tWdV|qX
zTI|H?jd&MQa)AbyX{{ync*_tDHr?J<ymNjW$ed|#I0yNj%j{Yree1)w*K5B&ci0>x
z=h2*UrBp=DU_cP|+^`?OYQNKO<LvX!3Qip!1l)fHn$8B_AMQR8UWey3EkoJqnx-Xb
zfg)#?A)jZiT@^_JSTJV5*h@Mg5Mt>Ff|(eCH^Le~R+TumD)R;%Wr&qQzU^wh(SUQf
zMRgE1h03B+YgFpBX2Z@`9WWnJeGF3*2DSX6BhKHA&o8g@z`~zv`Ce!jidKtvJU`%F
z-&*O1mfsBzRER%!SK40Ik}?aFkdFROgGmWBYhJk8TFVu9hqqu9Sr(OOiR3UQ%Hz_j
zy$HkYzhO8>7@qK&l0#(7uHyg$<19r*s>B4Bw<OVK$D^GWmXN{X@UmrbcDLJWao-KS
z;Lj3~3KWq<&a^h4Mt5)u=7gIfZ?_+~?Y?)?@`!UXfSd<rHM)rr<N_vpK!j?gu?*Ca
z#E~(4sP~U@AzZ~D23o(r@_eXnm+wgcWorrEquy*26&Z}c6h&Neiq4W#vz?}hcnj%B
z5nFEP23~j&dY@rnkba*aM-(O79ghjw8GrkcGD=Yy_<fMYC)^i2aN;-9Atg~#P`+8S
z@AtYJUN7W$U(_oT_)xEGdOafg!ptOcA6dJQ%ihk;Os==qW*4Tj)4A-;?RQ`H@SqZf
zz<+^2$*xyLmr&!O$o`T424o9GT7a)3a;0X4fI>hapb$_9C<GJ&3IT<HLO>y)5Kssx
M1pXBSG;P52U$tT%xBvhE

literal 0
HcmV?d00001

diff --git a/tests/data/enums_from_netcdf.nc b/tests/data/enums_from_netcdf.nc
new file mode 100644
index 0000000000000000000000000000000000000000..d714b1414f28a02251d68377b54bc6a491fb28b6
GIT binary patch
literal 6149
zcmeHJy>HV%6o2+5sTEvWAgFv4VY&fT)RvSKp-M!K(?+BTYAObV6r9GPCnZjaor;aQ
zVnIwS6&o`Wh`*o%e*!B)omrR(@11=uR4qd{&^^U>=X>|Qe)oG;wwH^g(e$OXZWthC
z7)$a(v@;VcJ}vn_YRkn{-TbIei9N<*l{7g#r*+WiNa`YuT7pKBl;;n+jqrQ)2IJ$j
z*p1bj@h+s~0u3(HT1)8hmLVK$x&7^U=lmp)In&^94)Q&h`DTgqE%x<Q`}gNA%R_Pj
z%_&z(MdS<y1YzF|2LY@Px`PhRKJPYh>hd7q{!`F&HvBO1^Sbalx}a$p%1+lbElCR$
zIkOD;Jag@;ND{z;2?Hix&<TMMOGgmQ#Spv})&R1q#IaSGH|Z!ttPJvPR|@qyoWm`u
zgRm)<SDb3ST&p(fcA?^c`H1Rcn3^!C7FHZ_{$6~3ZG#6E{#?uVLc3V9TD<G|0q^-%
za}Zj7FFa5o{@iVLysRZ<9w;Fl{htPt5^7YvaJ|*e6?vDpU>sQ%m1v3NFeb|5rI&jV
zhC6@5Fi#ks@S2h%WXx{h07K&}MMbK_1edoY(Pqb^T@aR#!Q$|;WpQ?|*Kcv(4ZYyc
z5|IiNkwnh4HlIXya0>FmO^J6p58ckdJ85~uIT=9CL$eay#29h`lRY3prCeVF>PX_q
z1U}UJN4XHL;tvC@-(7h&QnQ!uO8{kS3ErdL><|?hjKK^=Ty;v$s#CR{hKP6@=|~Y<
zZs-PHco2G@VPKGcpCCsRCEFd33E7=|^PVzFQ5pDski{q5=R9!YH`5^{QBqL8QMDiR
zdz)TA<al4yD^vJTuWWgJBKqRoG;$wVyO_(~$<9r0wA%BFv)S2P_SVkZ&wF@Ki9+DN
zKw!zPRYaFi<Dtm@k^cr{3q@LhFJp40W`%%4Kp~(IPzWdl6aoqXg@8gpA)pXY2q*;p
K6$CVG$n;-_%p86I

literal 0
HcmV?d00001

diff --git a/tests/test_enum_var.py b/tests/test_enum_var.py
index e2cea47..e52b171 100644
--- a/tests/test_enum_var.py
+++ b/tests/test_enum_var.py
@@ -6,9 +6,8 @@
 import pyfive
 
 DIRNAME = os.path.dirname(__file__)
-ENUMVAR_HDF5_FILE = os.path.join(DIRNAME, 'enum_variable.hdf5')
+ENUMVAR_HDF5_FILE = os.path.join(DIRNAME, 'data/enum_variable.hdf5')
 
-@pytest.mark.xfail(reason="FileNotFoundError: [Errno 2] No such file or directory: '/home/.../pyfive/pyfive/tests/enum_variable.hdf5")
 def test_read_enum_variable():
 
     with pyfive.File(ENUMVAR_HDF5_FILE) as hfile:

From 742faf8951303e57493733814d5099821548c7f4 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 09:24:18 +0000
Subject: [PATCH 073/105] A framework for testing laziness.

---
 tests/test_lazy_variables.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py
index 0d22d73..6ecfb72 100644
--- a/tests/test_lazy_variables.py
+++ b/tests/test_lazy_variables.py
@@ -5,6 +5,23 @@
 
 HERE = Path(__file__).parent
 
+
+def test_file_laziness():    
+    """ Pyfive should not load the data until it is requested. """
+    with pyfive.File(HERE/'data/issue23_A.nc') as hfile:
+        variables = [v for v in hfile]
+        # we do not want to have executed a _getitem__ on any of the variables yet
+    
+    # check each member of that list is a string
+    # it is, it's just the keys of the Mapping superclass of Group
+    assert all(isinstance(v, str) for v in variables)
+
+
+
+
+
+
+
 def test_attributes_outside_context():
     """ Pyfive should be able to access attributes outside the context manager. """
     with pyfive.File(HERE/'data/issue23_A.nc') as hfile:

From 7a131082c9ff19ad3e051a07dcbb0317a3dedf77 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Wed, 15 Jan 2025 10:32:19 +0000
Subject: [PATCH 074/105] add test for threadsafe data access on posix and s3

---
 tests/test_threadsafe_data_access.py | 115 +++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 tests/test_threadsafe_data_access.py

diff --git a/tests/test_threadsafe_data_access.py b/tests/test_threadsafe_data_access.py
new file mode 100644
index 0000000..9c08716
--- /dev/null
+++ b/tests/test_threadsafe_data_access.py
@@ -0,0 +1,115 @@
+import os
+import s3fs
+import pathlib
+import pyfive
+import pytest
+import h5netcdf
+import netCDF4
+import numpy as np
+import dask.array as da
+
+# needed by the spoofed s3 filesystem
+port = 5555
+endpoint_uri = "http://127.0.0.1:%s/" % port
+
+
+def _get_s3_file(s3fs_s3, ncfile):
+    """Copy a POSIX file to S3."""
+    # set up physical file and Path properties
+    file_path = pathlib.Path(ncfile)
+    file_name = pathlib.Path(ncfile).name
+
+    # use mocked s3fs
+    bucket = "MY_BUCKET"
+    try:
+        s3fs_s3.mkdir(bucket)
+    except FileExistsError:
+        # Bucket was already created by another test
+        pass
+
+    s3fs_s3.put(file_path, bucket)
+    s3 = s3fs.S3FileSystem(
+        anon=False,
+        version_aware=True,
+        client_kwargs={"endpoint_url": endpoint_uri}
+    )
+    return s3.open(os.path.join("MY_BUCKET", file_name), "rb")
+
+                 
+def test_threadsafe_datea_accesss(s3fs_s3):
+    """test that the data is correctly retrieved using concurrent threads"""
+    # Create a chunked POSIX file 
+    chunked_file = "./threading_test_file_16_chunks.nc"
+    
+    n = netCDF4.Dataset(chunked_file, 'w')
+    chunksizes=(6, 32, 32)
+    z = n.createDimension('z', 12)
+    y = n.createDimension('y', 64)
+    x = n.createDimension('x', 128)
+    
+    v = n.createVariable('v', 'f8',
+                         dimensions=['z', 'y', 'x'],
+                         fill_value=False,
+                         contiguous=False,
+                         chunksizes=(6, 32, 32),
+                         complevel=1)
+    v[...] = np.random.random(v.size).reshape(12, 64, 128)
+    n.close()
+
+    posix = chunked_file
+
+    # Get s3 access
+    s3 = _get_s3_file(s3fs_s3, chunked_file)
+    
+    n_iterations = 1000
+    for ftype, filename in zip(
+            ('POSIX', 'S3'),
+            (posix, s3),
+    ):
+        print (f"\n{ftype} ----------------\n")
+
+        # Get the file data array, on asingle thread
+        v = pyfive.File(chunked_file)['v']
+        print (v)
+        print (f"Storage chunk size: {v.chunks}")
+        array0 = v[...].view(type=np.ndarray)
+
+        # Loop round different Dask chunk patterns. These are designed
+        # to various coincide and not coincide with the HDF5 chunks in
+        # the file.
+        for chunks in (
+                v.shape,
+                v.chunks,
+                (12, 32, 32),
+                (11, 63, 127),
+                (6, 20, 20)
+        ):
+            dx = da.from_array(v, chunks=chunks)
+            print (f"\n{dx.npartitions} Dask chunks: {dx.chunks}")
+            
+            for i in range(n_iterations):
+                # Use Dask to get the array with one or more threads
+                try:
+                    array = dx.compute()
+                except Exception as error:
+                    print (f"Failed on iteration {i + 1}")
+                    raise
+                else:
+                    # Compare the array created with multiple threads
+                    # with that created with one thread
+                    if not (array == array0).all():
+                        print (f"Failed on iteration {i + 1}")
+                        raise ValueError(
+                            "At least one Dask chunk read at least one wrong "
+                            "value from the file (likely from parts of "
+                            "storage chunks that it should not have been "
+                            "accessing, due to conflicting seeks on the same "
+                            "open file handle). "
+                            f"Storage chunks: {v.chunks}, "
+                            f"Dask chunks: {dx.chunks}"
+                        )
+            else:
+                print (f"Completed {n_iterations} iterations")
+
+    # Tidy up
+    os.remove(chunked_file)

From 1b6d67065bd5b839580c62427eb0454c8720841c Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Wed, 15 Jan 2025 10:35:38 +0000
Subject: [PATCH 075/105] note on number of threadsafe test iterations

---
 tests/test_threadsafe_data_access.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_threadsafe_data_access.py b/tests/test_threadsafe_data_access.py
index 9c08716..4291a60 100644
--- a/tests/test_threadsafe_data_access.py
+++ b/tests/test_threadsafe_data_access.py
@@ -60,8 +60,12 @@ def test_threadsafe_datea_accesss(s3fs_s3):
 
     # Get s3 access
     s3 = _get_s3_file(s3fs_s3, chunked_file)
-    
+
+    # Set number of iterations for checking thread safety. Need to be
+    # quite large to avoid cases where the code is not threadsafe, but
+    # still retrieves the correct data by good fortune.
     n_iterations = 1000
+    
     for ftype, filename in zip(
             ('POSIX', 'S3'),
             (posix, s3),

From 0ec45d736b3f6f338a9cc2190ae903d96281702a Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 12:00:50 +0000
Subject: [PATCH 076/105] Test framework for pseudochunking plus starting to
 migrate test data to data directory

---
 tests/data/dataset_multidim.hdf5 | Bin 0 -> 4464 bytes
 tests/test_dataset_multidim.py   |   2 +-
 tests/test_pseudochunking.py     |  46 +++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/dataset_multidim.hdf5
 create mode 100644 tests/test_pseudochunking.py

diff --git a/tests/data/dataset_multidim.hdf5 b/tests/data/dataset_multidim.hdf5
new file mode 100644
index 0000000000000000000000000000000000000000..6349a86592ac0d3fee9ab4c250783bf31341e3f0
GIT binary patch
literal 4464
zcmeI#*-{fh6oBFGNunc)g1fklJMMAc7c{}R0Rk%S!=mDjyO&(zosZ$8_!zv2{-^si
zRwWuL7nG@Tn)>E+_e>@;XL>r#*Gj3owWXuQNQq;UH*KyOHb1pYwELdW4*OHBSdCK}
zzem1ejr<w2TKn7e{ME+|_Ls|sBg6Sa)Q_2B-q(`Qk;VTi1C?@DpWN_}-gT$OA3Dyo
zX@<2jqH$E?n8q+pi`t$oSW|H0*KcHm`jKp{KU4Vn5Oa)g+0zyq-X$rC+ZIgh`lBdv
zAB;JVRB|@uNGTe8gYCDOS~FD!dU{KiI}2Bv<mz8tm`%PE>K_;8hpBn>k4toS8ctmQ
z@TT2cJFYt2wOT(s%<u8!Z=t%~_&6KaM;0<2QZpGU|1{R-<G8yXn{%&AO+NJJ;II9S
z=Vwy$lUC(nqN<%W8_j@!f--j+_q=4vkmnbsgg^DmG%z3XP!>QIC=0O&tyqjDkZC}c
zCQGpl%drA0u?kY=^8E5Ptw9IYVjb3F12!VTCTzwQY{fQg#}3F_zZ1K#8+))9`>-EH
z9Kb;w!eJc2Q5=K3B`0tar*Il)a2Drq9$mPAiztDGmg&YNRB#zra23~Z9X;qpA8w!@
z0~o|j+`?@P;STQN9`54-9^w&F3}XbN7{g;c!BdRm8J^<>Ug8yA;|=8Pdy98?j}Q2W
zPxy?iWEsdZkYymtKyzlGc3mgGIQ%B(t&5S{@_&#<Sr<qNYlGSLH@?oBj{P@G`wQ?5
Lw&_f5E-SwPF7;E_

literal 0
HcmV?d00001

diff --git a/tests/test_dataset_multidim.py b/tests/test_dataset_multidim.py
index 367d638..171319f 100644
--- a/tests/test_dataset_multidim.py
+++ b/tests/test_dataset_multidim.py
@@ -7,7 +7,7 @@
 import pyfive
 
 DIRNAME = os.path.dirname(__file__)
-DATASET_MULTIDIM_HDF5_FILE = os.path.join(DIRNAME, 'dataset_multidim.hdf5')
+DATASET_MULTIDIM_HDF5_FILE = os.path.join(DIRNAME, 'data/dataset_multidim.hdf5')
 
 
 def test_multidim_datasets():
diff --git a/tests/test_pseudochunking.py b/tests/test_pseudochunking.py
new file mode 100644
index 0000000..f7665f8
--- /dev/null
+++ b/tests/test_pseudochunking.py
@@ -0,0 +1,46 @@
+import pytest
+import numpy as np
+import pyfive 
+import h5py
+import io
+from pathlib import Path
+
+DIRNAME = Path(__file__).parent
+DATASET_MULTIDIM_HDF5_FILE = DIRNAME/'data/dataset_multidim.hdf5'
+
+@pytest.fixture
+def setup_data():
+    
+    file_like = io.BytesIO()
+    f = h5py.File(file_like, 'w', libver='earliest')
+    nx, ny = 400,400
+    data = np.arange(400*400).reshape(nx,ny)
+    f.create_dataset('var1', shape=(nx,ny), dtype='<i4',data=data, track_times=False)
+    f.close()
+
+    with pyfive.File(file_like,'r') as f:
+        var1 = f['var1']
+
+    return var1, data
+
+
+def test_get_direct_from_contiguous(setup_data):
+    
+    var1, data = setup_data
+    
+    #test getting it all
+
+    result = var1[...]
+    np.testing.assert_array_equal(result,data)
+
+
+def test_get_direct_from_contiguous_with_slice(setup_data):
+
+    var1, data = setup_data
+
+    via_file = var1[10:50,40:80]
+    via_data = data[10:50,40:80]
+
+    np.testing.assert_array_equal(via_file, via_data)
+    
+  
\ No newline at end of file

From 344573b135ea524e34c141255757b5a15c05ea7e Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 12:02:48 +0000
Subject: [PATCH 077/105] Ok, we pass the pseudochunking test, but we don't
 actually do it yet.

---
 pyfive/h5d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index bf02185..594b15d 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -260,7 +260,7 @@ def _get_contiguous_data(self, args):
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
 
-    def _get_direct_from_contiguous(self, args=None):
+    def _get_direct_from_contiguous(self, args=None, kwargs={}):
         """
         If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks
         otherwise we have to read the entire array. This is a fallback situation if we 
@@ -274,10 +274,10 @@ def __getstride():
             chunk_shape = np.ones(self.rank, dtype=int)
             for i in range(self.rank):
                 stride *= self.shape[i]
-                chunk_shape = box[:i] = self.shape[:i]
+                chunk_shape = self.shape[:i]
                 if stride*self.dtype.itemsize > self.pseudo_chunking_size:
                     stride //= self.shape[i]
-                    chunk_shape = box[:i] = self.shape[:i-1]
+                    chunk_shape = self.shape[:i-1]
             return chunk_shape, stride  
     
         itemsize = np.dtype(self.dtype).itemsize

From f4507763cf1afe066de11d0993b8a61d6b99d434 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 16:27:46 +0000
Subject: [PATCH 078/105] Pseudo chunking in, with test support (and a missing
 make data file that I didn't move properly)

---
 pyfive/h5d.py                            | 113 ++++++++++++++++-------
 tests/data/make_dataset_multidim_file.py |  17 ++++
 tests/test_pseudochunking.py             |  13 ++-
 3 files changed, 103 insertions(+), 40 deletions(-)
 create mode 100644 tests/data/make_dataset_multidim_file.py

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 594b15d..0a07d74 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -34,6 +34,10 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         chunk shape to approximate that volume and read the contigous variable
         as if were chunked. This is to facilitate lazy loading of partial data
         from contiguous storage.
+
+        (Currently the only way to change this value is by explicitly using
+        the set_pseudo_chunk_size method. Most users will not need to change 
+        it.)
         """
 
         self._order = dataobject.order
@@ -42,7 +46,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         try:
             dataobject.fh.fileno()
             self._filename = dataobject.fh.name
-            self.avoid_mmap = False
+            self.posix = True
             self.pseudo_chunking_size = 0 
         except (AttributeError, OSError):
             try:
@@ -52,7 +56,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
                 # maybe a remote https file opened as bytes?
                 # failing that, maybe a memory file, return as None
                 self._filename = getattr(self._fh,'full_name','None')
-            self.avoid_mmap = True
+            self.posix = False
             self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
@@ -126,14 +130,15 @@ def read_direct_chunk(self, chunk_position, **kwargs):
     def get_data(self, args):
         """ Called by the dataset getitem method """
 
-
-
-
         match self.layout_class:
             case 0:  #compact storage
                 raise NotImplementedError("Compact Storage")
             case 1:  # contiguous storage
-                return self._get_contiguous_data(args)
+                if self.data_offset == UNDEFINED_ADDRESS:
+                    # no storage is backing array, return all zeros
+                    return np.zeros(self.shape, dtype=self.dtype)[args]
+                else:
+                    return self._get_contiguous_data(args)
             case 2:  # chunked storage
                 if isinstance(self.dtype, tuple):
                 # references need to read all the chunks for now
@@ -179,6 +184,17 @@ def index(self):
             raise ValueError('No chunk index available for HDF layout class {self.layout}')
         else:
             return self._index
+    #### The following method can be used to set pseudo chunking size after the 
+    #### file has been closed and before data transactions. This is pyfive specific
+    def set_psuedo_chunk_size(self, newsize_MB):
+        """ Set pseudo chunking size for contiguous variables """
+        if self.layout_class == 1:
+            if not self.posix:
+                self.pseudo_chunking_size = newsize_MB*1024*1024
+            else:
+                pass  # silently ignore it, we'll be using a np.memmap
+        else:
+            raise ValueError('Attempt to set pseudo chunking on non-contigous variable')
         
     ######
     # The following DatasetID methods are used by PyFive and you wouldn't expect
@@ -227,13 +243,9 @@ def _build_index(self, dataobject):
                 self._index[key] = StoreInfo(key, filter_mask, addr, size)
 
     def _get_contiguous_data(self, args):
-    
-        if self.data_offset == UNDEFINED_ADDRESS:
-            # no storage is backing array, return all zeros
-            return np.zeros(self.shape, dtype=self.dtype)[args]
 
         if not isinstance(self.dtype, tuple):
-            if self.avoid_mmap:
+            if not self.posix:
                 return self._get_direct_from_contiguous(args)
             else:
                 try:
@@ -268,32 +280,65 @@ def _get_direct_from_contiguous(self, args=None, kwargs={}):
         we don't have a true Posix file. We should never end up here with compressed
         data.
         """
-        def __getstride():
+        def __get_pseudo_shape():
             """ Determine an appropriate chunk and stride for a given pseudo chunk size """
-            stride = 1
-            chunk_shape = np.ones(self.rank, dtype=int)
-            for i in range(self.rank):
-                stride *= self.shape[i]
-                chunk_shape = self.shape[:i]
-                if stride*self.dtype.itemsize > self.pseudo_chunking_size:
-                    stride //= self.shape[i]
-                    chunk_shape = self.shape[:i-1]
-            return chunk_shape, stride  
-    
-        itemsize = np.dtype(self.dtype).itemsize
-        # need to impose type in case self.shape is () in which case numpy would return a float
-        num_elements = np.prod(self.shape, dtype=int)
-        num_bytes = num_elements*itemsize
+            element_size = self.dtype.itemsize
+            chunk_shape = np.copy(self.shape)
+            while True:
+                chunk_size = np.prod(chunk_shape) * element_size
+                if chunk_size < self.pseudo_chunking_size:
+                    break
+                for i in range(len(chunk_shape)):  
+                    if chunk_shape[i] > 1:
+                        chunk_shape[i] //= 2
+                        break
+               
+            return chunk_shape, chunk_size
 
+        class LocalOffset:
+            def __init__(self, shape, chunk_shape, stride):
+                chunks_per_dim = [int(np.ceil(a / c)) for a, c in zip(shape, chunk_shape)]
+                self.chunk_strides = np.cumprod([1] + chunks_per_dim[::-1])[:-1][::-1]
+                self.stride = stride
+            def coord_to_offset(self,chunk_coords):
+                linear_offset = sum(idx * stride for idx, stride in zip(chunk_coords, self.chunk_strides))
+                return linear_offset*self.stride
+
+    
         if self.pseudo_chunking_size:
-            stride = __getstride()
+            chunk_shape, stride = __get_pseudo_shape()
+            offset_finder = LocalOffset(self.shape,chunk_shape,stride)
+            array = ZarrArrayStub(self.shape, chunk_shape)
+            indexer = OrthogonalIndexer(args, array)
+            out_shape = indexer.shape
+            out = np.empty(out_shape, dtype=self.dtype, order=self._order)
+            chunk_size = np.prod(chunk_shape)
+
+            for chunk_coords, chunk_selection, out_selection in indexer:
+                index = self.data_offset + offset_finder.coord_to_offset(chunk_coords)
+                self._fh.seek(index)
+                chunk_buffer = self._fh.read(stride)
+                chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
+                if len(chunk_data) < chunk_size:
+                    # last chunk over end of file
+                    padded_chunk_data = np.zeros(chunk_size, dtype=self.dtype)
+                    padded_chunk_data[:len(chunk_data)] = chunk_data
+                    chunk_data = padded_chunk_data
+                out[out_selection] = chunk_data.reshape(chunk_shape, order=self._order)[chunk_selection]
+    
+            return out
        
-        # we need it all, let's get it all (i.e. this really does read the lot)
-        self._fh.seek(self.data_offset)
-        chunk_buffer = self._fh.read(num_bytes) 
-        chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
-        chunk_data = chunk_data.reshape(self.shape, order=self._order)
-        return chunk_data[args]
+        else:
+            itemsize = np.dtype(self.dtype).itemsize
+            num_elements = np.prod(self.shape, dtype=int)
+            num_bytes = num_elements*itemsize
+
+            # we need it all, let's get it all (i.e. this really does read the lot)
+            self._fh.seek(self.data_offset)
+            chunk_buffer = self._fh.read(num_bytes) 
+            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
+            chunk_data = chunk_data.reshape(self.shape, order=self._order)
+            return chunk_data[args]
 
     
     def _get_raw_chunk(self, storeinfo):
@@ -351,6 +396,8 @@ def _get_selection_via_chunks(self, args):
 
         return out
 
+    
+
     @property
     def _fh(self):
         """ 
diff --git a/tests/data/make_dataset_multidim_file.py b/tests/data/make_dataset_multidim_file.py
new file mode 100644
index 0000000..7d885c4
--- /dev/null
+++ b/tests/data/make_dataset_multidim_file.py
@@ -0,0 +1,17 @@
+#! /usr/bin/env python
+""" Create a HDF5 file with datasets of many different dimensions . """
+import h5py
+import numpy as np
+
+f = h5py.File('dataset_multidim.hdf5', 'w')
+
+# signed intergers
+common_args = {
+    'dtype': '<i4',
+    'track_times': False,
+}
+f.create_dataset('a', shape=(2, ), data=np.arange(2), **common_args)
+f.create_dataset('b', shape=(2, 3), data=np.arange(6), **common_args)
+f.create_dataset('c', shape=(2, 3, 4), data=np.arange(24), **common_args)
+f.create_dataset('d', shape=(2, 3, 4, 5), data=np.arange(120), **common_args)
+f.close()
diff --git a/tests/test_pseudochunking.py b/tests/test_pseudochunking.py
index f7665f8..94de570 100644
--- a/tests/test_pseudochunking.py
+++ b/tests/test_pseudochunking.py
@@ -13,13 +13,15 @@ def setup_data():
     
     file_like = io.BytesIO()
     f = h5py.File(file_like, 'w', libver='earliest')
-    nx, ny = 400,400
-    data = np.arange(400*400).reshape(nx,ny)
+    nx, ny = 600,400
+    data = np.arange(nx*ny).reshape(nx,ny)
     f.create_dataset('var1', shape=(nx,ny), dtype='<i4',data=data, track_times=False)
     f.close()
 
     with pyfive.File(file_like,'r') as f:
         var1 = f['var1']
+        # use 100 KB as the chunk size
+        var1.id.set_psuedo_chunk_size(0.1)
 
     return var1, data
 
@@ -27,9 +29,6 @@ def setup_data():
 def test_get_direct_from_contiguous(setup_data):
     
     var1, data = setup_data
-    
-    #test getting it all
-
     result = var1[...]
     np.testing.assert_array_equal(result,data)
 
@@ -38,8 +37,8 @@ def test_get_direct_from_contiguous_with_slice(setup_data):
 
     var1, data = setup_data
 
-    via_file = var1[10:50,40:80]
-    via_data = data[10:50,40:80]
+    via_file = var1[10:50,2:5]
+    via_data = data[10:50,2:5]
 
     np.testing.assert_array_equal(via_file, via_data)
     

From 13e5e39b884a2ee840451083b23c61c0195f6753 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 17:08:02 +0000
Subject: [PATCH 079/105] Tidy up dependencies for testing

---
 setup.cfg                           |   1 +
 tests/dataset_multidim.hdf5         | Bin 4464 -> 0 bytes
 tests/make_dataset_multidim_file.py |  17 -----------------
 3 files changed, 1 insertion(+), 17 deletions(-)
 delete mode 100644 tests/dataset_multidim.hdf5
 delete mode 100644 tests/make_dataset_multidim_file.py

diff --git a/setup.cfg b/setup.cfg
index c767029..d3fb584 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -40,4 +40,5 @@ testing =
     flask
     flask-cors
     moto
+    dask
 
diff --git a/tests/dataset_multidim.hdf5 b/tests/dataset_multidim.hdf5
deleted file mode 100644
index 6349a86592ac0d3fee9ab4c250783bf31341e3f0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4464
zcmeI#*-{fh6oBFGNunc)g1fklJMMAc7c{}R0Rk%S!=mDjyO&(zosZ$8_!zv2{-^si
zRwWuL7nG@Tn)>E+_e>@;XL>r#*Gj3owWXuQNQq;UH*KyOHb1pYwELdW4*OHBSdCK}
zzem1ejr<w2TKn7e{ME+|_Ls|sBg6Sa)Q_2B-q(`Qk;VTi1C?@DpWN_}-gT$OA3Dyo
zX@<2jqH$E?n8q+pi`t$oSW|H0*KcHm`jKp{KU4Vn5Oa)g+0zyq-X$rC+ZIgh`lBdv
zAB;JVRB|@uNGTe8gYCDOS~FD!dU{KiI}2Bv<mz8tm`%PE>K_;8hpBn>k4toS8ctmQ
z@TT2cJFYt2wOT(s%<u8!Z=t%~_&6KaM;0<2QZpGU|1{R-<G8yXn{%&AO+NJJ;II9S
z=Vwy$lUC(nqN<%W8_j@!f--j+_q=4vkmnbsgg^DmG%z3XP!>QIC=0O&tyqjDkZC}c
zCQGpl%drA0u?kY=^8E5Ptw9IYVjb3F12!VTCTzwQY{fQg#}3F_zZ1K#8+))9`>-EH
z9Kb;w!eJc2Q5=K3B`0tar*Il)a2Drq9$mPAiztDGmg&YNRB#zra23~Z9X;qpA8w!@
z0~o|j+`?@P;STQN9`54-9^w&F3}XbN7{g;c!BdRm8J^<>Ug8yA;|=8Pdy98?j}Q2W
zPxy?iWEsdZkYymtKyzlGc3mgGIQ%B(t&5S{@_&#<Sr<qNYlGSLH@?oBj{P@G`wQ?5
Lw&_f5E-SwPF7;E_

diff --git a/tests/make_dataset_multidim_file.py b/tests/make_dataset_multidim_file.py
deleted file mode 100644
index 7d885c4..0000000
--- a/tests/make_dataset_multidim_file.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /usr/bin/env python
-""" Create a HDF5 file with datasets of many different dimensions . """
-import h5py
-import numpy as np
-
-f = h5py.File('dataset_multidim.hdf5', 'w')
-
-# signed intergers
-common_args = {
-    'dtype': '<i4',
-    'track_times': False,
-}
-f.create_dataset('a', shape=(2, ), data=np.arange(2), **common_args)
-f.create_dataset('b', shape=(2, 3), data=np.arange(6), **common_args)
-f.create_dataset('c', shape=(2, 3, 4), data=np.arange(24), **common_args)
-f.create_dataset('d', shape=(2, 3, 4, 5), data=np.arange(120), **common_args)
-f.close()

From c4a38b9e846faa1a28344367d1df756f050f75ee Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 17:36:50 +0000
Subject: [PATCH 080/105] Minor changes which come from upstream advice on my
 two pull requests https://github.com/jjhelmus/pyfive/pull/67 and
 https://github.com/jjhelmus/pyfive/pull/66

---
 pyfive/dataobjects.py |  9 +++++----
 pyfive/h5d.py         | 21 +++++++++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 6386b6f..9e7535a 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -368,9 +368,9 @@ def compression_opts(self):
         if GZIP_DEFLATE_FILTER in self._filter_ids:
             gzip_entry = [d for d in self.filter_pipeline
                           if d['filter_id'] == GZIP_DEFLATE_FILTER][0]
-            #raise ValueError(f'gzip_entry {gzip_entry.keys()}')
-            key = {0:'client_data_values',1:'client_data'}['client_data' in gzip_entry]
-            return gzip_entry[key][0]
+            return gzip_entry['client_data'][0]
+            #key = {0:'client_data_values',1:'client_data'}['client_data' in gzip_entry]
+            #return gzip_entry[key][0]
         return None
 
     @property
@@ -458,7 +458,8 @@ def filter_pipeline(self):
                 filter_info['name'] = name
                 client_values = struct.unpack_from("<{:d}i".format(num_client_values), self.msg_data, offset)
                 offset += (4 * num_client_values)
-                filter_info['client_data_values'] = client_values
+                filter_info['client_data'] = client_values
+                filter_info['client_data_values'] = num_client_values
 
                 filters.append(filter_info)
         else:
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index c2b8ea3..990b9b2 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -109,14 +109,20 @@ def get_chunk_info(self, index):
         Retrieve storage information about a chunk specified by its index.
         Our index is in chunk space, but H5Py wants it in coordinate space.
         """
-        return self._index[self._nthindex[index]]
+        if not self._index:
+            return None
+        else:
+            return self._index[self._nthindex[index]]
 
     def get_chunk_info_by_coord(self, coordinate_index):
         """
         Retrieve information about a chunk specified by the array address of the chunk’s 
         first element in each dimension.
         """
-        return self._index[coordinate_index]
+        if not self._index:
+            return None
+        else:
+            return self._index[coordinate_index]
     
     def get_num_chunks(self):
         """ 
@@ -129,6 +135,8 @@ def read_direct_chunk(self, chunk_position, **kwargs):
         Returns a tuple containing the filter_mask and the raw data storing this chunk as bytes.
         Additional arugments supported by H5Py are not supported here.
         """
+        if not self.index:
+            return None
         if chunk_position not in self._index:
             raise OSError("Chunk coordinates must lie on chunk boundaries")
         storeinfo = self._index[chunk_position]
@@ -146,6 +154,8 @@ def get_data(self, args):
                 else:
                     return self._get_contiguous_data(args)
             case 2:  # chunked storage
+                if not self._index:
+                    return np.zeros(self.shape, dtype=self.dtype)[args]
                 if isinstance(self.dtype, tuple):
                 # references need to read all the chunks for now
                     return self._get_selection_via_chunks(())[args]
@@ -193,7 +203,10 @@ def index(self):
     #### The following method can be used to set pseudo chunking size after the 
     #### file has been closed and before data transactions. This is pyfive specific
     def set_psuedo_chunk_size(self, newsize_MB):
-        """ Set pseudo chunking size for contiguous variables """
+        """ Set pseudo chunking size for contiguous variables. The default
+        value is 4 MB which should be suitable for most applications. For
+        arrays smaller than this value, no pseudo chunking is used. 
+        Larger arrays will be accessed in in roughly newsize_MB reads. """
         if self.layout_class == 1:
             if not self.posix:
                 self.pseudo_chunking_size = newsize_MB*1024*1024
@@ -221,7 +234,7 @@ def _build_index(self, dataobject):
             return
         
         # look out for an empty dataset, which will have no btree
-        if np.prod(self.shape) == 0:
+        if np.prod(self.shape) == 0 or dataobject._chunk_address == UNDEFINED_ADDRESS:
             self._index = {}
             return
         

From 49aa7949679a0d4335572b54387be992945bd370 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 15 Jan 2025 17:54:26 +0000
Subject: [PATCH 081/105] Suppress reference list warning. It's useless

---
 pyfive/dataobjects.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 9e7535a..ba24a7b 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -219,9 +219,12 @@ def _parse_attribute_msg(self, buffer, offset):
         try:
             dtype = DatatypeMessage(buffer, offset).dtype
         except NotImplementedError:
-            warnings.warn(
-                f"Attribute {name} type not implemented, set to None."
-            )
+            if name == 'REFERENCE_LIST':
+                pass #suppress this one, no one actually cares about these as far as I know
+            else:
+                warnings.warn(
+                    f"Attribute {name} type not implemented, set to None."
+                )
             return name, None
         offset += _padded_size(attr_dict['datatype_size'], padding_multiple)
 

From dba2683a3c03249ed8a73d32430ccb3807ada8b7 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Thu, 16 Jan 2025 16:10:28 +0000
Subject: [PATCH 082/105] Using context manager for threadsafe test

---
 tests/test_threadsafe_data_access.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_threadsafe_data_access.py b/tests/test_threadsafe_data_access.py
index 4291a60..fa4a94d 100644
--- a/tests/test_threadsafe_data_access.py
+++ b/tests/test_threadsafe_data_access.py
@@ -73,7 +73,8 @@ def test_threadsafe_datea_accesss(s3fs_s3):
         print (f"\n{ftype} ----------------\n")
 
         # Get the file data array, on asingle thread
-        v = pyfive.File(chunked_file)['v']
+        with pyfive.File(chunked_file) as hf:
+            v = hf['v']
         print (v)
         print (f"Storage chunk size: {v.chunks}")
         array0 = v[...].view(type=np.ndarray)

From 2ba27dc14b16c015a085eedf637ed6b4bfa6da25 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 20 Jan 2025 13:20:59 +0000
Subject: [PATCH 083/105] no returned memory maps

---
 pyfive/h5d.py                      |  20 ++++++++++++++++----
 tests/data/issue23_A_contiguous.nc | Bin 0 -> 10943 bytes
 tests/test_lazy_variables.py       |  20 ++++++++++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)
 create mode 100644 tests/data/issue23_A_contiguous.nc

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 990b9b2..d031b76 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -265,14 +265,26 @@ def _get_contiguous_data(self, args):
 
         if not isinstance(self.dtype, tuple):
             if not self.posix:
+                # Not posix
                 return self._get_direct_from_contiguous(args)
             else:
+                # posix
                 try:
-                    # return a memory-map to the stored array
-                    # I think this would mean that we only move the sub-array corresponding to result!
-                    view =  np.memmap(self._fh, dtype=self.dtype, mode='c',
-                                offset=self.data_offset, shape=self.shape, order=self._order)
+                    # Create a memory-map to the stored array, which
+                    # means that we will end up only copying the
+                    # sub-array into in memory.
+                    view =  np.memmap(
+                        self._fh,
+                        dtype=self.dtype,
+                        mode='c',
+                        offset=self.data_offset,
+                        shape=self.shape,
+                        order=self._order
+                    )
+                    # Create the sub-array
                     result = view[args]
+                    # Copy the data from disk to physical memory
+                    result = result.view(type=np.ndarray)
                     return result
                 except UnsupportedOperation:
                     return self._get_direct_from_contiguous(args)
diff --git a/tests/data/issue23_A_contiguous.nc b/tests/data/issue23_A_contiguous.nc
new file mode 100644
index 0000000000000000000000000000000000000000..18cf68c1ef960e4f0e1299de57626f06f9c432b7
GIT binary patch
literal 10943
zcmeHNeQZ=!7QZvEoepIvh<qp_J}N2-?Y5=3h$(NH>5QOtT3W@A5Qkyr!Hk){sWY!#
zSsHEDD1NOg5Cx-xU<@%4L4wBk0gWGtx@L*`hZ-YmqQ>utQ5IxX_MCg}dDBiIFowGS
zybCku-TTft@0{N`_q<8Zy=!4hWK89hO0VB9cpOz@??tpyr*8Nfjx@C`Z0Ya@c6+Pk
znl?(^Vv%B6=@lMinK)IH0nxmon1knmnaHP!fbb)0C~NhamYvR8d2u@2sDO_bKA}iW
zeQkZixN&eTK0=)^!bAkE8X8TdLcByd7|;G^$0%w3m}wr*1YGfYJf3pAfWcIofEJ&o
z8(;^HplBQ|8jpg`@+!QeT~q*S(`EF;7nSg0DN*ot)bYJ-MpjKs0Dr-hSk$x=$rX*E
zY&MrnTXEaWi*fKA@=uqBN*qCmDJlre!nr1V3bfYd&Q1vz!h>$NEV?or?OfCrjdg~a
zTf-tS4c9gBR-znjzA`NN@j=L+iQDk~-NYl~hddtEXgpNC>t(`F48l+WEHxmX!f^UI
zjsuhZV)CtUr(EWPR!O)`^^w+fM4T)sKR#~YFyeO74%35Hs+35n2O(|<LlwXet~2~C
z>8^BgDDWU40k2Y$gUIWzF8-x0wD1bw6kPO_abPNzXM3z<zG04n<?Y}8xe56K*Hf^H
zu@8bYvR;)+=XLlW*DccywM6t}y34fkum$wxJ+^LT?ZU0zcrG1Zo-u269u4B_ca|-1
zVlx923NIu$=QYQucwSn*`^TeR%_Gi0DnvUGCvw{HBNs;2k}{r6W{L_BWQwV-Jb4(G
zabStyao_jV<%l`4KCSX>wYu#)3okrntk@4}A6pJF+o$q`_&#g?Y`qNJQVQHyF%CCt
zVNbO#>by#v3Cy_NCc_6y0~Qrl%v7d^svBtb;5ph>Qa%E|c<SZa0|oysRIWdXw_u<;
z?PSxNM1EZ}`Z>Cy?@a8v2k`+Ma4yrz2^j-Tte0sf#N8~qgfC7#t$xF}K}G|rEY`d#
zDq*o`s6ChMHgk3wiYC4)YT<*x_nJA}JWUO=VLw52ZECE&q;^hC((E-e*>2cc_Uu%0
z#oQ)*pVX>1%BJaLc#w$tUOj)?Ax67GoULAqTh(2BbCrQXswVYVX`2ig0}N<L(p;G{
z&3w$t=Ij*4ro3Q&-i}+zcrF>U;$5aRU*-@<K~ARao}?K7jC~+bLH<&4Ddv>fD@0`O
z2bhi}jLSl8tuh<s3sq$%$a9>(SGqWpi`RL>@H!VgsH$=luXFj~b=I$TsPkOb!<(A<
z*Xcv`7O(RU5?*ILl_WMy|LCH1)*mQZXFV8G9IT)1=l4VL`dDIj_*uVN7I(5BFSyi2
zz(v4Cz(v4Cz(v4Cz(v4Cz(v4Cz(v4CV5AV}Uot?dqeXOqnmaT|d<ztf4yFA|Xg<&}
zAaL|Y29P+TKN6JJ106*92Cq9*>0y1JEF*rV{VQnBRL{_y=_lQaU(ua;nltV1Rq_`r
zzILMeQpJ}b+MXtQz#_UZNA!#yqQVHS`s)08|5JU&Bd_T@bEz%Hky~zj$zGo_X#HN}
z(to%8eS@{mxF_?)f1mtetGd6@IJ0a?Rn4wPjq8(=L-B1jD&K#+yQy-UaqT6$-{{@5
zL*3tIeEP)0@Pu7T{$``~&v!L1`+TQy;nIJ+a&YEL#^Fxn7*KM175^*7_}d2F+7f=w
zSo>X{G53ZSj5mAtY@YI`?Zyj>0^V6`?r`i#NrtD5S!*VJ_^(5c8{aN|e%kHdJfrS!
zSN82v^l!@FwT?aH=T_sRpCtfFe#<cb87Jx9=DQ8y4i^Cz0T%%mf&U8vvZK<6f6vy$
zSW$=lt&VUc+!2n3!ZF$3x)}cSkIjK(+>Yl>yU^%z+LOYM_vtVG`Z4=ok?Nt&-F^Yh
zW#<7d@7p~8a$LFR<MRDZ3lmJCZW{khooQ(;OKJf&%s_3>yd&Z?EJw6O56G3?qL(cr
z5Ztu?fp%^>K~T;m!(|s0E`5`5?kVH`iw5l??nMyid*dg4U64QfJX$?VL0(de5d9Eh
z#``~phqayOiTk(zeAtPZL>b%894itn^h<bNS`4`hd9!Le_e+Bc2Q5al?TjWPYLrMD
zR3MJoc7EvU9e+j4xxwsoyE0izw&-{xkggWB4VslZMBd%Gd!O8vr?l;ijV>8`6&Y-B
zmDF}d7$U3tmKAbp|8z2#kG-7ma7oqzfiG+K{e>(+h1H~jzYiU~lUnXjlCY;bawUmd
z{SuQwmhk*B`^YuuDyQCIm3l#CY5-z)%&sp2i&$~j*No@w!0Zz=eX(tcLXambPqyo;
zQVLTEx}-Wt!}udQ@wtPS&T)3UEc<Bs@7vMyRKJ?*#Yr3#t3yAJ(Qcmz%M$qbs=S_0
zTM1KdsH>k>Q{PZiSC8QubZTID+a3KtX&VmR2vsa$dZMxB!1g_b9kF&M@h=!6=EI{Q
zP^%$Ia+KjS=ABQIdw<$i=YZ*yPp5Dwk>dI913Rd|4yq()uq#IT)D8kmC2yZXD9Zbc
zK|x1vUB9s#!Q_`49}3W5TX5stCw~@r$~a*>H}Q{&x27MgM(F@V4@2#Te-PE~T=qsY
zVGlVmku&pVJeNoX8Zqe3S(4#daT-{K#^<}uM0!O!5li*J5wWzrDlnCf*bF=LfqtPx
zJ(__9LLF#K3_99(I@}+eZ(JV6ki`izlZkbib}E~sgCux(0d>Z6W_-ThWyY-lyc_UP
y$f-G!gl7<BJQS#SXko(JW@>Q#YghG8#ZLsFLXYf^My}9cj2s+ip7%}JJMDiGjG$Nm

literal 0
HcmV?d00001

diff --git a/tests/test_lazy_variables.py b/tests/test_lazy_variables.py
index 6ecfb72..ef9d157 100644
--- a/tests/test_lazy_variables.py
+++ b/tests/test_lazy_variables.py
@@ -56,6 +56,26 @@ def test_file_data_oustside_context():
     assert qsum1 == qsum2
 
 
+def test_numpy_array_type():
+    """Pyfive slices should always return a np.ndarray, not a np.memmap.
+
+    """
+    # Get data from contiguous file
+    with pyfive.File(HERE/'data/issue23_A_contiguous.nc') as hfile:
+        qdata = hfile['q']
+        qdata1 = qdata[...]
+        assert isinstance(qdata1, np.ndarray)
+        assert not isinstance(qdata1, np.memmap)
+
+    # Get data from chunked file
+    with pyfive.File(HERE/'data/issue23_A.nc') as hfile:
+        qdata = hfile['q']
+        qdata2 = qdata[...]
+        assert isinstance(qdata2, np.ndarray)
+        assert not isinstance(qdata2, np.memmap)
+
+    # Check that the data are equal in both cases
+    assert (qdata1 == qdata2).all()
 
 
From e6b518b805635763d12abfcf2168945ea0fa2b30 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 20 Jan 2025 13:40:41 +0000
Subject: [PATCH 084/105] Test for #16 and #29

---
 pyfive/h5d.py          |  2 +-
 tests/test_vlen_str.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_vlen_str.py

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 990b9b2..bedd6aa 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -287,7 +287,7 @@ def _get_contiguous_data(self, args):
                     shape=self.shape, order=self._order)
                 return np.array([Reference(addr) for addr in ref_addresses])[args]
             else:
-                raise NotImplementedError('datatype not implemented - {dtype_class}')
+                raise NotImplementedError(f'datatype not implemented - {dtype_class}')
 
 
     def _get_direct_from_contiguous(self, args=None):
diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
new file mode 100644
index 0000000..5f00dc4
--- /dev/null
+++ b/tests/test_vlen_str.py
@@ -0,0 +1,22 @@
+import h5py
+import pyfive
+import io
+
+def make_file(file_like, _vlen_string):
+    with h5py.File(file_like,'w') as hfile:
+        
+        dt = h5py.special_dtype(vlen=str)
+        v = hfile.create_dataset("var_len_str", (1,), dtype=dt)
+        v[0] = _vlen_string
+    
+
+def test_vlen_string():
+
+    tfile = io.BytesIO()
+    _vlen_string = "foo"
+    make_file(tfile, _vlen_string)
+    with pyfive.File(tfile) as hfile:
+        print(hfile)
+        ds1 = hfile['var_len_str']
+        assert _vlen_string == ds1[0]
+      
\ No newline at end of file

From b1ae323c76e3f37bbe842f58151dad9bf9a5ecc2 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 20 Jan 2025 14:44:16 +0000
Subject: [PATCH 085/105] More versions of the #29 tests

---
 tests/test_vlen_str.py | 58 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
index 5f00dc4..c0c36e0 100644
--- a/tests/test_vlen_str.py
+++ b/tests/test_vlen_str.py
@@ -1,22 +1,68 @@
 import h5py
 import pyfive
+import netCDF4 as nc
 import io
+import numpy as np
 
-def make_file(file_like, _vlen_string):
+def make_file_hdf5(file_like, _vlen_string):
     with h5py.File(file_like,'w') as hfile:
         
         dt = h5py.special_dtype(vlen=str)
         v = hfile.create_dataset("var_len_str", (1,), dtype=dt)
         v[0] = _vlen_string
-    
 
-def test_vlen_string():
+def make_file_nc(file_like,m_array):
+  
+    n = nc.Dataset(file_like, "w", format="NETCDF4")
+    n.createDimension("time", 4)
+    months = n.createVariable("months", str, ("time",))
+    months[:] =  np.array(m_array, dtype="S8")
+    n.close()
+
+def test_vlen_string_hdf5():
 
     tfile = io.BytesIO()
     _vlen_string = "foo"
-    make_file(tfile, _vlen_string)
+    make_file_hdf5(tfile, _vlen_string)
     with pyfive.File(tfile) as hfile:
-        print(hfile)
+         
         ds1 = hfile['var_len_str']
         assert _vlen_string == ds1[0]
-      
\ No newline at end of file
+    
+def test_vlen_string_nc1():
+    """ this verson currently fails because netcdf4 is doing something odd in memory """
+
+    t1file = io.BytesIO()
+    m_array = ["January", "February", "March", "April"]
+    make_file_nc(t1file,m_array)
+
+    with nc.Dataset(t1file,'r') as ncfile:
+        ds1 = ncfile['months']
+        assert np.array_equal(m_array, ds1) 
+
+
+    with h5py.File(t1file) as pfile:
+        ds1 = pfile['months']
+        assert np.array_equal(m_array, ds1) 
+
+  
+    with pyfive.File(t1file) as hfile:
+        ds1 = hfile['months']
+        assert np.array_equal(m_array, ds1) 
+
+def test_vlen_string_nc2(tmp_path):
+    tfile = tmp_path / 'test_vlen_string.nc'
+    m_array = ["January", "February", "March", "April"]
+    make_file_nc(tfile, m_array)
+    
+    with nc.Dataset(tfile, 'r') as ncfile:
+        ds1 = ncfile.variables['months'][:]
+        assert np.array_equal(m_array, ds1.astype(str))
+    
+    with h5py.File(tfile, 'r') as pfile:
+        ds1 = pfile['months'][:]
+        assert np.array_equal(m_array, ds1.astype(str))
+    
+    with pyfive.File(tfile) as hfile:
+        ds1 = hfile['months'][:]
+        assert np.array_equal(m_array, ds1.astype(str))
\ No newline at end of file

From c7e157c641011db58cd7fb486dc1db71c5310d13 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 20 Jan 2025 14:46:52 +0000
Subject: [PATCH 086/105] Better .gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 1679dae..fbf3007 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,9 @@
 .coverage
 .pyc
 build
-__pycache__
+__pycache__/
 *.egg-info
 .idea
 .DS_Store
 test-reports/
+<_io.Bytes*>

From 40c898b7700d0af92f137341087c1cd36034e6cb Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Mon, 20 Jan 2025 14:57:51 +0000
Subject: [PATCH 087/105] Giving up on in-memory netcdf tests for #29

---
 tests/test_vlen_str.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
index c0c36e0..28ec2c6 100644
--- a/tests/test_vlen_str.py
+++ b/tests/test_vlen_str.py
@@ -11,13 +11,17 @@ def make_file_hdf5(file_like, _vlen_string):
         v = hfile.create_dataset("var_len_str", (1,), dtype=dt)
         v[0] = _vlen_string
 
-def make_file_nc(file_like,m_array):
+def make_file_nc(file_like,m_array, inmemory=False):
   
-    n = nc.Dataset(file_like, "w", format="NETCDF4")
+    if inmemory:
+        n = nc.Dataset(file_like, 'w', diskless=True)
+    else:
+        n = nc.Dataset(file_like, "w", format="NETCDF4")
     n.createDimension("time", 4)
     months = n.createVariable("months", str, ("time",))
     months[:] =  np.array(m_array, dtype="S8")
-    n.close()
+    if not inmemory:
+        n.close()
 
 def test_vlen_string_hdf5():
 
@@ -29,12 +33,12 @@ def test_vlen_string_hdf5():
         ds1 = hfile['var_len_str']
         assert _vlen_string == ds1[0]
     
-def test_vlen_string_nc1():
+def NOtest_vlen_string_nc1():
     """ this verson currently fails because netcdf4 is doing something odd in memory """
 
     t1file = io.BytesIO()
     m_array = ["January", "February", "March", "April"]
-    make_file_nc(t1file,m_array)
+    make_file_nc(t1file,m_array, inmemory=True)
 
     with nc.Dataset(t1file,'r') as ncfile:
         ds1 = ncfile['months']

From 8acf067af4e73487540956e8c7b3bc4c919eeb4f Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 20 Jan 2025 15:28:23 +0000
Subject: [PATCH 088/105] explicitly close POSIX files

---
 pyfive/h5d.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index d031b76..c33f206 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -273,8 +273,9 @@ def _get_contiguous_data(self, args):
                     # Create a memory-map to the stored array, which
                     # means that we will end up only copying the
                     # sub-array into in memory.
+                    fh = self._fh
                     view =  np.memmap(
-                        self._fh,
+                        fh,
                         dtype=self.dtype,
                         mode='c',
                         offset=self.data_offset,
@@ -285,6 +286,7 @@ def _get_contiguous_data(self, args):
                     result = view[args]
                     # Copy the data from disk to physical memory
                     result = result.view(type=np.ndarray)
+                    fh.close()
                     return result
                 except UnsupportedOperation:
                     return self._get_direct_from_contiguous(args)
@@ -294,10 +296,14 @@ def _get_contiguous_data(self, args):
                 size = self.dtype[1]
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type - size {size}')
+
+                fh = self._fh
                 ref_addresses = np.memmap(
-                    self._fh, dtype=('<u8'), mode='c', offset=self.data_offset,
+                    fh, dtype=('<u8'), mode='c', offset=self.data_offset,
                     shape=self.shape, order=self._order)
-                return np.array([Reference(addr) for addr in ref_addresses])[args]
+                result = np.array([Reference(addr) for addr in ref_addresses])[args]
+                fh.close()
+                return result
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')
 
@@ -356,6 +362,9 @@ def coord_to_offset(self,chunk_coords):
                     chunk_data = padded_chunk_data
                 out[out_selection] = chunk_data.reshape(chunk_shape, order=self._order)[chunk_selection]
     
+            if self.posix:
+                fh.close()
+
             return out
 
         else:
@@ -363,13 +372,20 @@ def coord_to_offset(self,chunk_coords):
             num_elements = np.prod(self.shape, dtype=int)
             num_bytes = num_elements*itemsize
 
-            # we need it all, let's get it all (i.e. this really does read the lot)
+            # we need it all, let's get it all (i.e. this really does
+            # read the lot)
             fh.seek(self.data_offset)
             chunk_buffer = fh.read(num_bytes) 
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
             chunk_data = chunk_data.reshape(self.shape, order=self._order)
-            return chunk_data[args]
+            chunk_data = chunk_data[args]
+            if self.posix:
+                fh.close()
 
+            return chunk_data
+
+        if self.posix:
+            fh.close()
 
     
     def _get_raw_chunk(self, storeinfo):
@@ -378,7 +394,11 @@ def _get_raw_chunk(self, storeinfo):
         """
         fh = self._fh
         fh.seek(storeinfo.byte_offset)
-        return fh.read(storeinfo.size) 
+        out = fh.read(storeinfo.size)
+        if self.posix:
+            fh.close()
+
+        return out
 
     def _get_selection_via_chunks(self, args):
         """

From 526c642698adbfa5ee37b22f64654e08fe787369 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 07:43:46 +0000
Subject: [PATCH 089/105] vlen strings data test case, vanilla version, and
 version with missing defined. tests fail.

---
 pyfive/h5d.py          |  4 ++--
 tests/test_vlen_str.py | 35 ++++++++++++++++++++++-------------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index bedd6aa..79a7ab3 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -286,6 +286,8 @@ def _get_contiguous_data(self, args):
                     self._fh, dtype=('<u8'), mode='c', offset=self.data_offset,
                     shape=self.shape, order=self._order)
                 return np.array([Reference(addr) for addr in ref_addresses])[args]
+            elif dtype_class == 'VLEN_STRING':
+                raise NotImplementedError('Variable length strings under construction')
             else:
                 raise NotImplementedError(f'datatype not implemented - {dtype_class}')
 
@@ -357,8 +359,6 @@ def coord_to_offset(self,chunk_coords):
             chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
             chunk_data = chunk_data.reshape(self.shape, order=self._order)
             return chunk_data[args]
-
-
     
     def _get_raw_chunk(self, storeinfo):
         """ 
diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
index 28ec2c6..b9f9615 100644
--- a/tests/test_vlen_str.py
+++ b/tests/test_vlen_str.py
@@ -3,13 +3,16 @@
 import netCDF4 as nc
 import io
 import numpy as np
+import os
 
-def make_file_hdf5(file_like, _vlen_string):
-    with h5py.File(file_like,'w') as hfile:
+def make_file_hdf5(our_file, vlen_strings):
+   
+    with h5py.File(our_file,'w') as hfile:
         
         dt = h5py.special_dtype(vlen=str)
-        v = hfile.create_dataset("var_len_str", (1,), dtype=dt)
-        v[0] = _vlen_string
+        v = hfile.create_dataset("var_len_str", (2,), dtype=dt)
+        v[:] = vlen_strings
+
 
 def make_file_nc(file_like,m_array, inmemory=False):
   
@@ -23,15 +26,23 @@ def make_file_nc(file_like,m_array, inmemory=False):
     if not inmemory:
         n.close()
 
-def test_vlen_string_hdf5():
+def test_vlen_string_hdf5(tmp_path):
 
-    tfile = io.BytesIO()
-    _vlen_string = "foo"
-    make_file_hdf5(tfile, _vlen_string)
-    with pyfive.File(tfile) as hfile:
+    #tfile = io.BytesIO()
+    our_file = tmp_path/'h5py_vlen.hdf5'
+    our_view = tmp_path/'h5py_vlen.txt'
+    vlen_strings = ["foo","foobar"]
+    make_file_hdf5(our_file, vlen_strings)
+    os.system(f'h5dump {our_file} > {our_view}')
+    with open(our_view,'r') as f:
+        for line in f.readlines():
+            print(line)
+
+    with pyfive.File(our_file) as hfile:
          
-        ds1 = hfile['var_len_str']
-        assert _vlen_string == ds1[0]
+        ds1 = hfile['var_len_str'][:]
+        print(ds1)
+        assert np.array_equal(ds1,vlen_strings)
     
 def NOtest_vlen_string_nc1():
     """ this verson currently fails because netcdf4 is doing something odd in memory """
@@ -44,12 +55,10 @@ def NOtest_vlen_string_nc1():
         ds1 = ncfile['months']
         assert np.array_equal(m_array, ds1) 
 
-
     with h5py.File(t1file) as pfile:
         ds1 = pfile['months']
         assert np.array_equal(m_array, ds1) 
 
-  
     with pyfive.File(t1file) as hfile:
         ds1 = hfile['months']
         assert np.array_equal(m_array, ds1) 

From 0e4a45b5472ae70274db5fcdc34b4c30c19c7335 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Tue, 21 Jan 2025 10:06:09 +0000
Subject: [PATCH 090/105] add extra posix test for file closure

---
 pyfive/h5d.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index c33f206..4f8e5a9 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -302,7 +302,9 @@ def _get_contiguous_data(self, args):
                     fh, dtype=('<u8'), mode='c', offset=self.data_offset,
                     shape=self.shape, order=self._order)
                 result = np.array([Reference(addr) for addr in ref_addresses])[args]
-                fh.close()
+                if self.posix:
+                    fh.close()
+
                 return result
             else:
                 raise NotImplementedError('datatype not implemented - {dtype_class}')

From 1fb9c98be3e0c60faf03e0c992991013a8ffdb5a Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 13:02:20 +0000
Subject: [PATCH 091/105] More on h5d and testing. The iter_chunks method is
 broken and we now have a test which shows that.

---
 pyfive/h5d.py     | 29 ++++++++++++++++++-----------
 tests/test_h5d.py | 37 +++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 79a7ab3..7932f28 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -169,7 +169,7 @@ def iter_chunks(self, args):
         The optional sel argument is a slice or tuple of slices that defines the region to be used. 
         If not set, the entire dataspace will be used for the iterator.
         For each chunk within the given region, the iterator yields a tuple of slices that gives the
-        intersection of the given chunk  with the selection area. 
+        intersection of the given chunk with the selection area. 
         This can be used to read data in that chunk.
         """
         if self.chunks is None:
@@ -177,7 +177,7 @@ def iter_chunks(self, args):
         
         def convert_selection(tuple_of_slices):
             # while a slice of the form slice(a,b,None) is equivalent
-            # in funtion to a slice of form (a,b,1) it is not the same.
+            # in function to a slice of form (a,b,1) it is not the same.
             # For compatability I've gone for "the same"
             def convert_slice(aslice):
                 if aslice.step is None:
@@ -186,7 +186,19 @@ def convert_slice(aslice):
             return tuple([convert_slice(a) for a in tuple_of_slices])
     
         array = ZarrArrayStub(self.shape, self.chunks)
-        indexer = OrthogonalIndexer(args, array) 
+
+        if args:
+            # convert to getitem type args
+            converted = []
+            for s in args:
+                if isinstance(s, slice) and (s.stop - s.start) == 1:
+                    converted.append(s.start)
+                else:
+                    converted.append(s)
+            args = tuple(converted)
+            indexer = OrthogonalIndexer(*args, array) 
+        else:
+            indexer = OrthogonalIndexer(args, array) 
         for _, _, out_selection in indexer:
             yield convert_selection(out_selection)
 
@@ -244,14 +256,8 @@ def _build_index(self, dataobject):
                 dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims)
         
         self._index = {}
-        # we do this to avoid either using an iterator or many 
-        # temporary list creations if there are repeated chunk accesses.
         self._nthindex = []
         
-        # The zarr orthogonal indexer returns the position in chunk
-        # space, whereas pyfive wants the position in array space.
-        # Here we index the pyfive chunk_index in zarr index space.
-        
         for node in chunk_btree.all_nodes[0]:
             for node_key, addr in zip(node['keys'], node['addresses']):
                 start = node_key['chunk_offset'][:-1]
@@ -398,16 +404,17 @@ def _get_selection_via_chunks(self, args):
         out = np.empty(out_shape, dtype=dtype, order=self._order)
 
         for chunk_coords, chunk_selection, out_selection in indexer:
+            # map from chunk coordinate space to array space which is how hdf5 keeps the index
             chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if self.filter_pipeline is not None:
-                # FIXME: Why do I assume it's always a V1 Btree?
+                # we are only using the class method here, future filter pipelines may need their own function
                 chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
             chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy()
             out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
        
         if true_dtype is not None:
-            # no idea if this is going to work!
+            
             if dtype_class == 'REFERENCE':
                 to_reference = np.vectorize(Reference)
                 out = to_reference(out)
diff --git a/tests/test_h5d.py b/tests/test_h5d.py
index 6f126d3..24bbb6e 100644
--- a/tests/test_h5d.py
+++ b/tests/test_h5d.py
@@ -2,17 +2,13 @@
 import pyfive
 from pathlib import Path
 import pytest
+from operator import mul
 
 mypath = Path(__file__).parent
 filename = 'compressed.hdf5'
 variable_name = 'dataset3'
 breaking_address=(2,0)
 
-#mypath = mypath.parent/'bnl/'
-#filename = 'da193o_25_day__grid_T_198807-198807.nc'
-#variable_name = 'tos'
-# breaking_address=(2,0,3)
-
 def chunk_down(ff, vv):
     """ 
     Test the chunking stuff
@@ -33,14 +29,18 @@ def chunk_down(ff, vv):
     return n, c.chunk_offset, c.filter_mask, c.byte_offset, c.size, d, v
 
 
-def get_chunks(ff, vv, view=0):
+def get_chunks(ff, vv):
     var = ff[vv]
     chunks = list(var.iter_chunks())
-    for i in range(view):
-        print('Chunk ',i)
-        print(chunks[i])
     return chunks
 
+def get_slices(var, using_py5):
+    """ Return suitlable test slice from var """
+    rank = len(var.shape)
+    assert rank == 2
+    slice1 = slice(2,3)
+    slice2 = slice(2,3)
+    return (slice1, slice2)
 
 def test_h5d_chunking_details():
 
@@ -52,7 +52,7 @@ def test_h5d_chunking_details():
 
     assert h5detail == p5detail 
 
-def test_iter_chunks():
+def test_iter_chunks_all():
 
     with h5py.File(mypath/filename) as f:
         h5chunks = get_chunks(f, variable_name)
@@ -62,3 +62,20 @@ def test_iter_chunks():
 
     assert h5chunks == p5chunks 
 
+def test_iter_chunks_sel():
+    """ I don't really understand what h5py is doing here, so 
+    obviously I don't have the right method in pyfive and/
+    or the right test #FIXME """
+
+    with h5py.File(mypath/filename) as f:
+        var = f[variable_name]
+        slices = get_slices(var, False)
+        h5chunks = list(var.iter_chunks(slices))
+        print(h5chunks)
+
+    with pyfive.File(mypath/filename) as g:
+        var = g[variable_name]
+        slices = get_slices(var, True)
+        p5chunks = list(var.iter_chunks(slices))
+
+    assert h5chunks == p5chunks 
\ No newline at end of file

From 82dc2a9b75ed2d7a63b86d706b68e4029adccb47 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 14:19:35 +0000
Subject: [PATCH 092/105] Support for pyactivestorage via a bespoke
 `get_chunk_info_from_chunk_coord` method

---
 pyfive/h5d.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 7932f28..1ff1cf3 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -107,7 +107,6 @@ def __eq__(self, other):
     def get_chunk_info(self, index):
         """
         Retrieve storage information about a chunk specified by its index.
-        Our index is in chunk space, but H5Py wants it in coordinate space.
         """
         if not self._index:
             return None
@@ -226,6 +225,20 @@ def set_psuedo_chunk_size(self, newsize_MB):
                 pass  # silently ignore it, we'll be using a np.memmap
         else:
             raise ValueError('Attempt to set pseudo chunking on non-contigous variable')
+
+    def get_chunk_info_from_chunk_coord(self, chunk_coords):
+        """
+        Retrieve storage information about a chunk specified by its index.
+        This index is in chunk space (as used by zarr) and needs to be converted
+        to hdf5 coordinate space.  Additionaly, if this file is not chunked, the storeinfo 
+        is returned for the contiguous data as if it were one chunk.
+        """
+        if not self._index:
+            dummy =  StoreInfo(None, None, self.data_offset, self.dtype.itemsize*np.prod(self.shape))
+            return dummy
+        else:
+            coord_index = tuple(map(mul, chunk_coords, self.chunks))
+            return self.get_chunk_info_by_coord(coord_index)
         
     ######
     # The following DatasetID methods are used by PyFive and you wouldn't expect

From 838b0a52f4d258ccc82063fe9034bd46364e58a1 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 17:02:34 +0000
Subject: [PATCH 093/105] better ignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index fbf3007..94d92df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ __pycache__/
 .DS_Store
 test-reports/
 <_io.Bytes*>
+tests/__pycache__/

From a6336839490935ea0efe8a6b0bc9ef935ece7181 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 18:13:20 +0000
Subject: [PATCH 094/105] The first vlen data test passes with this code

---
 pyfive/core.py           |  2 ++
 pyfive/dataobjects.py    | 10 +++-------
 pyfive/h5d.py            | 12 +++++++++++-
 pyfive/misc_low_level.py | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/pyfive/core.py b/pyfive/core.py
index b0d2718..4d9dba6 100644
--- a/pyfive/core.py
+++ b/pyfive/core.py
@@ -60,3 +60,5 @@ def _unpack_integer(nbytes, buf, offset=0):
     fmt = "{}s".format(nbytes)
     values = struct.unpack_from(fmt, buf, offset=offset)
     return int.from_bytes(values[0], byteorder="little", signed=False)
+
+
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index ba24a7b..2639ab5 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -19,7 +19,7 @@
 from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders
 from pyfive.btree import BTreeV2AttrCreationOrder, BTreeV2AttrNames
 from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER
-from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap
+from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap, GLOBAL_HEAP_ID
 from pyfive.h5d import DatasetID
 from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 
@@ -686,9 +686,8 @@ def is_datatype(self):
             return True
         else:
             return False
-        
 
-        
+
 
 def determine_data_shape(buf, offset):
     """ Return the shape of the dataset pointed to in a Dataspace message. """
@@ -723,10 +722,7 @@ def determine_data_shape(buf, offset):
 # all metadata fields are stored in little-endian byte order.
 
 
-GLOBAL_HEAP_ID = OrderedDict((
-    ('collection_address', 'Q'),  # 8 byte addressing
-    ('object_index', 'I'),
-))
+
 GLOBAL_HEAP_ID_SIZE = _structure_size(GLOBAL_HEAP_ID)
 
 # IV.A.2.m The Attribute Message
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 79a7ab3..5fcf1bd 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -4,7 +4,9 @@
 from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 from pyfive.btree import BTreeV1RawDataChunks
 from pyfive.core import Reference, UNDEFINED_ADDRESS
+from pyfive.misc_low_level import get_vlen_string_data
 from io import UnsupportedOperation
+
 import struct
 import logging
 from importlib.metadata import version
@@ -69,6 +71,13 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         self.shape = dataobject.shape
         self.rank = len(self.shape)
         self.chunks = dataobject.chunks
+
+        # experimental code. We need to find out whether or not this
+        # is unnecessary duplication. At the moment it seems best for
+        # each variable to have it's own copy of those needed for 
+        # data access. Though that's clearly not optimal if they include
+        # other data. To be determined.
+        self._global_heaps={} 
         
         self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
         self._unique = (self._filename, self.shape, self._msg_offset)
@@ -287,7 +296,8 @@ def _get_contiguous_data(self, args):
                     shape=self.shape, order=self._order)
                 return np.array([Reference(addr) for addr in ref_addresses])[args]
             elif dtype_class == 'VLEN_STRING':
-                raise NotImplementedError('Variable length strings under construction')
+                fh = self._fh
+                return get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype)
             else:
                 raise NotImplementedError(f'datatype not implemented - {dtype_class}')
 
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index e4c8786..e7993e7 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -11,6 +11,7 @@
 from .core import _unpack_integer
 from .core import InvalidHDF5File
 from .core import UNDEFINED_ADDRESS
+from math import prod
 
 
 class SuperBlock(object):
@@ -333,6 +334,33 @@ def _indirect_info(self, nrows):
             nindirect = nobjects - ndirect_max
         return ndirect, nindirect
 
+def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
+    """ Return the data for a variable which is
+    made up of variable length string data """
+    fh.seek(data_offset)
+    count = prod(shape)
+    _, _, character_set = dtype
+    value = np.empty(count,dtype=object)
+    offset = 0
+    buf = fh.read(16*count)
+    for i in range(count):
+        vlen_size, = struct.unpack_from('<I', buf, offset=offset)
+        gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
+        gheap_address = gheap_id['collection_address']
+        if gheap_address not in global_heaps:
+            # load the global heap and cache the instance
+            gheap = GlobalHeap(fh, gheap_address)
+            global_heaps[gheap_address] = gheap
+        gheap = global_heaps[gheap_address]
+        vlen_data = gheap.objects[gheap_id['object_index']]
+        if character_set == 0:
+            # ascii character set, return as bytes
+            value[i] = vlen_data
+        else:
+            value[i] = vlen_data.decode('utf-8')
+        offset +=16
+    return value
+
 
 FORMAT_SIGNATURE = b'\211HDF\r\n\032\n'
 
@@ -398,6 +426,10 @@ def _indirect_info(self, nrows):
     ('scratch', '16s'),
 ))
 
+GLOBAL_HEAP_ID = OrderedDict((
+    ('collection_address', 'Q'),  # 8 byte addressing
+    ('object_index', 'I'),
+))
 
 # III.D Disk Format: Level 1D - Local Heaps
 LOCAL_HEAP = OrderedDict((

From fbdda40b3719eacff44c0390f596b48b9243c463 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Tue, 21 Jan 2025 19:59:46 +0000
Subject: [PATCH 095/105] closer to a solution for #29. These tests pass, but
 we need to deal with multidimensional arrays as well.

---
 pyfive/dataobjects.py    | 11 +++++++++--
 pyfive/misc_low_level.py |  7 +++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index 2639ab5..a327163 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -335,8 +335,15 @@ def fillvalue(self):
             size = 0
 
         if size:
-            payload = self.msg_data[offset:offset+size]
-            fillvalue = np.frombuffer(payload, self.dtype, count=1)[0]
+            if isinstance(self.dtype, tuple):
+                try:
+                    assert self.dtype[0] == 'VLEN_STRING'
+                except:
+                    raise ValueError('Unrecognised fill type')
+                fillvalue = self._attr_value(self.dtype, self.msg_data, 1, offset)[0]
+            else:
+                payload = self.msg_data[offset:offset+size]
+                fillvalue = np.frombuffer(payload, self.dtype, count=1)[0]
         else:
             fillvalue = 0
         return fillvalue
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index e7993e7..2d11d0c 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -12,6 +12,7 @@
 from .core import InvalidHDF5File
 from .core import UNDEFINED_ADDRESS
 from math import prod
+import numpy as np
 
 
 class SuperBlock(object):
@@ -335,8 +336,9 @@ def _indirect_info(self, nrows):
         return ndirect, nindirect
 
 def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
-    """ Return the data for a variable which is
-    made up of variable length string data """
+    """ Return the data for a variable which is made up of variable length string data """
+    # we need to import this from DatasetID, and that's imported from Dataobjects hence
+    # hiding it here in misc_low_level.
     fh.seek(data_offset)
     count = prod(shape)
     _, _, character_set = dtype
@@ -347,6 +349,7 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
         vlen_size, = struct.unpack_from('<I', buf, offset=offset)
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
         gheap_address = gheap_id['collection_address']
+        print('Collection address', gheap_address)
         if gheap_address not in global_heaps:
             # load the global heap and cache the instance
             gheap = GlobalHeap(fh, gheap_address)

From a20763f500aa0ef5e6f0092f2feb5c8c9d9d7ce7 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 22 Jan 2025 07:35:49 +0000
Subject: [PATCH 096/105] Partially working vlen string support, issues with
 global heap usage and dtypes remain

---
 pyfive/dataobjects.py    |   1 +
 pyfive/misc_low_level.py |   2 +-
 tests/test_vlen_str.py   | 105 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index a327163..d1bb5f5 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -285,6 +285,7 @@ def _vlen_size_and_data(self, buf, offset):
         # stored in the data object storage.
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
         gheap_address = gheap_id['collection_address']
+        print('Collection address in _vlen', gheap_address)
         if gheap_address not in self._global_heaps:
             # load the global heap and cache the instance
             gheap = GlobalHeap(self.fh, gheap_address)
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index 2d11d0c..58fd50c 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -349,7 +349,7 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
         vlen_size, = struct.unpack_from('<I', buf, offset=offset)
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
         gheap_address = gheap_id['collection_address']
-        print('Collection address', gheap_address)
+        print('Collection address for data', gheap_address)
         if gheap_address not in global_heaps:
             # load the global heap and cache the instance
             gheap = GlobalHeap(fh, gheap_address)
diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
index b9f9615..a21bdbd 100644
--- a/tests/test_vlen_str.py
+++ b/tests/test_vlen_str.py
@@ -26,6 +26,89 @@ def make_file_nc(file_like,m_array, inmemory=False):
     if not inmemory:
         n.close()
 
+
+def make_pathological_nc(our_file):
+    n = nc.Dataset(our_file, "w", format="NETCDF4")
+
+    n.createDimension("dim1", 1)
+    n.createDimension("time", 4)
+    n.createDimension("lat", 2)
+    n.createDimension("lon", 3)
+    n.createDimension("strlen8", 8)
+    n.createDimension("strlen7", 7)
+    n.createDimension("strlen5", 5)
+    n.createDimension("strlen3", 3)
+
+    months = np.array(["January", "February", "March", "April"], dtype="S8")
+   
+    months_m = np.ma.array(
+        months, dtype="S7", mask=[0, 1, 0, 0], fill_value=b""
+    )
+
+    numbers = np.array(
+        [["one", "two", "three"], ["four", "five", "six"]], dtype="S5"
+    )
+
+    s_months4 = n.createVariable("s_months4", str, ("time",))
+    s_months4.long_name = "string: Four months"
+    s_months4[:] = months
+    validation={'s_months4':months}
+
+
+    s_months1 = n.createVariable("s_months1", str, ("dim1",))
+    s_months1.long_name = "string: One month"
+    s_months1[:] = np.array(["December"], dtype="S8")
+    validation['s_months1'] = s_months1[:]
+
+    s_months0 = n.createVariable("s_months0", str, ())
+    s_months0.long_name = "string: One month (scalar)"
+    s_months0[:] = np.array(["May"], dtype="S3")
+    validation['s_months0'] = s_months0[:]
+
+    s_numbers = n.createVariable("s_numbers", str, ("lat", "lon"))
+    s_numbers.long_name = "string: Two dimensional"
+    s_numbers[...] = numbers
+    validation['s_numbers'] = s_numbers[:]
+
+    s_months4m = n.createVariable("s_months4m", str, ("time",))
+    s_months4m.long_name = "string: Four months (masked)"
+    array = months.copy()
+    array[1] = ""
+    s_months4m[...] = array
+    validation['s_months4m'] = s_months4m[...]
+
+    c_months4 = n.createVariable("c_months4", "S1", ("time", "strlen8"))
+    c_months4.long_name = "char: Four months"
+    c_months4[:, :] = nc.stringtochar(months)
+    validation['c_months4'] = c_months4[:, :]
+
+    c_months1 = n.createVariable("c_months1", "S1", ("dim1", "strlen8"))
+    c_months1.long_name = "char: One month"
+    c_months1[:] = nc.stringtochar(np.array(["December"], dtype="S8"))
+    validation['c_months1'] = c_months1[:]
+
+    c_months0 = n.createVariable("c_months0", "S1", ("strlen3",))
+    c_months0.long_name = "char: One month (scalar)"
+    c_months0[:] = np.array(list("May"))
+    validation['c_months0'] = c_months0[:]
+
+    c_numbers = n.createVariable("c_numbers", "S1", ("lat", "lon", "strlen5"))
+    c_numbers.long_name = "char: Two dimensional"
+    np.empty((2, 3, 5), dtype="S1")
+    c_numbers[...] = nc.stringtochar(numbers)
+    validation['c_numbers'] = c_numbers[...]
+
+    c_months4m = n.createVariable("c_months4m", "S1", ("time", "strlen7"))
+    c_months4m.long_name = "char: Four months (masked)"
+    array = nc.stringtochar(months_m)
+    c_months4m[:, :] = array
+    validation['c_months4m'] = array
+
+    n.close()
+
+    return validation
+
+
 def test_vlen_string_hdf5(tmp_path):
 
     #tfile = io.BytesIO()
@@ -78,4 +161,24 @@ def test_vlen_string_nc2(tmp_path):
     
     with pyfive.File(tfile) as hfile:
         ds1 = hfile['months'][:]
-        assert np.array_equal(m_array, ds1.astype(str))
\ No newline at end of file
+        assert np.array_equal(m_array, ds1.astype(str))
+
+def test_pathalogical_strings(tmp_path):
+    tfile = tmp_path/'test_strings.nc'
+    validation=make_pathological_nc(tfile)
+    with pyfive.File(tfile) as pfile:
+        with h5py.File(tfile) as hfile:
+            for k,v in validation.items():
+                hdata = hfile[k][...]
+                #decode_function = np.vectorize(lambda x: x.decode('utf-8'))
+                #hdata = decode_function(hdata)
+                pdata = pfile[k][...]
+                try:
+                    assert np.array_equal(v, pdata),f'Failed original test for {k}'
+                    assert np.array_equal(hdata, pdata), f'Failed comparison test for {d}'
+                except:
+                    print(f'---> Failing {k}')
+                    print('Original data', v)
+                    print('h5py', hdata)
+                    print('pyfive', pdata)
+                    raise

From e7c465e3b096a0e9cd170fee9a54a978fc1be6b8 Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 22 Jan 2025 09:08:49 +0000
Subject: [PATCH 097/105] Passing all vlen tests for #29, though we are
 ignoreing the dtype of the dataset itself, which needs to be a new issue.
 Also the caching stuff needs to be a new issue.

---
 pyfive/dataobjects.py    |  2 +-
 pyfive/h5d.py            |  3 ++-
 pyfive/misc_low_level.py | 17 ++++++++++-------
 tests/test_vlen_str.py   | 26 +++++++++++++-------------
 4 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
index d1bb5f5..55af76c 100644
--- a/pyfive/dataobjects.py
+++ b/pyfive/dataobjects.py
@@ -285,7 +285,7 @@ def _vlen_size_and_data(self, buf, offset):
         # stored in the data object storage.
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
         gheap_address = gheap_id['collection_address']
-        print('Collection address in _vlen', gheap_address)
+        #print('Collection address in _vlen', gheap_address)
         if gheap_address not in self._global_heaps:
             # load the global heap and cache the instance
             gheap = GlobalHeap(self.fh, gheap_address)
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 5fcf1bd..b1ad147 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -297,7 +297,8 @@ def _get_contiguous_data(self, args):
                 return np.array([Reference(addr) for addr in ref_addresses])[args]
             elif dtype_class == 'VLEN_STRING':
                 fh = self._fh
-                return get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype)
+                array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype)
+                return array.reshape(self.shape, order=self._order)
             else:
                 raise NotImplementedError(f'datatype not implemented - {dtype_class}')
 
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index 58fd50c..216ad62 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -342,6 +342,8 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
     fh.seek(data_offset)
     count = prod(shape)
     _, _, character_set = dtype
+    if int(character_set) not in [0, 1]:
+        raise ValueError(f'Unexpected string type, cannot decode character set {character_set}')
     value = np.empty(count,dtype=object)
     offset = 0
     buf = fh.read(16*count)
@@ -349,19 +351,20 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
         vlen_size, = struct.unpack_from('<I', buf, offset=offset)
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
         gheap_address = gheap_id['collection_address']
-        print('Collection address for data', gheap_address)
+        #print('Collection address for data', gheap_address)
         if gheap_address not in global_heaps:
             # load the global heap and cache the instance
             gheap = GlobalHeap(fh, gheap_address)
             global_heaps[gheap_address] = gheap
         gheap = global_heaps[gheap_address]
-        vlen_data = gheap.objects[gheap_id['object_index']]
-        if character_set == 0:
-            # ascii character set, return as bytes
-            value[i] = vlen_data
-        else:
-            value[i] = vlen_data.decode('utf-8')
+        value[i] = gheap.objects[gheap_id['object_index']]
         offset +=16
+        # if character_set == 0 ascii character set, return as bytes
+        if character_set !=0: 
+            # would like to do this outside the loop, but it's problematic at the moment
+            #decode = np.vectorize(lambda x: x.decode('utf-8'))
+            #value = decode(value)
+            value[i] = value[i].decode('UTF-8')
     return value
 
 
diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
index a21bdbd..6e94030 100644
--- a/tests/test_vlen_str.py
+++ b/tests/test_vlen_str.py
@@ -4,6 +4,7 @@
 import io
 import numpy as np
 import os
+import warnings
 
 def make_file_hdf5(our_file, vlen_strings):
    
@@ -52,8 +53,7 @@ def make_pathological_nc(our_file):
     s_months4 = n.createVariable("s_months4", str, ("time",))
     s_months4.long_name = "string: Four months"
     s_months4[:] = months
-    validation={'s_months4':months}
-
+    validation={'s_months4':s_months4[:]}
 
     s_months1 = n.createVariable("s_months1", str, ("dim1",))
     s_months1.long_name = "string: One month"
@@ -116,10 +116,10 @@ def test_vlen_string_hdf5(tmp_path):
     our_view = tmp_path/'h5py_vlen.txt'
     vlen_strings = ["foo","foobar"]
     make_file_hdf5(our_file, vlen_strings)
-    os.system(f'h5dump {our_file} > {our_view}')
-    with open(our_view,'r') as f:
-        for line in f.readlines():
-            print(line)
+    #os.system(f'h5dump {our_file} > {our_view}')
+    #with open(our_view,'r') as f:
+    #    for line in f.readlines():
+    #        print(line)
 
     with pyfive.File(our_file) as hfile:
          
@@ -163,22 +163,22 @@ def test_vlen_string_nc2(tmp_path):
         ds1 = hfile['months'][:]
         assert np.array_equal(m_array, ds1.astype(str))
 
-def test_pathalogical_strings(tmp_path):
+def test_pathological_strings(tmp_path):
     tfile = tmp_path/'test_strings.nc'
     validation=make_pathological_nc(tfile)
+    warnings.warn('Validation of variable length strings assumes h5py is wrong')
     with pyfive.File(tfile) as pfile:
         with h5py.File(tfile) as hfile:
             for k,v in validation.items():
                 hdata = hfile[k][...]
-                #decode_function = np.vectorize(lambda x: x.decode('utf-8'))
-                #hdata = decode_function(hdata)
                 pdata = pfile[k][...]
                 try:
                     assert np.array_equal(v, pdata),f'Failed original test for {k}'
-                    assert np.array_equal(hdata, pdata), f'Failed comparison test for {d}'
+                    assert np.array_equal(hdata.astype(str), pdata.astype(str)), f'Failed comparison test for {k}'
+                    print(f'--> Passing {k} ({hdata.dtype},{pdata.dtype})')
                 except:
-                    print(f'---> Failing {k}')
+                    print(f'---> Failing {k} ({hdata.dtype},{pdata.dtype})')
                     print('Original data', v)
-                    print('h5py', hdata)
-                    print('pyfive', pdata)
+                    print('h5py', hfile[k].dtype, hdata)
+                    print('pyfive',pfile[k].dtype, pdata)
                     raise

From e40c7d75d9472e5efa07e5866febaf17d1badb9f Mon Sep 17 00:00:00 2001
From: Bryan Lawrence <bryan.lawrence@ncas.ac.uk>
Date: Wed, 22 Jan 2025 14:53:55 +0000
Subject: [PATCH 098/105] Remaining tests for vlen and iterchunks, support for
 vlen dtypes (closes #34)

---
 pyfive/h5d.py        | 71 +++++++++++++++++++++++++-------------------
 pyfive/high_level.py |  7 +----
 tests/test_h5d.py    | 18 ++++++-----
 3 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index dd5381d..af53583 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -84,9 +84,9 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
 
         if isinstance(dataobject.dtype,tuple):
             # this may not behave the same as h5py, do we care? #FIXME
-            self.dtype = dataobject.dtype
+            self._dtype = dataobject.dtype
         else:
-            self.dtype = np.dtype(dataobject.dtype)
+            self._dtype = np.dtype(dataobject.dtype)
 
         self._meta = DatasetMeta(dataobject)
 
@@ -158,13 +158,13 @@ def get_data(self, args):
             case 1:  # contiguous storage
                 if self.data_offset == UNDEFINED_ADDRESS:
                     # no storage is backing array, return all zeros
-                    return np.zeros(self.shape, dtype=self.dtype)[args]
+                    return np.zeros(self.shape, dtype=self._dtype)[args]
                 else:
                     return self._get_contiguous_data(args)
             case 2:  # chunked storage
                 if not self._index:
-                    return np.zeros(self.shape, dtype=self.dtype)[args]
-                if isinstance(self.dtype, tuple):
+                    return np.zeros(self.shape, dtype=self._dtype)[args]
+                if isinstance(self._dtype, tuple):
                 # references need to read all the chunks for now
                     return self._get_selection_via_chunks(())[args]
                 else:
@@ -196,19 +196,20 @@ def convert_slice(aslice):
         array = ZarrArrayStub(self.shape, self.chunks)
 
         if args:
-            # convert to getitem type args
-            converted = []
-            for s in args:
-                if isinstance(s, slice) and (s.stop - s.start) == 1:
-                    converted.append(s.start)
-                else:
-                    converted.append(s)
-            args = tuple(converted)
-            indexer = OrthogonalIndexer(*args, array) 
+            # We have implemented what the docstring says it does below,
+            # but that's not what h5py actually does, and what is it 
+            # actually does is useless, so we haven't implemented that
+            raise NotImplementedError("h5py does something silly, and our implementation does not")
+            indexer = OrthogonalIndexer(args[0], array) 
         else:
             indexer = OrthogonalIndexer(args, array) 
-        for _, _, out_selection in indexer:
-            yield convert_selection(out_selection)
+        for chunk_coords, chunk_selection, out_selection in indexer:
+            if args:
+                yield convert_selection(chunk_selection)
+            else:
+                yield convert_selection(out_selection)
+
+   
 
     ##### The following property is made available to support ActiveStorage
     ##### and to help those who may want to generate kerchunk indices and
@@ -243,7 +244,7 @@ def get_chunk_info_from_chunk_coord(self, chunk_coords):
         is returned for the contiguous data as if it were one chunk.
         """
         if not self._index:
-            dummy =  StoreInfo(None, None, self.data_offset, self.dtype.itemsize*np.prod(self.shape))
+            dummy =  StoreInfo(None, None, self.data_offset, self._dtype.itemsize*np.prod(self.shape))
             return dummy
         else:
             coord_index = tuple(map(mul, chunk_coords, self.chunks))
@@ -291,7 +292,7 @@ def _build_index(self, dataobject):
 
     def _get_contiguous_data(self, args):
 
-        if not isinstance(self.dtype, tuple):
+        if not isinstance(self._dtype, tuple):
             if not self.posix:
                 # Not posix
                 return self._get_direct_from_contiguous(args)
@@ -304,7 +305,7 @@ def _get_contiguous_data(self, args):
                     fh = self._fh
                     view =  np.memmap(
                         fh,
-                        dtype=self.dtype,
+                        dtype=self._dtype,
                         mode='c',
                         offset=self.data_offset,
                         shape=self.shape,
@@ -319,9 +320,9 @@ def _get_contiguous_data(self, args):
                 except UnsupportedOperation:
                     return self._get_direct_from_contiguous(args)
         else:
-            dtype_class = self.dtype[0]
+            dtype_class = self._dtype[0]
             if dtype_class == 'REFERENCE':
-                size = self.dtype[1]
+                size = self._dtype[1]
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type - size {size}')
 
@@ -336,7 +337,7 @@ def _get_contiguous_data(self, args):
                 return result
             elif dtype_class == 'VLEN_STRING':
                 fh = self._fh
-                array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self.dtype)
+                array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self._dtype)
                 return array.reshape(self.shape, order=self._order)
             else:
                 raise NotImplementedError(f'datatype not implemented - {dtype_class}')
@@ -353,7 +354,7 @@ def _get_direct_from_contiguous(self, args=None):
         """
         def __get_pseudo_shape():
             """ Determine an appropriate chunk and stride for a given pseudo chunk size """
-            element_size = self.dtype.itemsize
+            element_size = self._dtype.itemsize
             chunk_shape = np.copy(self.shape)
             while True:
                 chunk_size = np.prod(chunk_shape) * element_size
@@ -381,17 +382,17 @@ def coord_to_offset(self,chunk_coords):
             array = ZarrArrayStub(self.shape, chunk_shape)
             indexer = OrthogonalIndexer(args, array)
             out_shape = indexer.shape
-            out = np.empty(out_shape, dtype=self.dtype, order=self._order)
+            out = np.empty(out_shape, dtype=self._dtype, order=self._order)
             chunk_size = np.prod(chunk_shape)
 
             for chunk_coords, chunk_selection, out_selection in indexer:
                 index = self.data_offset + offset_finder.coord_to_offset(chunk_coords)
                 fh.seek(index)
                 chunk_buffer = fh.read(stride)
-                chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
+                chunk_data = np.frombuffer(chunk_buffer, dtype=self._dtype).copy()
                 if len(chunk_data) < chunk_size:
                     # last chunk over end of file
-                    padded_chunk_data = np.zeros(chunk_size, dtype=self.dtype)
+                    padded_chunk_data = np.zeros(chunk_size, dtype=self._dtype)
                     padded_chunk_data[:len(chunk_data)] = chunk_data
                     chunk_data = padded_chunk_data
                 out[out_selection] = chunk_data.reshape(chunk_shape, order=self._order)[chunk_selection]
@@ -402,7 +403,7 @@ def coord_to_offset(self,chunk_coords):
             return out
 
         else:
-            itemsize = np.dtype(self.dtype).itemsize
+            itemsize = np.dtype(self._dtype).itemsize
             num_elements = np.prod(self.shape, dtype=int)
             num_bytes = num_elements*itemsize
 
@@ -410,7 +411,7 @@ def coord_to_offset(self,chunk_coords):
             # read the lot)
             fh.seek(self.data_offset)
             chunk_buffer = fh.read(num_bytes) 
-            chunk_data = np.frombuffer(chunk_buffer, dtype=self.dtype).copy()
+            chunk_data = np.frombuffer(chunk_buffer, dtype=self._dtype).copy()
             chunk_data = chunk_data.reshape(self.shape, order=self._order)
             chunk_data = chunk_data[args]
             if self.posix:
@@ -436,9 +437,9 @@ def _get_selection_via_chunks(self, args):
         the dataset array and in doing so, only load the relevant chunks.
         """
         # need a local dtype as we may override it for a reference read.
-        dtype = self.dtype
+        dtype = self._dtype
 
-        if isinstance(self.dtype, tuple): 
+        if isinstance(self._dtype, tuple): 
             # this is a reference and we're returning that
             true_dtype = tuple(dtype)
             dtype_class = dtype[0]
@@ -465,7 +466,7 @@ def _get_selection_via_chunks(self, args):
             filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
             if self.filter_pipeline is not None:
                 # we are only using the class method here, future filter pipelines may need their own function
-                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self.dtype.itemsize)
+                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self._dtype.itemsize)
             chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy()
             out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
        
@@ -505,6 +506,14 @@ def _fh(self):
 
         return fh
 
+    @property
+    def dtype(self):
+        if isinstance(self._dtype,tuple):
+            if self._dtype[0] == 'VLEN_STRING':
+                return object
+        
+        return self._dtype
+
 
 
 class DatasetMeta:
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index d1350dd..1fa6859 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -349,12 +349,7 @@ def ndim(self):
     @property
     def dtype(self):
         """ dtype attribute. """
-        try:
-            return np.dtype(self.id.dtype)
-        except NotImplementedError as e:
-            raise NotImplementedError(f'{e} (for {self.name})')
-        except TypeError as e:
-            return self.id.dtype
+        return self.id.dtype
 
     @property
     def value(self):
diff --git a/tests/test_h5d.py b/tests/test_h5d.py
index 24bbb6e..4fdb7fb 100644
--- a/tests/test_h5d.py
+++ b/tests/test_h5d.py
@@ -38,8 +38,8 @@ def get_slices(var, using_py5):
     """ Return suitlable test slice from var """
     rank = len(var.shape)
     assert rank == 2
-    slice1 = slice(2,3)
-    slice2 = slice(2,3)
+    slice1 = slice(8,15)
+    slice2 = slice(8,15)
     return (slice1, slice2)
 
 def test_h5d_chunking_details():
@@ -71,11 +71,13 @@ def test_iter_chunks_sel():
         var = f[variable_name]
         slices = get_slices(var, False)
         h5chunks = list(var.iter_chunks(slices))
-        print(h5chunks)
+        #print(h5chunks,var.shape, var.chunks)
 
-    with pyfive.File(mypath/filename) as g:
-        var = g[variable_name]
-        slices = get_slices(var, True)
-        p5chunks = list(var.iter_chunks(slices))
+    with pytest.raises(NotImplementedError):
+        with pyfive.File(mypath/filename) as g:
+            var = g[variable_name]
+            slices = get_slices(var, True)
+            p5chunks = list(var.iter_chunks(slices))
+            #print(p5chunks,var.shape, var.chunks)
 
-    assert h5chunks == p5chunks 
\ No newline at end of file
+        assert h5chunks == p5chunks 
\ No newline at end of file

From 599db7b993b23d006317ce665e22e8ff676982d5 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 23 Jan 2025 09:57:51 +0000
Subject: [PATCH 099/105] dev

---
 pyfive/h5d.py            | 113 ++++++++++++++++++++++++++++++++-------
 pyfive/misc_low_level.py |   5 +-
 2 files changed, 98 insertions(+), 20 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index af53583..30c2442 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -303,7 +303,7 @@ def _get_contiguous_data(self, args):
                     # means that we will end up only copying the
                     # sub-array into in memory.
                     fh = self._fh
-                    view =  np.memmap(
+                    view = np.memmap(
                         fh,
                         dtype=self._dtype,
                         mode='c',
@@ -338,7 +338,7 @@ def _get_contiguous_data(self, args):
             elif dtype_class == 'VLEN_STRING':
                 fh = self._fh
                 array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self._dtype)
-                return array.reshape(self.shape, order=self._order)
+                return array.reshape(self.shape, order=self._order)[args]
             else:
                 raise NotImplementedError(f'datatype not implemented - {dtype_class}')
 
@@ -438,7 +438,8 @@ def _get_selection_via_chunks(self, args):
         """
         # need a local dtype as we may override it for a reference read.
         dtype = self._dtype
-
+        print ('DTYPE', dtype)
+        print ('args =', args)
         if isinstance(self._dtype, tuple): 
             # this is a reference and we're returning that
             true_dtype = tuple(dtype)
@@ -448,8 +449,11 @@ def _get_selection_via_chunks(self, args):
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type')
                 dtype = '<u8'
-            else:
-                raise NotImplementedError('datatype not implemented')
+#            elif dtype_class == 'VLEN_STRING':
+#                pass
+                
+#            else:
+#                raise NotImplementedError('datatype not implemented')
         else:
             true_dtype = None
             if np.prod(self.shape) == 0:
@@ -458,20 +462,92 @@ def _get_selection_via_chunks(self, args):
         array = ZarrArrayStub(self.shape, self.chunks)
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
-        out = np.empty(out_shape, dtype=dtype, order=self._order)
+        print (out_shape, self.dtype, self._order)
+        out = np.empty(out_shape, dtype=self.dtype, order=self._order)
 
-        for chunk_coords, chunk_selection, out_selection in indexer:
-            # map from chunk coordinate space to array space which is how hdf5 keeps the index
-            chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
-            filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
-            if self.filter_pipeline is not None:
-                # we are only using the class method here, future filter pipelines may need their own function
-                chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self._dtype.itemsize)
-            chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy()
-            out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
+        if dtype_class == 'VLEN_STRING':
+            print ('indexer', indexer)
+            fh = self._fh
+            for chunk_coords, chunk_selection, out_selection in indexer:
+                chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
+                print ('CCO',chunk_coords, chunk_selection, out_selection, self._index[chunk_coords].byte_offset )
+                fh.seek(self._index[chunk_coords].byte_offset)
+                # map from chunk coordinate space to array space which
+                # is how hdf5 keeps the index
+#                filter_mask, chunk_buffer = self.read_direct_chunk(
+#                    chunk_coords
+#                )
+#                print (' chunk_buffer',  chunk_buffer, out_selection)
+#                print ('array', array.__dict__)
+
+                from math import prod
+                from .core import _unpack_struct_from
+                from .misc_low_level import GLOBAL_HEAP_ID, GlobalHeap
+                count = prod((3,)) #shape)
+                _, _, character_set = dtype
+                if int(character_set) not in [0, 1]:
+                    raise ValueError(
+                        f"Unexpected string type, cannot decode character "
+                        "set {character_set}"
+                    )
+                
+                value = np.empty(count, dtype=object)
+                offset = 0
+                buf = fh.read(16*count)
+                for i in range(count):
+                    vlen_size, = struct.unpack_from('<I', buf, offset=offset)
+                    print ('vlen_size=', vlen_size)
+                    gheap_id = _unpack_struct_from(
+                        GLOBAL_HEAP_ID, buf, offset + 4
+                    )
+                    gheap_address = gheap_id['collection_address']
+                    print (gheap_id)
+                    #print('Collection address for data', gheap_address)
+                    if gheap_address not in self._global_heaps:
+                        # load the global heap and cache the instance
+                        gheap = GlobalHeap(fh, gheap_address)
+                        self._global_heaps[gheap_address] = gheap
+                        gheap = self._global_heaps[gheap_address]
+                        v = gheap.objects[gheap_id['object_index']]
+                        print(repr(v))
+                    # if character_set == 0 ascii character set,
+                    # return as bytes
+                    #if character_set: 
+                    #    # would like to do this outside the loop, but
+                    #    # it's problematic at the moment
+                    #    #decode = np.vectorize(lambda x: x.decode('utf-8'))
+                    #    #value = decode(value)
+                    #    v = v.decode('UTF-8')
+
+                    value[i] = v
+                    offset +=16
+                        
+                print (value)
+
+                if self.posix:
+                    fh.close()
+                
+##                chunk_buffer = get_vlen_string_data(fh, #
+ #               chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self._dtype.itemsize)
+                chunk_data = value #np.frombuffer(chunk_buffer, dtype='S8')
+                chunk_data = chunk_data.reshape(self.chunks)
+                out[out_selection] = chunk_data[chunk_selection]
+                
+            raise ValueError(9999)
+        else:        
+            for chunk_coords, chunk_selection, out_selection in indexer:
+                # map from chunk coordinate space to array space which
+                # is how hdf5 keeps the index
+                chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
+                filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
+                if self.filter_pipeline is not None:
+                    # we are only using the class method here, future
+                    # filter pipelines may need their own function
+                    chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self._dtype.itemsize)
+                chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy()
+                out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
        
         if true_dtype is not None:
-            
             if dtype_class == 'REFERENCE':
                 to_reference = np.vectorize(Reference)
                 out = to_reference(out)
@@ -508,9 +584,8 @@ def _fh(self):
 
     @property
     def dtype(self):
-        if isinstance(self._dtype,tuple):
-            if self._dtype[0] == 'VLEN_STRING':
-                return object
+        if isinstance(self._dtype, tuple) and self._dtype[0] == 'VLEN_STRING':
+            return np.dtype("O")
         
         return self._dtype
 
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index 216ad62..5cd8f8a 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -132,7 +132,8 @@ class GlobalHeap(object):
     """
 
     def __init__(self, fh, offset):
-
+    
+        print ('fh=', fh, offset)
         fh.seek(offset)
         header = _unpack_struct_from_file(GLOBAL_HEAP_HEADER, fh)
         assert header['signature'] == b'GCOL'
@@ -349,7 +350,9 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
     buf = fh.read(16*count)
     for i in range(count):
         vlen_size, = struct.unpack_from('<I', buf, offset=offset)
+        print ('vlen_size=', vlen_size)
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
+        print (gheap_id)
         gheap_address = gheap_id['collection_address']
         #print('Collection address for data', gheap_address)
         if gheap_address not in global_heaps:

From 4b4fbc3b0d19185daf576a3d68bf264db0f2faa1 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 23 Jan 2025 17:22:09 +0000
Subject: [PATCH 100/105] dev

---
 pyfive/h5d.py            | 158 ++++++++++++++++-----------------------
 pyfive/high_level.py     |   1 +
 pyfive/indexing.py       |   2 +-
 pyfive/misc_low_level.py |  73 +++++++++++++++---
 tests/test_vlen_str.py   |  73 ++++++++++++++++--
 5 files changed, 192 insertions(+), 115 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 30c2442..6a5b018 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -4,7 +4,7 @@
 from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 from pyfive.btree import BTreeV1RawDataChunks
 from pyfive.core import Reference, UNDEFINED_ADDRESS
-from pyfive.misc_low_level import get_vlen_string_data
+from pyfive.misc_low_level import get_vlen_string_data_contiguous, get_vlen_string_data_from_chunk
 from io import UnsupportedOperation
 
 import struct
@@ -152,20 +152,30 @@ def read_direct_chunk(self, chunk_position, **kwargs):
     
     def get_data(self, args):
         """ Called by the dataset getitem method """
+        dtype = self._dtype
         match self.layout_class:
             case 0:  #compact storage
                 raise NotImplementedError("Compact Storage")
             case 1:  # contiguous storage
                 if self.data_offset == UNDEFINED_ADDRESS:
                     # no storage is backing array, return all zeros
-                    return np.zeros(self.shape, dtype=self._dtype)[args]
+                    if isinstance(dtype, tuple):
+                        return np.zeros(self.shape, dtype='U')[args]
+                    else:
+                        print(99, self.__dict__)
+                        return np.full(self.shape, self.fillvalue, dtype=dtype)[args]
+#                        return np.zeros(self.shape, dtype=dtype)[args]  
                 else:
                     return self._get_contiguous_data(args)
             case 2:  # chunked storage
                 if not self._index:
-                    return np.zeros(self.shape, dtype=self._dtype)[args]
-                if isinstance(self._dtype, tuple):
-                # references need to read all the chunks for now
+                    if isinstance(dtype, tuple):
+                        return np.zeros(self.shape, dtype='U')[args]
+                    else:
+                        print(991)
+                        return np.zeros(self.shape, dtype=dtype)[args]
+                if isinstance(dtype, tuple) and dtype[0] == "REFERENCE":
+                    # references need to read all the chunks for now
                     return self._get_selection_via_chunks(())[args]
                 else:
                     # this is lazily reading only the chunks we need
@@ -337,7 +347,16 @@ def _get_contiguous_data(self, args):
                 return result
             elif dtype_class == 'VLEN_STRING':
                 fh = self._fh
-                array = get_vlen_string_data(fh, self.data_offset, self._global_heaps, self.shape, self._dtype)
+                array = get_vlen_string_data_contiguous(
+                    fh,
+                    self.data_offset,
+                    self._global_heaps,
+                    self.shape,
+                    self._dtype
+                )
+                if self.posix:
+                    fh.close()
+
                 return array.reshape(self.shape, order=self._order)[args]
             else:
                 raise NotImplementedError(f'datatype not implemented - {dtype_class}')
@@ -432,14 +451,13 @@ def _get_raw_chunk(self, storeinfo):
         return out
 
     def _get_selection_via_chunks(self, args):
-        """
-        Use the zarr orthogonal indexer to extract data for a specfic selection within
-        the dataset array and in doing so, only load the relevant chunks.
+        """Use the zarr orthogonal indexer to extract data for a specfic
+        selection within the dataset array and in doing so, only load
+        the relevant chunks.
+
         """
         # need a local dtype as we may override it for a reference read.
         dtype = self._dtype
-        print ('DTYPE', dtype)
-        print ('args =', args)
         if isinstance(self._dtype, tuple): 
             # this is a reference and we're returning that
             true_dtype = tuple(dtype)
@@ -449,110 +467,62 @@ def _get_selection_via_chunks(self, args):
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type')
                 dtype = '<u8'
-#            elif dtype_class == 'VLEN_STRING':
-#                pass
-                
-#            else:
-#                raise NotImplementedError('datatype not implemented')
         else:
             true_dtype = None
+            dtype_class = None
             if np.prod(self.shape) == 0:
                 return np.zeros(self.shape)
 
         array = ZarrArrayStub(self.shape, self.chunks)
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
-        print (out_shape, self.dtype, self._order)
         out = np.empty(out_shape, dtype=self.dtype, order=self._order)
 
         if dtype_class == 'VLEN_STRING':
-            print ('indexer', indexer)
             fh = self._fh
+            
+            chunk_shape = self.chunks
+            global_heaps = self._global_heaps
+            index = self._index
             for chunk_coords, chunk_selection, out_selection in indexer:
                 chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
-                print ('CCO',chunk_coords, chunk_selection, out_selection, self._index[chunk_coords].byte_offset )
-                fh.seek(self._index[chunk_coords].byte_offset)
-                # map from chunk coordinate space to array space which
-                # is how hdf5 keeps the index
-#                filter_mask, chunk_buffer = self.read_direct_chunk(
-#                    chunk_coords
-#                )
-#                print (' chunk_buffer',  chunk_buffer, out_selection)
-#                print ('array', array.__dict__)
-
-                from math import prod
-                from .core import _unpack_struct_from
-                from .misc_low_level import GLOBAL_HEAP_ID, GlobalHeap
-                count = prod((3,)) #shape)
-                _, _, character_set = dtype
-                if int(character_set) not in [0, 1]:
-                    raise ValueError(
-                        f"Unexpected string type, cannot decode character "
-                        "set {character_set}"
-                    )
-                
-                value = np.empty(count, dtype=object)
-                offset = 0
-                buf = fh.read(16*count)
-                for i in range(count):
-                    vlen_size, = struct.unpack_from('<I', buf, offset=offset)
-                    print ('vlen_size=', vlen_size)
-                    gheap_id = _unpack_struct_from(
-                        GLOBAL_HEAP_ID, buf, offset + 4
-                    )
-                    gheap_address = gheap_id['collection_address']
-                    print (gheap_id)
-                    #print('Collection address for data', gheap_address)
-                    if gheap_address not in self._global_heaps:
-                        # load the global heap and cache the instance
-                        gheap = GlobalHeap(fh, gheap_address)
-                        self._global_heaps[gheap_address] = gheap
-                        gheap = self._global_heaps[gheap_address]
-                        v = gheap.objects[gheap_id['object_index']]
-                        print(repr(v))
-                    # if character_set == 0 ascii character set,
-                    # return as bytes
-                    #if character_set: 
-                    #    # would like to do this outside the loop, but
-                    #    # it's problematic at the moment
-                    #    #decode = np.vectorize(lambda x: x.decode('utf-8'))
-                    #    #value = decode(value)
-                    #    v = v.decode('UTF-8')
-
-                    value[i] = v
-                    offset +=16
-                        
-                print (value)
-
-                if self.posix:
-                    fh.close()
-                
-##                chunk_buffer = get_vlen_string_data(fh, #
- #               chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self._dtype.itemsize)
-                chunk_data = value #np.frombuffer(chunk_buffer, dtype='S8')
-                chunk_data = chunk_data.reshape(self.chunks)
+                chunk_data = get_vlen_string_data_from_chunk(
+                    fh,
+                    index[chunk_coords].byte_offset,
+                    global_heaps,
+                    chunk_shape,
+                    dtype
+                )
+                chunk_data  = chunk_data.reshape(chunk_shape)
                 out[out_selection] = chunk_data[chunk_selection]
-                
-            raise ValueError(9999)
-        else:        
+
+            if self.posix:
+                fh.close()
+
+        else:
             for chunk_coords, chunk_selection, out_selection in indexer:
                 # map from chunk coordinate space to array space which
                 # is how hdf5 keeps the index
                 chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
-                filter_mask, chunk_buffer = self.read_direct_chunk(chunk_coords)
+                filter_mask, chunk_buffer = self.read_direct_chunk(
+                    chunk_coords
+                )
                 if self.filter_pipeline is not None:
                     # we are only using the class method here, future
                     # filter pipelines may need their own function
-                    chunk_buffer = BTreeV1RawDataChunks._filter_chunk(chunk_buffer, filter_mask, self.filter_pipeline, self._dtype.itemsize)
-                chunk_data = np.frombuffer(chunk_buffer, dtype=dtype).copy()
-                out[out_selection] = chunk_data.reshape(self.chunks, order=self._order)[chunk_selection]
-       
-        if true_dtype is not None:
-            if dtype_class == 'REFERENCE':
-                to_reference = np.vectorize(Reference)
-                out = to_reference(out)
-            else:
-                raise NotImplementedError('datatype not implemented')
+                    chunk_buffer = BTreeV1RawDataChunks._filter_chunk(
+                        chunk_buffer,
+                        filter_mask,
+                        self.filter_pipeline,
+                        self._dtype.itemsize
+                    )
+                chunk_data = np.frombuffer(chunk_buffer, dtype=dtype)
+                chunk_data = chunk_data.reshape(self.chunks, order=self._order)
+                out[out_selection] = chunk_data[chunk_selection]
+
+        if dtype_class == 'REFERENCE':
+            to_reference = np.vectorize(Reference)
+            out = to_reference(out)
 
         return out
 
@@ -584,7 +554,7 @@ def _fh(self):
 
     @property
     def dtype(self):
-        if isinstance(self._dtype, tuple) and self._dtype[0] == 'VLEN_STRING':
+        if isinstance(self._dtype, tuple): # and self._dtype[0] == 'VLEN_STRING':
             return np.dtype("O")
         
         return self._dtype
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 1fa6859..ffdb19c 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -298,6 +298,7 @@ def __repr__(self):
         return '<HDF5 dataset "%s": shape %s, type "%s">' % info
 
     def __getitem__(self, args):
+        print (88888, self.fillvalue)
         data = self.id.get_data(args)
         if self._astype is None:
             return data
diff --git a/pyfive/indexing.py b/pyfive/indexing.py
index e1d9509..b8e4c62 100644
--- a/pyfive/indexing.py
+++ b/pyfive/indexing.py
@@ -757,7 +757,7 @@ def __init__(self, selection, array):
                     f"expected integer or slice, got {type(dim_sel)!r}"
                 )
 
-            dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size)
+            dim_indexer = SliceDiImndexer(slice_, dim_len, dim_chunk_size)
             dim_indexers.append(dim_indexer)
 
             if start >= dim_len or start < 0:
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index 5cd8f8a..19419e9 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -132,8 +132,7 @@ class GlobalHeap(object):
     """
 
     def __init__(self, fh, offset):
-    
-        print ('fh=', fh, offset)
+
         fh.seek(offset)
         header = _unpack_struct_from_file(GLOBAL_HEAP_HEADER, fh)
         assert header['signature'] == b'GCOL'
@@ -336,7 +335,9 @@ def _indirect_info(self, nrows):
             nindirect = nobjects - ndirect_max
         return ndirect, nindirect
 
-def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
+def get_vlen_string_data_contiguous(
+        fh, data_offset, global_heaps, shape, dtype
+):
     """ Return the data for a variable which is made up of variable length string data """
     # we need to import this from DatasetID, and that's imported from Dataobjects hence
     # hiding it here in misc_low_level.
@@ -349,10 +350,8 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
     offset = 0
     buf = fh.read(16*count)
     for i in range(count):
-        vlen_size, = struct.unpack_from('<I', buf, offset=offset)
-        print ('vlen_size=', vlen_size)
+        # vlen_size, = struct.unpack_from('<I', buf, offset=offset)
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
-        print (gheap_id)
         gheap_address = gheap_id['collection_address']
         #print('Collection address for data', gheap_address)
         if gheap_address not in global_heaps:
@@ -362,14 +361,64 @@ def get_vlen_string_data(fh, data_offset, global_heaps, shape, dtype):
         gheap = global_heaps[gheap_address]
         value[i] = gheap.objects[gheap_id['object_index']]
         offset +=16
-        # if character_set == 0 ascii character set, return as bytes
-        if character_set !=0: 
-            # would like to do this outside the loop, but it's problematic at the moment
-            #decode = np.vectorize(lambda x: x.decode('utf-8'))
-            #value = decode(value)
-            value[i] = value[i].decode('UTF-8')
+
+    # If character_set == 0 ascii character set, return as
+    # bytes. Otherwise return as UTF-8.
+    if character_set:
+        value = _convert_to_utf8_string_objects(value)
+
     return value
 
+def get_vlen_string_data_from_chunk(
+        fh, data_offset, global_heaps, shape, dtype
+):
+    """Return the data for a data chunk which is made up of variable
+length string data.
+
+    """
+    # we need to import this from DatasetID, and that's imported from
+    # Dataobjects hence hiding it here in misc_low_level.
+    fh.seek(data_offset)
+    count = prod(shape)
+    character_set = dtype[2]
+    if int(character_set) not in [0, 1]:
+        raise ValueError(
+            "Unexpected string type, cannot decode character set "
+            f"{character_set!r}"
+        )
+
+    value = np.empty(count, dtype=object)
+    offset = 0
+    buf = fh.read(16*count)
+    for i in range(count):
+        gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset + 4)
+        gheap_address = gheap_id['collection_address']
+        if gheap_address not in global_heaps:
+            gheap = GlobalHeap(fh, gheap_address)
+            global_heaps[gheap_address] = gheap
+
+        gheap = global_heaps[gheap_address]
+        value[i] = gheap.objects[gheap_id['object_index']]
+        offset +=16
+
+    # If character_set == 0 ascii character set, return as
+    # bytes. Otherwise return as UTF-8.
+    if character_set:
+        value = _convert_to_utf8_string_objects(value)
+
+    return value
+
+
+def _convert_to_utf8_string_objects(array):
+    """Convert an numpy array of byte string objects to an array of UTF-8
+    string objects.
+
+    """
+    decode = np.vectorize(lambda x: x.decode('utf-8'))
+    array = decode(array)
+    array = array.astype('O')
+    return array
+  
 
 FORMAT_SIGNATURE = b'\211HDF\r\n\032\n'
 
diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
index 6e94030..17aa1cf 100644
--- a/tests/test_vlen_str.py
+++ b/tests/test_vlen_str.py
@@ -27,6 +27,26 @@ def make_file_nc(file_like,m_array, inmemory=False):
     if not inmemory:
         n.close()
 
+def make_contiguous_and_chunked_nc(our_file):
+    m = ["January", "February", "March", "April", "May",
+         "June", "July", "August", "September", "October",
+         "November", "December"]
+
+    with nc.Dataset(our_file, "w", format="NETCDF4") as n:
+        n.createDimension("y", 3)
+        n.createDimension("x", 4)
+        
+        # Contiguous variable
+        months = n.createVariable("months", str, ("y", "x",))
+        months.long_name = "string: Four months (contiguous)"
+        months[...] = np.array(m, dtype="S9").reshape(3, 4)
+
+        # Chunked variable
+        months_chunked = n.createVariable("months_chunked", str, ("y", "x",),
+                                   chunksizes=(2, 2))
+        months_chunked.long_name = "string: Four months (chunked)"
+        months_chunked[...] = np.array(m, dtype="U9").reshape(3, 4)
+
 
 def make_pathological_nc(our_file):
     n = nc.Dataset(our_file, "w", format="NETCDF4")
@@ -150,18 +170,21 @@ def test_vlen_string_nc2(tmp_path):
     tfile = tmp_path / 'test_vlen_string.nc'
     m_array = ["January", "February", "March", "April"]
     make_file_nc(tfile, m_array)
-    
+
+    # Bytes version
+    m_array_bytes = [m.encode('utf-8') for m in m_array]        
+
     with nc.Dataset(tfile, 'r') as ncfile:
-        ds1 = ncfile.variables['months'][:]
-        assert np.array_equal(m_array, ds1.astype(str))
-    
+        ds1 = ncfile.variables['months'][:].tolist()
+        assert np.array_equal(m_array, ds1)
+         
     with h5py.File(tfile, 'r') as pfile:
-        ds1 = pfile['months'][:]
-        assert np.array_equal(m_array, ds1.astype(str))
+        ds1 = pfile['months'][:].tolist()
+        assert np.array_equal(m_array_bytes, ds1)
     
     with pyfive.File(tfile) as hfile:
-        ds1 = hfile['months'][:]
-        assert np.array_equal(m_array, ds1.astype(str))
+        ds1 = hfile['months'][:].tolist()
+        assert np.array_equal(m_array, ds1)
 
 def test_pathological_strings(tmp_path):
     tfile = tmp_path/'test_strings.nc'
@@ -182,3 +205,37 @@ def test_pathological_strings(tmp_path):
                     print('h5py', hfile[k].dtype, hdata)
                     print('pyfive',pfile[k].dtype, pdata)
                     raise
+
+def test_vlen_contiguous_chunked(tmp_path):
+    tfile = tmp_path/'test_strings_2.nc'
+    make_contiguous_and_chunked_nc(tfile)
+
+    # Check that slices of the contiguous and chunked vesions of the
+    # data are identical. Include slices that span multiple chunks.
+    # 
+    # The array shape is (3, 4) and the chunksize is (2, 2), give four
+    # chunks (A, B, C, D) as follows:
+    #
+    #  +---+---+---+---+
+    #  | A | A | B | B |
+    #  +---+---+---+---+
+    #  | A | A | B | B |
+    #  +---+---+---+---+
+    #  | C | C | D | D |
+    #  +---+---+---+---+
+    #
+    # So (slice(0, 2), slice(0, 2)) selects the entirety of the A
+    # chunks, and no others.
+
+    with pyfive.File(tfile) as pfile:
+        contiguous = pfile['months']
+        chunked = pfile['months_chunked']
+        
+        for index in (
+                Ellipsis,
+                (slice(1, 3), slice(1, 3)), # spans sub-parts of all 4 chunks
+                (1, slice(None)),
+                (slice(None), 1),
+                (slice(0, 2), slice(0,2)),            
+        ):
+            assert np.array_equal(contiguous[index], chunked[index])

From a2cfaeb5eb6eadeec8c3da7b52b3ec03643fc510 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 23 Jan 2025 17:51:56 +0000
Subject: [PATCH 101/105] dev

---
 pyfive/h5d.py        | 14 ++++++--------
 pyfive/high_level.py |  5 ++---
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 6a5b018..00fdd5b 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -150,7 +150,7 @@ def read_direct_chunk(self, chunk_position, **kwargs):
         storeinfo = self._index[chunk_position]
         return storeinfo.filter_mask, self._get_raw_chunk(storeinfo)
     
-    def get_data(self, args):
+    def get_data(self, args, fillvalue):
         """ Called by the dataset getitem method """
         dtype = self._dtype
         match self.layout_class:
@@ -158,13 +158,12 @@ def get_data(self, args):
                 raise NotImplementedError("Compact Storage")
             case 1:  # contiguous storage
                 if self.data_offset == UNDEFINED_ADDRESS:
-                    # no storage is backing array, return all zeros
+                    # no storage is backing array, return an array of
+                    # fill values
                     if isinstance(dtype, tuple):
-                        return np.zeros(self.shape, dtype='U')[args]
-                    else:
-                        print(99, self.__dict__)
-                        return np.full(self.shape, self.fillvalue, dtype=dtype)[args]
-#                        return np.zeros(self.shape, dtype=dtype)[args]  
+                        dtype = np.array(fillvalue).dtype
+
+                    return np.full(self.shape, fillvalue, dtype=dtype)[args]
                 else:
                     return self._get_contiguous_data(args)
             case 2:  # chunked storage
@@ -172,7 +171,6 @@ def get_data(self, args):
                     if isinstance(dtype, tuple):
                         return np.zeros(self.shape, dtype='U')[args]
                     else:
-                        print(991)
                         return np.zeros(self.shape, dtype=dtype)[args]
                 if isinstance(dtype, tuple) and dtype[0] == "REFERENCE":
                     # references need to read all the chunks for now
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index ffdb19c..32c0e5a 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from pyfive.core import Reference
+from pyfive.core import Reference, UNDEFINED_ADDRESS
 from pyfive.dataobjects import DataObjects, DatasetID
 from pyfive.misc_low_level import SuperBlock
 
@@ -298,8 +298,7 @@ def __repr__(self):
         return '<HDF5 dataset "%s": shape %s, type "%s">' % info
 
     def __getitem__(self, args):
-        print (88888, self.fillvalue)
-        data = self.id.get_data(args)
+        data = self.id.get_data(args, self.fillvalue)
         if self._astype is None:
             return data
         return data.astype(self._astype)

From bd161475b1d128c1190071b7fadc733fa042c55f Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Fri, 24 Jan 2025 16:27:36 +0000
Subject: [PATCH 102/105] vlen related fixes

---
 pyfive/h5d.py            | 25 +++++++++++++++++--------
 pyfive/high_level.py     |  2 +-
 pyfive/misc_low_level.py |  2 +-
 tests/test_fillvalue.py  | 38 ++++++++++++++++++++++++++++++++++++++
 tests/test_vlen_str.py   | 12 ++++++------
 5 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
index 00fdd5b..f731699 100644
--- a/pyfive/h5d.py
+++ b/pyfive/h5d.py
@@ -163,15 +163,21 @@ def get_data(self, args, fillvalue):
                     if isinstance(dtype, tuple):
                         dtype = np.array(fillvalue).dtype
 
+                    # Note: We can improve this so only an array of
+                    #       the shape implied by 'args' is
+                    #       created. One for the future.
                     return np.full(self.shape, fillvalue, dtype=dtype)[args]
                 else:
                     return self._get_contiguous_data(args)
             case 2:  # chunked storage
                 if not self._index:
+                    # no storage is backing array, return an array of
+                    # fill values
                     if isinstance(dtype, tuple):
-                        return np.zeros(self.shape, dtype='U')[args]
-                    else:
-                        return np.zeros(self.shape, dtype=dtype)[args]
+                        dtype = np.array(fillvalue).dtype
+
+                    return np.full(self.shape, fillvalue, dtype=dtype)[args]
+
                 if isinstance(dtype, tuple) and dtype[0] == "REFERENCE":
                     # references need to read all the chunks for now
                     return self._get_selection_via_chunks(())[args]
@@ -462,9 +468,11 @@ def _get_selection_via_chunks(self, args):
             dtype_class = dtype[0]
             if dtype_class == 'REFERENCE':
                 size = dtype[1]
+                dtype = '<u8'
                 if size != 8:
                     raise NotImplementedError('Unsupported Reference type')
-                dtype = '<u8'
+            elif dtype_class == 'VLEN_STRING':
+                dtype = self.dtype
         else:
             true_dtype = None
             dtype_class = None
@@ -474,7 +482,7 @@ def _get_selection_via_chunks(self, args):
         array = ZarrArrayStub(self.shape, self.chunks)
         indexer = OrthogonalIndexer(args, array) 
         out_shape = indexer.shape
-        out = np.empty(out_shape, dtype=self.dtype, order=self._order)
+        out = np.empty(out_shape, dtype=dtype, order=self._order)
 
         if dtype_class == 'VLEN_STRING':
             fh = self._fh
@@ -482,6 +490,7 @@ def _get_selection_via_chunks(self, args):
             chunk_shape = self.chunks
             global_heaps = self._global_heaps
             index = self._index
+            _dtype = self._dtype
             for chunk_coords, chunk_selection, out_selection in indexer:
                 chunk_coords = tuple(map(mul, chunk_coords, self.chunks))
                 chunk_data = get_vlen_string_data_from_chunk(
@@ -489,7 +498,7 @@ def _get_selection_via_chunks(self, args):
                     index[chunk_coords].byte_offset,
                     global_heaps,
                     chunk_shape,
-                    dtype
+                    _dtype
                 )
                 chunk_data  = chunk_data.reshape(chunk_shape)
                 out[out_selection] = chunk_data[chunk_selection]
@@ -552,9 +561,9 @@ def _fh(self):
 
     @property
     def dtype(self):
-        if isinstance(self._dtype, tuple): # and self._dtype[0] == 'VLEN_STRING':
+        if isinstance(self._dtype, tuple) and self._dtype[0] == 'VLEN_STRING':
             return np.dtype("O")
-        
+
         return self._dtype
 
 
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
index 32c0e5a..a23c8b4 100644
--- a/pyfive/high_level.py
+++ b/pyfive/high_level.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from pyfive.core import Reference, UNDEFINED_ADDRESS
+from pyfive.core import Reference
 from pyfive.dataobjects import DataObjects, DatasetID
 from pyfive.misc_low_level import SuperBlock
 
diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
index 19419e9..a9630a9 100644
--- a/pyfive/misc_low_level.py
+++ b/pyfive/misc_low_level.py
@@ -418,7 +418,7 @@ def _convert_to_utf8_string_objects(array):
     array = decode(array)
     array = array.astype('O')
     return array
-  
+
 
 FORMAT_SIGNATURE = b'\211HDF\r\n\032\n'
 
diff --git a/tests/test_fillvalue.py b/tests/test_fillvalue.py
index 0203158..7e77737 100644
--- a/tests/test_fillvalue.py
+++ b/tests/test_fillvalue.py
@@ -1,6 +1,10 @@
 """ Test pyfive's Dataset fillvalue attribute. """
 import os
 
+import netCDF4
+
+import numpy as np
+
 import pyfive
 
 DIRNAME = os.path.dirname(__file__)
@@ -22,3 +26,37 @@ def test_dataset_fillvalue_latest():
         assert hfile['dset1'].fillvalue == 42
         assert hfile['dset2'].fillvalue == 0
         assert abs(hfile['dset3'].fillvalue - 99.5) < 0.05
+
+
+def make_uninitialized_data_file_nc(file_like):
+    with netCDF4.Dataset(file_like, "w", format="NETCDF4") as n:
+        n.createDimension("y", 3)
+        n.createDimension("x", 4)
+
+        # Default fillvalue
+        n.createVariable("string", str, ("y", "x",))
+        n.createVariable("char", 'S1', ("y", "x",))
+        n.createVariable("int32", 'i4', ("y", "x",))
+        n.createVariable("float64", 'f8', ("y", "x",))
+
+        # Set fillvalue
+        n.createVariable("string_2", str, ("y", "x",), fill_value='NA')
+        n.createVariable("char_2", 'S1', ("y", "x",), fill_value='x')
+        n.createVariable("int32_2", 'i4', ("y", "x",), fill_value=999)
+        n.createVariable("float64_2", 'f8', ("y", "x",), fill_value=999.9)
+
+
+def test_uninitialized_data(tmp_path):
+    tfile = tmp_path / 'test_uninitialized_data.nc'
+    make_uninitialized_data_file_nc(tfile)
+
+    with pyfive.File(tfile, 'r') as pfile:
+        assert np.array_equal(pfile['string'][1], [''] * 4)
+        assert np.array_equal(pfile['char'][1], [b''] * 4)
+        assert np.array_equal(pfile['int32'][1], [-2147483647] * 4)
+        assert np.array_equal(pfile['float64'][1], [9.969209968386869e+36] * 4)
+
+        assert np.array_equal(pfile['string_2'][1], ['NA'] * 4)
+        assert np.array_equal(pfile['char_2'][1], [b'x'] * 4)
+        assert np.array_equal(pfile['int32_2'][1], [999] * 4)
+        assert np.array_equal(pfile['float64_2'][1], [999.9] * 4)
diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
index 17aa1cf..24a8900 100644
--- a/tests/test_vlen_str.py
+++ b/tests/test_vlen_str.py
@@ -35,7 +35,7 @@ def make_contiguous_and_chunked_nc(our_file):
     with nc.Dataset(our_file, "w", format="NETCDF4") as n:
         n.createDimension("y", 3)
         n.createDimension("x", 4)
-        
+
         # Contiguous variable
         months = n.createVariable("months", str, ("y", "x",))
         months.long_name = "string: Four months (contiguous)"
@@ -172,12 +172,12 @@ def test_vlen_string_nc2(tmp_path):
     make_file_nc(tfile, m_array)
 
     # Bytes version
-    m_array_bytes = [m.encode('utf-8') for m in m_array]        
+    m_array_bytes = [m.encode('utf-8') for m in m_array]
 
     with nc.Dataset(tfile, 'r') as ncfile:
         ds1 = ncfile.variables['months'][:].tolist()
         assert np.array_equal(m_array, ds1)
-         
+
     with h5py.File(tfile, 'r') as pfile:
         ds1 = pfile['months'][:].tolist()
         assert np.array_equal(m_array_bytes, ds1)
@@ -212,7 +212,7 @@ def test_vlen_contiguous_chunked(tmp_path):
 
     # Check that slices of the contiguous and chunked vesions of the
     # data are identical. Include slices that span multiple chunks.
-    # 
+    #
     # The array shape is (3, 4) and the chunksize is (2, 2), give four
     # chunks (A, B, C, D) as follows:
     #
@@ -230,12 +230,12 @@ def test_vlen_contiguous_chunked(tmp_path):
     with pyfive.File(tfile) as pfile:
         contiguous = pfile['months']
         chunked = pfile['months_chunked']
-        
+
         for index in (
                 Ellipsis,
                 (slice(1, 3), slice(1, 3)), # spans sub-parts of all 4 chunks
                 (1, slice(None)),
                 (slice(None), 1),
-                (slice(0, 2), slice(0,2)),            
+                (slice(0, 2), slice(0,2)),
         ):
             assert np.array_equal(contiguous[index], chunked[index])

From a50204d9caf75659c35ce5c7ee7d442139136032 Mon Sep 17 00:00:00 2001
From: David Hassell <davidhassell@users.noreply.github.com>
Date: Tue, 28 Jan 2025 09:35:55 +0000
Subject: [PATCH 103/105] Update pyfive/indexing.py

---
 pyfive/indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyfive/indexing.py b/pyfive/indexing.py
index b8e4c62..e1d9509 100644
--- a/pyfive/indexing.py
+++ b/pyfive/indexing.py
@@ -757,7 +757,7 @@ def __init__(self, selection, array):
                     f"expected integer or slice, got {type(dim_sel)!r}"
                 )
 
-            dim_indexer = SliceDiImndexer(slice_, dim_len, dim_chunk_size)
+            dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size)
             dim_indexers.append(dim_indexer)
 
             if start >= dim_len or start < 0:

From eed7e99ce031db20f89339be2115187de9e1a581 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Thu, 30 Jan 2025 15:03:05 +0000
Subject: [PATCH 104/105] install only in test mode

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 5e99483..623695f 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -28,7 +28,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
-        python -m pip install -e .
+        python -m pip install .[test]  # install in test mode
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Lint with flake8
       run: |

From 6255fc00eb658c65847b7bb1b88cf967e67f7f12 Mon Sep 17 00:00:00 2001
From: Valeriu Predoi <valeriu.predoi@gmail.com>
Date: Thu, 30 Jan 2025 15:15:56 +0000
Subject: [PATCH 105/105] actual correct name for testing regime

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 623695f..3e91bd9 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -28,7 +28,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
-        python -m pip install .[test]  # install in test mode
+        python -m pip install .[testing]  # install in test mode
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Lint with flake8
       run: |