@@ -23,17 +23,11 @@ class DatasetID:
23
23
instance, it is completely independent of the parent file, and it can be used
24
24
efficiently in distributed threads without thread contention to the b-tree etc.
25
25
"""
26
- def __init__ (self , dataobject , pseudo_chunking_size_MB = 4 ):
26
+ def __init__ (self , dataobject ):
27
27
"""
28
28
Instantiated with the pyfive datasetdataobject, we copy and cache everything
29
29
we want so that the only file operations are now data accesses.
30
30
31
- if pseudo_chunking_size_MB is set to a value greater than zero, and
32
- if the storage is not local posix (and hence np.mmap is not available) then
33
- when accessing contiguous variables, we attempt to find a suitable
34
- chunk shape to approximate that volume and read the contigous variable
35
- as if were chunked. This is to facilitate lazy loading of partial data
36
- from contiguous storage.
37
31
"""
38
32
39
33
self ._order = dataobject .order
@@ -42,8 +36,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
42
36
try :
43
37
dataobject .fh .fileno ()
44
38
self ._filename = dataobject .fh .name
45
- self .avoid_mmap = False
46
- self .pseudo_chunking_size = 0
39
+ self .avoid_mmap = False
47
40
except (AttributeError , OSError ):
48
41
try :
49
42
# maybe this is an S3File instance?
@@ -53,7 +46,6 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
53
46
# failing that, maybe a memory file, return as None
54
47
self ._filename = getattr (self ._fh ,'full_name' ,'None' )
55
48
self .avoid_mmap = True
56
- self .pseudo_chunking_size = pseudo_chunking_size_MB * 1024 * 1024
57
49
self .filter_pipeline = dataobject .filter_pipeline
58
50
self .shape = dataobject .shape
59
51
self .rank = len (self .shape )
@@ -126,9 +118,6 @@ def read_direct_chunk(self, chunk_position, **kwargs):
126
118
def get_data (self , args ):
127
119
""" Called by the dataset getitem method """
128
120
129
-
130
-
131
-
132
121
match self .layout_class :
133
122
case 0 : #compact storage
134
123
raise NotImplementedError ("Compact Storage" )
@@ -262,31 +251,14 @@ def _get_contiguous_data(self, args):
262
251
263
252
def _get_direct_from_contiguous (self , args = None ):
264
253
"""
265
- If pseudo_chunking_size is set, we attempt to read the contiguous data in chunks
266
- otherwise we have to read the entire array. This is a fallback situation if we
267
- can't use a memory map which would otherwise be lazy. This will normally be when
268
- we don't have a true Posix file. We should never end up here with compressed
269
- data.
254
+ This is a fallback situation if we can't use a memory map which would otherwise be lazy.
255
+ At the moment it is very unlazy.
270
256
"""
271
- def __getstride ():
272
- """ Determine an appropriate chunk and stride for a given pseudo chunk size """
273
- stride = 1
274
- chunk_shape = np .ones (self .rank , dtype = int )
275
- for i in range (self .rank ):
276
- stride *= self .shape [i ]
277
- chunk_shape = box [:i ] = self .shape [:i ]
278
- if stride * self .dtype .itemsize > self .pseudo_chunking_size :
279
- stride //= self .shape [i ]
280
- chunk_shape = box [:i ] = self .shape [:i - 1 ]
281
- return chunk_shape , stride
282
257
283
258
itemsize = np .dtype (self .dtype ).itemsize
284
259
# need to impose type in case self.shape is () in which case numpy would return a float
285
260
num_elements = np .prod (self .shape , dtype = int )
286
261
num_bytes = num_elements * itemsize
287
-
288
- if self .pseudo_chunking_size :
289
- stride = __getstride ()
290
262
291
263
# we need it all, let's get it all (i.e. this really does read the lot)
292
264
self ._fh .seek (self .data_offset )
0 commit comments