Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop PPDataProxy accessing the file when no data is needed. #3659

Merged
merged 12 commits into from
Feb 19, 2020
32 changes: 24 additions & 8 deletions lib/iris/fileformats/pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,14 +644,30 @@ def ndim(self):
return len(self.shape)

def __getitem__(self, keys):
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
def is_emptyslice(key):
return (isinstance(key, slice)
and isinstance(key.start, int)
and key.start == key.stop)

if any(is_emptyslice(key) for key in keys):
# Fake the result for an 'empty' slice : do not open + read the file !!
# Since "dask.array.from_array" fetches a no-data slice to 'snapshot'
# the array metadata.
target_shape = list(self.shape)
for i_dim, key in enumerate(keys):
if is_emptyslice(key):
target_shape[i_dim] = 0
data = np.zeros((1,), dtype=self.dtype)
data = np.broadcast_to(data, target_shape)
else:
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
data = data.__getitem__(keys)
return np.asanyarray(data, dtype=self.dtype)

Expand Down
24 changes: 23 additions & 1 deletion lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
# importing anything else.
import iris.tests as tests

import numpy as np

from iris.fileformats.pp import PPDataProxy, SplittableInt
from iris.tests import mock

Expand All @@ -35,7 +37,7 @@ def test_lbpack_SplittableInt(self):
self.assertEqual(proxy.lbpack, lbpack)
self.assertIs(proxy.lbpack, lbpack)

def test_lnpack_raw(self):
def test_lbpack_raw(self):
lbpack = 4321
proxy = PPDataProxy(None, None, None, None,
None, lbpack, None, None)
Expand All @@ -47,6 +49,26 @@ def test_lnpack_raw(self):
self.assertEqual(proxy.lbpack.n3, lbpack // 100 % 10)
self.assertEqual(proxy.lbpack.n4, lbpack // 1000 % 10)

def test__getitem__emptyslice(self):
# Check that indexing with an "empty" slice will *not* open and read the file.
# This is necessary because, since Dask 2.0, the "from_array" function takes
# a zero-length slice of its array argument, to capture array metadata.
test_shape = (3, 4, 2, 5, 6, 3, 7)
test_dtype = np.dtype(np.float32)
proxy = PPDataProxy(shape=test_shape, src_dtype=test_dtype,
path=None, offset=None, data_len=None, lbpack=None,
boundary_packing=None, mdi=None)

builtin_open_func_name = '{}.open'.format(__name__)
with self.patch(builtin_open_func_name) as mock_fileopen:
# Test indexing with "empty" slices, "normal" slices and integers.
result = proxy[1:3, 2, 0:0, :, 1:1, :100]

self.assertEqual(mock_fileopen.called, False)
self.assertIsInstance(result, np.ndarray)
self.assertEqual(result.dtype, test_dtype)
self.assertEqual(result.shape, (2, 0, 5, 0, 3, 7))


if __name__ == '__main__':
tests.main()