Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop PPDataProxy accessing the file when no data is needed. #3659

Merged
merged 12 commits into from
Feb 19, 2020
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
* Fixed a problem which was causing file loads to fetch *all* field data
whenever UM files (PP or Fieldsfiles) were loaded.
With large sourcefiles, initial file loads are slow, with large memory usage
before any cube data is even fetched. Large enough files will cause a crash.
The problem occurs only with Dask versions >= 2.0.
34 changes: 25 additions & 9 deletions lib/iris/fileformats/pp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2010 - 2019, Met Office
# (C) British Crown Copyright 2010 - 2020, Met Office
#
# This file is part of Iris.
#
Expand Down Expand Up @@ -644,14 +644,30 @@ def ndim(self):
return len(self.shape)

def __getitem__(self, keys):
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
def is_emptyslice(key):
return (isinstance(key, slice) and
isinstance(key.start, int) and
key.start == key.stop)

if any(is_emptyslice(key) for key in keys):
# Fake result for an 'empty' slice : do not open + read the file !!
# Since "dask.array.from_array" fetches a no-data slice to
# 'snapshot' the array metadata.
target_shape = list(self.shape)
for i_dim, key in enumerate(keys):
if is_emptyslice(key):
target_shape[i_dim] = 0
data = np.zeros((1,), dtype=self.dtype)
data = np.broadcast_to(data, target_shape)
else:
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
data = data.__getitem__(keys)
return np.asanyarray(data, dtype=self.dtype)

Expand Down
27 changes: 25 additions & 2 deletions lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2014 - 2019, Met Office
# (C) British Crown Copyright 2014 - 2020, Met Office
#
# This file is part of Iris.
#
Expand All @@ -23,6 +23,8 @@
# importing anything else.
import iris.tests as tests

import numpy as np

from iris.fileformats.pp import PPDataProxy, SplittableInt
from iris.tests import mock

Expand All @@ -35,7 +37,7 @@ def test_lbpack_SplittableInt(self):
self.assertEqual(proxy.lbpack, lbpack)
self.assertIs(proxy.lbpack, lbpack)

def test_lnpack_raw(self):
def test_lbpack_raw(self):
lbpack = 4321
proxy = PPDataProxy(None, None, None, None,
None, lbpack, None, None)
Expand All @@ -47,6 +49,27 @@ def test_lnpack_raw(self):
self.assertEqual(proxy.lbpack.n3, lbpack // 100 % 10)
self.assertEqual(proxy.lbpack.n4, lbpack // 1000 % 10)

def test__getitem__emptyslice(self):
# Check that fetching an "empty" slice does *not* read from the file.
# This is necessary because, since Dask 2.0, the "from_array" function
# takes a zero-length slice of its array argument,
# to capture array metadata.
test_shape = (3, 4, 2, 5, 6, 3, 7)
test_dtype = np.dtype(np.float32)
proxy = PPDataProxy(shape=test_shape, src_dtype=test_dtype,
path=None, offset=None, data_len=None, lbpack=None,
boundary_packing=None, mdi=None)

builtin_open_func_name = '{}.open'.format(__name__)
with self.patch(builtin_open_func_name) as mock_fileopen:
# Test indexing with "empty" slices, "normal" slices and integers.
result = proxy[1:3, 2, 0:0, :, 1:1, :100]

self.assertEqual(mock_fileopen.called, False)
self.assertIsInstance(result, np.ndarray)
self.assertEqual(result.dtype, test_dtype)
self.assertEqual(result.shape, (2, 0, 5, 0, 3, 7))


if __name__ == '__main__':
tests.main()