Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop PPDataProxy accessing the file when no data is needed. #3659

Merged
merged 12 commits into from
Feb 19, 2020
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
* Fixed a problem which was causing file loads to fetch *all* field data
whenever UM files (PP or Fieldsfiles) were loaded.
With large sourcefiles, initial file loads are slow, with large memory usage
before any cube data is even fetched. Large enough files will cause a crash.
The problem occurs only with Dask versions >= 2.0.
36 changes: 27 additions & 9 deletions lib/iris/fileformats/pp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2010 - 2019, Met Office
# (C) British Crown Copyright 2010 - 2020, Met Office
#
# This file is part of Iris.
#
Expand Down Expand Up @@ -644,14 +644,32 @@ def ndim(self):
return len(self.shape)

def __getitem__(self, keys):
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
# Convert a single key to a 1-tuple, for empty-slice testing.
if isinstance(keys, tuple):
keys_tuple = keys
else:
keys_tuple = (keys,)

if any(key == slice(0, 0) for key in keys_tuple):
# An 'empty' slice has no data : do not open + read the file !!
# In these cases, return a 'fake' data array instead.
# Needed because, for Dask >= 2.0, "dask.array.from_array" does a
# fetch like [0:0, 0:0, ...], to 'snapshot' the array metadata.
target_shape = list(self.shape)
for i_dim, key in enumerate(keys_tuple):
if key == slice(0, 0):
target_shape[i_dim] = 0
data = np.zeros((1,), dtype=self.dtype)
data = np.broadcast_to(data, target_shape)
else:
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
data = data.__getitem__(keys)
return np.asanyarray(data, dtype=self.dtype)

Expand Down
125 changes: 123 additions & 2 deletions lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2014 - 2019, Met Office
# (C) British Crown Copyright 2014 - 2020, Met Office
#
# This file is part of Iris.
#
Expand All @@ -19,10 +19,14 @@
from __future__ import (absolute_import, division, print_function)
from six.moves import (filter, input, map, range, zip) # noqa

import six

# Import iris.tests first so that some things can be initialised before
# importing anything else.
import iris.tests as tests

import numpy as np

from iris.fileformats.pp import PPDataProxy, SplittableInt
from iris.tests import mock

Expand All @@ -35,7 +39,7 @@ def test_lbpack_SplittableInt(self):
self.assertEqual(proxy.lbpack, lbpack)
self.assertIs(proxy.lbpack, lbpack)

def test_lnpack_raw(self):
def test_lbpack_raw(self):
lbpack = 4321
proxy = PPDataProxy(None, None, None, None,
None, lbpack, None, None)
Expand All @@ -48,5 +52,122 @@ def test_lnpack_raw(self):
self.assertEqual(proxy.lbpack.n4, lbpack // 1000 % 10)


class SliceTranslator():
"""
Class to translate an array-indexing expression into a tuple of keys.

An instance just returns the argument of its __getitem__ call.

"""
def __getitem__(self, keys):
return keys


# A multidimensional-indexable object that returns its index keys, so we can
# use multidimensional-indexing notation to specify a slicing expression.
Slices = SliceTranslator()


class Test__getitem__slicing(tests.IrisTest):
def _check_slicing(self, test_shape, indices, result_shape,
data_was_fetched=True):
# Check behaviour of the getitem call with specific slicings.
# Especially: check cases where a fetch does *not* read from the file.
# This is necessary because, since Dask 2.0, the "from_array" function
# takes a zero-length slice of its array argument, to capture array
# metadata, and in those cases we want to avoid file access.
test_dtype = np.dtype(np.float32)
proxy = PPDataProxy(shape=test_shape, src_dtype=test_dtype,
path=None, offset=None, data_len=None,
lbpack=0, # Note: a 'real' value is needed.
boundary_packing=None, mdi=None)

# Mock out the file-open call, to see if the file would be read.
if six.PY2:
builtin_open_func_name = '__builtin__.open'
else:
builtin_open_func_name = 'builtins.open'
mock_fileopen = self.patch(builtin_open_func_name)

# Also mock out the 'databytes_to_shaped_array' call, to fake minimal
# operation in the cases where file-open *does* get called.
fake_data = np.zeros(test_shape, dtype=test_dtype)
self.patch('iris.fileformats.pp._data_bytes_to_shaped_array',
mock.MagicMock(return_value=fake_data))

# Test the requested indexing operation.
result = proxy.__getitem__(indices)

# Check the behaviour and results were as expected.
self.assertEqual(mock_fileopen.called, data_was_fetched)
self.assertIsInstance(result, np.ndarray)
self.assertEqual(result.dtype, test_dtype)
self.assertEqual(result.shape, result_shape)

def test_slicing_1d_normal(self):
# A 'normal' 1d testcase with no empty slices.
self._check_slicing(
test_shape=(3,),
indices=Slices[1:10],
result_shape=(2,),
data_was_fetched=True)

def test_slicing_1d_empty(self):
# A 1d testcase with an empty slicing.
self._check_slicing(
test_shape=(3,),
indices=Slices[0:0],
result_shape=(0,),
data_was_fetched=False)

def test_slicing_1d_unrecognised_empty(self):
# When empty slice is not "0:0" : for now, we *don't* detect these.
self._check_slicing(
test_shape=(3,),
indices=Slices[1:1],
result_shape=(0,),
data_was_fetched=True)

def test_slicing_2d_normal(self):
# A 2d testcase with no empty slices.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[2, :3],
result_shape=(3,),
data_was_fetched=True)

def test_slicing_2d_allempty(self):
# A 2d testcase with an empty slice.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[0:0, 0:0],
result_shape=(0, 0),
data_was_fetched=False)

def test_slicing_2d_empty_dim0(self):
# A 2d testcase with an empty slice.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[0:0],
result_shape=(0, 4),
data_was_fetched=False)

def test_slicing_2d_empty_dim1(self):
# A 2d testcase with an empty slice, and an integer index.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[1, 0:0],
result_shape=(0,),
data_was_fetched=False)

def test_slicing_complex(self):
# Multiple dimensions with multiple empty slices.
self._check_slicing(
test_shape=(3, 4, 2, 5, 6, 3, 7),
indices=Slices[1:3, 2, 0:0, :, 1:1, :100],
result_shape=(2, 0, 5, 0, 3, 7),
data_was_fetched=False)


if __name__ == '__main__':
tests.main()