Skip to content

Commit

Permalink
ENH trusting np.ufuncs and np.dtypes (#336)
Browse files Browse the repository at this point in the history
Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
  • Loading branch information
3 people authored May 15, 2023
1 parent e67fcad commit c68141e
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 55 deletions.
2 changes: 2 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ v0.7
`Adrin Jalali`_.
- Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by
`Benjamin Bossan`_.
- All public ``numpy`` ufuncs (Universal Functions) and dtypes are trusted by default
by :func:`.io.load`. :pr:`336` by :user:`Omar Arab Oghli <omar-araboghli>`.

v0.6
----
Expand Down
5 changes: 3 additions & 2 deletions docs/persistence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ using :func:`skops.io.get_untrusted_types`:
from skops.io import get_untrusted_types
unknown_types = get_untrusted_types(file="my-model.skops")
print(unknown_types)
['numpy.float64', 'numpy.int64', 'sklearn.metrics._scorer._passthrough_scorer',
['sklearn.metrics._scorer._passthrough_scorer',
'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier']
Note that everything in the above list is safe to load. We already have many
Expand All @@ -108,7 +108,8 @@ At the moment, ``skops`` cannot persist arbitrary Python code. This means if
you have custom functions (say, a custom function to be used with
:class:`sklearn.preprocessing.FunctionTransformer`), it will not work. However,
most ``numpy`` and ``scipy`` functions should work. Therefore, you can save
objects having references to functions such as ``numpy.sqrt``.
objects having references to functions or universal functions (ufuncs)
such as ``numpy.sqrt``.

Compression
~~~~~~~~~~~
Expand Down
10 changes: 8 additions & 2 deletions skops/io/_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from ._audit import Node, get_tree
from ._protocol import PROTOCOL
from ._trusted_types import (
NUMPY_DTYPE_TYPE_NAMES,
NUMPY_UFUNC_TYPE_NAMES,
PRIMITIVE_TYPE_NAMES,
SCIPY_UFUNC_TYPE_NAMES,
SKLEARN_ESTIMATOR_TYPE_NAMES,
Expand Down Expand Up @@ -210,7 +212,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
# TODO: what do we trust?
self.trusted = self._get_trusted(trusted, default=SCIPY_UFUNC_TYPE_NAMES)
self.trusted = self._get_trusted(
trusted, default=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
)
self.children = {}

def _construct(self):
Expand Down Expand Up @@ -293,7 +297,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
# TODO: what do we trust?
self.trusted = self._get_trusted(trusted, PRIMITIVE_TYPE_NAMES)
self.trusted = self._get_trusted(
trusted, PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
)
# We use a bare Node type here since a Node only checks the type in the
# dict using __class__ and __module__ keys.
self.children = {}
Expand Down
9 changes: 8 additions & 1 deletion skops/io/_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._audit import Node, get_tree
from ._general import function_get_state
from ._protocol import PROTOCOL
from ._trusted_types import NUMPY_DTYPE_TYPE_NAMES
from ._utils import LoadContext, SaveContext, get_module, get_state, gettype
from .exceptions import UnsupportedTypeException

Expand Down Expand Up @@ -52,6 +53,10 @@ def ndarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:


class NdArrayNode(Node):
# TODO: NdArrayNode is not only responsible for np.arrays
# but also for np.generics, thus the confusion with DTypeNode.
# See PR-336

def __init__(
self,
state: dict[str, Any],
Expand All @@ -60,7 +65,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
self.type = state["type"]
self.trusted = self._get_trusted(trusted, [np.ndarray])
self.trusted = self._get_trusted(
trusted, [np.ndarray] + NUMPY_DTYPE_TYPE_NAMES # type: ignore
)
if self.type == "numpy":
self.children = {
"content": io.BytesIO(load_context.src.read(state["file"]))
Expand Down
22 changes: 12 additions & 10 deletions skops/io/_trusted_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import scipy
from sklearn.utils import all_estimators

from ._utils import get_type_name
from ._utils import get_public_type_names, get_type_name

PRIMITIVES_TYPES = [int, float, str, bool]

Expand All @@ -14,13 +14,15 @@
if get_type_name(estimator_class).startswith("sklearn.")
]

SCIPY_UFUNC_TYPE_NAMES = sorted(
set(
[
get_type_name(getattr(scipy.special, attr))
for attr in dir(scipy.special)
if isinstance(getattr(scipy.special, attr), np.ufunc)
and get_type_name(getattr(scipy.special, attr)).startswith("scipy")
]
)
SCIPY_UFUNC_TYPE_NAMES = get_public_type_names(module=scipy.special, oftype=np.ufunc)

NUMPY_UFUNC_TYPE_NAMES = get_public_type_names(module=np, oftype=np.ufunc)

NUMPY_DTYPE_TYPE_NAMES = sorted(
{
type_name
for dtypes in np.sctypes.values()
for dtype in dtypes # type: ignore
if (type_name := get_type_name(dtype)).startswith("numpy")
}
)
34 changes: 34 additions & 0 deletions skops/io/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
from dataclasses import dataclass, field
from functools import singledispatch
from types import ModuleType
from typing import Any, Type
from zipfile import ZipFile

Expand Down Expand Up @@ -200,3 +201,36 @@ def get_type_paths(types: Any) -> list[str]:
types = [types]

return [get_type_name(t) if not isinstance(t, str) else t for t in types]


def get_public_type_names(module: ModuleType, oftype: Type) -> list[str]:
"""
Helper function that gets the type names of all
public objects of the given ``oftype`` from the given ``module``,
which start with the root module name.
Public objects are those that can be read via ``dir(...)``.
Parameters
----------
module: ModuleType
Module under which the public objects are defined.
oftype: Type
The type of the objects.
Returns
----------
type_names_list: list of str
The sorted list of type names, all as strings,
e.g. ``["numpy.core._multiarray_umath.absolute"]``.
"""
module_name, _, _ = module.__name__.rpartition(".")

return sorted(
{
type_name
for attr in dir(module)
if issubclass((obj := getattr(module, attr)).__class__, oftype)
and (type_name := get_type_name(obj)).startswith(module_name)
}
)
22 changes: 0 additions & 22 deletions skops/io/tests/test_audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,8 @@
from contextlib import suppress
from zipfile import ZipFile

import numpy as np
import pytest
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

from skops.io import dumps, get_untrusted_types
from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr
Expand Down Expand Up @@ -152,25 +149,6 @@ def __init__(self):
assert not hasattr(temp, "b")


def test_complex_pipeline_untrusted_set():
# fmt: off
clf = Pipeline([
("features", FeatureUnion([
("scaler", StandardScaler()),
("sqrt", FunctionTransformer(
func=np.sqrt,
inverse_func=np.square,
)),
])),
("clf", LogisticRegression(random_state=0, solver="liblinear")),
])
# fmt: on

untrusted = get_untrusted_types(data=dumps(clf))
type_names = [x.split(".")[-1] for x in untrusted]
assert type_names == ["sqrt", "square"]


def test_format_object_node():
estimator = LogisticRegression(random_state=0, solver="liblinear")
state = get_state(estimator, SaveContext(None))
Expand Down
4 changes: 0 additions & 4 deletions skops/io/tests/test_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ def trusted(self):
"lightgbm.sklearn.LGBMClassifier",
"lightgbm.sklearn.LGBMRegressor",
"lightgbm.sklearn.LGBMRanker",
"numpy.int32",
"numpy.int64",
"sklearn.preprocessing._label.LabelEncoder",
]

Expand Down Expand Up @@ -329,8 +327,6 @@ def trusted(self):
# TODO: adjust once more types are trusted by default
return [
"builtins.bytes",
"numpy.float32",
"numpy.float64",
"catboost.core.CatBoostClassifier",
"catboost.core.CatBoostRegressor",
"catboost.core.CatBoostRanker",
Expand Down
32 changes: 28 additions & 4 deletions skops/io/tests/test_persist.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,13 @@
from skops.io import dump, dumps, get_untrusted_types, load, loads
from skops.io._audit import NODE_TYPE_MAPPING, get_tree
from skops.io._sklearn import UNSUPPORTED_TYPES
from skops.io._trusted_types import SCIPY_UFUNC_TYPE_NAMES, SKLEARN_ESTIMATOR_TYPE_NAMES
from skops.io._trusted_types import (
NUMPY_DTYPE_TYPE_NAMES,
NUMPY_UFUNC_TYPE_NAMES,
PRIMITIVE_TYPE_NAMES,
SCIPY_UFUNC_TYPE_NAMES,
SKLEARN_ESTIMATOR_TYPE_NAMES,
)
from skops.io._utils import LoadContext, SaveContext, _get_state, get_state, gettype
from skops.io.exceptions import UnsupportedTypeException, UntrustedTypesFoundException
from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal
Expand Down Expand Up @@ -225,11 +231,17 @@ def _tested_estimators(type_filter=None):


def _tested_ufuncs():
for full_name in SCIPY_UFUNC_TYPE_NAMES:
for full_name in SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES:
module_name, _, ufunc_name = full_name.rpartition(".")
yield gettype(module_name=module_name, cls_or_func=ufunc_name)


def _tested_types():
for full_name in PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES:
module_name, _, type_name = full_name.rpartition(".")
yield gettype(module_name=module_name, cls_or_func=type_name)


def _unsupported_estimators(type_filter=None):
for name, Estimator in all_estimators(type_filter=type_filter):
if Estimator not in UNSUPPORTED_TYPES:
Expand Down Expand Up @@ -359,15 +371,27 @@ def test_can_persist_fitted(estimator):

assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in SCIPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in NUMPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in NUMPY_DTYPE_TYPE_NAMES for type_ in untrusted_types)
assert_method_outputs_equal(estimator, loaded, X)


@pytest.mark.parametrize("ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES)
@pytest.mark.parametrize(
"ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
)
def test_can_trust_ufuncs(ufunc):
dumped = dumps(ufunc)
untrusted_types = get_untrusted_types(data=dumped)
assert len(untrusted_types) == 0
# TODO: extend with numpy ufuncs


@pytest.mark.parametrize(
"type_", _tested_types(), ids=PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
)
def test_can_trust_types(type_):
dumped = dumps(type_)
untrusted_types = get_untrusted_types(data=dumped)
assert len(untrusted_types) == 0


@pytest.mark.parametrize(
Expand Down
13 changes: 3 additions & 10 deletions skops/io/tests/test_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,26 +101,19 @@ def sink(nodes_iter, *args, **kwargs):
nodes_self_unsafe = [node for node in nodes if not node.is_self_safe]
nodes_unsafe = [node for node in nodes if not node.is_safe]

# there are currently 2 unsafe nodes, a numpy int and the custom
# functions. The former might be considered safe in the future, in which
# case this test needs to be changed.
assert len(nodes_self_unsafe) == 2
assert nodes_self_unsafe[0].val == "numpy.int64"
assert nodes_self_unsafe[1].val == "test_visualize.unsafe_function"
assert len(nodes_self_unsafe) == 1
assert nodes_self_unsafe[0].val == "test_visualize.unsafe_function"

# it's not easy to test the number of indirectly unsafe nodes, because
# it will depend on the nesting structure; we can only be sure that it's
# more than 2, and one of them should be the FunctionTransformer
assert len(nodes_unsafe) > 2
assert any("FunctionTransformer" in node.val for node in nodes_unsafe)

@pytest.mark.parametrize(
"trusted", [True, ["numpy.int64", "test_visualize.unsafe_function"]]
)
@pytest.mark.parametrize("trusted", [True, ["test_visualize.unsafe_function"]])
def test_all_nodes_trusted(self, pipeline, trusted, capsys):
# The pipeline contains untrusted type(s), but if we pass trusted=True,
# it is not considered untrusted anymore
# TODO: remove numpy.int64 from trusted once it's trusted by default
file = sio.dumps(pipeline)
sio.visualize(file, show="untrusted", trusted=trusted)
expected = "root: sklearn.pipeline.Pipeline"
Expand Down

0 comments on commit c68141e

Please sign in to comment.