Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH trusting np.ufuncs and np.dtypes #336

Merged
merged 13 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ skops Changelog

v0.7
----

- All public ``numpy`` ufuncs (Universal Functions) and dtypes are trusted by default
by :func:`.io.load`. :pr:`336` by :user:`Omar Arab Oghli <omar-araboghli>`.

v0.6
----
Expand Down
5 changes: 3 additions & 2 deletions docs/persistence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ using :func:`skops.io.get_untrusted_types`:
from skops.io import get_untrusted_types
unknown_types = get_untrusted_types(file="my-model.skops")
print(unknown_types)
['numpy.float64', 'numpy.int64', 'sklearn.metrics._scorer._passthrough_scorer',
['sklearn.metrics._scorer._passthrough_scorer',
'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier']

Note that everything in the above list is safe to load. We already have many
Expand All @@ -108,7 +108,8 @@ At the moment, ``skops`` cannot persist arbitrary Python code. This means if
you have custom functions (say, a custom function to be used with
:class:`sklearn.preprocessing.FunctionTransformer`), it will not work. However,
most ``numpy`` and ``scipy`` functions should work. Therefore, you can save
objects having references to functions such as ``numpy.sqrt``.
objects having references to functions or universal functions (ufuncs)
such as ``numpy.sqrt``.

Command Line Interface
######################
Expand Down
10 changes: 8 additions & 2 deletions skops/io/_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from ._audit import Node, get_tree
from ._protocol import PROTOCOL
from ._trusted_types import (
NUMPY_DTYPE_TYPE_NAMES,
NUMPY_UFUNC_TYPE_NAMES,
PRIMITIVE_TYPE_NAMES,
SCIPY_UFUNC_TYPE_NAMES,
SKLEARN_ESTIMATOR_TYPE_NAMES,
Expand Down Expand Up @@ -206,7 +208,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
# TODO: what do we trust?
self.trusted = self._get_trusted(trusted, default=SCIPY_UFUNC_TYPE_NAMES)
self.trusted = self._get_trusted(
trusted, default=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
)
self.children = {}

def _construct(self):
Expand Down Expand Up @@ -289,7 +293,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
# TODO: what do we trust?
self.trusted = self._get_trusted(trusted, PRIMITIVE_TYPE_NAMES)
self.trusted = self._get_trusted(
trusted, PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
)
# We use a bare Node type here since a Node only checks the type in the
# dict using __class__ and __module__ keys.
self.children = {}
Expand Down
7 changes: 6 additions & 1 deletion skops/io/_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._audit import Node, get_tree
from ._general import function_get_state
from ._protocol import PROTOCOL
from ._trusted_types import NUMPY_DTYPE_TYPE_NAMES
from ._utils import LoadContext, SaveContext, get_module, get_state, gettype
from .exceptions import UnsupportedTypeException

Expand Down Expand Up @@ -52,6 +53,8 @@ def ndarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:


class NdArrayNode(Node):
# TODO: NdArrayNode and DtypeNode names lead to confusion, see PR-336
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of sending users on a chase, let's just clear the confusion right away: That this is not only responsible for np arrays, but also for np generics.


def __init__(
self,
state: dict[str, Any],
Expand All @@ -60,7 +63,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
self.type = state["type"]
self.trusted = self._get_trusted(trusted, [np.ndarray])
self.trusted = self._get_trusted(
trusted, [np.ndarray] + NUMPY_DTYPE_TYPE_NAMES # type: ignore
)
if self.type == "numpy":
self.children = {
"content": io.BytesIO(load_context.src.read(state["file"]))
Expand Down
22 changes: 12 additions & 10 deletions skops/io/_trusted_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import scipy
from sklearn.utils import all_estimators

from ._utils import get_type_name
from ._utils import get_public_type_names, get_type_name

PRIMITIVES_TYPES = [int, float, str, bool]

Expand All @@ -14,13 +14,15 @@
if get_type_name(estimator_class).startswith("sklearn.")
]

SCIPY_UFUNC_TYPE_NAMES = sorted(
set(
[
get_type_name(getattr(scipy.special, attr))
for attr in dir(scipy.special)
if isinstance(getattr(scipy.special, attr), np.ufunc)
and get_type_name(getattr(scipy.special, attr)).startswith("scipy")
]
)
SCIPY_UFUNC_TYPE_NAMES = get_public_type_names(module=scipy.special, oftype=np.ufunc)

NUMPY_UFUNC_TYPE_NAMES = get_public_type_names(module=np, oftype=np.ufunc)

NUMPY_DTYPE_TYPE_NAMES = sorted(
{
type_name
for dtypes in np.sctypes.values()
for dtype in dtypes # type: ignore
if (type_name := get_type_name(dtype)).startswith("numpy")
}
)
34 changes: 34 additions & 0 deletions skops/io/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
from dataclasses import dataclass, field
from functools import singledispatch
from types import ModuleType
from typing import Any, Type
from zipfile import ZipFile

Expand Down Expand Up @@ -200,3 +201,36 @@ def get_type_paths(types: Any) -> list[str]:
types = [types]

return [get_type_name(t) if not isinstance(t, str) else t for t in types]


def get_public_type_names(module: ModuleType, oftype: Type) -> list[str]:
"""
Helper function that gets the type names of all
public objects of the given ``_type`` from the given ``module``,
which start with the root module name.

Public objects are those that can be read via ``dir(...)``.

Parameters
----------
module: ModuleType
Module under which the public objects are defined.
oftype: Type
The type of the objects.

Returns
----------
type_names_list: list of str
The sorted list of type names, all as strings,
e.g. ``["numpy.core._multiarray_umath.absolute"]``.
"""
module_name, _, _ = module.__name__.rpartition(".")

return sorted(
{
type_name
for attr in dir(module)
if (issubclass((obj := getattr(module, attr)).__class__, oftype))
and ((type_name := get_type_name(obj)).startswith(module_name))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For both lines, are the outer parens necessary? It seems to me that they could be removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wasn't easy to use walrus operator in such "complex" statement 😄 Thanks for the catch - removed them!

}
)
22 changes: 0 additions & 22 deletions skops/io/tests/test_audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,8 @@
from contextlib import suppress
from zipfile import ZipFile

import numpy as np
import pytest
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

from skops.io import dumps, get_untrusted_types
from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr
Expand Down Expand Up @@ -152,25 +149,6 @@ def __init__(self):
assert not hasattr(temp, "b")


def test_complex_pipeline_untrusted_set():
# fmt: off
clf = Pipeline([
("features", FeatureUnion([
("scaler", StandardScaler()),
("sqrt", FunctionTransformer(
func=np.sqrt,
inverse_func=np.square,
)),
])),
("clf", LogisticRegression(random_state=0, solver="liblinear")),
])
# fmt: on

untrusted = get_untrusted_types(data=dumps(clf))
type_names = [x.split(".")[-1] for x in untrusted]
assert type_names == ["sqrt", "square"]


def test_format_object_node():
estimator = LogisticRegression(random_state=0, solver="liblinear")
state = get_state(estimator, SaveContext(None))
Expand Down
32 changes: 28 additions & 4 deletions skops/io/tests/test_persist.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@
from skops.io import dump, dumps, get_untrusted_types, load, loads
from skops.io._audit import NODE_TYPE_MAPPING, get_tree
from skops.io._sklearn import UNSUPPORTED_TYPES
from skops.io._trusted_types import SCIPY_UFUNC_TYPE_NAMES, SKLEARN_ESTIMATOR_TYPE_NAMES
from skops.io._trusted_types import (
NUMPY_DTYPE_TYPE_NAMES,
NUMPY_UFUNC_TYPE_NAMES,
PRIMITIVE_TYPE_NAMES,
SCIPY_UFUNC_TYPE_NAMES,
SKLEARN_ESTIMATOR_TYPE_NAMES,
)
from skops.io._utils import LoadContext, SaveContext, _get_state, get_state, gettype
from skops.io.exceptions import UnsupportedTypeException, UntrustedTypesFoundException
from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal
Expand Down Expand Up @@ -224,11 +230,17 @@ def _tested_estimators(type_filter=None):


def _tested_ufuncs():
for full_name in SCIPY_UFUNC_TYPE_NAMES:
for full_name in SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES:
module_name, _, ufunc_name = full_name.rpartition(".")
yield gettype(module_name=module_name, cls_or_func=ufunc_name)


def _tested_types():
for full_name in PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES:
module_name, _, type_name = full_name.rpartition(".")
yield gettype(module_name=module_name, cls_or_func=type_name)


def _unsupported_estimators(type_filter=None):
for name, Estimator in all_estimators(type_filter=type_filter):
if Estimator not in UNSUPPORTED_TYPES:
Expand Down Expand Up @@ -358,15 +370,27 @@ def test_can_persist_fitted(estimator):

assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in SCIPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in NUMPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in NUMPY_DTYPE_TYPE_NAMES for type_ in untrusted_types)
assert_method_outputs_equal(estimator, loaded, X)


@pytest.mark.parametrize("ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES)
@pytest.mark.parametrize(
"ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
)
def test_can_trust_ufuncs(ufunc):
dumped = dumps(ufunc)
untrusted_types = get_untrusted_types(data=dumped)
assert len(untrusted_types) == 0
# TODO: extend with numpy ufuncs


@pytest.mark.parametrize(
"type_", _tested_types(), ids=PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
)
def test_can_trust_types(type_):
dumped = dumps(type_)
untrusted_types = get_untrusted_types(data=dumped)
assert len(untrusted_types) == 0


@pytest.mark.parametrize(
Expand Down
8 changes: 2 additions & 6 deletions skops/io/tests/test_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,8 @@ def sink(nodes_iter, *args, **kwargs):
nodes_self_unsafe = [node for node in nodes if not node.is_self_safe]
nodes_unsafe = [node for node in nodes if not node.is_safe]

# there are currently 2 unsafe nodes, a numpy int and the custom
# functions. The former might be considered safe in the future, in which
# case this test needs to be changed.
assert len(nodes_self_unsafe) == 2
assert nodes_self_unsafe[0].val == "numpy.int64"
assert nodes_self_unsafe[1].val == "test_visualize.unsafe_function"
assert len(nodes_self_unsafe) == 1
assert nodes_self_unsafe[0].val == "test_visualize.unsafe_function"

# it's not easy to test the number of indirectly unsafe nodes, because
# it will depend on the nesting structure; we can only be sure that it's
Expand Down