From 45862450b4e00eaadd1bbc816c5fc63ecf2740e4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 3 Jul 2018 18:40:43 -0400 Subject: [PATCH 01/20] ENH: add integer-na support via an ExtensionArray closes #20700 closes #20747 --- doc/source/whatsnew/v0.24.0.txt | 57 ++ pandas/core/arrays/__init__.py | 3 + pandas/core/arrays/base.py | 12 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/integer.py | 563 ++++++++++++++++ pandas/core/dtypes/cast.py | 3 +- pandas/core/dtypes/concat.py | 1 + pandas/core/indexes/base.py | 24 +- pandas/core/missing.py | 3 +- pandas/core/ops.py | 78 ++- pandas/core/series.py | 2 +- pandas/tests/extension/base/__init__.py | 2 +- pandas/tests/extension/base/getitem.py | 6 +- pandas/tests/extension/base/ops.py | 20 +- pandas/tests/extension/base/reshaping.py | 24 +- pandas/tests/extension/decimal/array.py | 11 +- .../tests/extension/decimal/test_decimal.py | 8 +- pandas/tests/extension/integer/__init__.py | 0 .../tests/extension/integer/test_integer.py | 604 ++++++++++++++++++ pandas/tests/extension/json/array.py | 13 +- pandas/tests/extension/json/test_json.py | 15 +- 21 files changed, 1386 insertions(+), 67 deletions(-) create mode 100644 pandas/core/arrays/integer.py create mode 100644 pandas/tests/extension/integer/__init__.py create mode 100644 pandas/tests/extension/integer/test_integer.py diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index ed4022d422b4d..0c9b0ce98ede0 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -8,6 +8,7 @@ v0.24.0 (Month XX, 2018) New features ~~~~~~~~~~~~ + - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) .. _whatsnew_0240.enhancements.extension_array_operators: @@ -26,6 +27,61 @@ See the :ref:`ExtensionArray Operator Support ` documentation section for details on both ways of adding operator support. +.. _whatsnew_0240.enhancements.intna: + +Integer NA Support +^^^^^^^^^^^^^^^^^^ + +Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled thru the use of ``ExtensionTypes`` . Here is an example of the usage. + +We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying an list or array using the traditional missing value +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`) + +.. ipython:: python + + s = pd.Series([1, 2, np.nan], dtype='Int64') + s + + +Operations on these dtypes will propagate ``NaN`` as other pandas operations. + +.. ipython:: python + + # arithmetic + s + 1 + + # comparison + s == 1 + + # indexing + s.iloc[1:3] + + # operate with other dtypes + s + s.iloc[1:3] + + # coerce when needed + s + 0.01 + +These dtypes can operate as part of ``DataFrames``. + +.. ipython:: python + + df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df + df.dtypes + + +These dtypes can be merged & reshaped & casted. + +.. ipython:: python + + pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes + df['A'].astype(float) + +.. warning:: + + The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This maybe changed at a future date. + .. _whatsnew_0240.enhancements.read_html: ``read_html`` Enhancements @@ -182,6 +238,7 @@ Previous Behavior: ExtensionType Changes ^^^^^^^^^^^^^^^^^^^^^ +- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 1b8a43d4293a5..6e8dafd125bfc 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,6 +1,9 @@ from .base import (ExtensionArray, # noqa + ExtensionOpsMixin, ExtensionScalarOpsMixin) from .categorical import Categorical # noqa from .datetimes import DatetimeArrayMixin # noqa from .period import PeriodArrayMixin # noqa from .timedelta import TimedeltaArrayMixin # noqa +from .integer import ( # noqa + IntegerArray, to_integer_array) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fe4e461b0bd4f..c0697dd29e4d0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -12,8 +12,8 @@ from pandas.errors import AbstractMethodError from pandas.compat.numpy import function as nv from pandas.compat import set_function_name, PY3 -from pandas.core.dtypes.common import is_list_like from pandas.core import ops +from pandas.core.dtypes.common import is_list_like _not_implemented_message = "{} does not implement {}." @@ -88,7 +88,7 @@ class ExtensionArray(object): # Constructors # ------------------------------------------------------------------------ @classmethod - def _from_sequence(cls, scalars, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=False): """Construct a new ExtensionArray from a sequence of scalars. Parameters @@ -96,6 +96,8 @@ def _from_sequence(cls, scalars, copy=False): scalars : Sequence Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. + dtype : Dtype, optional + consruct for this particular dtype copy : boolean, default False if True, copy the underlying data Returns @@ -378,7 +380,7 @@ def fillna(self, value=None, method=None, limit=None): func = pad_1d if method == 'pad' else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) - new_values = self._from_sequence(new_values) + new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value new_values = self.copy() @@ -407,7 +409,7 @@ def unique(self): from pandas import unique uniques = unique(self.astype(object)) - return self._from_sequence(uniques) + return self._from_sequence(uniques, dtype=self.dtype) def _values_for_factorize(self): # type: () -> Tuple[ndarray, Any] @@ -559,7 +561,7 @@ def take(self, indices, allow_fill=False, fill_value=None): result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) - return self._from_sequence(result) + return self._from_sequence(result, dtype=self.dtype) """ # Implementer note: The `fill_value` parameter should be a user-facing # value, an instance of self.dtype.type. When passed `fill_value=None`, diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 83fd0ab499283..ceaa41b98ae7e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -487,8 +487,8 @@ def _constructor(self): return Categorical @classmethod - def _from_sequence(cls, scalars): - return Categorical(scalars) + def _from_sequence(cls, scalars, dtype=None, copy=False): + return Categorical(scalars, dtype=dtype) def copy(self): """ Copy constructor. """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py new file mode 100644 index 0000000000000..a7c0518bb4ed9 --- /dev/null +++ b/pandas/core/arrays/integer.py @@ -0,0 +1,563 @@ +import sys +import warnings +import copy +import numpy as np + +from pandas.compat import u +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass +from pandas.util._decorators import cache_readonly +from pandas.compat import set_function_name +from pandas.api.types import (is_integer, is_scalar, is_float, + is_float_dtype, is_integer_dtype, + is_object_dtype, + is_list_like, + infer_dtype) +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import registry +from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.cast import maybe_downcast_to_dtype + +from pandas.io.formats.printing import ( + format_object_summary, format_object_attrs, default_pprint) + + +class IntegerDtype(ExtensionDtype): + type = None + na_value = np.nan + + @cache_readonly + def is_signed_integer(self): + return self.kind == 'i' + + @cache_readonly + def is_unsigned_integer(self): + return self.kind == 'u' + + @cache_readonly + def numpy_dtype(self): + """ Return an instance of our numpy dtype """ + return np.dtype(self.type) + + @cache_readonly + def kind(self): + return self.numpy_dtype.kind + + @classmethod + def construct_array_type(cls): + """Return the array type associated with this dtype + + Returns + ------- + type + """ + return IntegerArray + + @classmethod + def construct_from_string(cls, string): + """ + Construction from a string, raise a TypeError if not + possible + """ + if string == cls.name: + return cls() + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +def to_integer_array(values): + """ + Parameters + ---------- + values : 1D list-like + + Returns + ------- + infer and return an integer array + + Raises + ------ + TypeError if incompatible types + """ + values = np.array(values, copy=False) + try: + dtype = _dtypes[str(values.dtype)] + except KeyError: + if is_float_dtype(values): + return IntegerArray(values) + + raise TypeError("Incompatible dtype for {}".format(values.dtype)) + return IntegerArray(values, dtype=dtype, copy=False) + + +def coerce_to_array(values, dtype, mask=None, copy=False): + """ + Coerce the input values array to numpy arrays with a mask + + Parameters + ---------- + values : 1D list-like + dtype : integer dtype + mask : boolean 1D array, optional + copy : boolean, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + + if isinstance(values, IntegerArray): + values, mask = values.data, values.mask + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + values = np.array(values, copy=copy) + if is_object_dtype(values): + inferred_type = infer_dtype(values) + if inferred_type not in ['floating', 'integer', + 'mixed-integer', 'mixed-integer-float']: + raise TypeError("{} cannot be converted to an IntegerDtype".format( + values.dtype)) + + elif not (is_integer_dtype(values) or is_float_dtype(values)): + raise TypeError("{} cannot be converted to an IntegerDtype".format( + values.dtype)) + + if mask is None: + mask = isna(values) + else: + assert len(mask) == len(values) + + if not values.ndim == 1: + raise TypeError("values must be a 1D list-like") + if not mask.ndim == 1: + raise TypeError("mask must be a 1D list-like") + + # avoid float->int numpy conversion issues + if is_object_dtype(values): + mask |= isna(values) + + # infer dtype if needed + if dtype is None: + if is_integer_dtype(values): + dtype = values.dtype + else: + dtype = np.dtype('int64') + else: + dtype = dtype.type + + # we copy as need to coerce here + if mask.any(): + values = values.copy() + values[mask] = 1 + + values = values.astype(dtype) + else: + values = values.astype(dtype, copy=False) + + return values, mask + + +class IntegerArray(ExtensionArray, ExtensionOpsMixin): + """ + We represent an IntegerArray with 2 numpy arrays + - data: contains a numpy integer array of the appropriate dtype + - mask: a boolean array holding a mask on the data, False is missing + """ + + @cache_readonly + def dtype(self): + return _dtypes[str(self.data.dtype)] + + def __init__(self, values, mask=None, dtype=None, copy=False): + self.data, self.mask = coerce_to_array( + values, dtype=dtype, mask=mask, copy=copy) + + @classmethod + def _from_sequence(cls, scalars, mask=None, dtype=None, copy=False): + return cls(scalars, mask=mask, dtype=dtype, copy=copy) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, dtype=original.dtype) + + def __getitem__(self, item): + if is_integer(item): + if self.mask[item]: + return self.dtype.na_value + return self.data[item] + return type(self)(self.data[item], + mask=self.mask[item], + dtype=self.dtype) + + def _coerce_to_ndarray(self): + """ coerce to an ndarary, preserving my scalar types """ + + # TODO(jreback) make this better + data = self.data.astype(object) + data[self.mask] = self._na_value + return data + + def __array__(self, dtype=None): + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self._coerce_to_ndarray() + + def __iter__(self): + """Iterate over elements of the array. + + """ + # This needs to be implemented so that pandas recognizes extension + # arrays as list-like. The default implementation makes successive + # calls to ``__getitem__``, which may be slower than necessary. + for i in range(len(self)): + if self.mask[i]: + yield self.dtype.na_value + else: + yield self.data[i] + + def _formatting_values(self): + # type: () -> np.ndarray + return self._coerce_to_ndarray() + + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = 1 if isna(fill_value) else fill_value + result = take(self.data, indexer, fill_value=data_fill_value, + allow_fill=allow_fill) + + mask = take(self.mask, indexer, fill_value=True, + allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask=mask, dtype=self.dtype) + + def copy(self, deep=False): + data, mask = self.data, self.mask + if deep: + data = copy.deepcopy(data) + mask = copy.deepcopy(mask) + else: + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, dtype=self.dtype, copy=False) + + def __setitem__(self, key, value): + _is_scalar = is_scalar(value) + if _is_scalar: + value = [value] + value, mask = coerce_to_array(value, dtype=self.dtype) + + if _is_scalar: + value = value[0] + mask = mask[0] + + self.data[key] = value + self.mask[key] = mask + + def __len__(self): + return len(self.data) + + def __repr__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + klass = self.__class__.__name__ + data = format_object_summary(self, default_pprint, False) + attrs = format_object_attrs(self) + space = " " + + prepr = (u(",%s") % + space).join(u("%s=%s") % (k, v) for k, v in attrs) + + res = u("%s(%s%s)") % (klass, data, prepr) + + return res + + @property + def nbytes(self): + return self.data.nbytes + self.mask.nbytes + + def isna(self): + return self.mask + + @property + def _na_value(self): + return np.nan + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + mask = np.concatenate([x.mask for x in to_concat]) + return cls(data, mask=mask, dtype=to_concat[0].dtype) + + def astype(self, dtype, copy=True): + """Cast to a NumPy array with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray + NumPy ndarray with 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an IntegerDtype, equivalent of same_kind + casting + """ + + # if we are astyping to an existing IntegerDtype we can fastpath + if isinstance(dtype, IntegerDtype): + result = self.data.astype(dtype.numpy_dtype, + casting='same_kind', copy=False) + return type(self)(result, mask=self.mask, + dtype=dtype, copy=False) + + # coerce + data = self._coerce_to_ndarray() + return data.astype(dtype=dtype, copy=False) + + @property + def _ndarray_values(self): + # type: () -> np.ndarray + """Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + """ + return self.data + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each category. + + Every category will have an entry, even those with a count of 0. + + Parameters + ---------- + dropna : boolean, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + + """ + + from pandas import Index, Series + + # compute counts on the data with no nans + data = self.data[~self.mask] + value_counts = Index(data).value_counts() + array = value_counts.values + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.astype(object) + + # if we want nans, count the mask + if not dropna: + + # TODO(extension) + # appending to an Index *always* infers + # w/o passing the dtype + array = np.append(array, [self.mask.sum()]) + index = Index(np.concatenate( + [index.values, + np.array([np.nan], dtype=object)]), dtype=object) + + return Series(array, index=index) + + def _values_for_argsort(self): + # type: () -> ndarray + """Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + data = self.data.copy() + data[self.mask] = data.min() - 1 + return data + + @classmethod + def _create_comparison_method(cls, op): + def cmp_method(self, other): + + op_name = op.__name__ + mask = None + if isinstance(other, IntegerArray): + other, mask = other.data, other.mask + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 0 and len(self) != len(other): + raise ValueError('Lengths must match to compare') + + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(record=True): + with np.errstate(all='ignore'): + result = op(self.data, other) + + # nans propagate + if mask is None: + mask = self.mask + else: + mask = self.mask | mask + + result[mask] = True if op_name == 'ne' else False + return result + + name = '__{name}__'.format(name=op.__name__) + return set_function_name(cmp_method, name, cls) + + def _maybe_mask_result(self, result, mask, other, op_name): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + + # may need to fill infs + # and mask wraparound + if is_float_dtype(result): + mask |= (result == np.inf) | (result == -np.inf) + + # floor div can be a float or an integer dependending + # on the operands + if (op_name in ['rfloordiv', 'floordiv'] and + (is_float_dtype(other) or is_float(other))): + result[mask] = np.nan + return result + + # by definition a float result + elif op_name in ['rtruediv', 'truediv', 'rdiv', 'div']: + result[mask] = np.nan + return result + + elif is_float_dtype(result): + # if our float result, try to downcast if possible + # if remains float, then mask and return as float + nonans = result[notna(result)] + maybe = maybe_downcast_to_dtype(nonans, self.dtype.numpy_dtype) + if not is_integer_dtype(maybe): + result[mask] = np.nan + return result + + return type(self)(result, mask=mask, dtype=self.dtype, copy=False) + + @classmethod + def _create_arithmetic_method(cls, op): + def integer_arithmetic_method(self, other): + + op_name = op.__name__ + mask = None + if isinstance(other, (ABCSeries, ABCIndexClass)): + other = getattr(other, 'values', other) + + if isinstance(other, IntegerArray): + other, mask = other.data, other.mask + elif getattr(other, 'ndim', 0) > 1: + raise TypeError("can only perform ops with 1-d structures") + elif is_list_like(other): + other = np.asarray(other) + if not other.ndim: + other = other.item() + elif other.ndim == 1: + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError( + "can only perform ops with numeric values") + else: + if not (is_float(other) or is_integer(other)): + raise TypeError("can only perform ops with numeric values") + + # nans propagate + if mask is None: + mask = self.mask + else: + mask = self.mask | mask + + with np.errstate(all='ignore'): + result = op(self.data, other) + + # divmod returns a tuple + if op_name == 'divmod': + div, mod = result + return (self._maybe_mask_result(div, mask, other, 'floordiv'), + self._maybe_mask_result(mod, mask, other, 'mod')) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = '__{name}__'.format(name=op.__name__) + return set_function_name(integer_arithmetic_method, name, cls) + + +IntegerArray._add_arithmetic_ops() +IntegerArray._add_comparison_ops() + + +module = sys.modules[__name__] + + +# create the Dtype +_dtypes = {} +for dtype in ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64']: + + if dtype.startswith('u'): + name = "U{}".format(dtype[1:].capitalize()) + else: + name = dtype.capitalize() + classname = "{}Dtype".format(name) + attributes_dict = {'type': getattr(np, dtype), + 'name': name} + dtype_type = type(classname, (IntegerDtype, ), attributes_dict) + setattr(module, classname, dtype_type) + + # register + registry.register(dtype_type) + _dtypes[dtype] = dtype_type() diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0bc6ad8499934..c84ca88754e4f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -651,7 +651,8 @@ def astype_nansafe(arr, dtype, copy=True): # dispatch on extension dtype if needed if is_extension_array_dtype(dtype): - return dtype.array_type._from_sequence(arr, copy=copy) + return dtype.construct_array_type()._from_sequence( + arr, dtype=dtype, copy=copy) if not isinstance(dtype, np.dtype): dtype = pandas_dtype(dtype) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 285e386e25613..7abffb54e6a10 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -531,6 +531,7 @@ def _concat_index_asobject(to_concat, name=None): to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] + return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 78fa6f8217157..8f5dbb6c8cf57 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -44,6 +44,7 @@ is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, + is_extension_array_dtype, is_hashable, needs_i8_conversion, is_iterator, is_list_like, @@ -266,18 +267,32 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, name=name) # categorical - if is_categorical_dtype(data) or is_categorical_dtype(dtype): + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): from .category import CategoricalIndex return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval - if is_interval_dtype(data) or is_interval_dtype(dtype): + elif is_interval_dtype(data) or is_interval_dtype(dtype): from .interval import IntervalIndex closed = kwargs.get('closed', None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) + # extension dtype + elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): + data = np.asarray(data) + if not (dtype is None or is_object_dtype(dtype)): + + # coerce to the provided dtype + data = dtype.construct_array_type()( + data, dtype=dtype, copy=False) + + # coerce to the object dtype + data = data.astype(object) + return Index(data, dtype=object, copy=True, name=name, + **kwargs) + # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -1175,10 +1190,15 @@ def _to_embed(self, keep_tz=False, dtype=None): def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self + elif is_categorical_dtype(dtype): from .category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) + + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) + try: if is_datetime64tz_dtype(dtype): from pandas import DatetimeIndex diff --git a/pandas/core/missing.py b/pandas/core/missing.py index e9b9a734ec5f5..f5fb0070ffc4b 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -638,7 +638,8 @@ def fill_zeros(result, x, y, name, fill): # if we have a fill of inf, then sign it correctly # (GH 6178 and PR 9308) if np.isinf(fill): - signs = np.sign(y if name.startswith(('r', '__r')) else x) + signs = y if name.startswith(('r', '__r')) else x + signs = np.sign(signs.astype('float', copy=False)) negative_inf_mask = (signs.ravel() < 0) & mask np.putmask(result, negative_inf_mask, -fill) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 1ddf77cf71a11..70f64e1aef9cf 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -135,6 +135,13 @@ def rfloordiv(left, right): def rmod(left, right): + # check if right is a string as % is the string + # formatting operation; this is a TypeError + # otherwise perform the op + if isinstance(right, compat.string_types): + raise TypeError("{typ} cannot perform the operation mod".format( + typ=type(left).__name__)) + return right % left @@ -1018,7 +1025,7 @@ def _align_method_SERIES(left, right, align_asobject=False): return left, right -def _construct_result(left, result, index, name, dtype): +def _construct_result(left, result, index, name, dtype=None): """ If the raw op result has a non-None name (e.g. it is an Index object) and the name argument is None, then passing name to the constructor will @@ -1030,7 +1037,7 @@ def _construct_result(left, result, index, name, dtype): return out -def _construct_divmod_result(left, result, index, name, dtype): +def _construct_divmod_result(left, result, index, name, dtype=None): """divmod returns a tuple of like indexed series instead of a single series. """ constructor = left._constructor @@ -1048,16 +1055,33 @@ def dispatch_to_extension_op(op, left, right): # The op calls will raise TypeError if the op is not defined # on the ExtensionArray + # TODO(jreback) + # we need to listify to avoid ndarray, or non-same-type extension array + # dispatching + if is_extension_array_dtype(left): - res_values = op(left.values, right) + + new_left = left.values + if (isinstance(right, np.ndarray) or + (is_extension_array_dtype(right) and + type(left) != type(right))): + new_right = list(right) + else: + new_right = right + else: - # We know that left is not ExtensionArray and is Series and right is - # ExtensionArray. Want to force ExtensionArray op to get called - res_values = op(list(left.values), right.values) + new_left = list(left.values) + new_right = right + + res_values = op(new_left, new_right) res_name = get_op_result_name(left, right) - return left._constructor(res_values, index=left.index, - name=res_name) + + if op.__name__ == 'divmod': + return _construct_divmod_result( + left, res_values, left.index, res_name) + + return _construct_result(left, res_values, left.index, res_name) def _arith_method_SERIES(cls, op, special): @@ -1074,7 +1098,6 @@ def _arith_method_SERIES(cls, op, special): def na_op(x, y): import pandas.core.computation.expressions as expressions - try: result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: @@ -1095,6 +1118,20 @@ def na_op(x, y): return result def safe_na_op(lvalues, rvalues): + """ + return the result of evaluating na_op on the passed in values + + try coercion to object type if the native types are not compatible + + Parameters + ---------- + lvalues : array-like + rvalues : array-like + + Raises + ------ + TypeError: invalid operation + """ try: with np.errstate(all='ignore'): return na_op(lvalues, rvalues) @@ -1105,14 +1142,21 @@ def safe_na_op(lvalues, rvalues): raise def wrapper(left, right): - if isinstance(right, ABCDataFrame): return NotImplemented left, right = _align_method_SERIES(left, right) res_name = get_op_result_name(left, right) - if is_datetime64_dtype(left) or is_datetime64tz_dtype(left): + if is_categorical_dtype(left): + raise TypeError("{typ} cannot perform the operation " + "{op}".format(typ=type(left).__name__, op=str_rep)) + + elif (is_extension_array_dtype(left) or + is_extension_array_dtype(right)): + return dispatch_to_extension_op(op, left, right) + + elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left): result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) return construct_result(left, result, index=left.index, name=res_name, @@ -1124,15 +1168,6 @@ def wrapper(left, right): index=left.index, name=res_name, dtype=result.dtype) - elif is_categorical_dtype(left): - raise TypeError("{typ} cannot perform the operation " - "{op}".format(typ=type(left).__name__, op=str_rep)) - - elif (is_extension_array_dtype(left) or - (is_extension_array_dtype(right) and - not is_categorical_dtype(right))): - return dispatch_to_extension_op(op, left, right) - lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): @@ -1204,6 +1239,9 @@ def _comp_method_SERIES(cls, op, special): masker = _gen_eval_kwargs(op_name).get('masker', False) def na_op(x, y): + # TODO: + # should have guarantess on what x, y can be type-wise + # Extension Dtypes are not called here # dispatch to the categorical if we have a categorical # in either operand diff --git a/pandas/core/series.py b/pandas/core/series.py index 0bdb9d9cc23a6..83580c86ccbfb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4094,7 +4094,7 @@ def _try_cast(arr, take_fast_path): elif is_extension_array_dtype(dtype): # create an extension array from its dtype array_type = dtype.construct_array_type() - subarr = array_type(subarr, copy=copy) + subarr = array_type(subarr, dtype=dtype, copy=copy) elif dtype is not None and raise_cast_failure: raise diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 640b894e2245f..b6b81bb941a59 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -47,7 +47,7 @@ class TestMyDtype(BaseDtypeTests): from .groupby import BaseGroupbyTests # noqa from .interface import BaseInterfaceTests # noqa from .methods import BaseMethodsTests # noqa -from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests # noqa +from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index e9df49780f119..886a0f66b5f66 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -226,12 +226,14 @@ def test_reindex(self, data, na_value): n = len(data) result = s.reindex([-1, 0, n]) expected = pd.Series( - data._from_sequence([na_value, data[0], na_value]), + data._from_sequence([na_value, data[0], na_value], + dtype=s.dtype), index=[-1, 0, n]) self.assert_series_equal(result, expected) result = s.reindex([n, n + 1]) - expected = pd.Series(data._from_sequence([na_value, na_value]), + expected = pd.Series(data._from_sequence([na_value, na_value], + dtype=s.dtype), index=[n, n + 1]) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 659b9757ac1e3..f8d2f8314d2b1 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -3,10 +3,12 @@ import operator import pandas as pd +from pandas.core import ops from .base import BaseExtensionTests class BaseOpsUtil(BaseExtensionTests): + def get_op_from_name(self, op_name): short_opname = op_name.strip('_') try: @@ -32,6 +34,20 @@ def _check_op(self, s, op, other, exc=NotImplementedError): with pytest.raises(exc): op(s, other) + def _check_divmod_op(self, s, op, other, exc=NotImplementedError): + # divmod has multiple return values, so check separatly + if exc is None: + result_div, result_mod = op(s, other) + if op is divmod: + expected_div, expected_mod = s // other, s % other + else: + expected_div, expected_mod = other // s, other % s + self.assert_series_equal(result_div, expected_div) + self.assert_series_equal(result_mod, expected_mod) + else: + with pytest.raises(exc): + divmod(s, other) + class BaseArithmeticOpsTests(BaseOpsUtil): """Various Series and DataFrame arithmetic ops methods.""" @@ -50,8 +66,8 @@ def test_arith_array(self, data, all_arithmetic_operators): def test_divmod(self, data): s = pd.Series(data) - self._check_op(s, divmod, 1, exc=TypeError) - self._check_op(1, divmod, s, exc=TypeError) + self._check_divmod_op(s, divmod, 1, exc=TypeError) + self._check_divmod_op(1, ops.rdivmod, s, exc=TypeError) def test_error(self, data, all_arithmetic_operators): # invalid ops diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c83726c5278a5..0340289e0b674 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -82,7 +82,8 @@ def test_concat_columns(self, data, na_value): # non-aligned df2 = pd.DataFrame({'B': [1, 2, 3]}, index=[1, 2, 3]) expected = pd.DataFrame({ - 'A': data._from_sequence(list(data[:3]) + [na_value]), + 'A': data._from_sequence(list(data[:3]) + [na_value], + dtype=data.dtype), 'B': [np.nan, 1, 2, 3]}) result = pd.concat([df1, df2], axis=1) @@ -96,8 +97,10 @@ def test_align(self, data, na_value): r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) # Assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(data._from_sequence(list(a) + [na_value])) - e2 = pd.Series(data._from_sequence([na_value] + list(b))) + e1 = pd.Series(data._from_sequence(list(a) + [na_value], + dtype=data.dtype)) + e2 = pd.Series(data._from_sequence([na_value] + list(b), + dtype=data.dtype)) self.assert_series_equal(r1, e1) self.assert_series_equal(r2, e2) @@ -109,8 +112,10 @@ def test_align_frame(self, data, na_value): ) # Assumes that the ctor can take a list of scalars of the type - e1 = pd.DataFrame({'A': data._from_sequence(list(a) + [na_value])}) - e2 = pd.DataFrame({'A': data._from_sequence([na_value] + list(b))}) + e1 = pd.DataFrame({'A': data._from_sequence(list(a) + [na_value], + dtype=data.dtype)}) + e2 = pd.DataFrame({'A': data._from_sequence([na_value] + list(b), + dtype=data.dtype)}) self.assert_frame_equal(r1, e1) self.assert_frame_equal(r2, e2) @@ -120,7 +125,8 @@ def test_align_series_frame(self, data, na_value): df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) r1, r2 = ser.align(df) - e1 = pd.Series(data._from_sequence(list(data) + [na_value]), + e1 = pd.Series(data._from_sequence(list(data) + [na_value], + dtype=data.dtype), name=ser.name) self.assert_series_equal(r1, e1) @@ -153,7 +159,8 @@ def test_merge(self, data, na_value): res = pd.merge(df1, df2) exp = pd.DataFrame( {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], - 'ext': data._from_sequence([data[0], data[0], data[1]])}) + 'ext': data._from_sequence([data[0], data[0], data[1]], + dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) res = pd.merge(df1, df2, how='outer') @@ -161,5 +168,6 @@ def test_merge(self, data, na_value): {'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4], 'key': [0, 0, 1, 2, 3], 'ext': data._from_sequence( - [data[0], data[0], data[1], data[2], na_value])}) + [data[0], data[0], data[1], data[2], na_value], + dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 33adebbbe5780..373da1af9ef24 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -1,6 +1,5 @@ import decimal import numbers -import random import sys import numpy as np @@ -38,7 +37,7 @@ def construct_from_string(cls, string): class DecimalArray(ExtensionArray, ExtensionScalarOpsMixin): dtype = DecimalDtype() - def __init__(self, values, copy=False): + def __init__(self, values, dtype=None, copy=False): for val in values: if not isinstance(val, self.dtype.type): raise TypeError("All values must be of type " + @@ -54,8 +53,8 @@ def __init__(self, values, copy=False): # self._values = self.values = self.data @classmethod - def _from_sequence(cls, scalars, copy=False): - return cls(scalars) + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars, copy=copy) @classmethod def _from_factorized(cls, values, original): @@ -117,7 +116,3 @@ def _concat_same_type(cls, to_concat): DecimalArray._add_arithmetic_ops() DecimalArray._add_comparison_ops() - - -def make_data(): - return [decimal.Decimal(random.random()) for _ in range(100)] diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 8fd3d1a57f6c8..b851ad1d06d4d 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -1,5 +1,6 @@ import decimal +import random import numpy as np import pandas as pd import pandas.util.testing as tm @@ -7,7 +8,12 @@ from pandas.tests.extension import base -from .array import DecimalDtype, DecimalArray, make_data +from .array import DecimalDtype, DecimalArray + + +@pytest.fixture +def make_data(): + return [decimal.Decimal(random.random()) for _ in range(100)] @pytest.fixture diff --git a/pandas/tests/extension/integer/__init__.py b/pandas/tests/extension/integer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py new file mode 100644 index 0000000000000..773581a826b8d --- /dev/null +++ b/pandas/tests/extension/integer/test_integer.py @@ -0,0 +1,604 @@ +import numpy as np +import pandas as pd +import pandas.util.testing as tm +import pytest + +from pandas.tests.extension import base +from pandas.api.types import ( + is_integer, is_scalar, is_float, is_float_dtype) +from pandas.core.dtypes.generic import ABCIndexClass + +from pandas.core.arrays import ( + to_integer_array, IntegerArray) +from pandas.core.arrays.integer import ( + Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) + + +@pytest.fixture +def make_data(): + return (list(range(8)) + + [np.nan] + + list(range(10, 98)) + + [np.nan] + + [99, 100]) + + +@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return IntegerArray(make_data(), dtype=dtype) + + +@pytest.fixture +def data_missing(dtype): + return IntegerArray([np.nan, 1], dtype=dtype) + + +@pytest.fixture +def data_repeated(data): + def gen(count): + for _ in range(count): + yield data + yield gen + + +@pytest.fixture +def data_for_sorting(dtype): + return IntegerArray([1, 2, 0], dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return IntegerArray([1, np.nan, 0], dtype=dtype) + + +@pytest.fixture +def na_cmp(): + # we are np.nan + return lambda x, y: np.isnan(x) and np.isnan(y) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(dtype): + b = 1 + a = 0 + c = 2 + na = np.nan + return IntegerArray([b, b, na, na, a, a, b, c], dtype=dtype) + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == 'i' + else: + assert np.dtype(dtype.type).kind == 'u' + assert dtype.name is not None + + +class BaseInteger(object): + + def assert_index_equal(self, left, right, *args, **kwargs): + + left_na = left.isna() + right_na = right.isna() + + tm.assert_numpy_array_equal(left_na, right_na) + return tm.assert_index_equal(left[~left_na], + right[~right_na], + *args, **kwargs) + + def assert_series_equal(self, left, right, *args, **kwargs): + + left_na = left.isna() + right_na = right.isna() + + tm.assert_series_equal(left_na, right_na) + return tm.assert_series_equal(left[~left_na], + right[~right_na], + *args, **kwargs) + + def assert_frame_equal(self, left, right, *args, **kwargs): + # TODO(EA): select_dtypes + tm.assert_index_equal( + left.columns, right.columns, + exact=kwargs.get('check_column_type', 'equiv'), + check_names=kwargs.get('check_names', True), + check_exact=kwargs.get('check_exact', False), + check_categorical=kwargs.get('check_categorical', True), + obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame'))) + + integers = (left.dtypes == 'integer').index + + for col in integers: + self.assert_series_equal(left[col], right[col], + *args, **kwargs) + + left = left.drop(columns=integers) + right = right.drop(columns=integers) + tm.assert_frame_equal(left, right, *args, **kwargs) + + +class TestDtype(BaseInteger, base.BaseDtypeTests): + + @pytest.mark.skip(reason="using multiple dtypes") + def test_is_dtype_unboxes_dtype(self): + # we have multiple dtypes, so skip + pass + + def test_array_type_with_arg(self, data, dtype): + assert dtype.construct_array_type() is IntegerArray + + +class TestArithmeticOps(BaseInteger, base.BaseArithmeticOpsTests): + + def _check_divmod_op(self, s, op, other, exc=None): + super(TestArithmeticOps, self)._check_divmod_op(s, op, other, None) + + def _check_op(self, s, op_name, other, exc=None): + op = self.get_op_from_name(op_name) + result = op(s, other) + + # compute expected + mask = s.isna() + + # other array is an Integer + if isinstance(other, IntegerArray): + omask = getattr(other, 'mask', None) + mask = getattr(other, 'data', other) + if omask is not None: + mask |= omask + + # float result type or float op + if ((is_float_dtype(other) or is_float(other) or + op_name in ['__rtruediv__', '__truediv__', + '__rdiv__', '__div__'])): + rs = s.astype('float') + expected = op(rs, other) + self._check_op_float(result, expected, mask, s, op_name, other) + + # integer result type + else: + rs = pd.Series(s.values.data) + expected = op(rs, other) + self._check_op_integer(result, expected, mask, s, op_name, other) + + def _check_op_float(self, result, expected, mask, s, op_name, other): + # check comparisions that are resulting in float dtypes + + expected[mask] = np.nan + self.assert_series_equal(result, expected) + + def _check_op_integer(self, result, expected, mask, s, op_name, other): + # check comparisions that are resulting in integer dtypes + + # to compare properly, we convert the expected + # to float, mask to nans and convert infs + # if we have uints then we process as uints + # then conert to float + # and we ultimately want to create a IntArray + # for comparisons + + fill_value = 0 + + # mod/rmod turn floating 0 into NaN while + # integer works as expected (no nan) + if op_name in ['__mod__', '__rmod__']: + if is_scalar(other): + if other == 0: + expected[s.values == 0] = 0 + else: + expected = expected.fillna(0) + else: + expected[(s.values == 0) & + ((expected == 0) | expected.isna())] = 0 + + try: + expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + original = expected + expected = expected.astype(s.dtype) + + except ValueError: + + expected = expected.astype(float) + expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + original = expected + expected = expected.astype(s.dtype) + + expected[mask] = np.nan + + # assert that the expected astype is ok + # (skip for unsigned as they have wrap around) + if not s.dtype.is_unsigned_integer: + original = pd.Series(original) + + # we need to fill with 0's to emulate what an astype('int') does + # (truncation) for certain ops + if op_name in ['__rtruediv__', '__rdiv__']: + mask |= original.isna() + original = original.fillna(0).astype('int') + + original = original.astype('float') + original[mask] = np.nan + self.assert_series_equal(original, expected.astype('float')) + + # assert our expected result + self.assert_series_equal(result, expected) + + def test_arith_integer_array(self, data, all_arithmetic_operators): + # we operate with a rhs of an integer array + + op = all_arithmetic_operators + + s = pd.Series(data) + rhs = pd.Series([1] * len(data), dtype=data.dtype) + rhs.iloc[-1] = np.nan + + self._check_op(s, op, rhs) + + def test_arith_scalar(self, data, all_arithmetic_operators): + # scalar + op = all_arithmetic_operators + + s = pd.Series(data) + self._check_op(s, op, 1, exc=TypeError) + + def test_arith_array(self, data, all_arithmetic_operators): + # ndarray & other series + op = all_arithmetic_operators + + s = pd.Series(data) + other = np.ones(len(s), dtype=s.dtype.type) + self._check_op(s, op, other, exc=TypeError) + + def test_arith_coerce_scalar(self, data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + + other = 0.01 + self._check_op(s, op, other) + + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops('foo') + with pytest.raises(TypeError): + ops(pd.Timestamp('20180101')) + + # invalid array-likes + with pytest.raises(TypeError): + ops(pd.Series('foo', index=s.index)) + + if op != '__rpow__': + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + with pytest.raises(TypeError): + ops(pd.Series(pd.date_range('20180101', periods=len(s)))) + + # 2d + with pytest.raises(TypeError): + opa(pd.DataFrame({'A': s})) + with pytest.raises(TypeError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + +class TestComparisonOps(BaseInteger, base.BaseComparisonOpsTests): + + def _compare_other(self, s, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = op(s, other) + expected = pd.Series(op(data.data, other)) + + # fill the nan locations + expected[data.mask] = True if op_name == '__ne__' else False + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data.data) + expected = op(expected, other) + + # fill the nan locations + expected[data.mask] = True if op_name == '__ne__' else False + + tm.assert_series_equal(result, expected) + + +class TestInterface(BaseInteger, base.BaseInterfaceTests): + + def test_repr_array(self, data): + result = repr(data) + + # not long + assert '...' not in result + + assert 'dtype=' in result + assert 'IntegerArray' in result + + def test_repr_array_long(self, data): + # some arrays may be able to assert a ... in the repr + with pd.option_context('display.max_seq_items', 1): + result = repr(data) + + assert '...' in result + assert 'length' in result + + +class TestConstructors(BaseInteger, base.BaseConstructorsTests): + + def test_from_dtype_from_float(self, data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(np.array(data).astype('float'), dtype=str(dtype)) + self.assert_series_equal(result, expected) + + # from int / list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + self.assert_series_equal(result, expected) + + # from int / array + expected = pd.Series(data).dropna().reset_index(drop=True) + dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) + result = pd.Series(dropped, dtype=str(dtype)) + self.assert_series_equal(result, expected) + + +class TestReshaping(BaseInteger, base.BaseReshapingTests): + + def test_concat_mixed_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/20762 + df1 = pd.DataFrame({'A': data[:3]}) + df2 = pd.DataFrame({"A": [1, 2, 3]}) + df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category') + df4 = pd.DataFrame({"A": pd.SparseArray([1, 2, 3])}) + dfs = [df1, df2, df3, df4] + + # dataframes + result = pd.concat(dfs) + expected = pd.concat([x.astype(object) for x in dfs]) + self.assert_frame_equal(result, expected) + + # series + result = pd.concat([x['A'] for x in dfs]) + expected = pd.concat([x['A'].astype(object) for x in dfs]) + self.assert_series_equal(result, expected) + + result = pd.concat([df1, df2]) + expected = pd.concat([df1.astype('object'), df2.astype('object')]) + self.assert_frame_equal(result, expected) + + # concat of an Integer and Int coerces to object dtype + # TODO(jreback) once integrated this would + # be a result of Integer + result = pd.concat([df1['A'], df2['A']]) + expected = pd.concat([df1['A'].astype('object'), + df2['A'].astype('object')]) + self.assert_series_equal(result, expected) + + +class TestGetitem(BaseInteger, base.BaseGetitemTests): + pass + + +class TestMissing(BaseInteger, base.BaseMissingTests): + pass + + +class TestMethods(BaseInteger, base.BaseMethodsTests): + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts( + dropna=dropna).sort_index() + expected.index = expected.index.astype(all_data.dtype) + + self.assert_series_equal(result, expected) + + def test_combine_add(self, data_repeated): + # GH 20825 + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + + # fundamentally this is not a great operation + # as overflow / underflow can easily happen here + # e.g. int8 + int8 + def scalar_add(a, b): + + # TODO; should really be a type specific NA + if pd.isna(a) or pd.isna(b): + return np.nan + if is_integer(a): + a = int(a) + elif is_integer(b): + b = int(b) + return a + b + + result = s1.combine(s2, scalar_add) + expected = pd.Series( + orig_data1._from_sequence([scalar_add(a, b) for (a, b) in + zip(orig_data1, + orig_data2)])) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + val for a in list(orig_data1)])) + self.assert_series_equal(result, expected) + + +class TestCasting(BaseInteger, base.BaseCastingTests): + + @pytest.mark.parametrize('dropna', [True, False]) + def test_construct_index(self, all_data, dropna): + # ensure that we do not coerce to Float64Index, rather + # keep as Index + + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Index(IntegerArray(other, + dtype=all_data.dtype)) + expected = pd.Index(other, dtype=object) + + self.assert_index_equal(result, expected) + + @pytest.mark.parametrize('dropna', [True, False]) + def test_astype_index(self, all_data, dropna): + # as an int/uint index to Index + + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + + dtype = all_data.dtype + idx = pd.Index(np.array(other)) + assert isinstance(idx, ABCIndexClass) + + result = idx.astype(dtype) + expected = idx.astype(object).astype(dtype) + self.assert_index_equal(result, expected) + + +class TestGroupby(BaseInteger, base.BaseGroupbyTests): + + @pytest.mark.xfail(reason="groupby not working") + def test_groupby_extension_no_sort(self, data_for_grouping): + super(TestGroupby, self).test_groupby_extension_no_sort( + data_for_grouping) + + @pytest.mark.xfail(reason="groupby not working") + @pytest.mark.parametrize('as_index', [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + super(TestGroupby, self).test_groupby_extension_agg( + as_index, data_for_grouping) + + +def test_frame_repr(data_missing): + + df = pd.DataFrame({'A': data_missing}) + result = repr(df) + expected = ' A\n0 NaN\n1 1' + assert result == expected + + +def test_conversions(data_missing): + + # astype to object series + df = pd.DataFrame({'A': data_missing}) + result = df['A'].astype('object') + expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') + tm.assert_series_equal(result, expected) + + # convert to object ndarray + # we assert that we are exactly equal + # including type conversions of scalars + result = df['A'].astype('object').values + expected = np.array([np.nan, 1], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + for r, e in zip(result, expected): + if pd.isnull(r): + assert pd.isnull(e) + elif is_integer(r): + # PY2 can be int or long + assert r == e + assert is_integer(e) + else: + assert r == e + assert type(r) == type(e) + + +@pytest.mark.parametrize( + 'values', + [ + ['foo', 'bar'], + 'foo', + 1, + 1.0, + pd.date_range('20130101', periods=2), + np.array(['foo'])]) +def test_to_integer_array_error(values): + # error in converting existing arrays to IntegerArrays + with pytest.raises(TypeError): + to_integer_array(values) + + +@pytest.mark.parametrize( + 'values, dtype', + [ + (np.array([1], dtype='int64'), Int64Dtype), + (np.array([1, np.nan]), Int64Dtype)]) +def test_to_integer_array(values, dtype): + # convert existing arrays to IntegerArrays + result = to_integer_array(values) + expected = IntegerArray(values, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_cross_type_arithmetic(): + + df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B': pd.Series([1, np.nan, 3], dtype='UInt8'), + 'C': [1, 2, 3]}) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype='Int64') + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, False]) + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype='Int64') + tm.assert_series_equal(result, expected) + + +# TODO(jreback) - these need testing / are broken + +# shift + +# set_index (destroys type) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 160bf259e1e32..34c397252a8bb 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -13,8 +13,6 @@ import collections import itertools import numbers -import random -import string import sys import numpy as np @@ -54,7 +52,7 @@ def construct_from_string(cls, string): class JSONArray(ExtensionArray): dtype = JSONDtype() - def __init__(self, values, copy=False): + def __init__(self, values, dtype=None, copy=False): for val in values: if not isinstance(val, self.dtype.type): raise TypeError("All values must be of type " + @@ -69,7 +67,7 @@ def __init__(self, values, copy=False): # self._values = self.values = self.data @classmethod - def _from_sequence(cls, scalars, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars) @classmethod @@ -180,10 +178,3 @@ def _values_for_argsort(self): # cast them to an (N, P) array, instead of an (N,) array of tuples. frozen = [()] + list(tuple(x.items()) for x in self) return np.array(frozen, dtype=object)[1:] - - -def make_data(): - # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer - return [collections.UserDict([ - (random.choice(string.ascii_letters), random.randint(0, 100)) - for _ in range(random.randint(0, 10))]) for _ in range(100)] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 7eeaf7946663e..126607907a8da 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -1,5 +1,7 @@ import operator import collections +import random +import string import pytest @@ -8,11 +10,19 @@ from pandas.compat import PY2, PY36 from pandas.tests.extension import base -from .array import JSONArray, JSONDtype, make_data +from .array import JSONArray, JSONDtype pytestmark = pytest.mark.skipif(PY2, reason="Py2 doesn't have a UserDict") +@pytest.fixture +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + return [collections.UserDict([ + (random.choice(string.ascii_letters), random.randint(0, 100)) + for _ in range(random.randint(0, 10))]) for _ in range(100)] + + @pytest.fixture def dtype(): return JSONDtype() @@ -203,7 +213,8 @@ def test_combine_add(self, data_repeated): class TestCasting(BaseJSON, base.BaseCastingTests): - @pytest.mark.xfail + + @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)") def test_astype_str(self): """This currently fails in NumPy on np.array(self, dtype=str) with From 4faa4c6a20db3c35d487b462f85c025a9a6f3ea0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 10:39:31 -0500 Subject: [PATCH 02/20] update for review comments --- pandas/core/arrays/integer.py | 9 +++++---- pandas/tests/extension/integer/test_integer.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index a7c0518bb4ed9..0e59828727e63 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -311,7 +311,7 @@ def _concat_same_type(cls, to_concat): return cls(data, mask=mask, dtype=to_concat[0].dtype) def astype(self, dtype, copy=True): - """Cast to a NumPy array with 'dtype'. + """Cast to a NumPy array or IntegerArray with 'dtype'. Parameters ---------- @@ -324,8 +324,8 @@ def astype(self, dtype, copy=True): Returns ------- - array : ndarray - NumPy ndarray with 'dtype' for its dtype. + array : ndarray or IntegerArray + NumPy ndarray or IntergerArray with 'dtype' for its dtype. Raises ------ @@ -502,7 +502,8 @@ def integer_arithmetic_method(self, other): if isinstance(other, IntegerArray): other, mask = other.data, other.mask elif getattr(other, 'ndim', 0) > 1: - raise TypeError("can only perform ops with 1-d structures") + raise NotImplementedError( + "can only perform ops with 1-d structures") elif is_list_like(other): other = np.asarray(other) if not other.ndim: diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 773581a826b8d..a8bfda82f0211 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -296,9 +296,9 @@ def test_error(self, data, all_arithmetic_operators): ops(pd.Series(pd.date_range('20180101', periods=len(s)))) # 2d - with pytest.raises(TypeError): + with pytest.raises(NotImplementedError): opa(pd.DataFrame({'A': s})) - with pytest.raises(TypeError): + with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) From 712b52d2fff17ccd1bb56bb37ffd0d86c09d8d98 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 14:12:57 -0500 Subject: [PATCH 03/20] update docs of IntegerDtype remove mask arg from _from_sqequence in IntegerArray --- pandas/core/arrays/integer.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0e59828727e63..c4db155447a5c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -22,7 +22,16 @@ format_object_summary, format_object_attrs, default_pprint) -class IntegerDtype(ExtensionDtype): +class _IntegerDtype(ExtensionDtype): + """ + An ExtensionDtype to hold a single size & kind of integer dtype. + + These specific implementations are subclasses of the non-public + _IntegerDtype. For example we have Int8Dtype to represnt signed int 8s. + + The attributes name & type are set when these subclasses are created. + """ + name = None type = None na_value = np.nan @@ -177,8 +186,8 @@ def __init__(self, values, mask=None, dtype=None, copy=False): values, dtype=dtype, mask=mask, copy=copy) @classmethod - def _from_sequence(cls, scalars, mask=None, dtype=None, copy=False): - return cls(scalars, mask=mask, dtype=dtype, copy=copy) + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars, dtype=dtype, copy=copy) @classmethod def _from_factorized(cls, values, original): @@ -335,7 +344,7 @@ def astype(self, dtype, copy=True): """ # if we are astyping to an existing IntegerDtype we can fastpath - if isinstance(dtype, IntegerDtype): + if isinstance(dtype, _IntegerDtype): result = self.data.astype(dtype.numpy_dtype, casting='same_kind', copy=False) return type(self)(result, mask=self.mask, @@ -556,7 +565,7 @@ def integer_arithmetic_method(self, other): classname = "{}Dtype".format(name) attributes_dict = {'type': getattr(np, dtype), 'name': name} - dtype_type = type(classname, (IntegerDtype, ), attributes_dict) + dtype_type = type(classname, (_IntegerDtype, ), attributes_dict) setattr(module, classname, dtype_type) # register From 74f392a29baccceddc88ceedffb1f00dec6da56c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 15:28:03 -0500 Subject: [PATCH 04/20] review comments --- pandas/core/arrays/base.py | 3 ++- pandas/core/arrays/integer.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c0697dd29e4d0..139124e47ed45 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -97,7 +97,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. dtype : Dtype, optional - consruct for this particular dtype + construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. copy : boolean, default False if True, copy the underlying data Returns diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c4db155447a5c..aff17a4a86dd9 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -182,6 +182,18 @@ def dtype(self): return _dtypes[str(self.data.dtype)] def __init__(self, values, mask=None, dtype=None, copy=False): + """ + Parameters + ---------- + values : 1D list-like / IntegerArray + mask : 1D list-like, optional + dtype : subclass of _IntegerDtype, optional + copy : bool, default False + + Returns + ------- + IntegerArray + """ self.data, self.mask = coerce_to_array( values, dtype=dtype, mask=mask, copy=copy) From 3889feb29647e0b9bac02b3f42bc79b9959d4ed0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 15:44:09 -0500 Subject: [PATCH 05/20] make data & mask private attributes --- pandas/core/arrays/integer.py | 74 +++++++++---------- .../tests/extension/integer/test_integer.py | 10 +-- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index aff17a4a86dd9..0bdadbd089e93 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -117,7 +117,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): """ if isinstance(values, IntegerArray): - values, mask = values.data, values.mask + values, mask = values._data, values._mask if copy: values = values.copy() mask = mask.copy() @@ -179,7 +179,7 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): @cache_readonly def dtype(self): - return _dtypes[str(self.data.dtype)] + return _dtypes[str(self._data.dtype)] def __init__(self, values, mask=None, dtype=None, copy=False): """ @@ -194,7 +194,7 @@ def __init__(self, values, mask=None, dtype=None, copy=False): ------- IntegerArray """ - self.data, self.mask = coerce_to_array( + self._data, self._mask = coerce_to_array( values, dtype=dtype, mask=mask, copy=copy) @classmethod @@ -207,19 +207,19 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if is_integer(item): - if self.mask[item]: + if self._mask[item]: return self.dtype.na_value - return self.data[item] - return type(self)(self.data[item], - mask=self.mask[item], + return self._data[item] + return type(self)(self._data[item], + mask=self._mask[item], dtype=self.dtype) def _coerce_to_ndarray(self): """ coerce to an ndarary, preserving my scalar types """ # TODO(jreback) make this better - data = self.data.astype(object) - data[self.mask] = self._na_value + data = self._data.astype(object) + data[self._mask] = self._na_value return data def __array__(self, dtype=None): @@ -237,10 +237,10 @@ def __iter__(self): # arrays as list-like. The default implementation makes successive # calls to ``__getitem__``, which may be slower than necessary. for i in range(len(self)): - if self.mask[i]: + if self._mask[i]: yield self.dtype.na_value else: - yield self.data[i] + yield self._data[i] def _formatting_values(self): # type: () -> np.ndarray @@ -252,10 +252,10 @@ def take(self, indexer, allow_fill=False, fill_value=None): # we always fill with 1 internally # to avoid upcasting data_fill_value = 1 if isna(fill_value) else fill_value - result = take(self.data, indexer, fill_value=data_fill_value, + result = take(self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill) - mask = take(self.mask, indexer, fill_value=True, + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) # if we are filling @@ -270,7 +270,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): return type(self)(result, mask=mask, dtype=self.dtype) def copy(self, deep=False): - data, mask = self.data, self.mask + data, mask = self._data, self._mask if deep: data = copy.deepcopy(data) mask = copy.deepcopy(mask) @@ -289,11 +289,11 @@ def __setitem__(self, key, value): value = value[0] mask = mask[0] - self.data[key] = value - self.mask[key] = mask + self._data[key] = value + self._mask[key] = mask def __len__(self): - return len(self.data) + return len(self._data) def __repr__(self): """ @@ -316,10 +316,10 @@ def __repr__(self): @property def nbytes(self): - return self.data.nbytes + self.mask.nbytes + return self._data.nbytes + self._mask.nbytes def isna(self): - return self.mask + return self._mask @property def _na_value(self): @@ -327,8 +327,8 @@ def _na_value(self): @classmethod def _concat_same_type(cls, to_concat): - data = np.concatenate([x.data for x in to_concat]) - mask = np.concatenate([x.mask for x in to_concat]) + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) return cls(data, mask=mask, dtype=to_concat[0].dtype) def astype(self, dtype, copy=True): @@ -357,9 +357,9 @@ def astype(self, dtype, copy=True): # if we are astyping to an existing IntegerDtype we can fastpath if isinstance(dtype, _IntegerDtype): - result = self.data.astype(dtype.numpy_dtype, - casting='same_kind', copy=False) - return type(self)(result, mask=self.mask, + result = self._data.astype(dtype.numpy_dtype, + casting='same_kind', copy=False) + return type(self)(result, mask=self._mask, dtype=dtype, copy=False) # coerce @@ -376,7 +376,7 @@ def _ndarray_values(self): The expectation is that this is cheap to compute, and is primarily used for interacting with our indexers. """ - return self.data + return self._data def value_counts(self, dropna=True): """ @@ -402,7 +402,7 @@ def value_counts(self, dropna=True): from pandas import Index, Series # compute counts on the data with no nans - data = self.data[~self.mask] + data = self._data[~self._mask] value_counts = Index(data).value_counts() array = value_counts.values @@ -417,7 +417,7 @@ def value_counts(self, dropna=True): # TODO(extension) # appending to an Index *always* infers # w/o passing the dtype - array = np.append(array, [self.mask.sum()]) + array = np.append(array, [self._mask.sum()]) index = Index(np.concatenate( [index.values, np.array([np.nan], dtype=object)]), dtype=object) @@ -438,8 +438,8 @@ def _values_for_argsort(self): -------- ExtensionArray.argsort """ - data = self.data.copy() - data[self.mask] = data.min() - 1 + data = self._data.copy() + data[self._mask] = data.min() - 1 return data @classmethod @@ -449,7 +449,7 @@ def cmp_method(self, other): op_name = op.__name__ mask = None if isinstance(other, IntegerArray): - other, mask = other.data, other.mask + other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) if other.ndim > 0 and len(self) != len(other): @@ -459,13 +459,13 @@ def cmp_method(self, other): # comparisons, this will raise in the future with warnings.catch_warnings(record=True): with np.errstate(all='ignore'): - result = op(self.data, other) + result = op(self._data, other) # nans propagate if mask is None: - mask = self.mask + mask = self._mask else: - mask = self.mask | mask + mask = self._mask | mask result[mask] = True if op_name == 'ne' else False return result @@ -521,7 +521,7 @@ def integer_arithmetic_method(self, other): other = getattr(other, 'values', other) if isinstance(other, IntegerArray): - other, mask = other.data, other.mask + other, mask = other._data, other._mask elif getattr(other, 'ndim', 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") @@ -539,12 +539,12 @@ def integer_arithmetic_method(self, other): # nans propagate if mask is None: - mask = self.mask + mask = self._mask else: - mask = self.mask | mask + mask = self._mask | mask with np.errstate(all='ignore'): - result = op(self.data, other) + result = op(self._data, other) # divmod returns a tuple if op_name == 'divmod': diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index a8bfda82f0211..e98e559858e2b 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -171,7 +171,7 @@ def _check_op(self, s, op_name, other, exc=None): # integer result type else: - rs = pd.Series(s.values.data) + rs = pd.Series(s.values._data) expected = op(rs, other) self._check_op_integer(result, expected, mask, s, op_name, other) @@ -309,10 +309,10 @@ def _compare_other(self, s, data, op_name, other): # array result = op(s, other) - expected = pd.Series(op(data.data, other)) + expected = pd.Series(op(data._data, other)) # fill the nan locations - expected[data.mask] = True if op_name == '__ne__' else False + expected[data._mask] = True if op_name == '__ne__' else False tm.assert_series_equal(result, expected) @@ -320,11 +320,11 @@ def _compare_other(self, s, data, op_name, other): s = pd.Series(data) result = op(s, other) - expected = pd.Series(data.data) + expected = pd.Series(data._data) expected = op(expected, other) # fill the nan locations - expected[data.mask] = True if op_name == '__ne__' else False + expected[data._mask] = True if op_name == '__ne__' else False tm.assert_series_equal(result, expected) From e5b86415d9ef2ab87de2838d71029e5bb0f4b995 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 16:22:56 -0500 Subject: [PATCH 06/20] add dtype to to_integer_array --- pandas/core/arrays/integer.py | 21 +++++++++++-------- .../tests/extension/integer/test_integer.py | 13 ++++++------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0bdadbd089e93..e1dd98cfb4754 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -74,11 +74,13 @@ def construct_from_string(cls, string): "'{}'".format(cls, string)) -def to_integer_array(values): +def to_integer_array(values, dtype=None): """ Parameters ---------- values : 1D list-like + dtype : dtype, optional + dtype to coerce Returns ------- @@ -88,14 +90,6 @@ def to_integer_array(values): ------ TypeError if incompatible types """ - values = np.array(values, copy=False) - try: - dtype = _dtypes[str(values.dtype)] - except KeyError: - if is_float_dtype(values): - return IntegerArray(values) - - raise TypeError("Incompatible dtype for {}".format(values.dtype)) return IntegerArray(values, dtype=dtype, copy=False) @@ -115,9 +109,18 @@ def coerce_to_array(values, dtype, mask=None, copy=False): ------- tuple of (values, mask) """ + if dtype is not None: + if not issubclass(type(dtype), _IntegerDtype): + try: + dtype = _dtypes[str(np.dtype(dtype))] + except KeyError: + raise ValueError("invalid dtype specified {}".format(dtype)) if isinstance(values, IntegerArray): values, mask = values._data, values._mask + if dtype is not None: + values = values.astype(dtype.numpy_dtype, copy=False) + if copy: values = values.copy() mask = mask.copy() diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index e98e559858e2b..65a40ecfcf352 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -567,14 +567,15 @@ def test_to_integer_array_error(values): @pytest.mark.parametrize( - 'values, dtype', + 'values, to_dtype, result_dtype', [ - (np.array([1], dtype='int64'), Int64Dtype), - (np.array([1, np.nan]), Int64Dtype)]) -def test_to_integer_array(values, dtype): + (np.array([1], dtype='int64'), None, Int64Dtype), + (np.array([1, np.nan]), None, Int64Dtype), + (np.array([1, np.nan]), 'int8', Int8Dtype)]) +def test_to_integer_array(values, to_dtype, result_dtype): # convert existing arrays to IntegerArrays - result = to_integer_array(values) - expected = IntegerArray(values, dtype=dtype) + result = to_integer_array(values, dtype=to_dtype) + expected = IntegerArray(values, dtype=result_dtype()) tm.assert_extension_array_equal(result, expected) From d073e57765efc0caf7bfcc3e65844ae44ad38ee5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 16:48:00 -0500 Subject: [PATCH 07/20] remove uneeded code & copies --- pandas/core/arrays/integer.py | 6 +----- pandas/core/indexes/base.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e1dd98cfb4754..4202411306111 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -148,10 +148,6 @@ def coerce_to_array(values, dtype, mask=None, copy=False): if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") - # avoid float->int numpy conversion issues - if is_object_dtype(values): - mask |= isna(values) - # infer dtype if needed if dtype is None: if is_integer_dtype(values): @@ -270,7 +266,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): result[fill_mask] = fill_value mask = mask ^ fill_mask - return type(self)(result, mask=mask, dtype=self.dtype) + return type(self)(result, mask=mask, dtype=self.dtype, copy=False) def copy(self, deep=False): data, mask = self._data, self._mask diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8f5dbb6c8cf57..11cc92773f793 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -290,7 +290,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # coerce to the object dtype data = data.astype(object) - return Index(data, dtype=object, copy=True, name=name, + return Index(data, dtype=object, copy=copy, name=name, **kwargs) # index-like From 2f0818181659e07ba4df61508c025a2e64251f64 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 17:31:15 -0500 Subject: [PATCH 08/20] handle numpy scalars & more tests --- pandas/core/arrays/integer.py | 24 ++++--------------- pandas/core/ops.py | 16 +++++++++---- .../tests/extension/integer/test_integer.py | 10 ++++++++ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 4202411306111..51dafabc943e7 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -16,7 +16,6 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.io.formats.printing import ( format_object_summary, format_object_attrs, default_pprint) @@ -487,27 +486,14 @@ def _maybe_mask_result(self, result, mask, other, op_name): if is_float_dtype(result): mask |= (result == np.inf) | (result == -np.inf) - # floor div can be a float or an integer dependending - # on the operands - if (op_name in ['rfloordiv', 'floordiv'] and - (is_float_dtype(other) or is_float(other))): + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if ((is_float_dtype(other) or is_float(other)) or + (op_name in ['rtruediv', 'truediv', 'rdiv', 'div'])): result[mask] = np.nan return result - # by definition a float result - elif op_name in ['rtruediv', 'truediv', 'rdiv', 'div']: - result[mask] = np.nan - return result - - elif is_float_dtype(result): - # if our float result, try to downcast if possible - # if remains float, then mask and return as float - nonans = result[notna(result)] - maybe = maybe_downcast_to_dtype(nonans, self.dtype.numpy_dtype) - if not is_integer_dtype(maybe): - result[mask] = np.nan - return result - return type(self)(result, mask=mask, dtype=self.dtype, copy=False) @classmethod diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 70f64e1aef9cf..b7422ede0faac 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -13,7 +13,7 @@ import numpy as np import pandas as pd -from pandas._libs import algos as libalgos, ops as libops +from pandas._libs import lib, algos as libalgos, ops as libops from pandas import compat from pandas.util._decorators import Appender @@ -1062,10 +1062,16 @@ def dispatch_to_extension_op(op, left, right): if is_extension_array_dtype(left): new_left = left.values - if (isinstance(right, np.ndarray) or - (is_extension_array_dtype(right) and - type(left) != type(right))): - new_right = list(right) + if isinstance(right, np.ndarray): + + # handle numpy scalars, this is a PITA + # TODO(jreback) + new_right = lib.item_from_zerodim(right) + if is_scalar(new_right): + new_right = [new_right] + new_right = list(new_right) + elif is_extension_array_dtype(right) and type(left) != type(right): + new_right = list(new_right) else: new_right = right diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 65a40ecfcf352..6a6c1f27a51cd 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -271,6 +271,16 @@ def test_arith_coerce_scalar(self, data, all_arithmetic_operators): other = 0.01 self._check_op(s, op, other) + @pytest.mark.parametrize("other", [1., 1.0, np.array(1.), np.array([1.])]) + def test_arithmetic_conversion(self, all_arithmetic_operators, other): + # if we have a float operand we should have a float result + # if if that is equal to an integer + op = self.get_op_from_name(all_arithmetic_operators) + + s = pd.Series([1, 2, 3], dtype='Int64') + result = op(s, other) + assert result.dtype is np.dtype('float') + def test_error(self, data, all_arithmetic_operators): # invalid ops From e6533ddc2cdbfe5b1741c7e5b3efd57605b488fd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 8 Jul 2018 08:34:44 -0500 Subject: [PATCH 09/20] clean up / test astype --- pandas/core/dtypes/common.py | 19 +++++- pandas/core/internals.py | 11 +++- pandas/core/series.py | 14 +---- .../tests/extension/integer/test_integer.py | 61 +++++++++++++++++++ pandas/tests/series/test_constructors.py | 9 ++- 5 files changed, 95 insertions(+), 19 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ef4f36dc6df33..f893c8cae0b90 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1714,6 +1714,11 @@ def is_extension_array_dtype(arr_or_dtype): if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values + try: + arr_or_dtype = pandas_dtype(arr_or_dtype) + except TypeError: + pass + return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) @@ -1976,6 +1981,11 @@ def pandas_dtype(dtype): Returns ------- np.dtype or a pandas dtype + + Raises + ------ + TypeError if not a dtype + """ # registered extension types @@ -1987,10 +1997,15 @@ def pandas_dtype(dtype): if isinstance(dtype, ExtensionDtype): return dtype + # try a numpy dtype + # raise a consistent TypeError if failed try: npdtype = np.dtype(dtype) - except (TypeError, ValueError): + except TypeError: raise + except ValueError: + raise TypeError("data type '{}' not understood".format( + type(dtype))) # Any invalid dtype (such as pd.Timestamp) should raise an error. # np.dtype(invalid_type).kind = 0 for such objects. However, this will @@ -2000,6 +2015,6 @@ def pandas_dtype(dtype): if dtype in [object, np.object_, 'object', 'O']: return npdtype elif npdtype.kind == 'O': - raise TypeError('dtype {dtype} not understood'.format(dtype=dtype)) + raise TypeError("dtype '{}' not understood".format(dtype)) return npdtype diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 208d7b8bcf8a7..fb2cb64407302 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -39,7 +39,8 @@ is_re, is_re_compilable, is_scalar, - _get_dtype) + _get_dtype, + pandas_dtype) from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, maybe_upcast, @@ -631,9 +632,13 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self.make_block(Categorical(self.values, dtype=dtype)) + # convert dtypes if needed + try: + dtype = pandas_dtype(dtype) + except TypeError: + pass + # astype processing - if not is_extension_array_dtype(dtype): - dtype = np.dtype(dtype) if is_dtype_equal(self.dtype, dtype): if copy: return self.copy() diff --git a/pandas/core/series.py b/pandas/core/series.py index 83580c86ccbfb..d2823034b4202 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -234,13 +234,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, '`index` argument. `copy` must ' 'be False.') - elif is_extension_array_dtype(data) and dtype is not None: - if not data.dtype.is_dtype(dtype): - raise ValueError("Cannot specify a dtype '{}' with an " - "extension array of a different " - "dtype ('{}').".format(dtype, - data.dtype)) - + elif is_extension_array_dtype(data): + pass elif (isinstance(data, types.GeneratorType) or (compat.PY3 and isinstance(data, map))): data = list(data) @@ -4131,10 +4126,7 @@ def _try_cast(arr, take_fast_path): subarr = data if dtype is not None and not data.dtype.is_dtype(dtype): - msg = ("Cannot coerce extension array to dtype '{typ}'. " - "Do the coercion before passing to the constructor " - "instead.".format(typ=dtype)) - raise ValueError(msg) + subarr = data.astype(dtype) if copy: subarr = data.copy() diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 6a6c1f27a51cd..8a220e9019572 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -511,6 +511,67 @@ def test_astype_index(self, all_data, dropna): expected = idx.astype(object).astype(dtype) self.assert_index_equal(result, expected) + def test_astype(self, all_data): + all_data = all_data[:10] + + ints = all_data[~all_data.isna()] + mixed = all_data + dtype = Int8Dtype() + + # coerce to same type - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype) + expected = pd.Series(ints) + self.assert_series_equal(result, expected) + + # coerce to same other - ints + s = pd.Series(ints) + result = s.astype(dtype) + expected = pd.Series(ints, dtype=dtype) + self.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype.numpy_dtype) + expected = pd.Series(ints._data.astype( + all_data.dtype.numpy_dtype)) + tm.assert_series_equal(result, expected) + + # coerce to same type - mixed + s = pd.Series(mixed) + result = s.astype(all_data.dtype) + expected = pd.Series(mixed) + self.assert_series_equal(result, expected) + + # coerce to same other - mixed + s = pd.Series(mixed) + result = s.astype(dtype) + expected = pd.Series(mixed, dtype=dtype) + self.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - mixed + s = pd.Series(mixed) + with pytest.raises(ValueError): + s.astype(all_data.dtype.numpy_dtype) + + # coerce to object + s = pd.Series(mixed) + result = s.astype('object') + expected = pd.Series(np.asarray(mixed)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dtype', [Int8Dtype(), 'Int8']) + def test_astype_specific_casting(self, dtype): + s = pd.Series([1, 2, 3], dtype='Int64') + result = s.astype(dtype) + expected = pd.Series([1, 2, 3], dtype='Int8') + self.assert_series_equal(result, expected) + + s = pd.Series([1, 2, 3, None], dtype='Int64') + result = s.astype(dtype) + expected = pd.Series([1, 2, 3, None], dtype='Int8') + self.assert_series_equal(result, expected) + class TestGroupby(BaseInteger, base.BaseGroupbyTests): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index fe224436c52e6..e95e41bbdeefa 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -226,10 +226,13 @@ def test_constructor_categorical(self): res = Series(cat) tm.assert_categorical_equal(res.values, cat) + # can cast to a new dtype + result = Series(pd.Categorical([1, 2, 3]), + dtype='int64') + expected = pd.Series([1, 2, 3], dtype='int64') + tm.assert_series_equal(result, expected) + # GH12574 - pytest.raises( - ValueError, lambda: Series(pd.Categorical([1, 2, 3]), - dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) From 35a87387db87ceb6a96621b428b856e2416c6a2d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 8 Jul 2018 10:24:58 -0500 Subject: [PATCH 10/20] fix up dtype comparison tests --- pandas/core/dtypes/common.py | 4 +--- .../tests/extension/decimal/test_decimal.py | 23 +++++++------------ pandas/tests/extension/test_common.py | 15 +++++++++--- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f893c8cae0b90..d83d0224ce84b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -2001,9 +2001,7 @@ def pandas_dtype(dtype): # raise a consistent TypeError if failed try: npdtype = np.dtype(dtype) - except TypeError: - raise - except ValueError: + except Exception: raise TypeError("data type '{}' not understood".format( type(dtype))) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b851ad1d06d4d..d40e318b7319c 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -178,35 +178,28 @@ def test_series_constructor_coerce_data_to_extension_dtype_raises(): pd.Series([0, 1, 2], dtype=DecimalDtype()) -def test_series_constructor_with_same_dtype_ok(): +def test_series_constructor_with_dtype(): arr = DecimalArray([decimal.Decimal('10.0')]) result = pd.Series(arr, dtype=DecimalDtype()) expected = pd.Series(arr) tm.assert_series_equal(result, expected) - -def test_series_constructor_coerce_extension_array_to_dtype_raises(): - arr = DecimalArray([decimal.Decimal('10.0')]) - xpr = r"Cannot specify a dtype 'int64' .* \('decimal'\)." - - with tm.assert_raises_regex(ValueError, xpr): - pd.Series(arr, dtype='int64') + result = pd.Series(arr, dtype='int64') + expected = pd.Series([10]) + tm.assert_series_equal(result, expected) -def test_dataframe_constructor_with_same_dtype_ok(): +def test_dataframe_constructor_with_dtype(): arr = DecimalArray([decimal.Decimal('10.0')]) result = pd.DataFrame({"A": arr}, dtype=DecimalDtype()) expected = pd.DataFrame({"A": arr}) tm.assert_frame_equal(result, expected) - -def test_dataframe_constructor_with_different_dtype_raises(): arr = DecimalArray([decimal.Decimal('10.0')]) - - xpr = "Cannot coerce extension array to dtype 'int64'. " - with tm.assert_raises_regex(ValueError, xpr): - pd.DataFrame({"A": arr}, dtype='int64') + result = pd.DataFrame({"A": arr}, dtype='int64') + expected = pd.DataFrame({"A": [10]}) + tm.assert_frame_equal(result, expected) class TestArithmeticOps(BaseDecimal, base.BaseArithmeticOpsTests): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 589134632c7e9..d71587136e489 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -22,7 +22,16 @@ def __array__(self, dtype): @property def dtype(self): - return self.data.dtype + return DummyDtype() + + def astype(self, dtype, copy=True): + # we don't support anything but a single dtype + if isinstance(dtype, DummyDtype): + if copy: + return type(self)(self.data) + return self + + return np.array(self, dtype=dtype, copy=copy) class TestExtensionArrayDtype(object): @@ -61,10 +70,10 @@ def test_astype_no_copy(): arr = DummyArray(np.array([1, 2, 3], dtype=np.int64)) result = arr.astype(arr.dtype, copy=False) - assert arr.data is result + assert arr is result result = arr.astype(arr.dtype) - assert arr.data is not result + assert arr is not result @pytest.mark.parametrize('dtype', [ From 68efb028db23e7fa7953d0d3352a5433765e1658 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 8 Jul 2018 10:36:59 -0500 Subject: [PATCH 11/20] fixup quotes in interval index error messages --- pandas/core/dtypes/common.py | 2 +- pandas/tests/indexes/interval/test_astype.py | 2 +- pandas/tests/indexes/interval/test_construction.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index d83d0224ce84b..244e1526f5d92 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -2003,7 +2003,7 @@ def pandas_dtype(dtype): npdtype = np.dtype(dtype) except Exception: raise TypeError("data type '{}' not understood".format( - type(dtype))) + dtype)) # Any invalid dtype (such as pd.Timestamp) should raise an error. # np.dtype(invalid_type).kind = 0 for such objects. However, this will diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index b3a4bfa878c3f..1e96ac730a0eb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -56,7 +56,7 @@ def test_astype_cannot_cast(self, index, dtype): index.astype(dtype) def test_astype_invalid_dtype(self, index): - msg = 'data type "fake_dtype" not understood' + msg = "data type 'fake_dtype' not understood" with tm.assert_raises_regex(TypeError, msg): index.astype('fake_dtype') diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 3745f79d7d65d..1bcfcee3724da 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -138,7 +138,7 @@ def test_generic_errors(self, constructor): constructor(dtype='int64', **filler) # invalid dtype - msg = 'data type "invalid" not understood' + msg = "data type 'invalid' not understood" with tm.assert_raises_regex(TypeError, msg): constructor(dtype='invalid', **filler) From c9e8f7db945955ca0c2793103bdbafec032bca4f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 8 Jul 2018 10:48:46 -0500 Subject: [PATCH 12/20] some optimization on dtype checking --- pandas/core/dtypes/common.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 244e1526f5d92..96cf2b8977b44 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1809,6 +1809,9 @@ def _get_dtype(arr_or_dtype): TypeError : The passed in object is None. """ + # TODO(extension) + # replace with pandas_dtype + if arr_or_dtype is None: raise TypeError("Cannot deduce dtype from null object") if isinstance(arr_or_dtype, np.dtype): @@ -1856,6 +1859,8 @@ def _get_dtype_type(arr_or_dtype): passed in array or dtype object. """ + # TODO(extension) + # replace with pandas_dtype if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type elif isinstance(arr_or_dtype, type): @@ -1988,13 +1993,19 @@ def pandas_dtype(dtype): """ + # short-circuit + if isinstance(dtype, np.ndarray): + return dtype.dtype + elif isinstance(dtype, np.dtype): + return dtype + # registered extension types result = registry.find(dtype) if result is not None: return result # un-registered extension types - if isinstance(dtype, ExtensionDtype): + elif isinstance(dtype, ExtensionDtype): return dtype # try a numpy dtype From ec2c63275f7345b24a995abdb4444040d694f2e0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 8 Jul 2018 12:09:26 -0500 Subject: [PATCH 13/20] don't force repr on invalid dtype --- pandas/core/dtypes/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 96cf2b8977b44..d5ba87b9fecc8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -2013,6 +2013,9 @@ def pandas_dtype(dtype): try: npdtype = np.dtype(dtype) except Exception: + # we don't want to force a repr of the non-string + if not isinstance(dtype, string_types): + raise TypeError("data type not understood") raise TypeError("data type '{}' not understood".format( dtype)) From 953de123371db86d1fa758f3eabd562f96c838f0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 9 Jul 2018 18:05:20 -0400 Subject: [PATCH 14/20] remove uneeded try/catch; review comments --- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/internals.py | 5 +---- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 139124e47ed45..6b213dcb870a9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -96,7 +96,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): scalars : Sequence Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. - dtype : Dtype, optional + dtype : dtype, optional construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. copy : boolean, default False diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 51dafabc943e7..a78e78681ddec 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -3,7 +3,7 @@ import copy import numpy as np -from pandas.compat import u +from pandas.compat import u, range from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.util._decorators import cache_readonly from pandas.compat import set_function_name diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fb2cb64407302..df0a0a06adf26 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -633,10 +633,7 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self.make_block(Categorical(self.values, dtype=dtype)) # convert dtypes if needed - try: - dtype = pandas_dtype(dtype) - except TypeError: - pass + dtype = pandas_dtype(dtype) # astype processing if is_dtype_equal(self.dtype, dtype): From e74d10b7c226be2f856e17231eaea06cac42a553 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 9 Jul 2018 19:55:52 -0400 Subject: [PATCH 15/20] only allow safe casting --- pandas/core/arrays/integer.py | 28 +++++++++++++++++-- .../tests/extension/integer/test_integer.py | 17 +++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index a78e78681ddec..2df0f5f2bc9d7 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -92,6 +92,26 @@ def to_integer_array(values, dtype=None): return IntegerArray(values, dtype=dtype, copy=False) +def safe_cast(values, dtype, copy): + """ + Safely cast the values to the dtype if they + are equivalent, meaning floats must be equivalent to the + ints. + + """ + + try: + return values.astype(dtype, casting='safe', copy=copy) + except TypeError: + + casted = values.astype(dtype, copy=copy) + if (casted == values).all(): + return casted + + raise TypeError("cannot safely cast non-equivalent {} to {}".format( + values.dtype, np.dtype(dtype))) + + def coerce_to_array(values, dtype, mask=None, copy=False): """ Coerce the input values array to numpy arrays with a mask @@ -156,14 +176,16 @@ def coerce_to_array(values, dtype, mask=None, copy=False): else: dtype = dtype.type + # if we are float, let's make sure that we can + # safely cast + # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = 1 - - values = values.astype(dtype) + values = safe_cast(values, dtype, copy=False) else: - values = values.astype(dtype, copy=False) + values = safe_cast(values, dtype, copy=False) return values, mask diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 8a220e9019572..58beb4ae75152 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -572,6 +572,23 @@ def test_astype_specific_casting(self, dtype): expected = pd.Series([1, 2, 3, None], dtype='Int8') self.assert_series_equal(result, expected) + def test_construct_cast_invalid(self, dtype): + + msg = "cannot safely" + arr = [1.2, 2.3, 3.7] + with tm.assert_raises_regex(TypeError, msg): + IntegerArray(arr, dtype=dtype) + + with tm.assert_raises_regex(TypeError, msg): + pd.Series(arr).astype(dtype) + + arr = [1.2, 2.3, 3.7, np.nan] + with tm.assert_raises_regex(TypeError, msg): + IntegerArray(arr, dtype=dtype) + + with tm.assert_raises_regex(TypeError, msg): + pd.Series(arr).astype(dtype) + class TestGroupby(BaseInteger, base.BaseGroupbyTests): From 23afee1fa32b6df43c919d26716d0c796ec7a535 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 Jul 2018 20:23:23 -0400 Subject: [PATCH 16/20] review comments --- doc/source/whatsnew/v0.24.0.txt | 15 +++++------ pandas/core/arrays/base.py | 4 +-- pandas/core/arrays/integer.py | 25 ++++++++++++------- pandas/tests/extension/decimal/array.py | 2 +- .../tests/extension/decimal/test_decimal.py | 1 - .../tests/extension/integer/test_integer.py | 1 - pandas/tests/extension/json/test_json.py | 1 - 7 files changed, 27 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 0c9b0ce98ede0..067adc5415e91 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -29,12 +29,13 @@ ways of adding operator support. .. _whatsnew_0240.enhancements.intna: -Integer NA Support -^^^^^^^^^^^^^^^^^^ +Optional Integer NA Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled thru the use of ``ExtensionTypes`` . Here is an example of the usage. +Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +Here is an example of the usage. -We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying an list or array using the traditional missing value +We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`) .. ipython:: python @@ -57,12 +58,12 @@ Operations on these dtypes will propagate ``NaN`` as other pandas operations. s.iloc[1:3] # operate with other dtypes - s + s.iloc[1:3] + s + s.iloc[1:3].astype('Int8') # coerce when needed s + 0.01 -These dtypes can operate as part of ``DataFrames``. +These dtypes can operate as part of of ``DataFrame``. .. ipython:: python @@ -80,7 +81,7 @@ These dtypes can be merged & reshaped & casted. .. warning:: - The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This maybe changed at a future date. + The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. .. _whatsnew_0240.enhancements.read_html: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6b213dcb870a9..01ed085dd2b9f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -97,10 +97,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. dtype : dtype, optional - construct for this particular dtype. This should be a Dtype + Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. copy : boolean, default False - if True, copy the underlying data + If True, copy the underlying data. Returns ------- ExtensionArray diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2df0f5f2bc9d7..c126117060c3d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -3,15 +3,18 @@ import copy import numpy as np -from pandas.compat import u, range -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass +from pandas._libs.lib import infer_dtype from pandas.util._decorators import cache_readonly +from pandas.compat import u, range from pandas.compat import set_function_name -from pandas.api.types import (is_integer, is_scalar, is_float, - is_float_dtype, is_integer_dtype, - is_object_dtype, - is_list_like, - infer_dtype) + +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass +from pandas.core.dtypes.common import ( + is_integer, is_scalar, is_float, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_list_like) from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.dtypes import registry @@ -75,6 +78,8 @@ def construct_from_string(cls, string): def to_integer_array(values, dtype=None): """ + Infer and return an integer array of the values. + Parameters ---------- values : 1D list-like @@ -83,7 +88,7 @@ def to_integer_array(values, dtype=None): Returns ------- - infer and return an integer array + IntegerArray Raises ------ @@ -235,7 +240,9 @@ def __getitem__(self, item): dtype=self.dtype) def _coerce_to_ndarray(self): - """ coerce to an ndarary, preserving my scalar types """ + """ + coerce to an ndarary of object dtype + """ # TODO(jreback) make this better data = self._data.astype(object) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 373da1af9ef24..108b8874b3ac5 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -54,7 +54,7 @@ def __init__(self, values, dtype=None, copy=False): @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): - return cls(scalars, copy=copy) + return cls(scalars) @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index d40e318b7319c..6e614274457fb 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -11,7 +11,6 @@ from .array import DecimalDtype, DecimalArray -@pytest.fixture def make_data(): return [decimal.Decimal(random.random()) for _ in range(100)] diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 58beb4ae75152..77320a2de62c4 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -15,7 +15,6 @@ UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) -@pytest.fixture def make_data(): return (list(range(8)) + [np.nan] + diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 126607907a8da..520c303f1990b 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -15,7 +15,6 @@ pytestmark = pytest.mark.skipif(PY2, reason="Py2 doesn't have a UserDict") -@pytest.fixture def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer return [collections.UserDict([ From 86362f6ae34bd764abbbda5a9903d2eda1e32544 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 11 Jul 2018 06:12:37 -0400 Subject: [PATCH 17/20] xfail reduce ops --- pandas/tests/extension/base/ops.py | 13 ++++++++++--- pandas/tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/integer/test_integer.py | 12 ++++++++++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index f8d2f8314d2b1..16b37564643bf 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -52,13 +52,20 @@ def _check_divmod_op(self, s, op, other, exc=NotImplementedError): class BaseArithmeticOpsTests(BaseOpsUtil): """Various Series and DataFrame arithmetic ops methods.""" - def test_arith_scalar(self, data, all_arithmetic_operators): - # scalar + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # series & scalar op_name = all_arithmetic_operators s = pd.Series(data) self.check_opname(s, op_name, s.iloc[0], exc=TypeError) - def test_arith_array(self, data, all_arithmetic_operators): + @pytest.mark.xfail(reason="_reduce needs implementation") + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op_name = all_arithmetic_operators + df = pd.DataFrame({'A': data}) + self.check_opname(df, op_name, data[0], exc=TypeError) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 6e614274457fb..20ff94fe33d0a 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -207,7 +207,7 @@ def check_opname(self, s, op_name, other, exc=None): super(TestArithmeticOps, self).check_opname(s, op_name, other, exc=None) - def test_arith_array(self, data, all_arithmetic_operators): + def test_arith_series_with_array(self, data, all_arithmetic_operators): op_name = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 77320a2de62c4..451f7488bd38a 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -247,14 +247,22 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): self._check_op(s, op, rhs) - def test_arith_scalar(self, data, all_arithmetic_operators): + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) - def test_arith_array(self, data, all_arithmetic_operators): + @pytest.mark.xfail(run=False, reason="_reduce needs implementation") + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op = all_arithmetic_operators + + df = pd.DataFrame({'A': data}) + self._check_op(df, op, 1, exc=TypeError) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators From 1bdeb187dded618f177e737f8822ddc022b19466 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 11 Jul 2018 06:38:12 -0400 Subject: [PATCH 18/20] better type checking for extension types --- pandas/core/dtypes/common.py | 1 - pandas/core/ops.py | 2 +- pandas/tests/extension/base/dtype.py | 24 +++++++++++++++++++ pandas/tests/extension/base/ops.py | 2 +- .../extension/category/test_categorical.py | 5 ++-- 5 files changed, 29 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index d5ba87b9fecc8..81ed515d96c95 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1992,7 +1992,6 @@ def pandas_dtype(dtype): TypeError if not a dtype """ - # short-circuit if isinstance(dtype, np.ndarray): return dtype.dtype diff --git a/pandas/core/ops.py b/pandas/core/ops.py index b7422ede0faac..ec834f84315ff 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1356,7 +1356,7 @@ def wrapper(self, other, axis=None): elif (is_extension_array_dtype(self) or (is_extension_array_dtype(other) and - not is_categorical_dtype(other))): + not is_scalar(other))): return dispatch_to_extension_op(op, self, other) elif isinstance(other, ABCSeries): diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 52a12816c8722..7e582dc58194b 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -1,6 +1,7 @@ import pytest import numpy as np import pandas as pd +from pandas.util import testing as tm from .base import BaseExtensionTests @@ -54,3 +55,26 @@ def test_array_type(self, data, dtype): def test_array_type_with_arg(self, data, dtype): with pytest.raises(NotImplementedError): dtype.construct_array_type('foo') + + def test_check_dtype(self, data, dtype): + # check equivalency for using .dtypes + df = pd.DataFrame({'A': pd.Series(data, dtype=dtype), + 'B': data, + 'C': 'foo', 'D': 1}) + + # np.dtype('int64') == 'Int64' == 'int64' + # so can't distinguish + if dtype.name == 'Int64': + expected = pd.Series([True, True, False, True], + index=list('ABCD')) + else: + expected = pd.Series([True, True, False, False], + index=list('ABCD')) + + result = df.dtypes == dtype.name + tm.assert_series_equal(result, expected) + + expected = pd.Series([True, True, False, False], + index=list('ABCD')) + result = df.dtypes.apply(str) == dtype.name + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 16b37564643bf..f7bfdb8ec218a 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -58,7 +58,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): s = pd.Series(data) self.check_opname(s, op_name, s.iloc[0], exc=TypeError) - @pytest.mark.xfail(reason="_reduce needs implementation") + @pytest.mark.xfail(run=False, reason="_reduce needs implementation") def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op_name = all_arithmetic_operators diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 715e8bd40a2d0..76f6b03907ef8 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -189,11 +189,12 @@ class TestCasting(base.BaseCastingTests): class TestArithmeticOps(base.BaseArithmeticOpsTests): - def test_arith_scalar(self, data, all_arithmetic_operators): + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): op_name = all_arithmetic_operators if op_name != '__rmod__': - super(TestArithmeticOps, self).test_arith_scalar(data, op_name) + super(TestArithmeticOps, self).test_arith_series_with_scalar( + data, op_name) else: pytest.skip('rmod never called when string is first argument') From 88858358898c3e013a4b5b03717cc25b6a896617 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 12 Jul 2018 06:01:53 -0400 Subject: [PATCH 19/20] use a better testing idiom --- pandas/tests/extension/base/dtype.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 7e582dc58194b..269017b961b35 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -1,7 +1,6 @@ import pytest import numpy as np import pandas as pd -from pandas.util import testing as tm from .base import BaseExtensionTests @@ -72,9 +71,9 @@ def test_check_dtype(self, data, dtype): index=list('ABCD')) result = df.dtypes == dtype.name - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) expected = pd.Series([True, True, False, False], index=list('ABCD')) result = df.dtypes.apply(str) == dtype.name - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) From 160678647fb0a01b57261e34da853a882d2e55dd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 16 Jul 2018 06:39:41 -0400 Subject: [PATCH 20/20] interval index compat --- pandas/core/arrays/interval.py | 4 ++-- pandas/tests/extension/base/dtype.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4ad53e16bc439..e78d6d4948023 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -191,8 +191,8 @@ def _simple_new(cls, left, right, closed=None, return result @classmethod - def _from_sequence(cls, scalars): - return cls(scalars) + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars, dtype=dtype, copy=copy) @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 269017b961b35..2125458e8a0ba 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -55,7 +55,9 @@ def test_array_type_with_arg(self, data, dtype): with pytest.raises(NotImplementedError): dtype.construct_array_type('foo') - def test_check_dtype(self, data, dtype): + def test_check_dtype(self, data): + dtype = data.dtype + # check equivalency for using .dtypes df = pd.DataFrame({'A': pd.Series(data, dtype=dtype), 'B': data, @@ -70,10 +72,10 @@ def test_check_dtype(self, data, dtype): expected = pd.Series([True, True, False, False], index=list('ABCD')) - result = df.dtypes == dtype.name + result = df.dtypes == str(dtype) self.assert_series_equal(result, expected) expected = pd.Series([True, True, False, False], index=list('ABCD')) - result = df.dtypes.apply(str) == dtype.name + result = df.dtypes.apply(str) == str(dtype) self.assert_series_equal(result, expected)