-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: add in extension dtype registry #21185
Changes from all commits
4b132cf
90fccb0
80e15fd
1c3d023
4b05a1c
9cfbc07
72e1010
59b3510
07d1038
7dbd2f3
7a2cb6a
96d5d09
5119672
e22d4c7
7cc1a0a
930cec5
bd5dcd3
407d7b3
9084906
f560ea1
5fabd51
d2c91d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -109,6 +109,11 @@ class ExtensionDtype(_DtypeOpsMixin): | |
* name | ||
* construct_from_string | ||
|
||
Optionally one can override construct_array_type for construction | ||
with the name of this dtype via the Registry | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would give some additional explanation of what the Registry is, because now this is not explained here? |
||
|
||
* construct_array_type | ||
|
||
The `na_value` class attribute can be used to set the default NA value | ||
for this type. :attr:`numpy.nan` is used by default. | ||
|
||
|
@@ -156,6 +161,16 @@ def name(self): | |
""" | ||
raise AbstractMethodError(self) | ||
|
||
@classmethod | ||
def construct_array_type(cls): | ||
"""Return the array type associated with this dtype | ||
|
||
Returns | ||
------- | ||
type | ||
""" | ||
raise NotImplementedError | ||
|
||
@classmethod | ||
def construct_from_string(cls, string): | ||
"""Attempt to construct this type from a string. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -648,6 +648,11 @@ def conv(r, dtype): | |
def astype_nansafe(arr, dtype, copy=True): | ||
""" return a view if copy is False, but | ||
need to be very careful as the result shape could change! """ | ||
|
||
# dispatch on extension dtype if needed | ||
if is_extension_array_dtype(dtype): | ||
return dtype.array_type._from_sequence(arr, copy=copy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should there be any validation about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this validation is a contract of the Array itself |
||
|
||
if not isinstance(dtype, np.dtype): | ||
dtype = pandas_dtype(dtype) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,65 @@ | |
from .base import ExtensionDtype, _DtypeOpsMixin | ||
|
||
|
||
class Registry(object): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without looking at the uses yet, could we simplify this a by just allowing string lookup? Ideally, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the current code also supports finding the dtype for eg There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, and I suppose we want that to support |
||
""" | ||
Registry for dtype inference | ||
|
||
The registry allows one to map a string repr of a extension | ||
dtype to an extenstion dtype. | ||
|
||
Multiple extension types can be registered. | ||
These are tried in order. | ||
|
||
Examples | ||
-------- | ||
registry.register(MyExtensionDtype) | ||
""" | ||
dtypes = [] | ||
|
||
@classmethod | ||
def register(self, dtype): | ||
""" | ||
Parameters | ||
---------- | ||
dtype : ExtensionDtype | ||
""" | ||
if not issubclass(dtype, (PandasExtensionDtype, ExtensionDtype)): | ||
raise ValueError("can only register pandas extension dtypes") | ||
|
||
self.dtypes.append(dtype) | ||
|
||
def find(self, dtype): | ||
""" | ||
Parameters | ||
---------- | ||
dtype : PandasExtensionDtype or string | ||
|
||
Returns | ||
------- | ||
return the first matching dtype, otherwise return None | ||
""" | ||
if not isinstance(dtype, compat.string_types): | ||
dtype_type = dtype | ||
if not isinstance(dtype, type): | ||
dtype_type = type(dtype) | ||
if issubclass(dtype_type, (PandasExtensionDtype, ExtensionDtype)): | ||
return dtype | ||
|
||
return None | ||
|
||
for dtype_type in self.dtypes: | ||
try: | ||
return dtype_type.construct_from_string(dtype) | ||
except TypeError: | ||
pass | ||
|
||
return None | ||
|
||
|
||
registry = Registry() | ||
|
||
|
||
class PandasExtensionDtype(_DtypeOpsMixin): | ||
""" | ||
A np.dtype duck-typed class, suitable for holding a custom dtype. | ||
|
@@ -265,6 +324,17 @@ def _hash_categories(categories, ordered=True): | |
else: | ||
return np.bitwise_xor.reduce(hashed) | ||
|
||
@classmethod | ||
def construct_array_type(cls): | ||
"""Return the array type associated with this dtype | ||
|
||
Returns | ||
------- | ||
type | ||
""" | ||
from pandas import Categorical | ||
return Categorical | ||
|
||
@classmethod | ||
def construct_from_string(cls, string): | ||
""" attempt to construct this type from a string, raise a TypeError if | ||
|
@@ -556,11 +626,16 @@ def _parse_dtype_strict(cls, freq): | |
@classmethod | ||
def construct_from_string(cls, string): | ||
""" | ||
attempt to construct this type from a string, raise a TypeError | ||
if its not possible | ||
Strict construction from a string, raise a TypeError if not | ||
possible | ||
""" | ||
from pandas.tseries.offsets import DateOffset | ||
if isinstance(string, (compat.string_types, DateOffset)): | ||
|
||
if (isinstance(string, compat.string_types) and | ||
(string.startswith('period[') or | ||
string.startswith('Period[')) or | ||
isinstance(string, DateOffset)): | ||
# do not parse string like U as period[U] | ||
# avoid tuple to be regarded as freq | ||
try: | ||
return cls(freq=string) | ||
|
@@ -660,7 +735,7 @@ def __new__(cls, subtype=None): | |
try: | ||
subtype = pandas_dtype(subtype) | ||
except TypeError: | ||
raise ValueError("could not construct IntervalDtype") | ||
raise TypeError("could not construct IntervalDtype") | ||
|
||
if is_categorical_dtype(subtype) or is_string_dtype(subtype): | ||
# GH 19016 | ||
|
@@ -682,8 +757,11 @@ def construct_from_string(cls, string): | |
attempt to construct this type from a string, raise a TypeError | ||
if its not possible | ||
""" | ||
if isinstance(string, compat.string_types): | ||
if (isinstance(string, compat.string_types) and | ||
(string.startswith('interval') or | ||
string.startswith('Interval'))): | ||
return cls(string) | ||
|
||
msg = "a string needs to be passed, got type {typ}" | ||
raise TypeError(msg.format(typ=type(string))) | ||
|
||
|
@@ -727,3 +805,10 @@ def is_dtype(cls, dtype): | |
else: | ||
return False | ||
return super(IntervalDtype, cls).is_dtype(dtype) | ||
|
||
|
||
# register the dtypes in search order | ||
registry.register(DatetimeTZDtype) | ||
registry.register(PeriodDtype) | ||
registry.register(IntervalDtype) | ||
registry.register(CategoricalDtype) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove the leading space. Add a trailing
.