From 4ffb277dffd0914cce90d8227194665bf448ccde Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Thu, 8 Apr 2021 14:30:44 -0700 Subject: [PATCH] various bugfix in test and data type inference code * remove expensive convert_dtype in float inference * bring back cardinality calculation on all float columns * nominal detection applies even for floats < 20 row now --- lux/executor/PandasExecutor.py | 16 ++++------------ tests/test_interestingness.py | 21 ++++++++++++--------- tests/test_pandas_coverage.py | 6 +++--- 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 35115781..911bd932 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -422,13 +422,8 @@ def compute_data_type(self, ldf: LuxDataFrame): elif self._is_geographical_attribute(ldf[attr]): ldf._data_type[attr] = "geographical" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): - # int columns gets coerced into floats if contain NaN - convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes()) - if ( - convertible2int - and ldf.cardinality[attr] != len(ldf) - and (len(ldf[attr].convert_dtypes().unique() < 20)) - ): + + if ldf.cardinality[attr] != len(ldf) and (ldf.cardinality[attr] < 20): ldf._data_type[attr] = "nominal" else: ldf._data_type[attr] = "quantitative" @@ -525,11 +520,8 @@ def compute_stats(self, ldf: LuxDataFrame): else: attribute_repr = attribute - if ldf.dtypes[attribute] != "float64" or ldf[attribute].isnull().values.any(): - ldf.unique_values[attribute_repr] = list(ldf[attribute].unique()) - ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute]) - else: - ldf.cardinality[attribute_repr] = 999 # special value for non-numeric attribute + ldf.unique_values[attribute_repr] = list(ldf[attribute].unique()) + ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr]) if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype( ldf.dtypes[attribute] diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py index 1dc4f535..703fa44d 100644 --- a/tests/test_interestingness.py +++ b/tests/test_interestingness.py @@ -273,25 +273,28 @@ def test_interestingness_deviation_nan(): import numpy as np dataset = [ - {"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0}, - {"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2}, - {"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3}, - {"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4}, - {"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5}, - {"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1}, + {"date": "2017-08-25", "category": "A", "value": 25.0}, + {"date": "2017-08-25", "category": "B", "value": 1.2}, + {"date": "2017-08-25", "category": "C", "value": 1.3}, + {"date": "2017-08-25", "category": "D", "value": 1.4}, + {"date": "2017-08-25", "category": "E", "value": 1.5}, + {"date": "2017-08-25", "category": "F", "value": 0.1}, {"date": np.nan, "category": "C", "value": 0.2}, {"date": np.nan, "category": "B", "value": 0.2}, {"date": np.nan, "category": "F", "value": 0.3}, {"date": np.nan, "category": "E", "value": 0.3}, {"date": np.nan, "category": "D", "value": 0.4}, {"date": np.nan, "category": "A", "value": 10.4}, - {"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5}, - {"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0}, - {"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1}, + {"date": "2017-07-25", "category": "A", "value": 15.5}, + {"date": "2017-07-25", "category": "F", "value": 1.0}, + {"date": "2017-07-25", "category": "B", "value": 0.1}, ] test = pd.DataFrame(dataset) from lux.vis.Vis import Vis + test["date"] = pd.to_datetime(test["date"], format="%Y-%M-%d") + test.set_data_type({"value": "quantitative"}) + vis = Vis(["date", "value", "category=A"], test) vis2 = Vis(["date", "value", "category=B"], test) from lux.interestingness.interestingness import interestingness diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py index c1039591..fc8451c0 100644 --- a/tests/test_pandas_coverage.py +++ b/tests/test_pandas_coverage.py @@ -257,7 +257,7 @@ def test_transform(global_var): df["Year"] = pd.to_datetime(df["Year"], format="%Y") new_df = df.iloc[:, 1:].groupby("Origin").transform(sum) new_df._ipython_display_() - assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"] + assert list(new_df.recommendation.keys()) == ["Occurrence"] assert len(new_df.cardinality) == 7 @@ -409,7 +409,7 @@ def test_loc(global_var): assert len(new_df.cardinality) == 6 new_df = df.loc[0:10, "Displacement":"Horsepower"] new_df._ipython_display_() - assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"] + assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"] assert len(new_df.cardinality) == 2 import numpy as np @@ -438,7 +438,7 @@ def test_iloc(global_var): assert len(new_df.cardinality) == 6 new_df = df.iloc[0:11, 3:5] new_df._ipython_display_() - assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"] + assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"] assert len(new_df.cardinality) == 2 import numpy as np