Adding support for all dtypes, adds more tests and fixes small bugs (#…

…102) * update export tutorial to add explanation for standalone argument * minor fixes and remove cell output in notebooks * added contributing doc * fix bugs and uncomment some tests * remove raise warning * remove unnecessary import * split up rename test into two parts * fix setting warning, fix data_type bugs and add relevant tests * remove ordinal data type * add test for small dataframe resetting index * add loc and iloc tests * fix attribute access directly to dataframe * add small changes to code * added test for qcut and cut * add check if dtype is Interval * added qcut test Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
lux-org · Oct 2, 2020 · f1cf523 · f1cf523
1 parent e7df939
commit f1cf523
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 17 deletions.
diff --git a/lux/action/filter.py b/lux/action/filter.py
@@ -43,7 +43,7 @@ def filter(ldf):
 		#get unique values for all categorical values specified and creates corresponding filters
 		fltr = filters[0]
 
-		if (ldf.data_type_lookup[fltr.attribute]=="ordinal" or ldf.data_type_lookup[fltr.attribute]=="nominal"):
+		if (ldf.data_type_lookup[fltr.attribute]=="nominal"):
 			recommendation = {"action":"Filter",
 							"description":f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value."}
 			unique_values = ldf.unique_values[fltr.attribute]

diff --git a/lux/core/frame.py b/lux/core/frame.py
@@ -103,9 +103,10 @@ def expire_metadata(self):
 	## Override Pandas ##
 	#####################
 	def __getattr__(self, name):
-		super(LuxDataFrame, self).__getattr__(name)
+		ret_value = super(LuxDataFrame, self).__getattr__(name)
 		self.expire_metadata()
 		self.expire_recs()
+		return ret_value
 	def _set_axis(self, axis, labels):
 		super(LuxDataFrame, self)._set_axis(axis, labels)
 		self.expire_metadata()
@@ -114,6 +115,10 @@ def _update_inplace(self,*args,**kwargs):
 		super(LuxDataFrame, self)._update_inplace(*args,**kwargs)
 		self.expire_metadata()
 		self.expire_recs()
+	def _set_item(self, key, value):
+		super(LuxDataFrame, self)._set_item(key, value)
+		self.expire_metadata()
+		self.expire_recs()
 	@property
 	def default_display(self):
 		if (self._default_pandas_display):
@@ -353,7 +358,7 @@ def compute_SQL_data_type(self):
 			datatype = list(pd.read_sql(datatype_query, self.SQLconnection)['data_type'])[0]
 			sql_dtypes[attr] = datatype
 
-		data_type = {"quantitative":[], "ordinal":[], "nominal":[], "temporal":[]}
+		data_type = {"quantitative":[], "nominal":[], "temporal":[]}
 		for attr in list(self.columns):
 			if str(attr).lower() in ["month", "year"]:
 				data_type_lookup[attr] = "temporal"

diff --git a/lux/executor/Executor.py b/lux/executor/Executor.py
@@ -52,7 +52,7 @@ def compute_data_model(self):
 
     def mapping(self, rmap):
         group_map = {}
-        for val in ["quantitative", "id", "ordinal", "nominal", "temporal"]:
+        for val in ["quantitative", "id", "nominal", "temporal"]:
             group_map[val] = list(filter(lambda x: rmap[x] == val, rmap))
         return group_map
 

diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -316,6 +316,8 @@ def compute_data_type(self, ldf:LuxDataFrame):
                     ldf.data_type_lookup[attr] = "nominal"
             elif is_datetime_series(ldf.dtypes[attr]): #check if attribute is any type of datetime dtype
                 ldf.data_type_lookup[attr] = "temporal"
+            else:
+                ldf.data_type_lookup[attr] = "nominal" 
         # for attr in list(df.dtypes[df.dtypes=="int64"].keys()):
         #   if self.cardinality[attr]>50:
         if (ldf.index.dtype !='int64' and ldf.index.name):
@@ -346,7 +348,7 @@ def compute_data_type(self, ldf:LuxDataFrame):
     def compute_data_model(self, ldf:LuxDataFrame):
         ldf.data_model = {
             "measure": ldf.data_type["quantitative"],
-            "dimension": ldf.data_type["ordinal"] + ldf.data_type["nominal"] + ldf.data_type["temporal"]  + ldf.data_type["id"]
+            "dimension": ldf.data_type["nominal"] + ldf.data_type["temporal"]  + ldf.data_type["id"]
         }
         ldf.data_model_lookup = self.reverseMapping(ldf.data_model)
 

diff --git a/lux/vis/Clause.py b/lux/vis/Clause.py
@@ -41,7 +41,7 @@ def __init__(self, description:typing.Union[str,list] ="",attribute: typing.Unio
 			Possible values: 'x','y','color', by default ""
 		data_type : str, optional
 			Data type for the specified attribute.
-			Possible values: 'nominal', 'quantitative', 'ordinal','temporal', by default ""
+			Possible values: 'nominal', 'quantitative','temporal', by default ""
 		data_model : str, optional
 			Data model for the specified attribute
 			Possible values: 'dimension', 'measure', by default ""

diff --git a/lux/vislib/altair/AltairRenderer.py b/lux/vislib/altair/AltairRenderer.py
@@ -50,7 +50,8 @@ def create_vis(self,vis, standalone=True):
 				if pd.api.types.is_period_dtype(vis.data.dtypes[attr]) or isinstance(vis.data[attr].iloc[0], pd.Period):
 					dateColumn = vis.data[attr]
 					vis.data[attr] = pd.PeriodIndex(dateColumn.values).to_timestamp()
-
+				if pd.api.types.is_interval_dtype(vis.data.dtypes[attr]) or isinstance(vis.data[attr].iloc[0], pd.Interval):
+					vis.data[attr] = vis.data[attr].astype(str)
 		if (vis.mark =="histogram"):
 			chart = Histogram(vis)
 		elif (vis.mark =="bar"):

diff --git a/tests/test_action.py b/tests/test_action.py
@@ -88,7 +88,7 @@ def test_crosstab():
 	'Result':['Pass','Pass','Fail','Pass','Fail','Pass','Pass','Fail','Fail','Pass','Pass','Fail']}
 
 	df = pd.DataFrame(d,columns=['Name','Exam','Subject','Result'])
-	result = pd.crosstab([df["Exam"]],df["Result"])
+	result = pd.crosstab([df.Exam],df.Result)
 	result._repr_html_()
 	assert list(result.recommendation.keys() ) == ['Row Groups','Column Groups']
 

diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py
@@ -120,6 +120,26 @@ def test_groupby_agg():
     assert list(new_df.recommendation.keys() ) == ['Column Groups']
     assert len(new_df.cardinality) == 7
 
+def test_qcut():
+    df = pd.read_csv("lux/data/car.csv")
+    df["Year"] = pd.to_datetime(df["Year"], format='%Y')
+    df["Weight"] = pd.qcut(df["Weight"], q = 3)
+    df._repr_html_()
+
+def test_cut():
+    df = pd.read_csv("lux/data/car.csv")
+    df["Weight"] = pd.cut(df["Weight"], bins = [0, 2500, 7500, 10000], labels = ["small", "medium", "large"])
+    df._repr_html_()
+# def test_groupby_agg_very_small():
+
+#     url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
+#     df = pd.read_csv(url)
+#     df["Year"] = pd.to_datetime(df["Year"], format='%Y')
+#     new_df = df.groupby("Origin").agg(sum).reset_index()
+#     new_df._repr_html_()
+#     assert list(new_df.recommendation.keys() ) == ['Column Groups']
+#     assert len(new_df.cardinality) == 7
+
 # def test_groupby_multi_index():
 #     url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
 #     df = pd.read_csv(url)
@@ -214,15 +234,15 @@ def test_drop():
     assert list(new_df2.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']
     assert len(new_df2.cardinality) == 6
 
-# def test_merge():
-#     url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
-#     df = pd.read_csv(url)
-#     df["Year"] = pd.to_datetime(df["Year"], format='%Y')
-#     new_df = df.drop([0, 1, 2], axis = "rows")
-#     new_df2 = pd.merge(df, new_df, how = "left", indicator = True)
-#     new_df2._repr_html_()
-#     assert list(new_df2.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']  # TODO once bug is fixed
-#     assert len(new_df2.cardinality) == 7 # TODO once bug is fixed
+def test_merge():
+    url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
+    df = pd.read_csv(url)
+    df["Year"] = pd.to_datetime(df["Year"], format='%Y')
+    new_df = df.drop([0, 1, 2], axis = "rows")
+    new_df2 = pd.merge(df, new_df, how = "left", indicator = True)
+    new_df2._repr_html_()
+    assert list(new_df2.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']  # TODO once bug is fixed
+    assert len(new_df2.cardinality) == 11 # TODO once bug is fixed
 
 def test_prefix():
     df = pd.read_csv("lux/data/car.csv")
@@ -233,6 +253,52 @@ def test_prefix():
     assert len(new_df.cardinality) == 9
     assert new_df.cardinality["1_Name"] == 300
 
+def test_loc():
+    url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
+    df = pd.read_csv(url)
+    df["Year"] = pd.to_datetime(df["Year"], format='%Y')
+    new_df = df.loc[:,"Displacement":"Origin"]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']
+    assert len(new_df.cardinality) == 6
+    new_df = df.loc[0:10,"Displacement":"Origin"]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
+    assert len(new_df.cardinality) == 6
+    new_df = df.loc[0:10,"Displacement":"Horsepower"]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
+    assert len(new_df.cardinality) == 2
+    import numpy as np
+    inter_df = df.groupby("Brand")[["Acceleration", "Weight", "Horsepower"]].agg(np.mean)
+    new_df = inter_df.loc["chevrolet":"fiat", "Acceleration":"Weight"]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Column Groups']
+    assert len(new_df.cardinality) == 3
+
+def test_iloc():
+    url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
+    df = pd.read_csv(url)
+    df["Year"] = pd.to_datetime(df["Year"], format='%Y')
+    new_df = df.iloc[:,3:9]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']
+    assert len(new_df.cardinality) == 6
+    new_df = df.iloc[0:11,3:9]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
+    assert len(new_df.cardinality) == 6
+    new_df = df.iloc[0:11,3:5]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
+    assert len(new_df.cardinality) == 2
+    import numpy as np
+    inter_df = df.groupby("Brand")[["Acceleration", "Weight", "Horsepower"]].agg(np.mean)
+    new_df = inter_df.iloc[5:10, 0:2]
+    new_df._repr_html_()
+    assert list(new_df.recommendation.keys() ) == ['Column Groups']
+    assert len(new_df.cardinality) == 3
+
 def check_metadata_equal(df1, df2):
     # Checks to make sure metadata for df1 and df2 are equal.
     for attr in df1._metadata: