Skip to content

Commit

Permalink
Adding support for all dtypes, adds more tests and fixes small bugs (#…
Browse files Browse the repository at this point in the history
…102)

* update export tutorial to add explanation for standalone argument

* minor fixes and remove cell output in notebooks

* added contributing doc

* fix bugs and uncomment some tests

* remove raise warning

* remove unnecessary import

* split up rename test into two parts

* fix setting warning, fix data_type bugs and add relevant tests

* remove ordinal data type

* add test for small dataframe resetting index

* add loc and iloc tests

* fix attribute access directly to dataframe

* add small changes to code

* added test for qcut and cut

* add check if dtype is Interval

* added qcut test

Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
  • Loading branch information
westernguy2 and dorisjlee authored Oct 2, 2020
1 parent e7df939 commit f1cf523
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 17 deletions.
2 changes: 1 addition & 1 deletion lux/action/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def filter(ldf):
#get unique values for all categorical values specified and creates corresponding filters
fltr = filters[0]

if (ldf.data_type_lookup[fltr.attribute]=="ordinal" or ldf.data_type_lookup[fltr.attribute]=="nominal"):
if (ldf.data_type_lookup[fltr.attribute]=="nominal"):
recommendation = {"action":"Filter",
"description":f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value."}
unique_values = ldf.unique_values[fltr.attribute]
Expand Down
9 changes: 7 additions & 2 deletions lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,10 @@ def expire_metadata(self):
## Override Pandas ##
#####################
def __getattr__(self, name):
super(LuxDataFrame, self).__getattr__(name)
ret_value = super(LuxDataFrame, self).__getattr__(name)
self.expire_metadata()
self.expire_recs()
return ret_value
def _set_axis(self, axis, labels):
super(LuxDataFrame, self)._set_axis(axis, labels)
self.expire_metadata()
Expand All @@ -114,6 +115,10 @@ def _update_inplace(self,*args,**kwargs):
super(LuxDataFrame, self)._update_inplace(*args,**kwargs)
self.expire_metadata()
self.expire_recs()
def _set_item(self, key, value):
super(LuxDataFrame, self)._set_item(key, value)
self.expire_metadata()
self.expire_recs()
@property
def default_display(self):
if (self._default_pandas_display):
Expand Down Expand Up @@ -353,7 +358,7 @@ def compute_SQL_data_type(self):
datatype = list(pd.read_sql(datatype_query, self.SQLconnection)['data_type'])[0]
sql_dtypes[attr] = datatype

data_type = {"quantitative":[], "ordinal":[], "nominal":[], "temporal":[]}
data_type = {"quantitative":[], "nominal":[], "temporal":[]}
for attr in list(self.columns):
if str(attr).lower() in ["month", "year"]:
data_type_lookup[attr] = "temporal"
Expand Down
2 changes: 1 addition & 1 deletion lux/executor/Executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def compute_data_model(self):

def mapping(self, rmap):
group_map = {}
for val in ["quantitative", "id", "ordinal", "nominal", "temporal"]:
for val in ["quantitative", "id", "nominal", "temporal"]:
group_map[val] = list(filter(lambda x: rmap[x] == val, rmap))
return group_map

Expand Down
4 changes: 3 additions & 1 deletion lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,8 @@ def compute_data_type(self, ldf:LuxDataFrame):
ldf.data_type_lookup[attr] = "nominal"
elif is_datetime_series(ldf.dtypes[attr]): #check if attribute is any type of datetime dtype
ldf.data_type_lookup[attr] = "temporal"
else:
ldf.data_type_lookup[attr] = "nominal"
# for attr in list(df.dtypes[df.dtypes=="int64"].keys()):
# if self.cardinality[attr]>50:
if (ldf.index.dtype !='int64' and ldf.index.name):
Expand Down Expand Up @@ -346,7 +348,7 @@ def compute_data_type(self, ldf:LuxDataFrame):
def compute_data_model(self, ldf:LuxDataFrame):
ldf.data_model = {
"measure": ldf.data_type["quantitative"],
"dimension": ldf.data_type["ordinal"] + ldf.data_type["nominal"] + ldf.data_type["temporal"] + ldf.data_type["id"]
"dimension": ldf.data_type["nominal"] + ldf.data_type["temporal"] + ldf.data_type["id"]
}
ldf.data_model_lookup = self.reverseMapping(ldf.data_model)

Expand Down
2 changes: 1 addition & 1 deletion lux/vis/Clause.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, description:typing.Union[str,list] ="",attribute: typing.Unio
Possible values: 'x','y','color', by default ""
data_type : str, optional
Data type for the specified attribute.
Possible values: 'nominal', 'quantitative', 'ordinal','temporal', by default ""
Possible values: 'nominal', 'quantitative','temporal', by default ""
data_model : str, optional
Data model for the specified attribute
Possible values: 'dimension', 'measure', by default ""
Expand Down
3 changes: 2 additions & 1 deletion lux/vislib/altair/AltairRenderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def create_vis(self,vis, standalone=True):
if pd.api.types.is_period_dtype(vis.data.dtypes[attr]) or isinstance(vis.data[attr].iloc[0], pd.Period):
dateColumn = vis.data[attr]
vis.data[attr] = pd.PeriodIndex(dateColumn.values).to_timestamp()

if pd.api.types.is_interval_dtype(vis.data.dtypes[attr]) or isinstance(vis.data[attr].iloc[0], pd.Interval):
vis.data[attr] = vis.data[attr].astype(str)
if (vis.mark =="histogram"):
chart = Histogram(vis)
elif (vis.mark =="bar"):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_crosstab():
'Result':['Pass','Pass','Fail','Pass','Fail','Pass','Pass','Fail','Fail','Pass','Pass','Fail']}

df = pd.DataFrame(d,columns=['Name','Exam','Subject','Result'])
result = pd.crosstab([df["Exam"]],df["Result"])
result = pd.crosstab([df.Exam],df.Result)
result._repr_html_()
assert list(result.recommendation.keys() ) == ['Row Groups','Column Groups']

Expand Down
84 changes: 75 additions & 9 deletions tests/test_pandas_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,26 @@ def test_groupby_agg():
assert list(new_df.recommendation.keys() ) == ['Column Groups']
assert len(new_df.cardinality) == 7

def test_qcut():
df = pd.read_csv("lux/data/car.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y')
df["Weight"] = pd.qcut(df["Weight"], q = 3)
df._repr_html_()

def test_cut():
df = pd.read_csv("lux/data/car.csv")
df["Weight"] = pd.cut(df["Weight"], bins = [0, 2500, 7500, 10000], labels = ["small", "medium", "large"])
df._repr_html_()
# def test_groupby_agg_very_small():

# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
# df = pd.read_csv(url)
# df["Year"] = pd.to_datetime(df["Year"], format='%Y')
# new_df = df.groupby("Origin").agg(sum).reset_index()
# new_df._repr_html_()
# assert list(new_df.recommendation.keys() ) == ['Column Groups']
# assert len(new_df.cardinality) == 7

# def test_groupby_multi_index():
# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
# df = pd.read_csv(url)
Expand Down Expand Up @@ -214,15 +234,15 @@ def test_drop():
assert list(new_df2.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']
assert len(new_df2.cardinality) == 6

# def test_merge():
# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
# df = pd.read_csv(url)
# df["Year"] = pd.to_datetime(df["Year"], format='%Y')
# new_df = df.drop([0, 1, 2], axis = "rows")
# new_df2 = pd.merge(df, new_df, how = "left", indicator = True)
# new_df2._repr_html_()
# assert list(new_df2.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal'] # TODO once bug is fixed
# assert len(new_df2.cardinality) == 7 # TODO once bug is fixed
def test_merge():
url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
df = pd.read_csv(url)
df["Year"] = pd.to_datetime(df["Year"], format='%Y')
new_df = df.drop([0, 1, 2], axis = "rows")
new_df2 = pd.merge(df, new_df, how = "left", indicator = True)
new_df2._repr_html_()
assert list(new_df2.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal'] # TODO once bug is fixed
assert len(new_df2.cardinality) == 11 # TODO once bug is fixed

def test_prefix():
df = pd.read_csv("lux/data/car.csv")
Expand All @@ -233,6 +253,52 @@ def test_prefix():
assert len(new_df.cardinality) == 9
assert new_df.cardinality["1_Name"] == 300

def test_loc():
url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
df = pd.read_csv(url)
df["Year"] = pd.to_datetime(df["Year"], format='%Y')
new_df = df.loc[:,"Displacement":"Origin"]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']
assert len(new_df.cardinality) == 6
new_df = df.loc[0:10,"Displacement":"Origin"]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
assert len(new_df.cardinality) == 6
new_df = df.loc[0:10,"Displacement":"Horsepower"]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
assert len(new_df.cardinality) == 2
import numpy as np
inter_df = df.groupby("Brand")[["Acceleration", "Weight", "Horsepower"]].agg(np.mean)
new_df = inter_df.loc["chevrolet":"fiat", "Acceleration":"Weight"]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Column Groups']
assert len(new_df.cardinality) == 3

def test_iloc():
url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
df = pd.read_csv(url)
df["Year"] = pd.to_datetime(df["Year"], format='%Y')
new_df = df.iloc[:,3:9]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution', 'Occurrence', 'Temporal']
assert len(new_df.cardinality) == 6
new_df = df.iloc[0:11,3:9]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
assert len(new_df.cardinality) == 6
new_df = df.iloc[0:11,3:5]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Correlation', 'Distribution']
assert len(new_df.cardinality) == 2
import numpy as np
inter_df = df.groupby("Brand")[["Acceleration", "Weight", "Horsepower"]].agg(np.mean)
new_df = inter_df.iloc[5:10, 0:2]
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Column Groups']
assert len(new_df.cardinality) == 3

def check_metadata_equal(df1, df2):
# Checks to make sure metadata for df1 and df2 are equal.
for attr in df1._metadata:
Expand Down

0 comments on commit f1cf523

Please sign in to comment.