Skip to content

Commit e86c71d

Browse files
authored
Merge pull request #220 from kayjan/optimization-timings
Optimization timings
2 parents 5da91ca + 992ea0e commit e86c71d

14 files changed

+810
-384
lines changed

CHANGELOG.md

+31-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
## [0.17.0] - 2024-04-04
10+
### Added
11+
- Misc: Group tests for benchmark timings to compare the timings by multiplier more effectively.
12+
### Changed
13+
- Tree Constructor: `add_dict_to_tree_by_name` and `add_dataframe_to_tree_by_name` modifies tree in-place instead
14+
of returning new tree, and does not accept `join_type` as argument as pandas dataframe operation is phased out.
15+
If there are clashing attributes, only those that have values will be replaced.
16+
**This might not be backwards-compatible!**
17+
- Tree Constructor: `dataframe_to_tree` no longer relies on `add_dataframe_to_tree_by_path` as it performs
18+
assertion checks twice. This leads to 5% improvement in timings for a tree with 10000 nodes, averaged across 10 runs.
19+
- Misc: Abstract out assertion checks for empty dataframe and duplicate attribute.
20+
- Misc: Abstract out logic for checking null and filtering attributes.
21+
- Misc: Optimization in dictionary and dataframe operations.
22+
### Fixed
23+
- Tree Constructor: `dict_to_tree` no longer uses dataframe operations, leading to 33% improvement in timings for
24+
a tree with 10000 nodes, averaged across 10 runs. The resulting data type of node follows the dictionary exactly,
25+
compared to the previous dataframe operations that may change the dtypes for certain columns.
26+
**This might not be backwards-compatible!**
27+
- Tree Constructor: `dataframe_to_tree_by_relation` fix root node detection logic, ignore existing name column,
28+
ignore non-attribute columns, ignore null attribute columns.
29+
- Tree Constructor: `add_dataframe_to_tree_by_path` ignore existing name column, ignore non-attribute columns,
30+
ignore null attribute columns.
31+
- Tree Constructor: `add_dataframe_to_tree_by_name` ignore existing name column, ignore non-attribute columns,
32+
ignore null attribute columns.
33+
- Tree Constructor: `dataframe_to_tree` ignore existing name column, ignore non-attribute columns,
34+
ignore null attribute columns.
35+
- DAG Constructor: `dataframe_to_dag` ignore existing name column, ignore non-attribute columns,
36+
ignore null attribute columns.
37+
938
## [0.16.4] - 2024-03-14
1039
### Fixed
1140
- [#216] Tree Exporter: Fix nan checker when printing trees.
@@ -511,7 +540,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
511540
- Utility Iterator: Tree traversal methods.
512541
- Workflow To Do App: Tree use case with to-do list implementation.
513542

514-
[Unreleased]: https://github.com/kayjan/bigtree/compare/0.16.4...HEAD
543+
[Unreleased]: https://github.com/kayjan/bigtree/compare/0.17.0...HEAD
544+
[0.17.0]: https://github.com/kayjan/bigtree/compare/0.16.4...0.17.0
515545
[0.16.4]: https://github.com/kayjan/bigtree/compare/0.16.3...0.16.4
516546
[0.16.3]: https://github.com/kayjan/bigtree/compare/0.16.2...0.16.3
517547
[0.16.2]: https://github.com/kayjan/bigtree/compare/0.16.1...0.16.2

assets/docs/tree_construct.png

-6.79 KB
Loading

bigtree/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.16.4"
1+
__version__ = "0.17.0"
22

33
from bigtree.binarytree.construct import list_to_binarytree
44
from bigtree.dag.construct import dataframe_to_dag, dict_to_dag, list_to_dag

bigtree/binarytree/construct.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
__all__ = ["list_to_binarytree"]
66

7+
from bigtree.utils.assertions import assert_length_not_empty
8+
79

810
def list_to_binarytree(
911
heapq_list: List[int], node_type: Type[BinaryNode] = BinaryNode
@@ -37,8 +39,7 @@ def list_to_binarytree(
3739
Returns:
3840
(BinaryNode)
3941
"""
40-
if not len(heapq_list):
41-
raise ValueError("Input list does not contain any data, check `heapq_list`")
42+
assert_length_not_empty(heapq_list, "Input list", "heapq_list")
4243

4344
root_node = node_type(heapq_list[0])
4445
node_list = [root_node]

bigtree/dag/construct.py

+23-35
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@
33
from typing import Any, Dict, List, Tuple, Type
44

55
from bigtree.node.dagnode import DAGNode
6+
from bigtree.utils.assertions import (
7+
assert_dataframe_no_duplicate_attribute,
8+
assert_dataframe_not_empty,
9+
assert_dictionary_not_empty,
10+
assert_length_not_empty,
11+
filter_attributes,
12+
isnull,
13+
)
614
from bigtree.utils.exceptions import optional_dependencies_pandas
715

816
try:
@@ -35,15 +43,15 @@ def list_to_dag(
3543
Returns:
3644
(DAGNode)
3745
"""
38-
if not len(relations):
39-
raise ValueError("Input list does not contain any data, check `relations`")
46+
assert_length_not_empty(relations, "Input list", "relations")
4047

4148
relation_data = pd.DataFrame(relations, columns=["parent", "child"])
4249
return dataframe_to_dag(
4350
relation_data, child_col="child", parent_col="parent", node_type=node_type
4451
)
4552

4653

54+
@optional_dependencies_pandas
4755
def dict_to_dag(
4856
relation_attrs: Dict[str, Any],
4957
parent_key: str = "parents",
@@ -75,8 +83,7 @@ def dict_to_dag(
7583
Returns:
7684
(DAGNode)
7785
"""
78-
if not len(relation_attrs):
79-
raise ValueError("Dictionary does not contain any data, check `relation_attrs`")
86+
assert_dictionary_not_empty(relation_attrs, "relation_attrs")
8087

8188
# Convert dictionary to dataframe
8289
data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index()
@@ -110,6 +117,8 @@ def dataframe_to_dag(
110117
- If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other
111118
columns are `attribute_cols`.
112119
120+
Only attributes in `attribute_cols` with non-null values will be added to the tree.
121+
113122
Examples:
114123
>>> import pandas as pd
115124
>>> from bigtree import dataframe_to_dag, dag_iterator
@@ -141,12 +150,7 @@ def dataframe_to_dag(
141150
Returns:
142151
(DAGNode)
143152
"""
144-
data = data.copy()
145-
146-
if not len(data.columns):
147-
raise ValueError("Data does not contain any columns, check `data`")
148-
if not len(data):
149-
raise ValueError("Data does not contain any rows, check `data`")
153+
assert_dataframe_not_empty(data)
150154

151155
if not child_col:
152156
child_col = data.columns[0]
@@ -160,27 +164,12 @@ def dataframe_to_dag(
160164
attribute_cols = list(data.columns)
161165
attribute_cols.remove(child_col)
162166
attribute_cols.remove(parent_col)
163-
elif any([col not in data.columns for col in attribute_cols]):
164-
raise ValueError(
165-
f"One or more attribute column(s) not in data, check `attribute_cols`: {attribute_cols}"
166-
)
167167

168-
data_check = data.copy()[[child_col, parent_col] + attribute_cols].drop_duplicates(
169-
subset=[child_col] + attribute_cols
170-
)
171-
_duplicate_check = (
172-
data_check[child_col]
173-
.value_counts()
174-
.to_frame("counts")
175-
.rename_axis(child_col)
176-
.reset_index()
168+
data = data[[child_col, parent_col] + attribute_cols].copy()
169+
170+
assert_dataframe_no_duplicate_attribute(
171+
data, "child name", child_col, attribute_cols
177172
)
178-
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
179-
if len(_duplicate_check):
180-
raise ValueError(
181-
f"There exists duplicate child name with different attributes\n"
182-
f"Check {_duplicate_check}"
183-
)
184173
if sum(data[child_col].isnull()):
185174
raise ValueError(f"Child name cannot be empty, check column: {child_col}")
186175

@@ -190,15 +179,14 @@ def dataframe_to_dag(
190179
for row in data.reset_index(drop=True).to_dict(orient="index").values():
191180
child_name = row[child_col]
192181
parent_name = row[parent_col]
193-
node_attrs = row.copy()
194-
del node_attrs[child_col]
195-
del node_attrs[parent_col]
196-
node_attrs = {k: v for k, v in node_attrs.items() if not pd.isnull(v)}
197-
child_node = node_dict.get(child_name, node_type(child_name))
182+
node_attrs = filter_attributes(
183+
row, omit_keys=["name", child_col, parent_col], omit_null_values=True
184+
)
185+
child_node = node_dict.get(child_name, node_type(child_name, **node_attrs))
198186
child_node.set_attrs(node_attrs)
199187
node_dict[child_name] = child_node
200188

201-
if not pd.isnull(parent_name):
189+
if not isnull(parent_name):
202190
parent_node = node_dict.get(parent_name, node_type(parent_name))
203191
node_dict[parent_name] = parent_node
204192
child_node.parents = [parent_node]

0 commit comments

Comments
 (0)