Skip to content

Commit 813ebca

Browse files
authored
Merge pull request #328 from kayjan/feature/tree-diff-attr
Make tree diff comparison faster
2 parents 624500f + b667b02 commit 813ebca

File tree

2 files changed

+65
-100
lines changed

2 files changed

+65
-100
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

77
## [Unreleased]
8+
### Changed:
9+
- Tree Helper: Get tree diff logic to be faster to compare all attribute list and data at once (for attr diff).
10+
- Tree Helper: Get tree diff logic to be faster to add suffix at the end (for path diff).
811

912
## [0.22.2] - 2024-11-11
1013
### Added:

bigtree/tree/helper.py

+62-100
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
from collections import deque
2-
from typing import Any, Deque, Dict, List, Set, Type, TypeVar, Union
1+
from typing import Any, Dict, List, Set, Type, TypeVar, Union
32

43
from bigtree.node import basenode, binarynode, node
54
from bigtree.tree import construct, export, search
@@ -454,33 +453,33 @@ def get_tree_diff(
454453
)
455454

456455
# Check tree structure difference
457-
data_both = data[[path_col, name_col, parent_col] + attr_list].merge(
456+
data_compare = data[[path_col, name_col, parent_col] + attr_list].merge(
458457
data_other[[path_col, name_col, parent_col] + attr_list],
459458
how="outer",
460459
on=[path_col, name_col, parent_col],
461460
indicator=indicator_col,
462461
suffixes=(old_suffix, new_suffix),
463462
)
464463
if aggregate:
465-
data_both_agg = data_both[
466-
(data_both[indicator_col] == "left_only")
467-
| (data_both[indicator_col] == "right_only")
464+
data_path_diff = data_compare[
465+
(data_compare[indicator_col] == "left_only")
466+
| (data_compare[indicator_col] == "right_only")
468467
].drop_duplicates(subset=[name_col, parent_col], keep=False)
469468
if only_diff:
470469
# If only_diff and aggregate, remove children under (moved from)
471-
data_both = data_both.sort_values(indicator_col, ascending=False)
472-
data_both = data_both[
473-
~data_both.duplicated(subset=[name_col, parent_col])
470+
data_compare = data_compare.sort_values(indicator_col, ascending=False)
471+
data_compare = data_compare[
472+
~data_compare.duplicated(subset=[name_col, parent_col])
474473
] # keep right_only
475474
else:
476-
data_both_agg = data_both
475+
data_path_diff = data_compare
477476

478477
# Handle tree structure difference
479478
paths_removed = list(
480-
data_both_agg[data_both_agg[indicator_col] == "left_only"][path_col]
479+
data_path_diff[data_path_diff[indicator_col] == "left_only"][path_col]
481480
)[::-1]
482481
paths_added = list(
483-
data_both_agg[data_both_agg[indicator_col] == "right_only"][path_col]
482+
data_path_diff[data_path_diff[indicator_col] == "right_only"][path_col]
484483
)[::-1]
485484

486485
moved_from_ind: List[bool] = [True for _ in range(len(paths_removed))]
@@ -491,102 +490,65 @@ def get_tree_diff(
491490
moved_from_ind = [name in names_added for name in names_removed]
492491
moved_to_ind = [name in names_removed for name in names_added]
493492

494-
def add_suffix_to_path(
495-
_data: pd.DataFrame, _condition: pd.Series, _original_name: str, _suffix: str
496-
) -> None:
497-
"""Add suffix to path string, in-place
498-
499-
Args:
500-
_data (pd.DataFrame): original data with path column
501-
_condition (pd.Series): whether to add suffix, contains True/False values
502-
_original_name (str): path prefix to add suffix to
503-
_suffix (str): suffix to add to path column
504-
505-
Returns:
506-
(pd.DataFrame)
507-
"""
508-
_data.iloc[_condition.values, _data.columns.get_loc(path_col)] = _data.iloc[
509-
_condition.values, _data.columns.get_loc(path_col)
510-
].str.replace(_original_name, f"{_original_name} ({_suffix})", regex=True)
511-
512-
def add_suffix_to_data(
513-
_data: pd.DataFrame,
514-
paths_diff: List[str],
515-
move_ind: List[bool],
516-
suffix_general: str,
517-
suffix_move: str,
518-
suffix_not_moved: str,
519-
) -> None:
520-
"""Add suffix to data, in-place
521-
522-
Args:
523-
_data (pd.DataFrame): original data with path column
524-
paths_diff (List[str]): list of paths that were modified (e.g., added/removed)
525-
move_ind (List[bool]): move indicator to indicate path was moved instead of added/removed
526-
suffix_general (str): path suffix for general case
527-
suffix_move (str): path suffix if path was moved
528-
suffix_not_moved (str): path suffix if path is not moved (e.g., added/removed)
529-
"""
530-
for _path_diff, _move_ind in zip(paths_diff, move_ind):
531-
if not detail:
532-
suffix = suffix_general
533-
else:
534-
suffix = suffix_move if _move_ind else suffix_not_moved
535-
condition_node_modified = data_both[path_col].str.endswith(
536-
_path_diff
537-
) | data_both[path_col].str.contains(_path_diff + tree_sep)
538-
add_suffix_to_path(data_both, condition_node_modified, _path_diff, suffix)
539-
540-
add_suffix_to_data(
541-
data_both, paths_removed, moved_from_ind, "-", "moved from", "removed"
542-
)
543-
add_suffix_to_data(data_both, paths_added, moved_to_ind, "+", "moved to", "added")
493+
path_removed_to_suffix = {
494+
path: "-" if not detail else ("moved from" if move_ind else "removed")
495+
for path, move_ind in zip(paths_removed, moved_from_ind)
496+
}
497+
path_added_to_suffix = {
498+
path: "+" if not detail else ("moved to" if move_ind else "added")
499+
for path, move_ind in zip(paths_added, moved_to_ind)
500+
}
544501

545502
# Check tree attribute difference
546-
path_to_attrs_diff: List[Dict[str, Dict[str, Any]]] = []
547-
paths_with_attr_diff: Deque[str] = deque([])
548-
for attr_change in attr_list:
549-
condition_diff = (
550-
(
551-
~data_both[f"{attr_change}{old_suffix}"].isnull()
552-
| ~data_both[f"{attr_change}{new_suffix}"].isnull()
503+
dict_attr_diff: Dict[str, Dict[str, Any]] = {}
504+
if attr_list:
505+
data_both = data_compare[data_compare[indicator_col] == "both"]
506+
condition_attr_diff = (
507+
"("
508+
+ ") | (".join(
509+
[
510+
f"""(data_both["{attr}{old_suffix}"] != data_both["{attr}{new_suffix}"]) """
511+
f"""& ~(data_both["{attr}{old_suffix}"].isnull() & data_both["{attr}{new_suffix}"].isnull())"""
512+
for attr in attr_list
513+
]
553514
)
554-
& (
555-
data_both[f"{attr_change}{old_suffix}"]
556-
!= data_both[f"{attr_change}{new_suffix}"]
557-
)
558-
& (data_both[indicator_col] == "both")
515+
+ ")"
559516
)
560-
data_diff = data_both[condition_diff]
561-
if len(data_diff):
562-
tuple_diff = zip(
563-
data_diff[f"{attr_change}{old_suffix}"],
564-
data_diff[f"{attr_change}{new_suffix}"],
565-
)
566-
dict_attr_diff = [{attr_change: v} for v in tuple_diff]
567-
dict_path_diff = dict(list(zip(data_diff[path_col], dict_attr_diff)))
568-
path_to_attrs_diff.append(dict_path_diff)
569-
paths_with_attr_diff.extend(list(data_diff[path_col]))
517+
data_attr_diff = data_both[eval(condition_attr_diff)]
518+
dict_attr_all = data_attr_diff.set_index(path_col).to_dict(orient="index")
519+
for path, node_attr in dict_attr_all.items():
520+
dict_attr_diff[path] = {
521+
attr: (
522+
node_attr[f"{attr}{old_suffix}"],
523+
node_attr[f"{attr}{new_suffix}"],
524+
)
525+
for attr in attr_list
526+
if node_attr[f"{attr}{old_suffix}"] != node_attr[f"{attr}{new_suffix}"]
527+
and node_attr[f"{attr}{old_suffix}"]
528+
and node_attr[f"{attr}{new_suffix}"]
529+
}
570530

571531
if only_diff:
572-
data_both = data_both[
573-
(data_both[indicator_col] != "both")
574-
| (data_both[path_col].isin(paths_with_attr_diff))
532+
data_compare = data_compare[
533+
(data_compare[indicator_col] != "both")
534+
| (data_compare[path_col].isin(dict_attr_diff.keys()))
575535
]
576-
data_both = data_both[[path_col]].sort_values(path_col)
577-
if len(data_both):
536+
data_compare = data_compare[[path_col]].sort_values(path_col)
537+
if len(data_compare):
578538
tree_diff = construct.dataframe_to_tree(
579-
data_both, node_type=tree.__class__, sep=tree.sep
539+
data_compare, node_type=tree.__class__, sep=tree.sep
580540
)
541+
for path in sorted(path_removed_to_suffix, reverse=True):
542+
_node = search.find_full_path(tree_diff, path)
543+
_node.name += f""" ({path_removed_to_suffix[path]})"""
544+
for path in sorted(path_added_to_suffix, reverse=True):
545+
_node = search.find_full_path(tree_diff, path)
546+
_node.name += f""" ({path_added_to_suffix[path]})"""
547+
581548
# Handle tree attribute difference
582-
if len(paths_with_attr_diff):
583-
path_changes_list = sorted(paths_with_attr_diff, reverse=True)
584-
name_changes_list = [
585-
{k: {"name": f"{k.split(tree.sep)[-1]} (~)"} for k in path_changes_list}
586-
]
587-
path_to_attrs_diff.extend(name_changes_list)
588-
for attr_change_dict in path_to_attrs_diff:
589-
tree_diff = construct.add_dict_to_tree_by_path(
590-
tree_diff, attr_change_dict
591-
)
549+
if dict_attr_diff:
550+
tree_diff = construct.add_dict_to_tree_by_path(tree_diff, dict_attr_diff)
551+
for path in sorted(dict_attr_diff, reverse=True):
552+
_node = search.find_full_path(tree_diff, path)
553+
_node.name += " (~)"
592554
return tree_diff

0 commit comments

Comments
 (0)