1
- from collections import deque
2
- from typing import Any , Deque , Dict , List , Set , Type , TypeVar , Union
1
+ from typing import Any , Dict , List , Set , Type , TypeVar , Union
3
2
4
3
from bigtree .node import basenode , binarynode , node
5
4
from bigtree .tree import construct , export , search
@@ -454,33 +453,33 @@ def get_tree_diff(
454
453
)
455
454
456
455
# Check tree structure difference
457
- data_both = data [[path_col , name_col , parent_col ] + attr_list ].merge (
456
+ data_compare = data [[path_col , name_col , parent_col ] + attr_list ].merge (
458
457
data_other [[path_col , name_col , parent_col ] + attr_list ],
459
458
how = "outer" ,
460
459
on = [path_col , name_col , parent_col ],
461
460
indicator = indicator_col ,
462
461
suffixes = (old_suffix , new_suffix ),
463
462
)
464
463
if aggregate :
465
- data_both_agg = data_both [
466
- (data_both [indicator_col ] == "left_only" )
467
- | (data_both [indicator_col ] == "right_only" )
464
+ data_path_diff = data_compare [
465
+ (data_compare [indicator_col ] == "left_only" )
466
+ | (data_compare [indicator_col ] == "right_only" )
468
467
].drop_duplicates (subset = [name_col , parent_col ], keep = False )
469
468
if only_diff :
470
469
# If only_diff and aggregate, remove children under (moved from)
471
- data_both = data_both .sort_values (indicator_col , ascending = False )
472
- data_both = data_both [
473
- ~ data_both .duplicated (subset = [name_col , parent_col ])
470
+ data_compare = data_compare .sort_values (indicator_col , ascending = False )
471
+ data_compare = data_compare [
472
+ ~ data_compare .duplicated (subset = [name_col , parent_col ])
474
473
] # keep right_only
475
474
else :
476
- data_both_agg = data_both
475
+ data_path_diff = data_compare
477
476
478
477
# Handle tree structure difference
479
478
paths_removed = list (
480
- data_both_agg [ data_both_agg [indicator_col ] == "left_only" ][path_col ]
479
+ data_path_diff [ data_path_diff [indicator_col ] == "left_only" ][path_col ]
481
480
)[::- 1 ]
482
481
paths_added = list (
483
- data_both_agg [ data_both_agg [indicator_col ] == "right_only" ][path_col ]
482
+ data_path_diff [ data_path_diff [indicator_col ] == "right_only" ][path_col ]
484
483
)[::- 1 ]
485
484
486
485
moved_from_ind : List [bool ] = [True for _ in range (len (paths_removed ))]
@@ -491,102 +490,65 @@ def get_tree_diff(
491
490
moved_from_ind = [name in names_added for name in names_removed ]
492
491
moved_to_ind = [name in names_removed for name in names_added ]
493
492
494
- def add_suffix_to_path (
495
- _data : pd .DataFrame , _condition : pd .Series , _original_name : str , _suffix : str
496
- ) -> None :
497
- """Add suffix to path string, in-place
498
-
499
- Args:
500
- _data (pd.DataFrame): original data with path column
501
- _condition (pd.Series): whether to add suffix, contains True/False values
502
- _original_name (str): path prefix to add suffix to
503
- _suffix (str): suffix to add to path column
504
-
505
- Returns:
506
- (pd.DataFrame)
507
- """
508
- _data .iloc [_condition .values , _data .columns .get_loc (path_col )] = _data .iloc [
509
- _condition .values , _data .columns .get_loc (path_col )
510
- ].str .replace (_original_name , f"{ _original_name } ({ _suffix } )" , regex = True )
511
-
512
- def add_suffix_to_data (
513
- _data : pd .DataFrame ,
514
- paths_diff : List [str ],
515
- move_ind : List [bool ],
516
- suffix_general : str ,
517
- suffix_move : str ,
518
- suffix_not_moved : str ,
519
- ) -> None :
520
- """Add suffix to data, in-place
521
-
522
- Args:
523
- _data (pd.DataFrame): original data with path column
524
- paths_diff (List[str]): list of paths that were modified (e.g., added/removed)
525
- move_ind (List[bool]): move indicator to indicate path was moved instead of added/removed
526
- suffix_general (str): path suffix for general case
527
- suffix_move (str): path suffix if path was moved
528
- suffix_not_moved (str): path suffix if path is not moved (e.g., added/removed)
529
- """
530
- for _path_diff , _move_ind in zip (paths_diff , move_ind ):
531
- if not detail :
532
- suffix = suffix_general
533
- else :
534
- suffix = suffix_move if _move_ind else suffix_not_moved
535
- condition_node_modified = data_both [path_col ].str .endswith (
536
- _path_diff
537
- ) | data_both [path_col ].str .contains (_path_diff + tree_sep )
538
- add_suffix_to_path (data_both , condition_node_modified , _path_diff , suffix )
539
-
540
- add_suffix_to_data (
541
- data_both , paths_removed , moved_from_ind , "-" , "moved from" , "removed"
542
- )
543
- add_suffix_to_data (data_both , paths_added , moved_to_ind , "+" , "moved to" , "added" )
493
+ path_removed_to_suffix = {
494
+ path : "-" if not detail else ("moved from" if move_ind else "removed" )
495
+ for path , move_ind in zip (paths_removed , moved_from_ind )
496
+ }
497
+ path_added_to_suffix = {
498
+ path : "+" if not detail else ("moved to" if move_ind else "added" )
499
+ for path , move_ind in zip (paths_added , moved_to_ind )
500
+ }
544
501
545
502
# Check tree attribute difference
546
- path_to_attrs_diff : List [Dict [str , Dict [str , Any ]]] = []
547
- paths_with_attr_diff : Deque [str ] = deque ([])
548
- for attr_change in attr_list :
549
- condition_diff = (
550
- (
551
- ~ data_both [f"{ attr_change } { old_suffix } " ].isnull ()
552
- | ~ data_both [f"{ attr_change } { new_suffix } " ].isnull ()
503
+ dict_attr_diff : Dict [str , Dict [str , Any ]] = {}
504
+ if attr_list :
505
+ data_both = data_compare [data_compare [indicator_col ] == "both" ]
506
+ condition_attr_diff = (
507
+ "("
508
+ + ") | (" .join (
509
+ [
510
+ f"""(data_both["{ attr } { old_suffix } "] != data_both["{ attr } { new_suffix } "]) """
511
+ f"""& ~(data_both["{ attr } { old_suffix } "].isnull() & data_both["{ attr } { new_suffix } "].isnull())"""
512
+ for attr in attr_list
513
+ ]
553
514
)
554
- & (
555
- data_both [f"{ attr_change } { old_suffix } " ]
556
- != data_both [f"{ attr_change } { new_suffix } " ]
557
- )
558
- & (data_both [indicator_col ] == "both" )
515
+ + ")"
559
516
)
560
- data_diff = data_both [condition_diff ]
561
- if len (data_diff ):
562
- tuple_diff = zip (
563
- data_diff [f"{ attr_change } { old_suffix } " ],
564
- data_diff [f"{ attr_change } { new_suffix } " ],
565
- )
566
- dict_attr_diff = [{attr_change : v } for v in tuple_diff ]
567
- dict_path_diff = dict (list (zip (data_diff [path_col ], dict_attr_diff )))
568
- path_to_attrs_diff .append (dict_path_diff )
569
- paths_with_attr_diff .extend (list (data_diff [path_col ]))
517
+ data_attr_diff = data_both [eval (condition_attr_diff )]
518
+ dict_attr_all = data_attr_diff .set_index (path_col ).to_dict (orient = "index" )
519
+ for path , node_attr in dict_attr_all .items ():
520
+ dict_attr_diff [path ] = {
521
+ attr : (
522
+ node_attr [f"{ attr } { old_suffix } " ],
523
+ node_attr [f"{ attr } { new_suffix } " ],
524
+ )
525
+ for attr in attr_list
526
+ if node_attr [f"{ attr } { old_suffix } " ] != node_attr [f"{ attr } { new_suffix } " ]
527
+ and node_attr [f"{ attr } { old_suffix } " ]
528
+ and node_attr [f"{ attr } { new_suffix } " ]
529
+ }
570
530
571
531
if only_diff :
572
- data_both = data_both [
573
- (data_both [indicator_col ] != "both" )
574
- | (data_both [path_col ].isin (paths_with_attr_diff ))
532
+ data_compare = data_compare [
533
+ (data_compare [indicator_col ] != "both" )
534
+ | (data_compare [path_col ].isin (dict_attr_diff . keys () ))
575
535
]
576
- data_both = data_both [[path_col ]].sort_values (path_col )
577
- if len (data_both ):
536
+ data_compare = data_compare [[path_col ]].sort_values (path_col )
537
+ if len (data_compare ):
578
538
tree_diff = construct .dataframe_to_tree (
579
- data_both , node_type = tree .__class__ , sep = tree .sep
539
+ data_compare , node_type = tree .__class__ , sep = tree .sep
580
540
)
541
+ for path in sorted (path_removed_to_suffix , reverse = True ):
542
+ _node = search .find_full_path (tree_diff , path )
543
+ _node .name += f""" ({ path_removed_to_suffix [path ]} )"""
544
+ for path in sorted (path_added_to_suffix , reverse = True ):
545
+ _node = search .find_full_path (tree_diff , path )
546
+ _node .name += f""" ({ path_added_to_suffix [path ]} )"""
547
+
581
548
# Handle tree attribute difference
582
- if len (paths_with_attr_diff ):
583
- path_changes_list = sorted (paths_with_attr_diff , reverse = True )
584
- name_changes_list = [
585
- {k : {"name" : f"{ k .split (tree .sep )[- 1 ]} (~)" } for k in path_changes_list }
586
- ]
587
- path_to_attrs_diff .extend (name_changes_list )
588
- for attr_change_dict in path_to_attrs_diff :
589
- tree_diff = construct .add_dict_to_tree_by_path (
590
- tree_diff , attr_change_dict
591
- )
549
+ if dict_attr_diff :
550
+ tree_diff = construct .add_dict_to_tree_by_path (tree_diff , dict_attr_diff )
551
+ for path in sorted (dict_attr_diff , reverse = True ):
552
+ _node = search .find_full_path (tree_diff , path )
553
+ _node .name += " (~)"
592
554
return tree_diff
0 commit comments