@@ -424,105 +424,41 @@ def align_diagnoses(
424
424
return diagnosis_stack
425
425
426
426
427
- def _create_raising_func (method : str ):
428
- """Raise ValueError for wrong ``method``."""
427
+ def _stack_to_float_matrix (diagnosis_stack : list [pd .DataFrame ]) -> np .ndarray :
428
+ """Convert diagnosis stack to 3D array of floats with ``Nones`` as ``np.nan``."""
429
+ diagnosis_matrix = np .array (diagnosis_stack )
430
+ diagnosis_matrix [pd .isna (diagnosis_matrix )] = np .nan
431
+ return np .astype (diagnosis_matrix , float )
429
432
430
- def raise_value_err (* args , ** kwargs ):
431
- raise ValueError (f"Unknown method { method } " )
432
-
433
- return raise_value_err
434
-
435
-
436
- def _false_estimate (
437
- obs : np .ndarray ,
438
- false_pos_probs : np .ndarray ,
439
- true_neg_probs : np .ndarray ,
440
- method : Literal ["prod" , "max" ],
441
- ) -> float :
442
- """Compute estimate of ``False``, given ``obs``.
443
433
444
- >>> _false_estimate([True, False], [0.1, 0.6], [0.4, 0.7], method="whatever")
445
- Traceback (most recent call last):
446
- ...
447
- ValueError: Unknown method whatever
448
- """
449
- false_llhs = np .where (obs , false_pos_probs , true_neg_probs )
450
- nans_masked = np .where (
451
- pd .isna (obs ),
452
- 1.0 if method == "prod" else 0.0 ,
453
- false_llhs ,
454
- )
455
- method = getattr (np , method , _create_raising_func (method ))
456
- return method (nans_masked )
457
-
458
-
459
- def _true_estimate (
460
- obs : np .ndarray ,
461
- true_pos_probs : np .ndarray ,
462
- false_neg_probs : np .ndarray ,
463
- method : Literal ["prod" , "max" ],
464
- ) -> float :
465
- """Compute estimate of ``True``, given ``obs``.
466
-
467
- >>> obs = [True, False, np.nan]
468
- >>> true_pos_probs = [0.8, 0.6, 0.9]
469
- >>> false_neg_probs = [0.6, 0.7, 0.9]
470
- >>> _true_estimate(obs, true_pos_probs, false_neg_probs, method="max")
471
- np.float64(0.8)
472
- >>> tmp = _true_estimate(obs, true_pos_probs, false_neg_probs, method="prod")
473
- >>> np.isclose(tmp, 0.56)
474
- np.True_
475
- """
476
- true_llhs = np .where (obs , true_pos_probs , false_neg_probs )
477
- nans_masked = np .where (
478
- pd .isna (obs ),
479
- 1.0 if method == "prod" else 0.0 ,
480
- true_llhs ,
481
- )
482
- method = getattr (np , method , _create_raising_func (method ))
483
- return method (nans_masked )
484
-
485
-
486
- def _max_likelihood (
487
- obs : np .ndarray ,
488
- specificities : np .ndarray ,
434
+ def _evaluate_likelihood_ratios (
435
+ diagnosis_matrix : np .ndarray ,
489
436
sensitivities : np .ndarray ,
490
- ) -> bool :
491
- """Compute most likely true state based on all ``obs``.
492
-
493
- >>> obs = np.array([True, False, np.nan, None])
494
- >>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
495
- >>> specificities = np. array([0.9, 0.7, 0.7, 0.7])
496
- >>> _max_likelihood(obs, sensitivities, specificities)
497
- np.True_
498
- >>> obs = np.array([True, False, False, False])
499
- >>> _max_likelihood(obs, sensitivities, specificities)
500
- np.False_
437
+ specificities : np . ndarray ,
438
+ method : Literal [ "max_llh" , "rank" ],
439
+ ) -> np . ndarray :
440
+ """Compare the likelihoods of true/false diagnoses using the given ``method``.
441
+
442
+ The ``diagnosis_matrix`` is a 3D array of shape ``(n_modalities, n_patients,
443
+ n_levels)``. The `` sensitivities`` and `` specificities`` are 1D arrays of shape
444
+ ``(n_modalities,)``. When choosing the ``method="max_llh"``, the likelihood of each
445
+ diagnosis is combined into one likelihood for each patient and level. With
446
+ ``method="rank"``, the most trustworthy diagnosis is chosen for each patient and
447
+ level.
501
448
"""
502
- healthy_llh = _false_estimate (obs , 1 - specificities , specificities , method = "prod" )
503
- involved_llhs = _true_estimate (obs , sensitivities , 1 - sensitivities , method = "prod" )
504
- return healthy_llh < involved_llhs
449
+ true_pos = sensitivities [:, None , None ] * diagnosis_matrix
450
+ false_neg = (1 - sensitivities [:, None , None ]) * (1 - diagnosis_matrix )
451
+ true_neg = specificities [:, None , None ] * (1 - diagnosis_matrix )
452
+ false_pos = (1 - specificities [:, None , None ]) * diagnosis_matrix
505
453
454
+ if method not in {"max_llh" , "rank" }:
455
+ raise ValueError (f"Unknown method { method } " )
506
456
507
- def _rank_trustworthy (
508
- obs : np .ndarray ,
509
- specificities : np .ndarray ,
510
- sensitivities : np .ndarray ,
511
- ) -> bool :
512
- """Estimate true state based on most trustworthy value in ``obs``.
513
-
514
- >>> obs = np.array([True, False, np.nan, None])
515
- >>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
516
- >>> specificities = np.array([0.9, 0.7, 0.7, 0.7])
517
- >>> _rank_trustworthy(obs, sensitivities, specificities)
518
- np.True_
519
- >>> obs = np.array([True, False, False, False])
520
- >>> _rank_trustworthy(obs, sensitivities, specificities)
521
- np.True_
522
- """
523
- healthy_llh = _false_estimate (obs , 1 - specificities , specificities , method = "max" )
524
- involved_llhs = _true_estimate (obs , sensitivities , 1 - sensitivities , method = "max" )
525
- return healthy_llh < involved_llhs
457
+ agg_func = np .nanprod if method == "max_llh" else np .nanmax
458
+ true_llh = agg_func (true_pos + false_neg , axis = 0 )
459
+ false_llh = agg_func (true_neg + false_pos , axis = 0 )
460
+
461
+ return true_llh >= false_llh
526
462
527
463
528
464
def _expand_mapping (
@@ -726,53 +662,64 @@ def stats(
726
662
727
663
return stats
728
664
665
+ def _filter_and_sort_modalities (
666
+ self ,
667
+ modalities : dict [str , ModalityConfig ] | None = None ,
668
+ ) -> dict [str , ModalityConfig ]:
669
+ """Return only those ``modalities`` present in data and sorted as in data."""
670
+ modalities = modalities or get_default_modalities ()
671
+ return {
672
+ modality_name : modality_config
673
+ for modality_name , modality_config in modalities .items ()
674
+ if modality_name in self .get_modalities ()
675
+ }
676
+
729
677
def combine (
730
678
self ,
731
679
modalities : dict [str , ModalityConfig ] | None = None ,
732
680
method : Literal ["max_llh" , "rank" ] = "max_llh" ,
733
681
) -> pd .DataFrame :
734
682
"""Combine diagnoses of ``modalities`` using ``method``.
735
683
736
- The details of what the ``method`` does and how can be found in their
737
- respective documentations: :py:func:`max_likelihood` and
738
- :py:func:`rank_trustworthy`.
684
+ The order of the provided ``modalities`` does not matter, as it is aligned
685
+ with the order in the DataFrame. With ``method="max_llh"``, the most likely
686
+ true state of involvement is inferred based on all available diagnoses for
687
+ each patient and level. With ``method="rank"``, only the most trustworthy
688
+ diagnosis is chosen for each patient and level based on the sensitivity and
689
+ specificity of the given list of ``modalities``.
739
690
740
691
The result contains only the combined columns. The intended use is to
741
692
:py:meth:`~pandas.DataFrame.update` the original DataFrame with the result.
742
693
743
694
>>> df = pd.DataFrame({
744
- ... ('MRI ' , 'ipsi', 'I'): [False, True , True , None],
745
- ... ('CT ' , 'ipsi', 'I'): [False, True , False, True ],
746
- ... ('pathology', 'ipsi', 'I'): [True , None , False, None],
695
+ ... ('CT ' , 'ipsi', 'I'): [False, True , False, True , None],
696
+ ... ('MRI ' , 'ipsi', 'I'): [False, True , True , None, None ],
697
+ ... ('pathology', 'ipsi', 'I'): [True , None , None, False, None],
747
698
... })
748
699
>>> df.ly.combine() # doctest: +NORMALIZE_WHITESPACE
749
700
ipsi
750
701
I
751
702
0 True
752
703
1 True
753
704
2 False
754
- 3 True
705
+ 3 False
706
+ 4 None
755
707
"""
756
- modalities = modalities or get_default_modalities ()
757
- modalities = {
758
- modality_name : modality_config
759
- for modality_name , modality_config in modalities .items ()
760
- if modality_name in self .get_modalities ()
761
- }
708
+ modalities = self ._filter_and_sort_modalities (modalities )
762
709
763
710
diagnosis_stack = align_diagnoses (self ._obj , list (modalities .keys ()))
764
- columns = diagnosis_stack [0 ].columns
765
- diagnosis_stack = np .array (diagnosis_stack )
766
-
767
- funcs1d = {"max_llh" : _max_likelihood , "rank" : _rank_trustworthy }
768
- result = np .apply_along_axis (
769
- func1d = funcs1d [method ],
770
- axis = 0 ,
771
- arr = diagnosis_stack ,
711
+ diagnosis_matrix = _stack_to_float_matrix (diagnosis_stack )
712
+ all_nan_mask = np .all (np .isnan (diagnosis_matrix ), axis = 0 )
713
+
714
+ result = _evaluate_likelihood_ratios (
715
+ diagnosis_matrix = diagnosis_matrix ,
772
716
sensitivities = np .array ([mod .sens for mod in modalities .values ()]),
773
717
specificities = np .array ([mod .spec for mod in modalities .values ()]),
718
+ method = method ,
774
719
)
775
- return pd .DataFrame (result , columns = columns )
720
+ result = np .astype (result , object )
721
+ result [all_nan_mask ] = None
722
+ return pd .DataFrame (result , columns = diagnosis_stack [0 ].columns )
776
723
777
724
def infer_sublevels (
778
725
self ,
0 commit comments