Skip to content

Commit a40bef7

Browse files
authored
Merge pull request #14 from rmnldwg/release-0.2.1
Release 0.2.1
2 parents 5c4fc68 + 34946bf commit a40bef7

File tree

2 files changed

+74
-115
lines changed

2 files changed

+74
-115
lines changed

CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## [0.2.1] - 2024-11-29
6+
7+
### 🐛 Bug Fixes
8+
9+
- If an LNL of a patient was unobserved (i.e., all diagnoses `None`), then the method `ly.combine()` returns `None` for that patient's LNL. Fixes [#13]
10+
11+
### 🧪 Testing
12+
13+
- Change the doctest of `ly.combine()` to check whether [#13] was fixed.
14+
515
## [0.2.0] - 2024-11-14
616

717
### 🚀 Features
@@ -181,6 +191,7 @@ Initial implementation of the lyDATA library.
181191
<!-- generated by git-cliff -->
182192
<!-- markdownlint-disable-file MD024 -->
183193

194+
[0.2.1]: https://github.com/rmnldwg/lydata/compare/0.2.0..0.2.1
184195
[0.2.0]: https://github.com/rmnldwg/lydata/compare/0.1.2..0.2.0
185196
[0.1.2]: https://github.com/rmnldwg/lydata/compare/0.1.1..0.1.2
186197
[0.1.1]: https://github.com/rmnldwg/lydata/compare/0.1.0..0.1.1
@@ -192,3 +203,4 @@ Initial implementation of the lyDATA library.
192203

193204
[#2]: https://github.com/rmnldwg/lydata/issues/2
194205
[#4]: https://github.com/rmnldwg/lydata/issues/4
206+
[#13]: https://github.com/rmnldwg/lydata/issues/13

lydata/accessor.py

+62-115
Original file line numberDiff line numberDiff line change
@@ -424,105 +424,41 @@ def align_diagnoses(
424424
return diagnosis_stack
425425

426426

427-
def _create_raising_func(method: str):
428-
"""Raise ValueError for wrong ``method``."""
427+
def _stack_to_float_matrix(diagnosis_stack: list[pd.DataFrame]) -> np.ndarray:
428+
"""Convert diagnosis stack to 3D array of floats with ``Nones`` as ``np.nan``."""
429+
diagnosis_matrix = np.array(diagnosis_stack)
430+
diagnosis_matrix[pd.isna(diagnosis_matrix)] = np.nan
431+
return np.astype(diagnosis_matrix, float)
429432

430-
def raise_value_err(*args, **kwargs):
431-
raise ValueError(f"Unknown method {method}")
432-
433-
return raise_value_err
434-
435-
436-
def _false_estimate(
437-
obs: np.ndarray,
438-
false_pos_probs: np.ndarray,
439-
true_neg_probs: np.ndarray,
440-
method: Literal["prod", "max"],
441-
) -> float:
442-
"""Compute estimate of ``False``, given ``obs``.
443433

444-
>>> _false_estimate([True, False], [0.1, 0.6], [0.4, 0.7], method="whatever")
445-
Traceback (most recent call last):
446-
...
447-
ValueError: Unknown method whatever
448-
"""
449-
false_llhs = np.where(obs, false_pos_probs, true_neg_probs)
450-
nans_masked = np.where(
451-
pd.isna(obs),
452-
1.0 if method == "prod" else 0.0,
453-
false_llhs,
454-
)
455-
method = getattr(np, method, _create_raising_func(method))
456-
return method(nans_masked)
457-
458-
459-
def _true_estimate(
460-
obs: np.ndarray,
461-
true_pos_probs: np.ndarray,
462-
false_neg_probs: np.ndarray,
463-
method: Literal["prod", "max"],
464-
) -> float:
465-
"""Compute estimate of ``True``, given ``obs``.
466-
467-
>>> obs = [True, False, np.nan]
468-
>>> true_pos_probs = [0.8, 0.6, 0.9]
469-
>>> false_neg_probs = [0.6, 0.7, 0.9]
470-
>>> _true_estimate(obs, true_pos_probs, false_neg_probs, method="max")
471-
np.float64(0.8)
472-
>>> tmp = _true_estimate(obs, true_pos_probs, false_neg_probs, method="prod")
473-
>>> np.isclose(tmp, 0.56)
474-
np.True_
475-
"""
476-
true_llhs = np.where(obs, true_pos_probs, false_neg_probs)
477-
nans_masked = np.where(
478-
pd.isna(obs),
479-
1.0 if method == "prod" else 0.0,
480-
true_llhs,
481-
)
482-
method = getattr(np, method, _create_raising_func(method))
483-
return method(nans_masked)
484-
485-
486-
def _max_likelihood(
487-
obs: np.ndarray,
488-
specificities: np.ndarray,
434+
def _evaluate_likelihood_ratios(
435+
diagnosis_matrix: np.ndarray,
489436
sensitivities: np.ndarray,
490-
) -> bool:
491-
"""Compute most likely true state based on all ``obs``.
492-
493-
>>> obs = np.array([True, False, np.nan, None])
494-
>>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
495-
>>> specificities = np.array([0.9, 0.7, 0.7, 0.7])
496-
>>> _max_likelihood(obs, sensitivities, specificities)
497-
np.True_
498-
>>> obs = np.array([True, False, False, False])
499-
>>> _max_likelihood(obs, sensitivities, specificities)
500-
np.False_
437+
specificities: np.ndarray,
438+
method: Literal["max_llh", "rank"],
439+
) -> np.ndarray:
440+
"""Compare the likelihoods of true/false diagnoses using the given ``method``.
441+
442+
The ``diagnosis_matrix`` is a 3D array of shape ``(n_modalities, n_patients,
443+
n_levels)``. The ``sensitivities`` and ``specificities`` are 1D arrays of shape
444+
``(n_modalities,)``. When choosing the ``method="max_llh"``, the likelihood of each
445+
diagnosis is combined into one likelihood for each patient and level. With
446+
``method="rank"``, the most trustworthy diagnosis is chosen for each patient and
447+
level.
501448
"""
502-
healthy_llh = _false_estimate(obs, 1 - specificities, specificities, method="prod")
503-
involved_llhs = _true_estimate(obs, sensitivities, 1 - sensitivities, method="prod")
504-
return healthy_llh < involved_llhs
449+
true_pos = sensitivities[:, None, None] * diagnosis_matrix
450+
false_neg = (1 - sensitivities[:, None, None]) * (1 - diagnosis_matrix)
451+
true_neg = specificities[:, None, None] * (1 - diagnosis_matrix)
452+
false_pos = (1 - specificities[:, None, None]) * diagnosis_matrix
505453

454+
if method not in {"max_llh", "rank"}:
455+
raise ValueError(f"Unknown method {method}")
506456

507-
def _rank_trustworthy(
508-
obs: np.ndarray,
509-
specificities: np.ndarray,
510-
sensitivities: np.ndarray,
511-
) -> bool:
512-
"""Estimate true state based on most trustworthy value in ``obs``.
513-
514-
>>> obs = np.array([True, False, np.nan, None])
515-
>>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
516-
>>> specificities = np.array([0.9, 0.7, 0.7, 0.7])
517-
>>> _rank_trustworthy(obs, sensitivities, specificities)
518-
np.True_
519-
>>> obs = np.array([True, False, False, False])
520-
>>> _rank_trustworthy(obs, sensitivities, specificities)
521-
np.True_
522-
"""
523-
healthy_llh = _false_estimate(obs, 1 - specificities, specificities, method="max")
524-
involved_llhs = _true_estimate(obs, sensitivities, 1 - sensitivities, method="max")
525-
return healthy_llh < involved_llhs
457+
agg_func = np.nanprod if method == "max_llh" else np.nanmax
458+
true_llh = agg_func(true_pos + false_neg, axis=0)
459+
false_llh = agg_func(true_neg + false_pos, axis=0)
460+
461+
return true_llh >= false_llh
526462

527463

528464
def _expand_mapping(
@@ -726,53 +662,64 @@ def stats(
726662

727663
return stats
728664

665+
def _filter_and_sort_modalities(
666+
self,
667+
modalities: dict[str, ModalityConfig] | None = None,
668+
) -> dict[str, ModalityConfig]:
669+
"""Return only those ``modalities`` present in data and sorted as in data."""
670+
modalities = modalities or get_default_modalities()
671+
return {
672+
modality_name: modality_config
673+
for modality_name, modality_config in modalities.items()
674+
if modality_name in self.get_modalities()
675+
}
676+
729677
def combine(
730678
self,
731679
modalities: dict[str, ModalityConfig] | None = None,
732680
method: Literal["max_llh", "rank"] = "max_llh",
733681
) -> pd.DataFrame:
734682
"""Combine diagnoses of ``modalities`` using ``method``.
735683
736-
The details of what the ``method`` does and how can be found in their
737-
respective documentations: :py:func:`max_likelihood` and
738-
:py:func:`rank_trustworthy`.
684+
The order of the provided ``modalities`` does not matter, as it is aligned
685+
with the order in the DataFrame. With ``method="max_llh"``, the most likely
686+
true state of involvement is inferred based on all available diagnoses for
687+
each patient and level. With ``method="rank"``, only the most trustworthy
688+
diagnosis is chosen for each patient and level based on the sensitivity and
689+
specificity of the given list of ``modalities``.
739690
740691
The result contains only the combined columns. The intended use is to
741692
:py:meth:`~pandas.DataFrame.update` the original DataFrame with the result.
742693
743694
>>> df = pd.DataFrame({
744-
... ('MRI' , 'ipsi', 'I'): [False, True , True , None],
745-
... ('CT' , 'ipsi', 'I'): [False, True , False, True],
746-
... ('pathology', 'ipsi', 'I'): [True , None , False, None],
695+
... ('CT' , 'ipsi', 'I'): [False, True , False, True, None],
696+
... ('MRI' , 'ipsi', 'I'): [False, True , True , None, None],
697+
... ('pathology', 'ipsi', 'I'): [True , None , None, False, None],
747698
... })
748699
>>> df.ly.combine() # doctest: +NORMALIZE_WHITESPACE
749700
ipsi
750701
I
751702
0 True
752703
1 True
753704
2 False
754-
3 True
705+
3 False
706+
4 None
755707
"""
756-
modalities = modalities or get_default_modalities()
757-
modalities = {
758-
modality_name: modality_config
759-
for modality_name, modality_config in modalities.items()
760-
if modality_name in self.get_modalities()
761-
}
708+
modalities = self._filter_and_sort_modalities(modalities)
762709

763710
diagnosis_stack = align_diagnoses(self._obj, list(modalities.keys()))
764-
columns = diagnosis_stack[0].columns
765-
diagnosis_stack = np.array(diagnosis_stack)
766-
767-
funcs1d = {"max_llh": _max_likelihood, "rank": _rank_trustworthy}
768-
result = np.apply_along_axis(
769-
func1d=funcs1d[method],
770-
axis=0,
771-
arr=diagnosis_stack,
711+
diagnosis_matrix = _stack_to_float_matrix(diagnosis_stack)
712+
all_nan_mask = np.all(np.isnan(diagnosis_matrix), axis=0)
713+
714+
result = _evaluate_likelihood_ratios(
715+
diagnosis_matrix=diagnosis_matrix,
772716
sensitivities=np.array([mod.sens for mod in modalities.values()]),
773717
specificities=np.array([mod.spec for mod in modalities.values()]),
718+
method=method,
774719
)
775-
return pd.DataFrame(result, columns=columns)
720+
result = np.astype(result, object)
721+
result[all_nan_mask] = None
722+
return pd.DataFrame(result, columns=diagnosis_stack[0].columns)
776723

777724
def infer_sublevels(
778725
self,

0 commit comments

Comments
 (0)