MannLabs · anna-charlotte · Jan 29, 2025 · Feb 5, 2025 · Feb 6, 2025 · Feb 10, 2025
diff --git a/alphadia/constants/classifier/two_step_classifier/715e351002075dbf.pth b/alphadia/constants/classifier/two_step_classifier/715e351002075dbf.pth
diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml
@@ -280,9 +280,8 @@ fdr:
   channel_wise_fdr: false
 
   # (Experimental)
-  # uses a two-step classifier consisting of a logistic regression and a neural network, with a default maximum of 5 iterations per fitting call
+  # uses a two-step classifier consisting of a logistic regression and a neural network.
   enable_two_step_classifier: false
-  two_step_classifier_max_iterations: 5
   # (Experimental)
   # Optimizes the batch size and learning rate of the neural network
   enable_nn_hyperparameter_tuning: false

diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py
@@ -1150,6 +1150,9 @@ def fit(self, x: np.ndarray, y: np.ndarray):
             x, y, test_size=self.test_size
         )
 
+        x_train = torch.Tensor(x_train)
+        y_train = torch.Tensor(y_train)
+
         x_test = torch.Tensor(x_test)
         y_test = torch.Tensor(y_test)
 
@@ -1161,9 +1164,6 @@ def fit(self, x: np.ndarray, y: np.ndarray):
 
         loss = nn.BCELoss()
 
-        x_train = torch.Tensor(x_train)
-        y_train = torch.Tensor(y_train)
-
         num_batches = (x_train.shape[0] // self.batch_size) - 1
         batch_start_list = np.arange(num_batches) * self.batch_size
         batch_stop_list = np.arange(num_batches) * self.batch_size + self.batch_size

diff --git a/alphadia/fdrx/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py
@@ -21,8 +21,7 @@ def __init__(  # noqa: PLR0913 Too many arguments in function definition (> 5)
         second_classifier: Classifier,
         first_fdr_cutoff: float = 0.6,
         second_fdr_cutoff: float = 0.01,
-        min_precursors_for_update: int = 5000,
-        max_iterations: int = 5,
+        min_precursors_for_update: int = 200,
         train_on_top_n: int = 1,
     ):
         """Initializing a two-step classifier.
@@ -37,10 +36,8 @@ def __init__(  # noqa: PLR0913 Too many arguments in function definition (> 5)
             The fdr threshold for the first classifier, determining how selective the first classification step is.
         second_fdr_cutoff : float, default=0.01
             The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results.
-        min_precursors_for_update : int, default=5000
+        min_precursors_for_update : int, default=200
             The minimum number of precursors required to update the first classifier.
-        max_iterations : int
-            Maximum number of refinement iterations during training.
         train_on_top_n : int
             Use candidates up to this rank for training. During inference, all ranks are used.
 
@@ -51,7 +48,6 @@ def __init__(  # noqa: PLR0913 Too many arguments in function definition (> 5)
         self.second_fdr_cutoff = second_fdr_cutoff
 
         self._min_precursors_for_update = min_precursors_for_update
-        self._max_iterations = max_iterations
         self._train_on_top_n = train_on_top_n
 
         logger.info(
@@ -67,11 +63,11 @@ def fit_predict(
         y_col: str = "decoy",
         group_columns: list[str] | None = None,
     ) -> pd.DataFrame:
-        """Train the two-step classifier and predict precursors using an iterative approach.
+        """Train the two-step classifier and predict precursors using the following approach.
 
-        1. First iteration: Train neural network on top-n candidates.
-        2. Subsequent iterations: Use linear classifier to filter data, then refine with neural network.
-        3. Update linear classifier if enough high-confidence predictions are found, else break.
+        1. Train neural network on top-n candidates.
+        2. Update linear classifier if enough high-confidence predictions are found, else break.
+        3. Use linear classifier to filter data, then refine with neural network.
 
         Parameters
         ----------
@@ -90,91 +86,63 @@ def fit_predict(
             DataFrame containing predictions and q-values
 
         """
+        min_train_size = 1
         logger.info("=== Starting training of TwoStepClassifier ===")
 
         df = self._preprocess_data(df, x_cols)
-        best_result = None
+        df_train = df[df["rank"] < self._train_on_top_n]
+        df_predict = df
 
-        # tracking precursors identified at fdr cutoffs `self.first_fdr_cutoff` and `self.second_fdr_cutoff``
-        previous_target_count_after_first_clf = -1
-        previous_target_count_after_second_clf = -1
-
-        for i in range(self._max_iterations):
-            logger.info(f"Starting iteration {i + 1} / {self._max_iterations}.")
-
-            # extract preselction using first classifier if it is fitted
-            if self.first_classifier.fitted and i > 0:
-                df_train = self._apply_filtering_with_first_classifier(
-                    df, x_cols, group_columns
-                )
-                df_predict = df_train  # using the same df for training and predicting, unlike in the following else block.
-                logger.info(
-                    f"Application of first classifier at fdr={self.first_fdr_cutoff} results in "
-                    f"{len(df_train):,} samples ({get_target_count(df_train):,} precursors)"
-                )
-
-                previous_target_count_after_first_clf = get_target_count(df_train)
-                self.second_classifier.epochs = 50
-            else:
-                logger.debug("First classifier not fitted yet. Proceeding without it.")
-                df_train = df[df["rank"] < self._train_on_top_n]
-                df_predict = df
+        # train and apply NN classifier
+        self.second_classifier.epochs = 10
+        df_after_second_clf = self._train_and_apply_second_classifier(
+            df_train, df_predict, x_cols, y_col, group_columns
+        )
+        best_result = df_after_second_clf
 
-                self.second_classifier.epochs = 10
+        df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff)
+        target_count_after_second_clf = get_target_count(df_filtered)
+        logger.info(
+            f"{target_count_after_second_clf:,} targets found "
+            f"after second classifier, at fdr={self.second_fdr_cutoff}"
+        )
 
-            # train and apply second classifier
-            df_after_second_clf = self._train_and_apply_second_classifier(
-                df_train, df_predict, x_cols, y_col, group_columns
-            )
+        # stop if not enough targets found after NN classifier
+        if target_count_after_second_clf < self._min_precursors_for_update:
+            return best_result
 
-            df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff)
-            current_target_count = get_target_count(df_filtered)
+        # update and use the linear classifier
+        self._update_first_classifier(df_filtered, df, x_cols, y_col, group_columns)
+        df_train = self._apply_filtering_with_first_classifier(
+            df, x_cols, group_columns
+        )
+        if len(df_train) < min_train_size:
+            return best_result
 
-            if current_target_count < previous_target_count_after_second_clf:
-                logger.info(
-                    f"Training stopped on iteration {i + 1}. Decrease in precursor count from "
-                    f"{previous_target_count_after_second_clf:,} to {current_target_count:,}."
-                )
-                return best_result
+        df_predict = df_train  # using the same df for training and predicting, unlike in the following else block.
+        previous_target_count_after_first_clf = get_target_count(df_train)
 
-            previous_target_count_after_second_clf = current_target_count
-            best_result = df_after_second_clf  # TODO: Remove if multiple iterations are dropped to save memory.
+        # train and apply second classifier
+        self.second_classifier.epochs = 50
+        df_after_second_clf = self._train_and_apply_second_classifier(
+            df_train, df_predict, x_cols, y_col, group_columns
+        )
+        df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff)
+        current_target_count = get_target_count(df_filtered)
 
-            logger.info(
-                f"Application of second classifier at fdr={self.second_fdr_cutoff} results in "
-                f"{get_target_count(df_train):,} precursors."
-            )
+        if current_target_count > target_count_after_second_clf:
+            target_count_after_second_clf = current_target_count
+            best_result = df_after_second_clf
 
-            # update first classifier if enough confident predictions
             if current_target_count > self._min_precursors_for_update:
-                target_count_after_first_clf, new_classifier = (
-                    self._fit_and_eval_first_classifier(
-                        df_filtered, df, x_cols, y_col, group_columns
-                    )
+                self._update_first_classifier(
+                    df_filtered,
+                    df,
+                    x_cols,
+                    y_col,
+                    group_columns,
+                    previous_target_count_after_first_clf,
                 )
-                if target_count_after_first_clf > previous_target_count_after_first_clf:
-                    logger.debug(
-                        f"Update of first classifier initiated: previous version had {previous_target_count_after_first_clf:,} "
-                        f"precursors, current version has {target_count_after_first_clf:,} precursors."
-                    )
-                    self.first_classifier = new_classifier
-                    previous_target_count_after_first_clf = target_count_after_first_clf
-
-                else:
-                    logger.debug(
-                        f"Update of first classifier skipped: previous version had {previous_target_count_after_first_clf:,} "
-                        f"precursors, current version has {target_count_after_first_clf:,} precursors."
-                    )
-            else:
-                logger.info(
-                    f"=== Insufficient precursors detected; ending after {i + 1} iterations ==="
-                )
-                break
-        else:
-            logger.info(
-                f"=== Stopping fitting after reaching the maximum number of iterations: "
-                f"{self._max_iterations} / {self._max_iterations} ==="
-            )
 
         return best_result
 
@@ -187,11 +155,22 @@ def _apply_filtering_with_first_classifier(
         self, df: pd.DataFrame, x_cols: list[str], group_columns: list[str]
     ) -> pd.DataFrame:
         """Apply first classifier to filter data for the training of the second classifier."""
+        n_precursors = get_target_count(df)
+        logger.info(
+            f"Applying first classifier to {len(df):,} precursors ({n_precursors:,} targets)"
+        )
+
         df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1]
 
-        return compute_and_filter_q_values(
+        filtered_df = compute_and_filter_q_values(
             df, self.first_fdr_cutoff, group_columns, remove_decoys=False
         )
+        logger.info(
+            f"Preselection of first classifier at fdr={self.first_fdr_cutoff} results in "
+            f"{len(filtered_df):,} precursors ({get_target_count(filtered_df):,} targets)"
+        )
+
+        return filtered_df
 
     def _train_and_apply_second_classifier(
         self,
@@ -202,24 +181,35 @@ def _train_and_apply_second_classifier(
         group_columns: list[str],
     ) -> pd.DataFrame:
         """Train second_classifier and apply it to get predictions."""
+        logger.info(
+            f"Training second classifier on {len(train_df):,} precursors "
+            f"({get_target_count(train_df):,} targets, top_n={self._train_on_top_n})"
+        )
+
         self.second_classifier.fit(
             train_df[x_cols].to_numpy().astype(np.float32),
             train_df[y_col].to_numpy().astype(np.float32),
         )
 
+        logger.info(
+            f"Applying second classifier to {len(predict_df):,} precursors "
+            f"({get_target_count(predict_df):,} targets, top_n={max(predict_df['rank']) + 1})"
+        )
+
         x = predict_df[x_cols].to_numpy().astype(np.float32)
         predict_df["proba"] = self.second_classifier.predict_proba(x)[:, 1]
 
         return compute_q_values(predict_df, group_columns)
 
-    def _fit_and_eval_first_classifier(
+    def _update_first_classifier(  # noqa: PLR0913
         self,
         subset_df: pd.DataFrame,
         full_df: pd.DataFrame,
         x_cols: list[str],
         y_col: str,
         group_columns: list[str],
-    ) -> tuple[int, Classifier]:
+        previous_count: int = -1,
+    ) -> None:
         """Fits a copy of the first classifier on a given subset and applies it to the full dataset.
 
         Returns the number of targets found and the trained classifier.
@@ -231,18 +221,27 @@ def _fit_and_eval_first_classifier(
         x_all = full_df[x_cols].to_numpy()
         reduced_df = full_df[[*group_columns, "decoy"]]
 
-        logger.info(f"Fitting first classifier on {len(df_train):,} samples.")
+        logger.info(
+            f"Fitting first classifier on {len(df_train):,} precursors, applying it to {len(x_all):,} precursors."
+        )
         new_classifier = copy.deepcopy(self.first_classifier)
         new_classifier.fit(x_train, y_train)
 
-        logger.info(f"Applying first classifier to {len(x_all):,} samples.")
         reduced_df["proba"] = new_classifier.predict_proba(x_all)[:, 1]
         df_targets = compute_and_filter_q_values(
             reduced_df, self.first_fdr_cutoff, group_columns
         )
         n_targets = get_target_count(df_targets)
 
-        return n_targets, new_classifier
+        # update first classifier if imrpovement
+        if n_targets > previous_count:
+            logger.info(
+                f"Updating the first classifier as new target count increased: {n_targets:,} > {previous_count:,}"
+            )
+            self.first_classifier = new_classifier
+            previous_count = n_targets
+
+        # return previous_count
 
     @property
     def fitted(self) -> bool:
@@ -288,12 +287,30 @@ def get_target_count(df: pd.DataFrame) -> int:
 
 
 def compute_q_values(
-    df: pd.DataFrame, group_columns: list[str] | None = None
+    df: pd.DataFrame,
+    group_columns: list[str] | None = None,
+    qval_col: str = "qval",
+    scale_by_target_decoy_ratio: bool = True,  # noqa: FBT001, FBT002
 ) -> pd.DataFrame:
     """Compute q-values for each entry after keeping only best entries per group."""
+    scaling_factor = 1.0
+    if scale_by_target_decoy_ratio:
+        n_targets = (df["decoy"] == 0).sum()
+        n_decoys = (df["decoy"] == 1).sum()
+        scaling_factor = round(n_targets / n_decoys, 3)
+        if not np.isfinite(scaling_factor) or scaling_factor == 0:
+            scaling_factor = 1.0
+
     df.sort_values("proba", ascending=True, inplace=True)
     df = keep_best(df, group_columns=group_columns)
-    return get_q_values(df, "proba", "decoy")
+    df = get_q_values(df, "proba", "decoy", qval_col)
+
+    logger.info(
+        f"Normalizing q-values using {n_targets:,} targets and {n_decoys:,} decoys (scaling factor = {scaling_factor})"
+    )
+    df[qval_col] = df[qval_col] * scaling_factor
+
+    return df
 
 
 def filter_by_qval(df: pd.DataFrame, fdr_cutoff: float) -> pd.DataFrame:

diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py
@@ -786,6 +786,8 @@ def save_classifier_store(self, path: None | str = None, version: int = -1):
             path = os.path.join(
                 os.path.dirname(alphadia.__file__), "constants", "classifier"
             )
+            if self.is_two_step_classifier:
+                path = os.path.join(path, "two_step_classifier")
 
         logger.info(f"Saving classifier store to {path}")
 
@@ -808,22 +810,21 @@ def load_classifier_store(self, path: None | str = None):
             path = os.path.join(
                 os.path.dirname(alphadia.__file__), "constants", "classifier"
             )
+            if self.is_two_step_classifier:
+                path = os.path.join(path, "two_step_classifier")
 
         logger.info(f"Loading classifier store from {path}")
 
-        if (
-            not self.is_two_step_classifier
-        ):  # TODO add pretrained model for TwoStepClassifier
-            for file in os.listdir(path):
-                if file.endswith(".pth"):
-                    classifier_hash = file.split(".")[0]
-
-                    if classifier_hash not in self.classifier_store:
-                        classifier = deepcopy(self.classifier_base)
-                        classifier.from_state_dict(
-                            torch.load(os.path.join(path, file), weights_only=False)
-                        )
-                        self.classifier_store[classifier_hash].append(classifier)
+        for file in os.listdir(path):
+            if file.endswith(".pth"):
+                classifier_hash = file.split(".")[0]
+
+                if classifier_hash not in self.classifier_store:
+                    classifier = deepcopy(self.classifier_base)
+                    classifier.from_state_dict(
+                        torch.load(os.path.join(path, file), weights_only=False)
+                    )
+                    self.classifier_store[classifier_hash].append(classifier)
 
     def get_classifier(self, available_columns: list, version: int = -1):
         """Gets the classifier for a given set of feature columns and version. If the classifier is not found in the store, gets the base classifier instead.