Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parameter to config and gui to set the min precursors required for update #460

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fcbcbbc
add parameter to config and gui to set the minimum number of precurso…
anna-charlotte Jan 29, 2025
f67756e
remove i greater 0 statement from two-step-classifier
anna-charlotte Feb 5, 2025
e621da2
add pretrained TwoStepClassifier model
anna-charlotte Feb 6, 2025
16ce82a
add check if n_samples > 10 in BinaryClassifierLegacyNewBatch
anna-charlotte Feb 10, 2025
716be78
2-step-clsf: add i > 0
anna-charlotte Feb 11, 2025
c8b9463
remove pretrained model
anna-charlotte Feb 11, 2025
5fc2fe9
2-step-clsf version: i >= 0, not pretrained
anna-charlotte Feb 11, 2025
0abc255
2-step-clsf version: i >= 0, pretrained
anna-charlotte Feb 11, 2025
18fa5ab
2-step-clsf version: i >= 0, not pretrained
anna-charlotte Feb 11, 2025
d93fae3
2-step-clsf version: i > 0, not pretrained
anna-charlotte Feb 11, 2025
8df18ea
remove print statements
anna-charlotte Feb 11, 2025
a6ee75d
add scaling of qval
anna-charlotte Feb 23, 2025
1fee785
fix group_column param
anna-charlotte Feb 24, 2025
8662354
fix scaling_factor
anna-charlotte Feb 24, 2025
82ea90d
fix scaling_factor
anna-charlotte Feb 24, 2025
d969206
reorganize TwoStepClassifier.fit_predict()
anna-charlotte Feb 27, 2025
6a168a3
add pretrained model file
anna-charlotte Feb 27, 2025
25e2aca
remove redundant parameters
anna-charlotte Feb 27, 2025
76aff18
move chek for enough samples
anna-charlotte Feb 27, 2025
033918d
clean up
anna-charlotte Feb 27, 2025
4ec9c01
Merge remote-tracking branch 'origin/main' into add-min-precursors-pa…
anna-charlotte Feb 27, 2025
d978ef4
adressing pr comments
anna-charlotte Feb 27, 2025
42b1c0c
fix os.scandir error
anna-charlotte Feb 27, 2025
1f4841f
add TODO
anna-charlotte Feb 28, 2025
927960f
fix typos
anna-charlotte Feb 28, 2025
f161030
pr comments
anna-charlotte Feb 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
3 changes: 1 addition & 2 deletions alphadia/constants/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,8 @@ fdr:
channel_wise_fdr: false

# (Experimental)
# uses a two-step classifier consisting of a logistic regression and a neural network, with a default maximum of 5 iterations per fitting call
# uses a two-step classifier consisting of a logistic regression and a neural network.
enable_two_step_classifier: false
two_step_classifier_max_iterations: 5
# (Experimental)
# Optimizes the batch size and learning rate of the neural network
enable_nn_hyperparameter_tuning: false
Expand Down
6 changes: 3 additions & 3 deletions alphadia/fdrexperimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -1150,6 +1150,9 @@ def fit(self, x: np.ndarray, y: np.ndarray):
x, y, test_size=self.test_size
)

x_train = torch.Tensor(x_train)
y_train = torch.Tensor(y_train)

x_test = torch.Tensor(x_test)
y_test = torch.Tensor(y_test)

Expand All @@ -1161,9 +1164,6 @@ def fit(self, x: np.ndarray, y: np.ndarray):

loss = nn.BCELoss()

x_train = torch.Tensor(x_train)
y_train = torch.Tensor(y_train)

num_batches = (x_train.shape[0] // self.batch_size) - 1
batch_start_list = np.arange(num_batches) * self.batch_size
batch_stop_list = np.arange(num_batches) * self.batch_size + self.batch_size
Expand Down
197 changes: 107 additions & 90 deletions alphadia/fdrx/models/two_step_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5)
second_classifier: Classifier,
first_fdr_cutoff: float = 0.6,
second_fdr_cutoff: float = 0.01,
min_precursors_for_update: int = 5000,
max_iterations: int = 5,
min_precursors_for_update: int = 200,
train_on_top_n: int = 1,
):
"""Initializing a two-step classifier.
Expand All @@ -37,10 +36,8 @@ def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5)
The fdr threshold for the first classifier, determining how selective the first classification step is.
second_fdr_cutoff : float, default=0.01
The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results.
min_precursors_for_update : int, default=5000
min_precursors_for_update : int, default=200
The minimum number of precursors required to update the first classifier.
max_iterations : int
Maximum number of refinement iterations during training.
train_on_top_n : int
Use candidates up to this rank for training. During inference, all ranks are used.

Expand All @@ -51,7 +48,6 @@ def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5)
self.second_fdr_cutoff = second_fdr_cutoff

self._min_precursors_for_update = min_precursors_for_update
self._max_iterations = max_iterations
self._train_on_top_n = train_on_top_n

logger.info(
Expand All @@ -67,11 +63,11 @@ def fit_predict(
y_col: str = "decoy",
group_columns: list[str] | None = None,
) -> pd.DataFrame:
"""Train the two-step classifier and predict precursors using an iterative approach.
"""Train the two-step classifier and predict precursors using the following approach.

1. First iteration: Train neural network on top-n candidates.
2. Subsequent iterations: Use linear classifier to filter data, then refine with neural network.
3. Update linear classifier if enough high-confidence predictions are found, else break.
1. Train neural network on top-n candidates.
2. Update linear classifier if enough high-confidence predictions are found, else break.
3. Use linear classifier to filter data, then refine with neural network.

Parameters
----------
Expand All @@ -90,91 +86,63 @@ def fit_predict(
DataFrame containing predictions and q-values

"""
min_train_size = 1
logger.info("=== Starting training of TwoStepClassifier ===")

df = self._preprocess_data(df, x_cols)
best_result = None
df_train = df[df["rank"] < self._train_on_top_n]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just from the name, I would assume _train_on_top_n is a boolean, but then this line would make no sense .. maybe find a better name?

df_predict = df

# tracking precursors identified at fdr cutoffs `self.first_fdr_cutoff` and `self.second_fdr_cutoff``
previous_target_count_after_first_clf = -1
previous_target_count_after_second_clf = -1

for i in range(self._max_iterations):
logger.info(f"Starting iteration {i + 1} / {self._max_iterations}.")

# extract preselction using first classifier if it is fitted
if self.first_classifier.fitted and i > 0:
df_train = self._apply_filtering_with_first_classifier(
df, x_cols, group_columns
)
df_predict = df_train # using the same df for training and predicting, unlike in the following else block.
logger.info(
f"Application of first classifier at fdr={self.first_fdr_cutoff} results in "
f"{len(df_train):,} samples ({get_target_count(df_train):,} precursors)"
)

previous_target_count_after_first_clf = get_target_count(df_train)
self.second_classifier.epochs = 50
else:
logger.debug("First classifier not fitted yet. Proceeding without it.")
df_train = df[df["rank"] < self._train_on_top_n]
df_predict = df
# train and apply NN classifier
self.second_classifier.epochs = 10
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deliberately hardcoded?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the value was chosen somewhat arbitrarily. The reason I set it to 10 was to avoid the error where BinaryClassifierLegacyNewBatching.fit() crashes in model_selection.train_test_split(x, y, test_size=self.test_size) when there aren’t enough samples in x for splitting. 10 worked for me, but I agree it’s not ideal and as I said, chosen a bot by random. Do you have a suggestion on what do do instead maybe?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at least, make it either a module-wide constant or create a new method parameter and set it as default (then is is more obvious that there is a knob to tune)..
if it makes sense to have the user tune it -> config

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops sorry, I mixed something up here, and was talking about something else. Yes, this one is deliberately set to 10, but I will add a #TODO to line 126 where we set it = 50

df_after_second_clf = self._train_and_apply_second_classifier(
df_train, df_predict, x_cols, y_col, group_columns
)
best_result = df_after_second_clf

self.second_classifier.epochs = 10
df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff)
target_count_after_second_clf = get_target_count(df_filtered)
logger.info(
f"{target_count_after_second_clf:,} targets found "
f"after second classifier, at fdr={self.second_fdr_cutoff}"
)

# train and apply second classifier
df_after_second_clf = self._train_and_apply_second_classifier(
df_train, df_predict, x_cols, y_col, group_columns
)
# stop if not enough targets found after NN classifier
if target_count_after_second_clf < self._min_precursors_for_update:
return best_result

df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff)
current_target_count = get_target_count(df_filtered)
# update and use the linear classifier
self._update_first_classifier(df_filtered, df, x_cols, y_col, group_columns)
df_train = self._apply_filtering_with_first_classifier(
df, x_cols, group_columns
)
if len(df_train) < min_train_size:
return best_result

if current_target_count < previous_target_count_after_second_clf:
logger.info(
f"Training stopped on iteration {i + 1}. Decrease in precursor count from "
f"{previous_target_count_after_second_clf:,} to {current_target_count:,}."
)
return best_result
df_predict = df_train # using the same df for training and predicting, unlike in the following else block.
previous_target_count_after_first_clf = get_target_count(df_train)

previous_target_count_after_second_clf = current_target_count
best_result = df_after_second_clf # TODO: Remove if multiple iterations are dropped to save memory.
# train and apply second classifier
self.second_classifier.epochs = 50
df_after_second_clf = self._train_and_apply_second_classifier(
df_train, df_predict, x_cols, y_col, group_columns
)
df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff)
current_target_count = get_target_count(df_filtered)

logger.info(
f"Application of second classifier at fdr={self.second_fdr_cutoff} results in "
f"{get_target_count(df_train):,} precursors."
)
if current_target_count > target_count_after_second_clf:
target_count_after_second_clf = current_target_count
best_result = df_after_second_clf

# update first classifier if enough confident predictions
if current_target_count > self._min_precursors_for_update:
target_count_after_first_clf, new_classifier = (
self._fit_and_eval_first_classifier(
df_filtered, df, x_cols, y_col, group_columns
)
self._update_first_classifier(
df_filtered,
df,
x_cols,
y_col,
group_columns,
previous_target_count_after_first_clf,
)
if target_count_after_first_clf > previous_target_count_after_first_clf:
logger.debug(
f"Update of first classifier initiated: previous version had {previous_target_count_after_first_clf:,} "
f"precursors, current version has {target_count_after_first_clf:,} precursors."
)
self.first_classifier = new_classifier
previous_target_count_after_first_clf = target_count_after_first_clf

else:
logger.debug(
f"Update of first classifier skipped: previous version had {previous_target_count_after_first_clf:,} "
f"precursors, current version has {target_count_after_first_clf:,} precursors."
)
else:
logger.info(
f"=== Insufficient precursors detected; ending after {i + 1} iterations ==="
)
break
else:
logger.info(
f"=== Stopping fitting after reaching the maximum number of iterations: "
f"{self._max_iterations} / {self._max_iterations} ==="
)

return best_result

Expand All @@ -187,11 +155,22 @@ def _apply_filtering_with_first_classifier(
self, df: pd.DataFrame, x_cols: list[str], group_columns: list[str]
) -> pd.DataFrame:
"""Apply first classifier to filter data for the training of the second classifier."""
n_precursors = get_target_count(df)
logger.info(
f"Applying first classifier to {len(df):,} precursors ({n_precursors:,} targets)"
)

df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1]

return compute_and_filter_q_values(
filtered_df = compute_and_filter_q_values(
df, self.first_fdr_cutoff, group_columns, remove_decoys=False
)
logger.info(
f"Preselection of first classifier at fdr={self.first_fdr_cutoff} results in "
f"{len(filtered_df):,} precursors ({get_target_count(filtered_df):,} targets)"
)

return filtered_df

def _train_and_apply_second_classifier(
self,
Expand All @@ -202,24 +181,35 @@ def _train_and_apply_second_classifier(
group_columns: list[str],
) -> pd.DataFrame:
"""Train second_classifier and apply it to get predictions."""
logger.info(
f"Training second classifier on {len(train_df):,} precursors "
f"({get_target_count(train_df):,} targets, top_n={self._train_on_top_n})"
)

self.second_classifier.fit(
train_df[x_cols].to_numpy().astype(np.float32),
train_df[y_col].to_numpy().astype(np.float32),
)

logger.info(
f"Applying second classifier to {len(predict_df):,} precursors "
f"({get_target_count(predict_df):,} targets, top_n={max(predict_df['rank']) + 1})"
)

x = predict_df[x_cols].to_numpy().astype(np.float32)
predict_df["proba"] = self.second_classifier.predict_proba(x)[:, 1]

return compute_q_values(predict_df, group_columns)

def _fit_and_eval_first_classifier(
def _update_first_classifier( # noqa: PLR0913
self,
subset_df: pd.DataFrame,
full_df: pd.DataFrame,
x_cols: list[str],
y_col: str,
group_columns: list[str],
) -> tuple[int, Classifier]:
previous_count: int = -1,
) -> None:
"""Fits a copy of the first classifier on a given subset and applies it to the full dataset.

Returns the number of targets found and the trained classifier.
Expand All @@ -231,18 +221,27 @@ def _fit_and_eval_first_classifier(
x_all = full_df[x_cols].to_numpy()
reduced_df = full_df[[*group_columns, "decoy"]]

logger.info(f"Fitting first classifier on {len(df_train):,} samples.")
logger.info(
f"Fitting first classifier on {len(df_train):,} precursors, applying it to {len(x_all):,} precursors."
)
new_classifier = copy.deepcopy(self.first_classifier)
new_classifier.fit(x_train, y_train)

logger.info(f"Applying first classifier to {len(x_all):,} samples.")
reduced_df["proba"] = new_classifier.predict_proba(x_all)[:, 1]
df_targets = compute_and_filter_q_values(
reduced_df, self.first_fdr_cutoff, group_columns
)
n_targets = get_target_count(df_targets)

return n_targets, new_classifier
# update first classifier if imrpovement
if n_targets > previous_count:
logger.info(
f"Updating the first classifier as new target count increased: {n_targets:,} > {previous_count:,}"
)
self.first_classifier = new_classifier
previous_count = n_targets

# return previous_count

@property
def fitted(self) -> bool:
Expand Down Expand Up @@ -288,12 +287,30 @@ def get_target_count(df: pd.DataFrame) -> int:


def compute_q_values(
df: pd.DataFrame, group_columns: list[str] | None = None
df: pd.DataFrame,
group_columns: list[str] | None = None,
qval_col: str = "qval",
scale_by_target_decoy_ratio: bool = True, # noqa: FBT001, FBT002
) -> pd.DataFrame:
"""Compute q-values for each entry after keeping only best entries per group."""
scaling_factor = 1.0
if scale_by_target_decoy_ratio:
n_targets = (df["decoy"] == 0).sum()
n_decoys = (df["decoy"] == 1).sum()
scaling_factor = round(n_targets / n_decoys, 3)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please avoid raising ZeroDivisionError here (either by catching it or by adding a small epsilon to the denominator (not sure how the latter will affect the isfinite check though)

if not np.isfinite(scaling_factor) or scaling_factor == 0:
scaling_factor = 1.0

df.sort_values("proba", ascending=True, inplace=True)
df = keep_best(df, group_columns=group_columns)
return get_q_values(df, "proba", "decoy")
df = get_q_values(df, "proba", "decoy", qval_col)

logger.info(
f"Normalizing q-values using {n_targets:,} targets and {n_decoys:,} decoys (scaling factor = {scaling_factor})"
)
df[qval_col] = df[qval_col] * scaling_factor

return df


def filter_by_qval(df: pd.DataFrame, fdr_cutoff: float) -> pd.DataFrame:
Expand Down
27 changes: 14 additions & 13 deletions alphadia/workflow/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,8 @@ def save_classifier_store(self, path: None | str = None, version: int = -1):
path = os.path.join(
os.path.dirname(alphadia.__file__), "constants", "classifier"
)
if self.is_two_step_classifier:
path = os.path.join(path, "two_step_classifier")

logger.info(f"Saving classifier store to {path}")

Expand All @@ -808,22 +810,21 @@ def load_classifier_store(self, path: None | str = None):
path = os.path.join(
os.path.dirname(alphadia.__file__), "constants", "classifier"
)
if self.is_two_step_classifier:
path = os.path.join(path, "two_step_classifier")

logger.info(f"Loading classifier store from {path}")

if (
not self.is_two_step_classifier
): # TODO add pretrained model for TwoStepClassifier
for file in os.listdir(path):
if file.endswith(".pth"):
classifier_hash = file.split(".")[0]

if classifier_hash not in self.classifier_store:
classifier = deepcopy(self.classifier_base)
classifier.from_state_dict(
torch.load(os.path.join(path, file), weights_only=False)
)
self.classifier_store[classifier_hash].append(classifier)
for file in os.listdir(path):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://www.stuartellis.name/articles/python-modern-practices/#use-osscandir-instead-of-oslistdir
Would not have expected reading that article yesterday would come in handy so quickly ;-)

if file.endswith(".pth"):
classifier_hash = file.split(".")[0]

if classifier_hash not in self.classifier_store:
classifier = deepcopy(self.classifier_base)
classifier.from_state_dict(
torch.load(os.path.join(path, file), weights_only=False)
)
self.classifier_store[classifier_hash].append(classifier)

def get_classifier(self, available_columns: list, version: int = -1):
"""Gets the classifier for a given set of feature columns and version. If the classifier is not found in the store, gets the base classifier instead.
Expand Down
Loading
Loading