From 7b7c7fd7c21ea59fed4002f296eece9ac59032b7 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 16 Jul 2024 16:13:15 +0200 Subject: [PATCH 01/10] #183 init the hla tutorial --- docs/nbs/tutorial_HLA_prediction.rst | 5 - docs/notebooks.rst | 1 - .../tutorial_immunopeptidomics.ipynb | 587 ++++++++++++++++++ peptdeep/hla/hla_utils.py | 8 +- 4 files changed, 591 insertions(+), 10 deletions(-) delete mode 100644 docs/nbs/tutorial_HLA_prediction.rst create mode 100644 docs/tutorials/tutorial_immunopeptidomics.ipynb diff --git a/docs/nbs/tutorial_HLA_prediction.rst b/docs/nbs/tutorial_HLA_prediction.rst deleted file mode 100644 index 544073bf..00000000 --- a/docs/nbs/tutorial_HLA_prediction.rst +++ /dev/null @@ -1,5 +0,0 @@ -Tutorial: HLA prediction -========================== - -Check `HLA1_Classifier.ipynb `_ -in `PeptDeep-HLA `_ repo. diff --git a/docs/notebooks.rst b/docs/notebooks.rst index 1ba96c8a..fa607aab 100644 --- a/docs/notebooks.rst +++ b/docs/notebooks.rst @@ -9,7 +9,6 @@ Tutorials and notebooks about how to use AlphaPeptDeep nbs/tutorial_models_from_scratch nbs/tutorial_speclib_from_fasta nbs/alphapeptdeep_hdf_to_tsv - nbs/tutorial_HLA_prediction nbs/tutorial_model_manager nbs/tutorial_building_rt_model nbs/tutorial_building_ccs_model diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb new file mode 100644 index 00000000..d71290eb --- /dev/null +++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using peptdeep for MHC class I immunopeptidomics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that pydivsufsort package is not installed by peptdeep by default. Install by:\n", + "```\n", + "pip install \"peptdeep[development,hla]\"\n", + "```\n", + "\n", + "Or install within jupyter notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q pydivsufsort" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unspecific digestion in alphabase" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n", + "\n", + "Unspecific digestion in alphabase involves two steps:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Concatenate protein sequences into a single sequence, separated by a sentinel character, e.g., '$'. For instance:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def concat_sequences_for_nonspecific_digestion(seq_list, sep=\"$\"):\n", + " return sep + sep.join(seq_list) + sep" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prot_seq_list = [\"MABCDEKFGHIJKLMNOPQRST\",\"FGHIJKLMNOPQR\"]\n", + "cat_prot = concat_sequences_for_nonspecific_digestion(prot_seq_list, sep=\"$\")\n", + "cat_prot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the first and last sentinel characters are crutial as well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Use `alphabase.protein.lcp_digest.get_substring_indices` to get all non-redundant non-specific sequences from the concatenated sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_pos
019
1110
2111
3112
4113
.........
791322
801323
811422
821423
831523
\n", + "

84 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos\n", + "0 1 9\n", + "1 1 10\n", + "2 1 11\n", + "3 1 12\n", + "4 1 13\n", + ".. ... ...\n", + "79 13 22\n", + "80 13 23\n", + "81 14 22\n", + "82 14 23\n", + "83 15 23\n", + "\n", + "[84 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from alphabase.protein.lcp_digest import get_substring_indices\n", + "import pandas as pd\n", + "\n", + "start_idxes, stop_idxes = get_substring_indices(\n", + " cat_prot, min_len=8, max_len=14, stop_char=\"$\"\n", + ")\n", + "digest_pos_df = pd.DataFrame({\n", + " \"start_pos\": start_idxes,\n", + " \"stop_pos\": stop_idxes,\n", + "})\n", + "digest_pos_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All unspecific peptides can be localted by the `start_pos` and `stop_pos` in `digest_pos_df`, and all peptides are non-redundant guaranteed by the LCP algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_pos
018
119
2110
3111
4112
.........
5493599879995
5493699879996
5493799889995
5493899889996
5493999899996
\n", + "

54940 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos\n", + "0 1 8\n", + "1 1 9\n", + "2 1 10\n", + "3 1 11\n", + "4 1 12\n", + "... ... ...\n", + "54935 9987 9995\n", + "54936 9987 9996\n", + "54937 9988 9995\n", + "54938 9988 9996\n", + "54939 9989 9996\n", + "\n", + "[54940 rows x 2 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import random\n", + "import string\n", + "random.seed(0)\n", + "cat_seq = '$'+''.join(random.choices(string.ascii_uppercase+'$', k=10000))+'$'\n", + "start_idxes, stop_idxes = get_substring_indices(cat_seq, min_len=7, max_len=14)\n", + "digest_pos_df = pd.DataFrame({\n", + " \"start_pos\": start_idxes,\n", + " \"stop_pos\": stop_idxes,\n", + "})\n", + "digest_pos_df" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "RAM_use_idxes = sys.getsizeof(digest_pos_df)*1e-6" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_possequence
018WULGNKV
119WULGNKVI
2110WULGNKVIM
3111WULGNKVIMP
4112WULGNKVIMPY
............
5493599879995CESHBWDD
5493699879996CESHBWDDX
5493799889995ESHBWDD
5493899889996ESHBWDDX
5493999899996SHBWDDX
\n", + "

54940 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos sequence\n", + "0 1 8 WULGNKV\n", + "1 1 9 WULGNKVI\n", + "2 1 10 WULGNKVIM\n", + "3 1 11 WULGNKVIMP\n", + "4 1 12 WULGNKVIMPY\n", + "... ... ... ...\n", + "54935 9987 9995 CESHBWDD\n", + "54936 9987 9996 CESHBWDDX\n", + "54937 9988 9995 ESHBWDD\n", + "54938 9988 9996 ESHBWDDX\n", + "54939 9989 9996 SHBWDDX\n", + "\n", + "[54940 rows x 3 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "digest_pos_df[\"sequence\"] = digest_pos_df[\n", + " [\"start_pos\",\"stop_pos\"]\n", + "].apply(lambda x: cat_seq[slice(*x)], axis=1)\n", + "digest_pos_df" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "RAM_use_seqs = sys.getsizeof(digest_pos_df[\"sequence\"])*1e-6" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'idxes RAM = 3.25833 Mb, seq RAM = 0.43968, ratio = 7.41063'" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f\"idxes RAM = {RAM_use_seqs:.5f} Mb, seq RAM = {RAM_use_idxes:.5f}, ratio = {RAM_use_seqs/RAM_use_idxes:.5f}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To save the RAM, the `peptdeep.hla` module works on start and stop indices instead of on peptide sequences directly. This will save about 8 times of the RAM for HLA-I peptides (length from 8 to 14). For a very large protein sequence database, there will be millions of unspecific peptides, so working with strings sometimes is not feasible due to the requirements of extremely large RAM." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transfer learning for HLA class I prediction with `peptideep.hla`" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n", + "\n", + "model = HLA1_Binding_Classifier()\n", + "model.load_pretrained_hla_model()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/peptdeep/hla/hla_utils.py b/peptdeep/hla/hla_utils.py index d74d36b0..ae29f31f 100644 --- a/peptdeep/hla/hla_utils.py +++ b/peptdeep/hla/hla_utils.py @@ -95,12 +95,12 @@ def nonspecific_digest_cat_proteins( pd.DataFrame A dataframe sorted by `nAA` with three columns: `start_pos`: the start index of the peptide in cat_protein - `end_pos`: the stop/end index of the peptide in cat_protein + `stop_pos`: the stop/end index of the peptide in cat_protein `nAA`: the number of amino acids (peptide length). """ pos_starts, pos_ends = get_substring_indices(cat_sequence, min_len, max_len) - digest_df = pd.DataFrame(dict(start_pos=pos_starts, end_pos=pos_ends)) - digest_df["nAA"] = digest_df.end_pos - digest_df.start_pos + digest_df = pd.DataFrame(dict(start_pos=pos_starts, stop_pos=pos_ends)) + digest_df["nAA"] = digest_df.stop_pos - digest_df.start_pos digest_df.sort_values("nAA", inplace=True) digest_df.reset_index(inplace=True, drop=True) return digest_df @@ -170,7 +170,7 @@ def get_seq_series(idxes_df: pd.DataFrame, cat_prot: str) -> pd.Series: pd.Series pd.Series with sub-sequences (peptide sequences). """ - return idxes_df[["start_pos", "end_pos"]].apply( + return idxes_df[["start_pos", "stop_pos"]].apply( lambda x: cat_prot[slice(*x)], axis=1 ) From ed5cd2a89961d790bc388458a3069616cd122140 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 16 Jul 2024 16:16:18 +0200 Subject: [PATCH 02/10] #183 add immunopeptidomics tutorial in sphix rst --- docs/notebooks.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/notebooks.rst b/docs/notebooks.rst index fa607aab..03646eb5 100644 --- a/docs/notebooks.rst +++ b/docs/notebooks.rst @@ -12,3 +12,4 @@ Tutorials and notebooks about how to use AlphaPeptDeep nbs/tutorial_model_manager nbs/tutorial_building_rt_model nbs/tutorial_building_ccs_model + nbs/tutorials/tutorial_immunopeptidomics From 3c308579b35719cfb05cd87e1e42aff4b3bdde5c Mon Sep 17 00:00:00 2001 From: Maria Wahle Date: Thu, 18 Jul 2024 09:09:25 +0200 Subject: [PATCH 03/10] add test --- peptdeep/hla/hla_class1.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/peptdeep/hla/hla_class1.py b/peptdeep/hla/hla_class1.py index 3a9f5789..67f5e575 100644 --- a/peptdeep/hla/hla_class1.py +++ b/peptdeep/hla/hla_class1.py @@ -6,7 +6,7 @@ from typing import Union import peptdeep.model.building_block as building_block -from peptdeep.model.model_interface import ModelInterface +from peptdeep.model.model_interface import ModelInterface, append_nAA_column_if_missing from peptdeep.model.featurize import get_ascii_indices from peptdeep.pretrained_models import pretrain_dir, download_models, global_settings @@ -380,6 +380,40 @@ def predict_from_proteins( peptide_df["sequence"] = get_seq_series(peptide_df, self._cat_protein_sequence) return peptide_df + def _concat_neg_df(self, precursor_df, column_to_train='HLA'): + precursor_df = append_nAA_column_if_missing(precursor_df) + precursor_df[column_to_train] = 1 + df_list = [precursor_df] + for nAA, group_df in precursor_df.groupby('nAA'): + rnd_seqs = get_random_sequences( + self.protein_df, + n=len(group_df), + pep_len = nAA + ) + df_list.append(pd.DataFrame( + {'sequence':rnd_seqs,'nAA':nAA,column_to_train:0} + )) + return pd.concat(df_list).reset_index(drop=True) + + def test(self, precursor_df): + df = self._concat_neg_df(precursor_df) + self.predict(df) + prob_list = [] + precision_list = [] + recall_list = [] + fp_list = [] + for prob in [0.5,0.6,0.7,0.8, 0.9]: + prob_list.append(prob) + precision_list.append(df[df.HLA_prob_pred>prob].HLA.mean()) + recall_list.append(df[df.HLA_prob_pred>prob].HLA.sum()/len(df)*2) + fp_list.append(1-(1-df[df.HLA_prob_pred Date: Thu, 18 Jul 2024 13:49:52 +0200 Subject: [PATCH 04/10] tutorial --- .../tutorial_immunopeptidomics.ipynb | 1329 +++++++++++++++-- nbs_tests/hla/hla_class1.ipynb | 9 +- 2 files changed, 1171 insertions(+), 167 deletions(-) diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb index d71290eb..1ee12f5e 100644 --- a/docs/tutorials/tutorial_immunopeptidomics.ipynb +++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb @@ -11,6 +11,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "This notebook introduces how to generate spectral libraries for immunopeptidomics analysis from a list of protein sequences. This entails several steps:\n", + "\n", + "1. unspecific digestion of protein sequences\n", + "2. selection of peptide sequences used for library prediction by peptdeep-hla predicition\n", + " 2.1 using the pretrained model\n", + " 2.2 using an improved model by including a transfer learning step\n", + "3. spectral library prediction\n", + "4. matching the peptides back to the proteins (this can be done before or after library prediction or seach) \n", + "\n", + "\n", + "\n", "Note that pydivsufsort package is not installed by peptdeep by default. Install by:\n", "```\n", "pip install \"peptdeep[development,hla]\"\n", @@ -21,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -40,28 +51,34 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Unspecific digestion in alphabase" + "## 1. Unspecific digestion in alphabase" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n", + "The unspecific digestion workflow uses the longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n", "\n", - "Unspecific digestion in alphabase involves two steps:" + "This means, the digestion is performed on a single sequence of strings and retrives both the peptide sequence as well as the start and stop indeces of the peptide within the complete sequence. Therefore, unspecific digestion in alphabase involves two steps:\n", + "\n", + "1. concatenation of protein sequences into a single sequence\n", + "2. unspecific digestion\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "1. Concatenate protein sequences into a single sequence, separated by a sentinel character, e.g., '$'. For instance:" + "#### 1.1 Concatenate protein sequences into a single sequence\n", + "\n", + "The protein sequences are concatenated into a single sequence. The sequences are seperated by a sentinel character, in this case '$', so that no peptides across proteins are formed. Note that the first and last sentinel characters are crutial as well.\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -80,7 +97,7 @@ "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'" ] }, - "execution_count": 3, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -95,19 +112,136 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that the first and last sentinel characters are crutial as well." + "The same can be done directly from a fasta: \n", + "@ Feng do you have an example fasta somwhere? " + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
protein_idfull_namegene_namedescriptionsequencenAA
tr|A0A024R161|A0A024R161_HUMANA0A024R161tr|A0A024R161|A0A024R161_HUMANDNAJC25-GNG10tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot...MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...153
tr|A0A024RAP8|A0A024RAP8_HUMANA0A024RAP8tr|A0A024RAP8|A0A024RAP8_HUMANKLRC4-KLRK1tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso...MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC...216
\n", + "
" + ], + "text/plain": [ + " protein_id full_name \\\n", + "tr|A0A024R161|A0A024R161_HUMAN A0A024R161 tr|A0A024R161|A0A024R161_HUMAN \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN A0A024RAP8 tr|A0A024RAP8|A0A024RAP8_HUMAN \n", + "\n", + " gene_name \\\n", + "tr|A0A024R161|A0A024R161_HUMAN DNAJC25-GNG10 \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN KLRC4-KLRK1 \n", + "\n", + " description \\\n", + "tr|A0A024R161|A0A024R161_HUMAN tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot... \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso... \n", + "\n", + " sequence \\\n", + "tr|A0A024R161|A0A024R161_HUMAN MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG... \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC... \n", + "\n", + " nAA \n", + "tr|A0A024R161|A0A024R161_HUMAN 153 \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN 216 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.hla.hla_utils import load_prot_df\n", + "fasta = load_prot_df(r\"D:\\Software\\FASTA\\Human\\example.fasta\")\n", + "fasta" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'$MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSAGKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAELQQYCMQNACKDALLVGVPAGSNPFREPRSCALL$MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIAVAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNWYESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLTIIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV$'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.hla.hla_utils import cat_proteins\n", + "cat_fasta = cat_proteins(fasta['sequence'])\n", + "cat_fasta" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "2. Use `alphabase.protein.lcp_digest.get_substring_indices` to get all non-redundant non-specific sequences from the concatenated sequence." + "#### 1.2 Unspecific digestion\n", + "\n", + "Use `alphabase.protein.lcp_digest.get_substring_indices` to get all non-redundant non-specific peptide sequences from the concatenated protein sequence. The digested peptide sequences are stored in a dataframe based on their start and stop indices in the concantenated protein sequence string. To save the RAM, the `peptdeep.hla` module works on start and stop indices instead of on peptide sequences directly. This will save about 8 times of the RAM for HLA-I peptides (length from 7 to 14, deomnstrated below). For a large protein sequence database, there will be millions of unspecific peptides, so working with strings is not feasible for a complete human fasta due to the requirements of extremely large RAM. (~ 70M unspecific sequences from the reviewed swissprot fasta require ~ 4-5 GB RAM already).\n", + "\n", + "Using the get_substring_indices function we extract the start and stop indices of all peptide sequences between 7 and 14 aa (min_len, max_len) from the concatenated protein sequences. All peptides sequences are unique, guranteed by the LCP algorithm." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -167,53 +301,53 @@ " ...\n", " \n", " \n", - " 79\n", - " 13\n", - " 22\n", + " 2438\n", + " 361\n", + " 370\n", " \n", " \n", - " 80\n", - " 13\n", - " 23\n", + " 2439\n", + " 361\n", + " 371\n", " \n", " \n", - " 81\n", - " 14\n", - " 22\n", + " 2440\n", + " 362\n", + " 370\n", " \n", " \n", - " 82\n", - " 14\n", - " 23\n", + " 2441\n", + " 362\n", + " 371\n", " \n", " \n", - " 83\n", - " 15\n", - " 23\n", + " 2442\n", + " 363\n", + " 371\n", " \n", " \n", "\n", - "

84 rows × 2 columns

\n", + "

2443 rows × 2 columns

\n", "" ], "text/plain": [ - " start_pos stop_pos\n", - "0 1 9\n", - "1 1 10\n", - "2 1 11\n", - "3 1 12\n", - "4 1 13\n", - ".. ... ...\n", - "79 13 22\n", - "80 13 23\n", - "81 14 22\n", - "82 14 23\n", - "83 15 23\n", + " start_pos stop_pos\n", + "0 1 9\n", + "1 1 10\n", + "2 1 11\n", + "3 1 12\n", + "4 1 13\n", + "... ... ...\n", + "2438 361 370\n", + "2439 361 371\n", + "2440 362 370\n", + "2441 362 371\n", + "2442 363 371\n", "\n", - "[84 rows x 2 columns]" + "[2443 rows x 2 columns]" ] }, - "execution_count": 4, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -221,9 +355,10 @@ "source": [ "from alphabase.protein.lcp_digest import get_substring_indices\n", "import pandas as pd\n", + "import sys\n", "\n", "start_idxes, stop_idxes = get_substring_indices(\n", - " cat_prot, min_len=8, max_len=14, stop_char=\"$\"\n", + " cat_fasta, min_len=8, max_len=14, stop_char=\"$\"\n", ")\n", "digest_pos_df = pd.DataFrame({\n", " \"start_pos\": start_idxes,\n", @@ -232,16 +367,25 @@ "digest_pos_df" ] }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "RAM_use_idxes = sys.getsizeof(digest_pos_df)*1e-6" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "All unspecific peptides can be localted by the `start_pos` and `stop_pos` in `digest_pos_df`, and all peptides are non-redundant guaranteed by the LCP algorithm." + "The unspecific peptide sequences can be localted by the `start_pos` and `stop_pos`." ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -267,117 +411,161 @@ " \n", " start_pos\n", " stop_pos\n", + " sequence\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", - " 8\n", + " 9\n", + " MGAPLLSP\n", " \n", " \n", " 1\n", " 1\n", - " 9\n", + " 10\n", + " MGAPLLSPG\n", " \n", " \n", " 2\n", " 1\n", - " 10\n", + " 11\n", + " MGAPLLSPGW\n", " \n", " \n", " 3\n", " 1\n", - " 11\n", + " 12\n", + " MGAPLLSPGWG\n", " \n", " \n", " 4\n", " 1\n", - " 12\n", + " 13\n", + " MGAPLLSPGWGA\n", " \n", " \n", " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", - " 54935\n", - " 9987\n", - " 9995\n", + " 2438\n", + " 361\n", + " 370\n", + " NTYICMQRT\n", " \n", " \n", - " 54936\n", - " 9987\n", - " 9996\n", + " 2439\n", + " 361\n", + " 371\n", + " NTYICMQRTV\n", " \n", " \n", - " 54937\n", - " 9988\n", - " 9995\n", + " 2440\n", + " 362\n", + " 370\n", + " TYICMQRT\n", " \n", " \n", - " 54938\n", - " 9988\n", - " 9996\n", + " 2441\n", + " 362\n", + " 371\n", + " TYICMQRTV\n", " \n", " \n", - " 54939\n", - " 9989\n", - " 9996\n", + " 2442\n", + " 363\n", + " 371\n", + " YICMQRTV\n", " \n", " \n", "\n", - "

54940 rows × 2 columns

\n", + "

2443 rows × 3 columns

\n", "" ], "text/plain": [ - " start_pos stop_pos\n", - "0 1 8\n", - "1 1 9\n", - "2 1 10\n", - "3 1 11\n", - "4 1 12\n", - "... ... ...\n", - "54935 9987 9995\n", - "54936 9987 9996\n", - "54937 9988 9995\n", - "54938 9988 9996\n", - "54939 9989 9996\n", + " start_pos stop_pos sequence\n", + "0 1 9 MGAPLLSP\n", + "1 1 10 MGAPLLSPG\n", + "2 1 11 MGAPLLSPGW\n", + "3 1 12 MGAPLLSPGWG\n", + "4 1 13 MGAPLLSPGWGA\n", + "... ... ... ...\n", + "2438 361 370 NTYICMQRT\n", + "2439 361 371 NTYICMQRTV\n", + "2440 362 370 TYICMQRT\n", + "2441 362 371 TYICMQRTV\n", + "2442 363 371 YICMQRTV\n", "\n", - "[54940 rows x 2 columns]" + "[2443 rows x 3 columns]" ] }, - "execution_count": 39, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import random\n", - "import string\n", - "random.seed(0)\n", - "cat_seq = '$'+''.join(random.choices(string.ascii_uppercase+'$', k=10000))+'$'\n", - "start_idxes, stop_idxes = get_substring_indices(cat_seq, min_len=7, max_len=14)\n", - "digest_pos_df = pd.DataFrame({\n", - " \"start_pos\": start_idxes,\n", - " \"stop_pos\": stop_idxes,\n", - "})\n", + "digest_pos_df[\"sequence\"] = digest_pos_df[\n", + " [\"start_pos\",\"stop_pos\"]\n", + "].apply(lambda x: cat_fasta[slice(*x)], axis=1)\n", "digest_pos_df" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "RAM_use_idxes = sys.getsizeof(digest_pos_df)*1e-6" + "RAM_use_seqs = sys.getsizeof(digest_pos_df[\"sequence\"])*1e-6" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'seq RAM = 0.16621 Mb, idxes RAM = 0.01969, ratio = 8.44230'" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f\"seq RAM = {RAM_use_seqs:.5f} Mb, idxes RAM = {RAM_use_idxes:.5f}, ratio = {RAM_use_seqs/RAM_use_idxes:.5f}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selection of peptide sequences used for library prediction\n", + "The digest_prot_df contains all unspecifically digested peptide sequences between 7 and 14 aa generatable from the concatenated protein sequences. This list is reduced using a HLA1_Binding_Classifier from peptdeep.hla.hla_class1. Two different model architectures are available, an LSTM model (HLA_Class_I_LSTM) and a BERT model (HLA_Class_I_BERT). A pretrained model is only available for the LSTM model architecture.\n", + "The HLA1_Binding_Classifer can be used with a pretrained model, tuned with existing peptide data or trained from scratch. Training of a new model should be considered carefully and will not be covered in this tutorial.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Selection of peptide seqeuence candidates without transferlearning\n", + "\n", + "Selection of peptide sequences for library predicition using the pretrained model can be done in a few steps. First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, we can use any kind of dataframe containing peptide sequences to predict how likely there are HLA peptides, the only requirement beeing that the column containing the peptides is called 'sequence'.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -404,163 +592,978 @@ " start_pos\n", " stop_pos\n", " sequence\n", + " nAA\n", + " HLA_prob_pred\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", + " 9\n", + " MGAPLLSP\n", " 8\n", - " WULGNKV\n", + " 0.239477\n", " \n", " \n", " 1\n", - " 1\n", - " 9\n", - " WULGNKVI\n", + " 145\n", + " 153\n", + " REPRSCAL\n", + " 8\n", + " 0.061692\n", " \n", " \n", " 2\n", - " 1\n", - " 10\n", - " WULGNKVIM\n", + " 146\n", + " 154\n", + " EPRSCALL\n", + " 8\n", + " 0.137313\n", " \n", " \n", " 3\n", - " 1\n", - " 11\n", - " WULGNKVIMP\n", + " 155\n", + " 163\n", + " MGWIRGRR\n", + " 8\n", + " 0.056462\n", " \n", " \n", " 4\n", - " 1\n", - " 12\n", - " WULGNKVIMPY\n", + " 156\n", + " 164\n", + " GWIRGRRS\n", + " 8\n", + " 0.001298\n", " \n", " \n", " ...\n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 54935\n", - " 9987\n", - " 9995\n", - " CESHBWDD\n", + " 2438\n", + " 112\n", + " 126\n", + " KVSQAAAELQQYCM\n", + " 14\n", + " 0.243115\n", " \n", " \n", - " 54936\n", - " 9987\n", - " 9996\n", - " CESHBWDDX\n", + " 2439\n", + " 317\n", + " 331\n", + " NGSWQWEDGSILSP\n", + " 14\n", + " 0.021114\n", " \n", " \n", - " 54937\n", - " 9988\n", - " 9995\n", - " ESHBWDD\n", + " 2440\n", + " 79\n", + " 93\n", + " DRYRPQPGDEGPGR\n", + " 14\n", + " 0.060635\n", " \n", " \n", - " 54938\n", - " 9988\n", - " 9996\n", - " ESHBWDDX\n", + " 2441\n", + " 113\n", + " 127\n", + " VSQAAAELQQYCMQ\n", + " 14\n", + " 0.355900\n", " \n", " \n", - " 54939\n", - " 9989\n", - " 9996\n", - " SHBWDDX\n", + " 2442\n", + " 190\n", + " 204\n", + " KQRCPVVKSKCREN\n", + " 14\n", + " 0.000362\n", " \n", " \n", "\n", - "

54940 rows × 3 columns

\n", + "

2443 rows × 5 columns

\n", "" ], "text/plain": [ - " start_pos stop_pos sequence\n", - "0 1 8 WULGNKV\n", - "1 1 9 WULGNKVI\n", - "2 1 10 WULGNKVIM\n", - "3 1 11 WULGNKVIMP\n", - "4 1 12 WULGNKVIMPY\n", - "... ... ... ...\n", - "54935 9987 9995 CESHBWDD\n", - "54936 9987 9996 CESHBWDDX\n", - "54937 9988 9995 ESHBWDD\n", - "54938 9988 9996 ESHBWDDX\n", - "54939 9989 9996 SHBWDDX\n", + " start_pos stop_pos sequence nAA HLA_prob_pred\n", + "0 1 9 MGAPLLSP 8 0.239477\n", + "1 145 153 REPRSCAL 8 0.061692\n", + "2 146 154 EPRSCALL 8 0.137313\n", + "3 155 163 MGWIRGRR 8 0.056462\n", + "4 156 164 GWIRGRRS 8 0.001298\n", + "... ... ... ... ... ...\n", + "2438 112 126 KVSQAAAELQQYCM 14 0.243115\n", + "2439 317 331 NGSWQWEDGSILSP 14 0.021114\n", + "2440 79 93 DRYRPQPGDEGPGR 14 0.060635\n", + "2441 113 127 VSQAAAELQQYCMQ 14 0.355900\n", + "2442 190 204 KQRCPVVKSKCREN 14 0.000362\n", "\n", - "[54940 rows x 3 columns]" + "[2443 rows x 5 columns]" ] }, - "execution_count": 41, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "digest_pos_df[\"sequence\"] = digest_pos_df[\n", - " [\"start_pos\",\"stop_pos\"]\n", - "].apply(lambda x: cat_seq[slice(*x)], axis=1)\n", - "digest_pos_df" + "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n", + "\n", + "model = HLA1_Binding_Classifier()\n", + "model.load_pretrained_hla_model()\n", + "manual_prediction = model.predict(digest_pos_df)\n", + "manual_prediction\n" ] }, { - "cell_type": "code", - "execution_count": 42, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "RAM_use_seqs = sys.getsizeof(digest_pos_df[\"sequence\"])*1e-6" + "Next, we can filter the list based on the HLA_prob_pred. The higher the probability, the more likely it is for the peptide sequence to be present in a immunopeptidomics sample. It is not recommended to use a cut-off below 0.7 as this inflates the spectral library massively. It is rather recommended to use more conservative cut-offs. " ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 48, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_possequencenAAHLA_prob_pred
17168176EMSEFHNY80.793702
24130138KDALLVGV80.817415
31137145VPAGSNPF80.751329
37170178SEFHNYNL80.940019
67181189KSDFSTRW80.895964
..................
231895109QSAEEAFLLVATAY140.969541
2378329343SPNLLTIIEMQKGD140.756001
2382519LLSPGWGAGAAGRR140.733784
2408110124TLKVSQAAAELQQY140.891976
2419620LSPGWGAGAAGRRW140.842583
\n", + "

148 rows × 5 columns

\n", + "
" + ], "text/plain": [ - "'idxes RAM = 3.25833 Mb, seq RAM = 0.43968, ratio = 7.41063'" + " start_pos stop_pos sequence nAA HLA_prob_pred\n", + "17 168 176 EMSEFHNY 8 0.793702\n", + "24 130 138 KDALLVGV 8 0.817415\n", + "31 137 145 VPAGSNPF 8 0.751329\n", + "37 170 178 SEFHNYNL 8 0.940019\n", + "67 181 189 KSDFSTRW 8 0.895964\n", + "... ... ... ... ... ...\n", + "2318 95 109 QSAEEAFLLVATAY 14 0.969541\n", + "2378 329 343 SPNLLTIIEMQKGD 14 0.756001\n", + "2382 5 19 LLSPGWGAGAAGRR 14 0.733784\n", + "2408 110 124 TLKVSQAAAELQQY 14 0.891976\n", + "2419 6 20 LSPGWGAGAAGRRW 14 0.842583\n", + "\n", + "[148 rows x 5 columns]" ] }, - "execution_count": 43, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "f\"idxes RAM = {RAM_use_seqs:.5f} Mb, seq RAM = {RAM_use_idxes:.5f}, ratio = {RAM_use_seqs/RAM_use_idxes:.5f}\"" + "manual_prediction[manual_prediction['HLA_prob_pred'] > 0.7]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To save the RAM, the `peptdeep.hla` module works on start and stop indices instead of on peptide sequences directly. This will save about 8 times of the RAM for HLA-I peptides (length from 8 to 14). For a very large protein sequence database, there will be millions of unspecific peptides, so working with strings sometimes is not feasible due to the requirements of extremely large RAM." + "As described above, directly using the sequences for classification can be memory intense for large lists of sequences. Thereby, the manual concatenation, unspecific digestion, predicition and filtering is only suggested for small sets of proteins or integration of selected sequences (e.g mutations, nuORFs etc.). This can be circumvented by directly predicting and filtering from a fasta using model.predict_from_proteins(). This executes the concatenation, unspecific digestion, predicition and filtering automatically in batches. Thereby the whole process can be done more efficient and be performed without a specialized computation infrastructure." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 49, "metadata": {}, - "source": [ - "## Transfer learning for HLA class I prediction with `peptideep.hla`" - ] - }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequence
016817680.793702EMSEFHNY
113013880.817415KDALLVGV
213714580.751329VPAGSNPF
317017880.940019SEFHNYNL
418118980.895964KSDFSTRW
..................
14395109140.969541QSAEEAFLLVATAY
144329343140.756001SPNLLTIIEMQKGD
145519140.733784LLSPGWGAGAAGRR
146110124140.891976TLKVSQAAAELQQY
147620140.842583LSPGWGAGAAGRRW
\n", + "

148 rows × 5 columns

\n", + "" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence\n", + "0 168 176 8 0.793702 EMSEFHNY\n", + "1 130 138 8 0.817415 KDALLVGV\n", + "2 137 145 8 0.751329 VPAGSNPF\n", + "3 170 178 8 0.940019 SEFHNYNL\n", + "4 181 189 8 0.895964 KSDFSTRW\n", + ".. ... ... ... ... ...\n", + "143 95 109 14 0.969541 QSAEEAFLLVATAY\n", + "144 329 343 14 0.756001 SPNLLTIIEMQKGD\n", + "145 5 19 14 0.733784 LLSPGWGAGAAGRR\n", + "146 110 124 14 0.891976 TLKVSQAAAELQQY\n", + "147 6 20 14 0.842583 LSPGWGAGAAGRRW\n", + "\n", + "[148 rows x 5 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict_from_proteins(fasta, prob_threshold=0.7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Selection of peptide seqeuence candidates with transferlearning\n", + "\n", + "To perform transferlearning we need a list of peptide sequences we expect to be present in our sample. These peptides can be retrived from several different sources like DDA or directDIA search results. It is recommended to use at the very least 1000 sequences for transferlearning. The more sequences available the better the transferlearning step works. The model performance can be assessed after transferlearning and should be assessed before predicition. \n", + "\n", + "First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, a protein dataframe is added, in this example the previousely loaded fasta file. The protein dataframe is used by the Classifier internaly to draw negative training data during model training and testing." + ] + }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ - "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n", - "\n", "model = HLA1_Binding_Classifier()\n", - "model.load_pretrained_hla_model()" + "model.load_pretrained_hla_model()\n", + "model.load_proteins(fasta)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we load the peptide sequences wee use for transferlearning and split it into a training and testing dataset. This step is very important to assess the model performance after transferlearning. Here, we use the digest_pos_df generated above. As these are no immunopeptides, but a list of unspecifically digested proteins, the model performance will not improve, but the pronciples remain the same. \n", + "@ Feng should we include a example file so that the model is actually improved or just use this? " + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1954, 489)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seq_df = digest_pos_df.sample(frac=0.2)\n", + "train_seq_df = digest_pos_df.drop(index=test_seq_df.index)\n", + "len(train_seq_df), len(test_seq_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we train the model using the training sequence dataframe. In this example we use 10 training epochs, in a real experiment more should be used. Good starting points are 40 epochs for a training dataset of around 10000 sequences or 100 epochs for a training dataset of around 1000 sequences. For a real experiment the warmup_epochs can be increased to 10. " + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-18 10:24:25> Training with fixed sequence length: 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=1, lr=2e-05, loss=1.4192258289882116\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=2, lr=4e-05, loss=1.0882413131850106\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=3, lr=6e-05, loss=0.8716121912002563\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=4, lr=8e-05, loss=0.7767811502729144\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=5, lr=0.0001, loss=0.7206867933273315\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=6, lr=0.0001, loss=0.7072907941681998\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013800655092511\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6962822931153434\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6965692894799369\n", + "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6948717491967338\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + ] + } + ], + "source": [ + "\n", + "model.train(train_seq_df,\n", + " epoch=10, warmup_epoch=5, \n", + " verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can assess the model performance after transferlearning using the model.test() function on the training and testing data. This can also be done before transferlearning to assess how well the model fits the available data already. The test assesses the precision, recall and fals positive rate of the model at different probability cut offs. As a rule of thumb a false postitve rate above 7% (@FENG adjust in case lower/higher) is not recomendable because the peptide list gets disproportionally larger, leading to lower IDs during the search. In case of a high false postitive rate, the probability cut off at which the peptides are predicted should be increased. " + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HLA_prob_predprecisionrecallfalse_positive
00.50.5074420.5583420.541965
10.60.5161290.0163770.015353
20.7NaN0.0000000.000000
30.8NaN0.0000000.000000
40.9NaN0.0000000.000000
\n", + "
" + ], + "text/plain": [ + " HLA_prob_pred precision recall false_positive\n", + "0 0.5 0.507442 0.558342 0.541965\n", + "1 0.6 0.516129 0.016377 0.015353\n", + "2 0.7 NaN 0.000000 0.000000\n", + "3 0.8 NaN 0.000000 0.000000\n", + "4 0.9 NaN 0.000000 0.000000" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.test(train_seq_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HLA_prob_predprecisionrecallfalse_positive
00.50.478070.4458080.486708
10.60.625000.0204500.012270
20.71.000000.0020450.000000
30.8NaN0.0000000.000000
40.9NaN0.0000000.000000
\n", + "
" + ], + "text/plain": [ + " HLA_prob_pred precision recall false_positive\n", + "0 0.5 0.47807 0.445808 0.486708\n", + "1 0.6 0.62500 0.020450 0.012270\n", + "2 0.7 1.00000 0.002045 0.000000\n", + "3 0.8 NaN 0.000000 0.000000\n", + "4 0.9 NaN 0.000000 0.000000" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.test(test_seq_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After transferlearning and testing the new model, peptides can be predicted as with the pretrained model. " + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 1.24it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequence
0268012680980.715877SEFHNYNL
\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence\n", + "0 26801 26809 8 0.715877 SEFHNYNL" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict_from_proteins(digest_pos_df)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spectral library prediciton" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -579,7 +1582,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.8.8" } }, "nbformat": 4, diff --git a/nbs_tests/hla/hla_class1.ipynb b/nbs_tests/hla/hla_class1.ipynb index d0fa0eb3..f4bcd7ae 100644 --- a/nbs_tests/hla/hla_class1.ipynb +++ b/nbs_tests/hla/hla_class1.ipynb @@ -33,10 +33,11 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + "2024-07-02 17:16:12> Downloading https://github.com/MannLabs/alphapeptdeep/releases/download/pre-trained-models/hla_model.zip ...\n", + "2024-07-02 17:16:14> The pretrained models had been downloaded in C:\\Users\\wahle/peptdeep\\pretrained_models\\hla_model.zip\n" ] } ], @@ -78,7 +79,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 14.32it/s]\n" + "100%|██████████| 1/1 [00:00<00:00, 7.46it/s]\n" ] }, { @@ -321,7 +322,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.8.8" } }, "nbformat": 4, From 3286f7bf411544173a0a1956d9cb97d4c8dcae43 Mon Sep 17 00:00:00 2001 From: Maria Wahle Date: Fri, 19 Jul 2024 14:46:22 +0200 Subject: [PATCH 05/10] Finished library prediction --- .../tutorial_immunopeptidomics.ipynb | 2081 ++++++++++++++++- 1 file changed, 1981 insertions(+), 100 deletions(-) diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb index 1ee12f5e..f8f27618 100644 --- a/docs/tutorials/tutorial_immunopeptidomics.ipynb +++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -41,6 +41,14 @@ "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n", + "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n" + ] } ], "source": [ @@ -60,7 +68,7 @@ "source": [ "The unspecific digestion workflow uses the longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n", "\n", - "This means, the digestion is performed on a single sequence of strings and retrives both the peptide sequence as well as the start and stop indeces of the peptide within the complete sequence. Therefore, unspecific digestion in alphabase involves two steps:\n", + "This means, the digestion is performed on a single sequence of strings and retrives both the peptide sequence as well as the start and stop indices of the peptide within the complete sequence. Therefore, unspecific digestion in alphabase involves two steps:\n", "\n", "1. concatenation of protein sequences into a single sequence\n", "2. unspecific digestion\n", @@ -78,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -88,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -97,7 +105,7 @@ "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'" ] }, - "execution_count": 39, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -145,6 +153,7 @@ " protein_id\n", " full_name\n", " gene_name\n", + " gene_org\n", " description\n", " sequence\n", " nAA\n", @@ -156,6 +165,7 @@ " A0A024R161\n", " tr|A0A024R161|A0A024R161_HUMAN\n", " DNAJC25-GNG10\n", + " A0A024R161_HUMAN\n", " tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot...\n", " MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...\n", " 153\n", @@ -165,6 +175,7 @@ " A0A024RAP8\n", " tr|A0A024RAP8|A0A024RAP8_HUMAN\n", " KLRC4-KLRK1\n", + " A0A024RAP8_HUMAN\n", " tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso...\n", " MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC...\n", " 216\n", @@ -178,9 +189,9 @@ "tr|A0A024R161|A0A024R161_HUMAN A0A024R161 tr|A0A024R161|A0A024R161_HUMAN \n", "tr|A0A024RAP8|A0A024RAP8_HUMAN A0A024RAP8 tr|A0A024RAP8|A0A024RAP8_HUMAN \n", "\n", - " gene_name \\\n", - "tr|A0A024R161|A0A024R161_HUMAN DNAJC25-GNG10 \n", - "tr|A0A024RAP8|A0A024RAP8_HUMAN KLRC4-KLRK1 \n", + " gene_name gene_org \\\n", + "tr|A0A024R161|A0A024R161_HUMAN DNAJC25-GNG10 A0A024R161_HUMAN \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN KLRC4-KLRK1 A0A024RAP8_HUMAN \n", "\n", " description \\\n", "tr|A0A024R161|A0A024R161_HUMAN tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot... \n", @@ -202,13 +213,14 @@ ], "source": [ "from peptdeep.hla.hla_utils import load_prot_df\n", - "fasta = load_prot_df(r\"D:\\Software\\FASTA\\Human\\example.fasta\")\n", + "fasta_path = \"D:/Software/FASTA/Human/example.fasta\"\n", + "fasta = load_prot_df(fasta_path)\n", "fasta" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -217,7 +229,7 @@ "'$MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSAGKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAELQQYCMQNACKDALLVGVPAGSNPFREPRSCALL$MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIAVAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNWYESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLTIIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV$'" ] }, - "execution_count": 41, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -241,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -347,7 +359,7 @@ "[2443 rows x 2 columns]" ] }, - "execution_count": 42, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -369,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -385,7 +397,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -503,7 +515,7 @@ "[2443 rows x 3 columns]" ] }, - "execution_count": 44, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -517,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -526,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -535,7 +547,7 @@ "'seq RAM = 0.16621 Mb, idxes RAM = 0.01969, ratio = 8.44230'" ] }, - "execution_count": 46, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -565,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -707,7 +719,7 @@ "[2443 rows x 5 columns]" ] }, - "execution_count": 47, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -725,12 +737,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next, we can filter the list based on the HLA_prob_pred. The higher the probability, the more likely it is for the peptide sequence to be present in a immunopeptidomics sample. It is not recommended to use a cut-off below 0.7 as this inflates the spectral library massively. It is rather recommended to use more conservative cut-offs. " + "Next, we can filter the list based on the HLA_prob_pred. The higher the probability, the more likely it is for the peptide sequence to be present in a immunopeptidomics sample. It is not recommended to use a cut-off below 0.7 as this inflates the spectral library. It is rather recommended to use more conservative cut-offs. " ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -872,7 +884,7 @@ "[148 rows x 5 columns]" ] }, - "execution_count": 48, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -890,21 +902,14 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/1 [00:00 Training with fixed sequence length: 0\n" + "2024-07-19 14:16:34> Training with fixed sequence length: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1138,14 +1144,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=1, lr=2e-05, loss=1.4192258289882116\n" + "[Training] Epoch=1, lr=2e-05, loss=1.403803927557809\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1153,14 +1159,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=2, lr=4e-05, loss=1.0882413131850106\n" + "[Training] Epoch=2, lr=4e-05, loss=1.0939611451966422\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1168,14 +1174,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=3, lr=6e-05, loss=0.8716121912002563\n" + "[Training] Epoch=3, lr=6e-05, loss=0.8742348296301705\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1183,14 +1189,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=4, lr=8e-05, loss=0.7767811502729144\n" + "[Training] Epoch=4, lr=8e-05, loss=0.7860026274408612\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1198,14 +1204,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=5, lr=0.0001, loss=0.7206867933273315\n" + "[Training] Epoch=5, lr=0.0001, loss=0.7296201757022313\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1213,14 +1219,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=6, lr=0.0001, loss=0.7072907941681998\n" + "[Training] Epoch=6, lr=0.0001, loss=0.7098635860851833\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1228,14 +1234,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013800655092511\n" + "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7049905742917743\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1243,14 +1249,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6962822931153434\n" + "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6990227273532322\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] }, @@ -1258,15 +1264,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6965692894799369\n", - "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6948717491967338\n" + "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6956126008714948\n", + "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6955537881170001\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", + "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" ] } @@ -1287,7 +1293,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1321,16 +1327,16 @@ " \n", " 0\n", " 0.5\n", - " 0.507442\n", - " 0.558342\n", - " 0.541965\n", + " 0.504579\n", + " 0.563971\n", + " 0.553736\n", " \n", " \n", " 1\n", " 0.6\n", - " 0.516129\n", - " 0.016377\n", - " 0.015353\n", + " 0.488889\n", + " 0.011259\n", + " 0.011771\n", " \n", " \n", " 2\n", @@ -1359,14 +1365,14 @@ ], "text/plain": [ " HLA_prob_pred precision recall false_positive\n", - "0 0.5 0.507442 0.558342 0.541965\n", - "1 0.6 0.516129 0.016377 0.015353\n", + "0 0.5 0.504579 0.563971 0.553736\n", + "1 0.6 0.488889 0.011259 0.011771\n", "2 0.7 NaN 0.000000 0.000000\n", "3 0.8 NaN 0.000000 0.000000\n", "4 0.9 NaN 0.000000 0.000000" ] }, - "execution_count": 53, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1377,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1411,22 +1417,22 @@ " \n", " 0\n", " 0.5\n", - " 0.47807\n", - " 0.445808\n", - " 0.486708\n", + " 0.484288\n", + " 0.535787\n", + " 0.570552\n", " \n", " \n", " 1\n", " 0.6\n", - " 0.62500\n", - " 0.020450\n", - " 0.012270\n", + " 0.285714\n", + " 0.004090\n", + " 0.010225\n", " \n", " \n", " 2\n", " 0.7\n", - " 1.00000\n", - " 0.002045\n", + " NaN\n", + " 0.000000\n", " 0.000000\n", " \n", " \n", @@ -1449,14 +1455,14 @@ ], "text/plain": [ " HLA_prob_pred precision recall false_positive\n", - "0 0.5 0.47807 0.445808 0.486708\n", - "1 0.6 0.62500 0.020450 0.012270\n", - "2 0.7 1.00000 0.002045 0.000000\n", + "0 0.5 0.484288 0.535787 0.570552\n", + "1 0.6 0.285714 0.004090 0.010225\n", + "2 0.7 NaN 0.000000 0.000000\n", "3 0.8 NaN 0.000000 0.000000\n", "4 0.9 NaN 0.000000 0.000000" ] }, - "execution_count": 54, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1474,14 +1480,14 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 1.24it/s]\n" + "100%|██████████| 1/1 [00:00<00:00, 1.21it/s]\n" ] }, { @@ -1515,55 +1521,1930 @@ " \n", " \n", " 0\n", - " 26801\n", - " 26809\n", + " 170\n", + " 178\n", " 8\n", - " 0.715877\n", + " 0.673504\n", " SEFHNYNL\n", " \n", + " \n", + " 1\n", + " 181\n", + " 189\n", + " 8\n", + " 0.617312\n", + " KSDFSTRW\n", + " \n", + " \n", + " 2\n", + " 309\n", + " 317\n", + " 8\n", + " 0.615951\n", + " MGLVHIPT\n", + " \n", + " \n", + " 3\n", + " 299\n", + " 307\n", + " 8\n", + " 0.601598\n", + " LLKLVKSY\n", + " \n", + " \n", + " 4\n", + " 346\n", + " 354\n", + " 8\n", + " 0.609713\n", + " YASSFKGY\n", + " \n", + " \n", + " 5\n", + " 344\n", + " 352\n", + " 8\n", + " 0.635873\n", + " ALYASSFK\n", + " \n", + " \n", + " 6\n", + " 294\n", + " 303\n", + " 9\n", + " 0.600454\n", + " KEDQDLLKL\n", + " \n", + " \n", + " 7\n", + " 298\n", + " 307\n", + " 9\n", + " 0.628539\n", + " DLLKLVKSY\n", + " \n", + " \n", + " 8\n", + " 74\n", + " 83\n", + " 9\n", + " 0.602105\n", + " RRYHPDRYR\n", + " \n", + " \n", + " 9\n", + " 344\n", + " 354\n", + " 10\n", + " 0.625569\n", + " ALYASSFKGY\n", + " \n", + " \n", + " 10\n", + " 232\n", + " 242\n", + " 10\n", + " 0.607737\n", + " FLNSLFNQEV\n", + " \n", + " \n", + " 11\n", + " 353\n", + " 363\n", + " 10\n", + " 0.610844\n", + " YIENCSTPNT\n", + " \n", + " \n", + " 12\n", + " 53\n", + " 63\n", + " 10\n", + " 0.608182\n", + " VLGVSRSAGK\n", + " \n", + " \n", + " 13\n", + " 298\n", + " 309\n", + " 11\n", + " 0.608567\n", + " DLLKLVKSYHW\n", + " \n", + " \n", + " 14\n", + " 351\n", + " 362\n", + " 11\n", + " 0.607036\n", + " KGYIENCSTPN\n", + " \n", + " \n", + " 15\n", + " 52\n", + " 63\n", + " 11\n", + " 0.635592\n", + " EVLGVSRSAGK\n", + " \n", + " \n", + " 16\n", + " 130\n", + " 142\n", + " 12\n", + " 0.601588\n", + " KDALLVGVPAGS\n", + " \n", + " \n", + " 17\n", + " 351\n", + " 363\n", + " 12\n", + " 0.632752\n", + " KGYIENCSTPNT\n", + " \n", + " \n", + " 18\n", + " 86\n", + " 99\n", + " 13\n", + " 0.608231\n", + " GDEGPGRTPQSAE\n", + " \n", + " \n", + " 19\n", + " 141\n", + " 154\n", + " 13\n", + " 0.603257\n", + " SNPFREPRSCALL\n", + " \n", + " \n", + " 20\n", + " 32\n", + " 45\n", + " 13\n", + " 0.608740\n", + " LVRPAGALVEGLY\n", + " \n", + " \n", + " 21\n", + " 130\n", + " 143\n", + " 13\n", + " 0.620658\n", + " KDALLVGVPAGSN\n", + " \n", + " \n", + " 22\n", + " 185\n", + " 199\n", + " 14\n", + " 0.625906\n", + " STRWQKQRCPVVKS\n", + " \n", + " \n", + " 23\n", + " 60\n", + " 74\n", + " 14\n", + " 0.684522\n", + " AGKAEIARAYRQLA\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " start_pos stop_pos nAA HLA_prob_pred sequence\n", - "0 26801 26809 8 0.715877 SEFHNYNL" + " start_pos stop_pos nAA HLA_prob_pred sequence\n", + "0 170 178 8 0.673504 SEFHNYNL\n", + "1 181 189 8 0.617312 KSDFSTRW\n", + "2 309 317 8 0.615951 MGLVHIPT\n", + "3 299 307 8 0.601598 LLKLVKSY\n", + "4 346 354 8 0.609713 YASSFKGY\n", + "5 344 352 8 0.635873 ALYASSFK\n", + "6 294 303 9 0.600454 KEDQDLLKL\n", + "7 298 307 9 0.628539 DLLKLVKSY\n", + "8 74 83 9 0.602105 RRYHPDRYR\n", + "9 344 354 10 0.625569 ALYASSFKGY\n", + "10 232 242 10 0.607737 FLNSLFNQEV\n", + "11 353 363 10 0.610844 YIENCSTPNT\n", + "12 53 63 10 0.608182 VLGVSRSAGK\n", + "13 298 309 11 0.608567 DLLKLVKSYHW\n", + "14 351 362 11 0.607036 KGYIENCSTPN\n", + "15 52 63 11 0.635592 EVLGVSRSAGK\n", + "16 130 142 12 0.601588 KDALLVGVPAGS\n", + "17 351 363 12 0.632752 KGYIENCSTPNT\n", + "18 86 99 13 0.608231 GDEGPGRTPQSAE\n", + "19 141 154 13 0.603257 SNPFREPRSCALL\n", + "20 32 45 13 0.608740 LVRPAGALVEGLY\n", + "21 130 143 13 0.620658 KDALLVGVPAGSN\n", + "22 185 199 14 0.625906 STRWQKQRCPVVKS\n", + "23 60 74 14 0.684522 AGKAEIARAYRQLA" ] }, - "execution_count": 55, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "model.predict_from_proteins(digest_pos_df)" + "model.predict_from_proteins(fasta, prob_threshold=0.6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spectral library prediciton" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the spectral library for the filtered peptide list can be predicted using PredictSpecLibFasta. First, one needs to select the models for rt/ccs/ms2 prediction using the ModelManager. One can select from a set of pretrained models or load externally trained models. Here we load the 'HLA' model (at the moment this still loads the generic model, but in the futer this is supposed to be replaced by an HLA specfic internal model). " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from peptdeep.spec_lib.predict_lib import ModelManager\n", + "from peptdeep.protein.fasta import PredictSpecLibFasta\n", + "\n", + "model_mgr = ModelManager()\n", + "model_mgr.load_installed_models(model_type='HLA')" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Spectral library prediciton" + "In the next step, the PredictSpecLibFasta is initialized using the preloaded model. The presettings here are selected for the prediction of tryptic libraries so some parameters need to be adjusted, in particular precursor_charge_min, precursor_charge_max. By default Carbamidomethylation is set as a fixed modification (fix_mod) and Acetylation and Oxidation are set as variable modifications (var_mod). Those can be removed by adding an empty list as shown for the variable modifications. \n", + "\n", + "Of note, PredictSpecLibFasta can also be used to predict a library from a fasta file. Therfore one can also set the protease (default trypsin) and the minimum and maximum peptide length (7 to 35). Wee dont need to change those parameters here, as we wont make use of the digestion functions but rather provide a already digested sequence table. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "speclib = PredictSpecLibFasta(model_manager=model_mgr,\n", + " precursor_charge_min=1,\n", + " precursor_charge_max=3,\n", + " fix_mods=[])" ] }, { "cell_type": "markdown", "metadata": {}, - "source": [] + "source": [ + "To reduce the size of the dataframe and predicted library we give each peptide sequence a unique protein identifier (number). This enables the use of search engines that rely on protein information (such as AlphaDIA) but one needs to keep in mind to remove filtering steps based on how many peptides per protein are identified during data analysis. Alternatively, proteins the peptide sequences could originate from can be infered using prot_infer (demonstrated below). " + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequenceprotein_idprotein_idxesfull_namegene_orggene_nameis_prot_ntermis_prot_cterm
016817680.793702EMSEFHNY00000FalseFalse
113013880.817415KDALLVGV11111FalseFalse
213714580.751329VPAGSNPF22222FalseFalse
317017880.940019SEFHNYNL33333FalseFalse
418118980.895964KSDFSTRW44444FalseFalse
.......................................
14395109140.969541QSAEEAFLLVATAY143143143143143FalseFalse
144329343140.756001SPNLLTIIEMQKGD144144144144144FalseFalse
145519140.733784LLSPGWGAGAAGRR145145145145145FalseFalse
146110124140.891976TLKVSQAAAELQQY146146146146146FalseFalse
147620140.842583LSPGWGAGAAGRRW147147147147147FalseFalse
\n", + "

148 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence protein_id \\\n", + "0 168 176 8 0.793702 EMSEFHNY 0 \n", + "1 130 138 8 0.817415 KDALLVGV 1 \n", + "2 137 145 8 0.751329 VPAGSNPF 2 \n", + "3 170 178 8 0.940019 SEFHNYNL 3 \n", + "4 181 189 8 0.895964 KSDFSTRW 4 \n", + ".. ... ... ... ... ... ... \n", + "143 95 109 14 0.969541 QSAEEAFLLVATAY 143 \n", + "144 329 343 14 0.756001 SPNLLTIIEMQKGD 144 \n", + "145 5 19 14 0.733784 LLSPGWGAGAAGRR 145 \n", + "146 110 124 14 0.891976 TLKVSQAAAELQQY 146 \n", + "147 6 20 14 0.842583 LSPGWGAGAAGRRW 147 \n", + "\n", + " protein_idxes full_name gene_org gene_name is_prot_nterm is_prot_cterm \n", + "0 0 0 0 0 False False \n", + "1 1 1 1 1 False False \n", + "2 2 2 2 2 False False \n", + "3 3 3 3 3 False False \n", + "4 4 4 4 4 False False \n", + ".. ... ... ... ... ... ... \n", + "143 143 143 143 143 False False \n", + "144 144 144 144 144 False False \n", + "145 145 145 145 145 False False \n", + "146 146 146 146 146 False False \n", + "147 147 147 147 147 False False \n", + "\n", + "[148 rows x 12 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sequences['protein_id'] = [str(i) for i in range(len(sequences))]\n", + "sequences['protein_idxes'] = sequences.protein_id.astype(\"U\")\n", + "sequences['full_name'] = sequences['protein_id'] \n", + "sequences['gene_org'] = sequences['protein_id'] \n", + "sequences['gene_name'] = sequences['protein_id']\n", + "sequences[\"is_prot_nterm\"] = False\n", + "sequences[\"is_prot_cterm\"] = False\n", + "sequences" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The sequence dataframe contains all the relevant information to be passed to the protein_df and the precursor_df." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idnAAfull_namegene_orggene_name
0EMSEFHNY08000
1KDALLVGV18111
2VPAGSNPF28222
3SEFHNYNL38333
4KSDFSTRW48444
.....................
143QSAEEAFLLVATAY14314143143143
144SPNLLTIIEMQKGD14414144144144
145LLSPGWGAGAAGRR14514145145145
146TLKVSQAAAELQQY14614146146146
147LSPGWGAGAAGRRW14714147147147
\n", + "

148 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_id nAA full_name gene_org gene_name\n", + "0 EMSEFHNY 0 8 0 0 0\n", + "1 KDALLVGV 1 8 1 1 1\n", + "2 VPAGSNPF 2 8 2 2 2\n", + "3 SEFHNYNL 3 8 3 3 3\n", + "4 KSDFSTRW 4 8 4 4 4\n", + ".. ... ... ... ... ... ...\n", + "143 QSAEEAFLLVATAY 143 14 143 143 143\n", + "144 SPNLLTIIEMQKGD 144 14 144 144 144\n", + "145 LLSPGWGAGAAGRR 145 14 145 145 145\n", + "146 TLKVSQAAAELQQY 146 14 146 146 146\n", + "147 LSPGWGAGAAGRRW 147 14 147 147 147\n", + "\n", + "[148 rows x 6 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.protein_df = sequences[[\"sequence\",\"protein_id\",\"nAA\", 'full_name', 'gene_org', 'gene_name']].copy()\n", + "speclib.protein_df" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_cterm
0EMSEFHNY016817680.793702FalseFalse
1KDALLVGV113013880.817415FalseFalse
2VPAGSNPF213714580.751329FalseFalse
3SEFHNYNL317017880.940019FalseFalse
4KSDFSTRW418118980.895964FalseFalse
...........................
143QSAEEAFLLVATAY14395109140.969541FalseFalse
144SPNLLTIIEMQKGD144329343140.756001FalseFalse
145LLSPGWGAGAAGRR145519140.733784FalseFalse
146TLKVSQAAAELQQY146110124140.891976FalseFalse
147LSPGWGAGAAGRRW147620140.842583FalseFalse
\n", + "

148 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 KDALLVGV 1 130 138 8 0.817415 \n", + "2 VPAGSNPF 2 137 145 8 0.751329 \n", + "3 SEFHNYNL 3 170 178 8 0.940019 \n", + "4 KSDFSTRW 4 181 189 8 0.895964 \n", + ".. ... ... ... ... ... ... \n", + "143 QSAEEAFLLVATAY 143 95 109 14 0.969541 \n", + "144 SPNLLTIIEMQKGD 144 329 343 14 0.756001 \n", + "145 LLSPGWGAGAAGRR 145 5 19 14 0.733784 \n", + "146 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "147 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "143 False False \n", + "144 False False \n", + "145 False False \n", + "146 False False \n", + "147 False False \n", + "\n", + "[148 rows x 8 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.precursor_df = sequences[[\"sequence\",\"protein_idxes\",\"start_pos\",\"stop_pos\",\"nAA\",\"HLA_prob_pred\", 'is_prot_nterm', 'is_prot_cterm']].copy()\n", + "speclib.precursor_df" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_cterm
0EMSEFHNY016817680.793702FalseFalse
1KDALLVGV113013880.817415FalseFalse
2VPAGSNPF213714580.751329FalseFalse
3SEFHNYNL317017880.940019FalseFalse
4KSDFSTRW418118980.895964FalseFalse
...........................
143QSAEEAFLLVATAY14395109140.969541FalseFalse
144SPNLLTIIEMQKGD144329343140.756001FalseFalse
145LLSPGWGAGAAGRR145519140.733784FalseFalse
146TLKVSQAAAELQQY146110124140.891976FalseFalse
147LSPGWGAGAAGRRW147620140.842583FalseFalse
\n", + "

148 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 KDALLVGV 1 130 138 8 0.817415 \n", + "2 VPAGSNPF 2 137 145 8 0.751329 \n", + "3 SEFHNYNL 3 170 178 8 0.940019 \n", + "4 KSDFSTRW 4 181 189 8 0.895964 \n", + ".. ... ... ... ... ... ... \n", + "143 QSAEEAFLLVATAY 143 95 109 14 0.969541 \n", + "144 SPNLLTIIEMQKGD 144 329 343 14 0.756001 \n", + "145 LLSPGWGAGAAGRR 145 5 19 14 0.733784 \n", + "146 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "147 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "143 False False \n", + "144 False False \n", + "145 False False \n", + "146 False False \n", + "147 False False \n", + "\n", + "[148 rows x 8 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.precursor_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, the modifications and charges can be added to the peptide dataframe using add_modifications and add_charge. This creates a unique entry for every combination of charge and modification for all the sequences in the precursor dataframe. " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_ctermmodsmod_sitescharge
0EMSEFHNY016817680.793702FalseFalseOxidation@M21
1EMSEFHNY016817680.793702FalseFalseOxidation@M22
2EMSEFHNY016817680.793702FalseFalseOxidation@M23
3EMSEFHNY016817680.793702FalseFalse1
4EMSEFHNY016817680.793702FalseFalse2
....................................
493TLKVSQAAAELQQY146110124140.891976FalseFalse2
494TLKVSQAAAELQQY146110124140.891976FalseFalse3
495LSPGWGAGAAGRRW147620140.842583FalseFalse1
496LSPGWGAGAAGRRW147620140.842583FalseFalse2
497LSPGWGAGAAGRRW147620140.842583FalseFalse3
\n", + "

498 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 EMSEFHNY 0 168 176 8 0.793702 \n", + "2 EMSEFHNY 0 168 176 8 0.793702 \n", + "3 EMSEFHNY 0 168 176 8 0.793702 \n", + "4 EMSEFHNY 0 168 176 8 0.793702 \n", + ".. ... ... ... ... ... ... \n", + "493 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "494 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "495 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "496 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "497 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm mods mod_sites charge \n", + "0 False False Oxidation@M 2 1 \n", + "1 False False Oxidation@M 2 2 \n", + "2 False False Oxidation@M 2 3 \n", + "3 False False 1 \n", + "4 False False 2 \n", + ".. ... ... ... ... ... \n", + "493 False False 2 \n", + "494 False False 3 \n", + "495 False False 1 \n", + "496 False False 2 \n", + "497 False False 3 \n", + "\n", + "[498 rows x 11 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.add_modifications()\n", + "speclib.add_charge()\n", + "speclib.precursor_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now ccs, rt and ms2 can be predicted for each entry" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-19 14:19:49> Predicting RT/IM/MS2 for 400 precursors ...\n", + "2024-07-19 14:19:49> Predicting RT ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 65.96it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-19 14:19:49> Predicting mobility ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "100%|██████████| 7/7 [00:00<00:00, 70.12it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-19 14:19:49> Predicting MS2 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "100%|██████████| 7/7 [00:00<00:00, 23.54it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-19 14:19:50> End predicting RT/IM/MS2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "speclib.predict_all()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "iRTs can be added using translate_rt_to_irt_pred. This is not neccessary for search engines like DIA-NN or AlphaDIA but required for Spectronaut." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predict RT for 11 iRT precursors.\n", + "Linear regression of `rt_pred` to `irt`:\n", + " R_square R slope intercept test_num\n", + "0 0.99007 0.995022 152.235621 -39.23216 11\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_ctermmodsmod_sites...precursor_mzrt_predrt_norm_predccs_predmobility_prednceinstrumentfrag_start_idxfrag_stop_idxirt_pred
0EMSEFHNY016817680.793702FalseFalseOxidation@M2...1072.4040370.1896500.189650254.1959231.25314030.0Lumos07-10.360729
1EMSEFHNY016817680.793702FalseFalseOxidation@M2...536.7056570.1896500.189650337.3285830.83149430.0Lumos714-10.360729
2EMSEFHNY016817680.793702FalseFalse...1056.4091230.2892610.289261255.1037601.25737330.0Lumos14214.803679
3EMSEFHNY016817680.793702FalseFalse...528.7082000.2892610.289261337.4446410.83162130.0Lumos21284.803679
4KDALLVGV113013880.817415FalseFalse...814.5032800.4337910.433791256.6152341.26000130.0Lumos283526.806266
..................................................................
395TLKVSQAAAELQQY146110124140.891976FalseFalse...775.4146620.4895450.489545429.3608701.06251430.0Lumos3810382335.294021
396TLKVSQAAAELQQY146110124140.891976FalseFalse...517.2788670.4895450.489545463.2311100.76422530.0Lumos3823383635.294021
397LSPGWGAGAAGRRW147620140.842583FalseFalse...1441.7447420.3777430.377743289.2009891.43037830.0Lumos3836384918.273781
398LSPGWGAGAAGRRW147620140.842583FalseFalse...721.3760090.3777430.377743404.6336671.00065930.0Lumos3849386218.273781
399LSPGWGAGAAGRRW147620140.842583FalseFalse...481.2530980.3777430.377743476.6556400.78585130.0Lumos3862387518.273781
\n", + "

400 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 EMSEFHNY 0 168 176 8 0.793702 \n", + "2 EMSEFHNY 0 168 176 8 0.793702 \n", + "3 EMSEFHNY 0 168 176 8 0.793702 \n", + "4 KDALLVGV 1 130 138 8 0.817415 \n", + ".. ... ... ... ... ... ... \n", + "395 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "396 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "397 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "398 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "399 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm mods mod_sites ... precursor_mz \\\n", + "0 False False Oxidation@M 2 ... 1072.404037 \n", + "1 False False Oxidation@M 2 ... 536.705657 \n", + "2 False False ... 1056.409123 \n", + "3 False False ... 528.708200 \n", + "4 False False ... 814.503280 \n", + ".. ... ... ... ... ... ... \n", + "395 False False ... 775.414662 \n", + "396 False False ... 517.278867 \n", + "397 False False ... 1441.744742 \n", + "398 False False ... 721.376009 \n", + "399 False False ... 481.253098 \n", + "\n", + " rt_pred rt_norm_pred ccs_pred mobility_pred nce instrument \\\n", + "0 0.189650 0.189650 254.195923 1.253140 30.0 Lumos \n", + "1 0.189650 0.189650 337.328583 0.831494 30.0 Lumos \n", + "2 0.289261 0.289261 255.103760 1.257373 30.0 Lumos \n", + "3 0.289261 0.289261 337.444641 0.831621 30.0 Lumos \n", + "4 0.433791 0.433791 256.615234 1.260001 30.0 Lumos \n", + ".. ... ... ... ... ... ... \n", + "395 0.489545 0.489545 429.360870 1.062514 30.0 Lumos \n", + "396 0.489545 0.489545 463.231110 0.764225 30.0 Lumos \n", + "397 0.377743 0.377743 289.200989 1.430378 30.0 Lumos \n", + "398 0.377743 0.377743 404.633667 1.000659 30.0 Lumos \n", + "399 0.377743 0.377743 476.655640 0.785851 30.0 Lumos \n", + "\n", + " frag_start_idx frag_stop_idx irt_pred \n", + "0 0 7 -10.360729 \n", + "1 7 14 -10.360729 \n", + "2 14 21 4.803679 \n", + "3 21 28 4.803679 \n", + "4 28 35 26.806266 \n", + ".. ... ... ... \n", + "395 3810 3823 35.294021 \n", + "396 3823 3836 35.294021 \n", + "397 3836 3849 18.273781 \n", + "398 3849 3862 18.273781 \n", + "399 3862 3875 18.273781 \n", + "\n", + "[400 rows x 21 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.translate_rt_to_irt_pred()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the predicted library can be exported in an hdf format (AlphaDIA) or translated to a tsv. The tsv translation can be very time consuming. Before the spectral library can be translated, the gene and protein column need to be mapped from the protein_df into the precursor_df. " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "hdf_path = \"D:\\Software\\FASTA\\Human\\speclib_example.hdf\"\n", + "\n", + "speclib.save_hdf(hdf_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:01<00:00, 1.50s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Translation finished, it will take several minutes to export the rest precursors to the tsv file...\n" + ] + } + ], + "source": [ + "from peptdeep.spec_lib.translate import translate_to_tsv\n", + "speclib.append_protein_name()\n", + "translate_to_tsv(speclib=speclib, \n", + " tsv = \"D:\\Software\\FASTA\\Human\\speclib_example.tsv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. matching peptides back to proteins" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The peptide sequnces can be matched back to proteins using ProteinInfer, requiring a 'sequence' column. ProteinInfer can be provided with any number of fasta files and lists all the proteins the peptide sequence appears in. This can be done with the sequence output of any search engine or before the library is generated. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# from alphabase.??? import ProteinInfer\n", + "infer = ProteinInfer(fasta_path)\n", + "infer_df = infer.infer_peptides(sequences.sequence.values)\n", + "infer_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequences_infered = pd.merge(sequences, infer_df, how = 'left', on ='sequence')" + ] } ], "metadata": { @@ -1582,7 +3463,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.10.12" } }, "nbformat": 4, From 503ca255a5489baabad86d1e97c5b655a1d5cd1f Mon Sep 17 00:00:00 2001 From: Maria Wahle Date: Mon, 22 Jul 2024 09:23:37 +0200 Subject: [PATCH 06/10] include protein annotation --- .../tutorial_immunopeptidomics.ipynb | 822 ++++++++++++++---- 1 file changed, 648 insertions(+), 174 deletions(-) diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb index f8f27618..58be219d 100644 --- a/docs/tutorials/tutorial_immunopeptidomics.ipynb +++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb @@ -126,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -206,7 +206,7 @@ "tr|A0A024RAP8|A0A024RAP8_HUMAN 216 " ] }, - "execution_count": 40, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -909,7 +909,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 1.23it/s]\n" + "100%|██████████| 1/1 [00:00<00:00, 1.20it/s]\n" ] }, { @@ -1129,7 +1129,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2024-07-19 14:16:34> Training with fixed sequence length: 0\n" + "2024-07-22 09:21:38> Training with fixed sequence length: 0\n" ] }, { @@ -1144,7 +1144,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=1, lr=2e-05, loss=1.403803927557809\n" + "[Training] Epoch=1, lr=2e-05, loss=1.415909733091082\n" ] }, { @@ -1159,7 +1159,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=2, lr=4e-05, loss=1.0939611451966422\n" + "[Training] Epoch=2, lr=4e-05, loss=1.0947138496807642\n" ] }, { @@ -1174,7 +1174,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=3, lr=6e-05, loss=0.8742348296301705\n" + "[Training] Epoch=3, lr=6e-05, loss=0.8823633790016174\n" ] }, { @@ -1189,7 +1189,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=4, lr=8e-05, loss=0.7860026274408612\n" + "[Training] Epoch=4, lr=8e-05, loss=0.7819523641041347\n" ] }, { @@ -1204,7 +1204,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=5, lr=0.0001, loss=0.7296201757022313\n" + "[Training] Epoch=5, lr=0.0001, loss=0.7255220583506993\n" ] }, { @@ -1219,7 +1219,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=6, lr=0.0001, loss=0.7098635860851833\n" + "[Training] Epoch=6, lr=0.0001, loss=0.705090846334185\n" ] }, { @@ -1234,7 +1234,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7049905742917743\n" + "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013667055538723\n" ] }, { @@ -1249,7 +1249,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6990227273532322\n" + "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6968921593257359\n" ] }, { @@ -1264,8 +1264,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6956126008714948\n", - "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6955537881170001\n" + "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6968518495559692\n", + "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6932548114231655\n" ] }, { @@ -1327,15 +1327,15 @@ " \n", " 0\n", " 0.5\n", - " 0.504579\n", - " 0.563971\n", - " 0.553736\n", + " 0.496400\n", + " 0.599795\n", + " 0.608495\n", " \n", " \n", " 1\n", " 0.6\n", - " 0.488889\n", - " 0.011259\n", + " 0.622951\n", + " 0.019447\n", " 0.011771\n", " \n", " \n", @@ -1365,8 +1365,8 @@ ], "text/plain": [ " HLA_prob_pred precision recall false_positive\n", - "0 0.5 0.504579 0.563971 0.553736\n", - "1 0.6 0.488889 0.011259 0.011771\n", + "0 0.5 0.496400 0.599795 0.608495\n", + "1 0.6 0.622951 0.019447 0.011771\n", "2 0.7 NaN 0.000000 0.000000\n", "3 0.8 NaN 0.000000 0.000000\n", "4 0.9 NaN 0.000000 0.000000" @@ -1417,16 +1417,16 @@ " \n", " 0\n", " 0.5\n", - " 0.484288\n", + " 0.480159\n", + " 0.494888\n", " 0.535787\n", - " 0.570552\n", " \n", " \n", " 1\n", " 0.6\n", - " 0.285714\n", - " 0.004090\n", - " 0.010225\n", + " 0.461538\n", + " 0.012270\n", + " 0.014315\n", " \n", " \n", " 2\n", @@ -1455,8 +1455,8 @@ ], "text/plain": [ " HLA_prob_pred precision recall false_positive\n", - "0 0.5 0.484288 0.535787 0.570552\n", - "1 0.6 0.285714 0.004090 0.010225\n", + "0 0.5 0.480159 0.494888 0.535787\n", + "1 0.6 0.461538 0.012270 0.014315\n", "2 0.7 NaN 0.000000 0.000000\n", "3 0.8 NaN 0.000000 0.000000\n", "4 0.9 NaN 0.000000 0.000000" @@ -1480,14 +1480,14 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 1.21it/s]\n" + "100%|██████████| 1/1 [00:00<00:00, 1.20it/s]\n" ] }, { @@ -1521,195 +1521,355 @@ " \n", " \n", " 0\n", - " 170\n", - " 178\n", + " 143\n", + " 151\n", " 8\n", - " 0.673504\n", - " SEFHNYNL\n", + " 0.606630\n", + " PFREPRSC\n", " \n", " \n", " 1\n", - " 181\n", - " 189\n", + " 170\n", + " 178\n", " 8\n", - " 0.617312\n", - " KSDFSTRW\n", + " 0.697908\n", + " SEFHNYNL\n", " \n", " \n", " 2\n", - " 309\n", - " 317\n", + " 62\n", + " 70\n", " 8\n", - " 0.615951\n", - " MGLVHIPT\n", + " 0.602259\n", + " KAEIARAY\n", " \n", " \n", " 3\n", + " 87\n", + " 95\n", + " 8\n", + " 0.611214\n", + " DEGPGRTP\n", + " \n", + " \n", + " 4\n", " 299\n", " 307\n", " 8\n", - " 0.601598\n", + " 0.611188\n", " LLKLVKSY\n", " \n", " \n", - " 4\n", + " 5\n", " 346\n", " 354\n", " 8\n", - " 0.609713\n", + " 0.620160\n", " YASSFKGY\n", " \n", " \n", - " 5\n", + " 6\n", " 344\n", " 352\n", " 8\n", - " 0.635873\n", + " 0.601700\n", " ALYASSFK\n", " \n", " \n", - " 6\n", + " 7\n", + " 223\n", + " 231\n", + " 8\n", + " 0.605099\n", + " IMVTIWSA\n", + " \n", + " \n", + " 8\n", + " 258\n", + " 266\n", + " 8\n", + " 0.618778\n", + " ICYKNNCY\n", + " \n", + " \n", + " 9\n", + " 363\n", + " 371\n", + " 8\n", + " 0.602542\n", + " YICMQRTV\n", + " \n", + " \n", + " 10\n", + " 17\n", + " 25\n", + " 8\n", + " 0.605605\n", + " RRWWMLLA\n", + " \n", + " \n", + " 11\n", " 294\n", " 303\n", " 9\n", - " 0.600454\n", + " 0.605901\n", " KEDQDLLKL\n", " \n", " \n", - " 7\n", + " 12\n", " 298\n", " 307\n", " 9\n", - " 0.628539\n", + " 0.635218\n", " DLLKLVKSY\n", " \n", " \n", - " 8\n", - " 74\n", - " 83\n", + " 13\n", + " 235\n", + " 244\n", " 9\n", - " 0.602105\n", - " RRYHPDRYR\n", + " 0.610781\n", + " SLFNQEVQI\n", " \n", " \n", - " 9\n", + " 14\n", + " 221\n", + " 230\n", + " 9\n", + " 0.622273\n", + " FIIMVTIWS\n", + " \n", + " \n", + " 15\n", + " 222\n", + " 231\n", + " 9\n", + " 0.612193\n", + " IIMVTIWSA\n", + " \n", + " \n", + " 16\n", + " 22\n", + " 31\n", + " 9\n", + " 0.604106\n", + " LLAPLLPAL\n", + " \n", + " \n", + " 17\n", + " 257\n", + " 266\n", + " 9\n", + " 0.620672\n", + " WICYKNNCY\n", + " \n", + " \n", + " 18\n", + " 267\n", + " 276\n", + " 9\n", + " 0.614074\n", + " FFDESKNWY\n", + " \n", + " \n", + " 19\n", + " 17\n", + " 26\n", + " 9\n", + " 0.604426\n", + " RRWWMLLAP\n", + " \n", + " \n", + " 20\n", + " 327\n", + " 336\n", + " 9\n", + " 0.614008\n", + " ILSPNLLTI\n", + " \n", + " \n", + " 21\n", + " 255\n", + " 265\n", + " 10\n", + " 0.601901\n", + " KNWICYKNNC\n", + " \n", + " \n", + " 22\n", " 344\n", " 354\n", " 10\n", - " 0.625569\n", + " 0.630664\n", " ALYASSFKGY\n", " \n", " \n", - " 10\n", + " 23\n", " 232\n", " 242\n", " 10\n", - " 0.607737\n", + " 0.634032\n", " FLNSLFNQEV\n", " \n", " \n", - " 11\n", + " 24\n", + " 221\n", + " 231\n", + " 10\n", + " 0.632162\n", + " FIIMVTIWSA\n", + " \n", + " \n", + " 25\n", + " 222\n", + " 232\n", + " 10\n", + " 0.606705\n", + " IIMVTIWSAV\n", + " \n", + " \n", + " 26\n", " 353\n", " 363\n", " 10\n", - " 0.610844\n", + " 0.611286\n", " YIENCSTPNT\n", " \n", " \n", - " 12\n", - " 53\n", - " 63\n", + " 27\n", + " 205\n", + " 215\n", " 10\n", - " 0.608182\n", - " VLGVSRSAGK\n", + " 0.606018\n", + " SPFFFCCFIA\n", " \n", " \n", - " 13\n", - " 298\n", - " 309\n", + " 28\n", + " 195\n", + " 206\n", " 11\n", - " 0.608567\n", - " DLLKLVKSYHW\n", + " 0.607188\n", + " VVKSKCRENAS\n", " \n", " \n", - " 14\n", - " 351\n", - " 362\n", + " 29\n", + " 221\n", + " 232\n", " 11\n", - " 0.607036\n", - " KGYIENCSTPN\n", + " 0.616940\n", + " FIIMVTIWSAV\n", " \n", " \n", - " 15\n", - " 52\n", - " 63\n", + " 30\n", + " 298\n", + " 309\n", " 11\n", - " 0.635592\n", - " EVLGVSRSAGK\n", + " 0.600725\n", + " DLLKLVKSYHW\n", " \n", " \n", - " 16\n", - " 130\n", - " 142\n", + " 31\n", + " 353\n", + " 364\n", + " 11\n", + " 0.618278\n", + " YIENCSTPNTY\n", + " \n", + " \n", + " 32\n", + " 42\n", + " 54\n", " 12\n", - " 0.601588\n", - " KDALLVGVPAGS\n", + " 0.606224\n", + " GLYCGTRDCYEV\n", " \n", " \n", - " 17\n", + " 33\n", " 351\n", " 363\n", " 12\n", - " 0.632752\n", + " 0.633097\n", " KGYIENCSTPNT\n", " \n", " \n", - " 18\n", - " 86\n", - " 99\n", + " 34\n", + " 200\n", + " 212\n", + " 12\n", + " 0.608198\n", + " CRENASPFFFCC\n", + " \n", + " \n", + " 35\n", + " 224\n", + " 236\n", + " 12\n", + " 0.606180\n", + " MVTIWSAVFLNS\n", + " \n", + " \n", + " 36\n", + " 195\n", + " 207\n", + " 12\n", + " 0.612207\n", + " VVKSKCRENASP\n", + " \n", + " \n", + " 37\n", + " 166\n", + " 179\n", " 13\n", - " 0.608231\n", - " GDEGPGRTPQSAE\n", + " 0.628934\n", + " SWEMSEFHNYNLD\n", " \n", " \n", - " 19\n", - " 141\n", - " 154\n", + " 38\n", + " 351\n", + " 364\n", " 13\n", - " 0.603257\n", - " SNPFREPRSCALL\n", + " 0.604953\n", + " KGYIENCSTPNTY\n", " \n", " \n", - " 20\n", - " 32\n", - " 45\n", + " 39\n", + " 35\n", + " 48\n", " 13\n", - " 0.608740\n", - " LVRPAGALVEGLY\n", + " 0.601324\n", + " PAGALVEGLYCGT\n", " \n", " \n", - " 21\n", + " 40\n", " 130\n", " 143\n", " 13\n", - " 0.620658\n", + " 0.603384\n", " KDALLVGVPAGSN\n", " \n", " \n", - " 22\n", + " 41\n", + " 333\n", + " 347\n", + " 14\n", + " 0.601238\n", + " LTIIEMQKGDCALY\n", + " \n", + " \n", + " 42\n", " 185\n", " 199\n", " 14\n", - " 0.625906\n", + " 0.610031\n", " STRWQKQRCPVVKS\n", " \n", " \n", - " 23\n", - " 60\n", - " 74\n", + " 43\n", + " 117\n", + " 131\n", " 14\n", - " 0.684522\n", - " AGKAEIARAYRQLA\n", + " 0.600326\n", + " AAELQQYCMQNACK\n", " \n", " \n", "\n", @@ -1717,33 +1877,53 @@ ], "text/plain": [ " start_pos stop_pos nAA HLA_prob_pred sequence\n", - "0 170 178 8 0.673504 SEFHNYNL\n", - "1 181 189 8 0.617312 KSDFSTRW\n", - "2 309 317 8 0.615951 MGLVHIPT\n", - "3 299 307 8 0.601598 LLKLVKSY\n", - "4 346 354 8 0.609713 YASSFKGY\n", - "5 344 352 8 0.635873 ALYASSFK\n", - "6 294 303 9 0.600454 KEDQDLLKL\n", - "7 298 307 9 0.628539 DLLKLVKSY\n", - "8 74 83 9 0.602105 RRYHPDRYR\n", - "9 344 354 10 0.625569 ALYASSFKGY\n", - "10 232 242 10 0.607737 FLNSLFNQEV\n", - "11 353 363 10 0.610844 YIENCSTPNT\n", - "12 53 63 10 0.608182 VLGVSRSAGK\n", - "13 298 309 11 0.608567 DLLKLVKSYHW\n", - "14 351 362 11 0.607036 KGYIENCSTPN\n", - "15 52 63 11 0.635592 EVLGVSRSAGK\n", - "16 130 142 12 0.601588 KDALLVGVPAGS\n", - "17 351 363 12 0.632752 KGYIENCSTPNT\n", - "18 86 99 13 0.608231 GDEGPGRTPQSAE\n", - "19 141 154 13 0.603257 SNPFREPRSCALL\n", - "20 32 45 13 0.608740 LVRPAGALVEGLY\n", - "21 130 143 13 0.620658 KDALLVGVPAGSN\n", - "22 185 199 14 0.625906 STRWQKQRCPVVKS\n", - "23 60 74 14 0.684522 AGKAEIARAYRQLA" + "0 143 151 8 0.606630 PFREPRSC\n", + "1 170 178 8 0.697908 SEFHNYNL\n", + "2 62 70 8 0.602259 KAEIARAY\n", + "3 87 95 8 0.611214 DEGPGRTP\n", + "4 299 307 8 0.611188 LLKLVKSY\n", + "5 346 354 8 0.620160 YASSFKGY\n", + "6 344 352 8 0.601700 ALYASSFK\n", + "7 223 231 8 0.605099 IMVTIWSA\n", + "8 258 266 8 0.618778 ICYKNNCY\n", + "9 363 371 8 0.602542 YICMQRTV\n", + "10 17 25 8 0.605605 RRWWMLLA\n", + "11 294 303 9 0.605901 KEDQDLLKL\n", + "12 298 307 9 0.635218 DLLKLVKSY\n", + "13 235 244 9 0.610781 SLFNQEVQI\n", + "14 221 230 9 0.622273 FIIMVTIWS\n", + "15 222 231 9 0.612193 IIMVTIWSA\n", + "16 22 31 9 0.604106 LLAPLLPAL\n", + "17 257 266 9 0.620672 WICYKNNCY\n", + "18 267 276 9 0.614074 FFDESKNWY\n", + "19 17 26 9 0.604426 RRWWMLLAP\n", + "20 327 336 9 0.614008 ILSPNLLTI\n", + "21 255 265 10 0.601901 KNWICYKNNC\n", + "22 344 354 10 0.630664 ALYASSFKGY\n", + "23 232 242 10 0.634032 FLNSLFNQEV\n", + "24 221 231 10 0.632162 FIIMVTIWSA\n", + "25 222 232 10 0.606705 IIMVTIWSAV\n", + "26 353 363 10 0.611286 YIENCSTPNT\n", + "27 205 215 10 0.606018 SPFFFCCFIA\n", + "28 195 206 11 0.607188 VVKSKCRENAS\n", + "29 221 232 11 0.616940 FIIMVTIWSAV\n", + "30 298 309 11 0.600725 DLLKLVKSYHW\n", + "31 353 364 11 0.618278 YIENCSTPNTY\n", + "32 42 54 12 0.606224 GLYCGTRDCYEV\n", + "33 351 363 12 0.633097 KGYIENCSTPNT\n", + "34 200 212 12 0.608198 CRENASPFFFCC\n", + "35 224 236 12 0.606180 MVTIWSAVFLNS\n", + "36 195 207 12 0.612207 VVKSKCRENASP\n", + "37 166 179 13 0.628934 SWEMSEFHNYNLD\n", + "38 351 364 13 0.604953 KGYIENCSTPNTY\n", + "39 35 48 13 0.601324 PAGALVEGLYCGT\n", + "40 130 143 13 0.603384 KDALLVGVPAGSN\n", + "41 333 347 14 0.601238 LTIIEMQKGDCALY\n", + "42 185 199 14 0.610031 STRWQKQRCPVVKS\n", + "43 117 131 14 0.600326 AAELQQYCMQNACK" ] }, - "execution_count": 39, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1768,7 +1948,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1790,7 +1970,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1809,7 +1989,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -2048,7 +2228,7 @@ "[148 rows x 12 columns]" ] }, - "execution_count": 24, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2073,7 +2253,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -2227,7 +2407,7 @@ "[148 rows x 6 columns]" ] }, - "execution_count": 25, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -2239,7 +2419,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -2430,7 +2610,7 @@ "[148 rows x 8 columns]" ] }, - "execution_count": 26, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2442,7 +2622,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -2633,7 +2813,7 @@ "[148 rows x 8 columns]" ] }, - "execution_count": 27, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -2651,7 +2831,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -2878,7 +3058,7 @@ "[498 rows x 11 columns]" ] }, - "execution_count": 28, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -2898,29 +3078,29 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-19 14:19:49> Predicting RT/IM/MS2 for 400 precursors ...\n", - "2024-07-19 14:19:49> Predicting RT ...\n" + "2024-07-22 09:22:23> Predicting RT/IM/MS2 for 400 precursors ...\n", + "2024-07-22 09:22:23> Predicting RT ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 7/7 [00:00<00:00, 65.96it/s]" + "100%|██████████| 7/7 [00:00<00:00, 69.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-19 14:19:49> Predicting mobility ...\n" + "2024-07-22 09:22:23> Predicting mobility ...\n" ] }, { @@ -2928,14 +3108,14 @@ "output_type": "stream", "text": [ "\n", - "100%|██████████| 7/7 [00:00<00:00, 70.12it/s]" + "100%|██████████| 7/7 [00:00<00:00, 72.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-19 14:19:49> Predicting MS2 ...\n" + "2024-07-22 09:22:23> Predicting MS2 ...\n" ] }, { @@ -2943,14 +3123,14 @@ "output_type": "stream", "text": [ "\n", - "100%|██████████| 7/7 [00:00<00:00, 23.54it/s]" + "100%|██████████| 7/7 [00:00<00:00, 22.52it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-19 14:19:50> End predicting RT/IM/MS2\n" + "2024-07-22 09:22:24> End predicting RT/IM/MS2\n" ] }, { @@ -2974,7 +3154,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -3357,7 +3537,7 @@ "[400 rows x 21 columns]" ] }, - "execution_count": 30, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -3375,7 +3555,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -3386,14 +3566,14 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:01<00:00, 1.50s/it]\n" + "100%|██████████| 1/1 [00:01<00:00, 1.51s/it]\n" ] }, { @@ -3422,19 +3602,315 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The peptide sequnces can be matched back to proteins using ProteinInfer, requiring a 'sequence' column. ProteinInfer can be provided with any number of fasta files and lists all the proteins the peptide sequence appears in. This can be done with the sequence output of any search engine or before the library is generated. " + "The peptide sequnces can be matched back to proteins using annotate_precursor_df, requiring a 'sequence' column and a protein_df like the previously loaded fasta file. This can be done with the sequence output of any search engine or before the library is generated. " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequenceprotein_idprotein_idxesfull_namegene_orggene_nameis_prot_ntermis_prot_ctermgenesproteinscardinality
016817680.793702EMSEFHNY00000FalseFalseA0A024RAP8_HUMANA0A024RAP81
113013880.817415KDALLVGV11111FalseFalseA0A024R161_HUMANA0A024R1611
213714580.751329VPAGSNPF22222FalseFalseA0A024R161_HUMANA0A024R1611
317017880.940019SEFHNYNL33333FalseFalseA0A024RAP8_HUMANA0A024RAP81
418118980.895964KSDFSTRW44444FalseFalseA0A024RAP8_HUMANA0A024RAP81
................................................
14395109140.969541QSAEEAFLLVATAY143143143143143FalseFalseA0A024R161_HUMANA0A024R1611
144329343140.756001SPNLLTIIEMQKGD144144144144144FalseFalseA0A024RAP8_HUMANA0A024RAP81
145519140.733784LLSPGWGAGAAGRR145145145145145FalseFalseA0A024R161_HUMANA0A024R1611
146110124140.891976TLKVSQAAAELQQY146146146146146FalseFalseA0A024R161_HUMANA0A024R1611
147620140.842583LSPGWGAGAAGRRW147147147147147FalseFalseA0A024R161_HUMANA0A024R1611
\n", + "

148 rows × 15 columns

\n", + "" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence protein_id \\\n", + "0 168 176 8 0.793702 EMSEFHNY 0 \n", + "1 130 138 8 0.817415 KDALLVGV 1 \n", + "2 137 145 8 0.751329 VPAGSNPF 2 \n", + "3 170 178 8 0.940019 SEFHNYNL 3 \n", + "4 181 189 8 0.895964 KSDFSTRW 4 \n", + ".. ... ... ... ... ... ... \n", + "143 95 109 14 0.969541 QSAEEAFLLVATAY 143 \n", + "144 329 343 14 0.756001 SPNLLTIIEMQKGD 144 \n", + "145 5 19 14 0.733784 LLSPGWGAGAAGRR 145 \n", + "146 110 124 14 0.891976 TLKVSQAAAELQQY 146 \n", + "147 6 20 14 0.842583 LSPGWGAGAAGRRW 147 \n", + "\n", + " protein_idxes full_name gene_org gene_name is_prot_nterm is_prot_cterm \\\n", + "0 0 0 0 0 False False \n", + "1 1 1 1 1 False False \n", + "2 2 2 2 2 False False \n", + "3 3 3 3 3 False False \n", + "4 4 4 4 4 False False \n", + ".. ... ... ... ... ... ... \n", + "143 143 143 143 143 False False \n", + "144 144 144 144 144 False False \n", + "145 145 145 145 145 False False \n", + "146 146 146 146 146 False False \n", + "147 147 147 147 147 False False \n", + "\n", + " genes proteins cardinality \n", + "0 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + "1 A0A024R161_HUMAN A0A024R161 1 \n", + "2 A0A024R161_HUMAN A0A024R161 1 \n", + "3 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + "4 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + ".. ... ... ... \n", + "143 A0A024R161_HUMAN A0A024R161 1 \n", + "144 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + "145 A0A024R161_HUMAN A0A024R161 1 \n", + "146 A0A024R161_HUMAN A0A024R161 1 \n", + "147 A0A024R161_HUMAN A0A024R161 1 \n", + "\n", + "[148 rows x 15 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# from alphabase.??? import ProteinInfer\n", - "infer = ProteinInfer(fasta_path)\n", - "infer_df = infer.infer_peptides(sequences.sequence.values)\n", - "infer_df" + "from alphabase.protein.fasta import annotate_precursor_df\n", + "inferred_sequences = annotate_precursor_df(sequences, fasta)\n", + "inferred_sequences" ] }, { @@ -3442,9 +3918,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "sequences_infered = pd.merge(sequences, infer_df, how = 'left', on ='sequence')" - ] + "source": [] } ], "metadata": { From dc0326ccc03ee191a8001c1e23e611ef2a802e97 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 23 Jul 2024 10:08:15 +0200 Subject: [PATCH 07/10] #183 ruff reformat --- peptdeep/hla/hla_class1.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/peptdeep/hla/hla_class1.py b/peptdeep/hla/hla_class1.py index 67f5e575..f093d53b 100644 --- a/peptdeep/hla/hla_class1.py +++ b/peptdeep/hla/hla_class1.py @@ -380,19 +380,17 @@ def predict_from_proteins( peptide_df["sequence"] = get_seq_series(peptide_df, self._cat_protein_sequence) return peptide_df - def _concat_neg_df(self, precursor_df, column_to_train='HLA'): + def _concat_neg_df(self, precursor_df, column_to_train="HLA"): precursor_df = append_nAA_column_if_missing(precursor_df) precursor_df[column_to_train] = 1 df_list = [precursor_df] - for nAA, group_df in precursor_df.groupby('nAA'): + for nAA, group_df in precursor_df.groupby("nAA"): rnd_seqs = get_random_sequences( - self.protein_df, - n=len(group_df), - pep_len = nAA + self.protein_df, n=len(group_df), pep_len=nAA + ) + df_list.append( + pd.DataFrame({"sequence": rnd_seqs, "nAA": nAA, column_to_train: 0}) ) - df_list.append(pd.DataFrame( - {'sequence':rnd_seqs,'nAA':nAA,column_to_train:0} - )) return pd.concat(df_list).reset_index(drop=True) def test(self, precursor_df): @@ -402,17 +400,21 @@ def test(self, precursor_df): precision_list = [] recall_list = [] fp_list = [] - for prob in [0.5,0.6,0.7,0.8, 0.9]: + for prob in [0.5, 0.6, 0.7, 0.8, 0.9]: prob_list.append(prob) - precision_list.append(df[df.HLA_prob_pred>prob].HLA.mean()) - recall_list.append(df[df.HLA_prob_pred>prob].HLA.sum()/len(df)*2) - fp_list.append(1-(1-df[df.HLA_prob_pred prob].HLA.mean()) + recall_list.append(df[df.HLA_prob_pred > prob].HLA.sum() / len(df) * 2) + fp_list.append( + 1 - (1 - df[df.HLA_prob_pred < prob].HLA).sum() / len(df) * 2 + ) + return pd.DataFrame( + dict( + HLA_prob_pred=prob_list, + precision=precision_list, + recall=recall_list, + false_positive=fp_list, + ) + ) def _download_pretrained_hla_model(self): download_models(url=self._model_url, target_path=self._model_zip) From d500554a685d14d9d4608c0dfb9dded2450c3488 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 23 Jul 2024 14:27:46 +0200 Subject: [PATCH 08/10] #183 hla tutorial move fasta to cur folder --- docs/tutorials/example.fasta | 9 + .../tutorial_immunopeptidomics.ipynb | 987 +++++++----------- peptdeep/model/model_interface.py | 8 +- 3 files changed, 412 insertions(+), 592 deletions(-) create mode 100644 docs/tutorials/example.fasta diff --git a/docs/tutorials/example.fasta b/docs/tutorials/example.fasta new file mode 100644 index 00000000..5619e28a --- /dev/null +++ b/docs/tutorials/example.fasta @@ -0,0 +1,9 @@ +>tr|A0A024R161|A0A024R161_HUMAN Guanine nucleotide-binding protein subunit gamma OS=Homo sapiens GN=DNAJC25-GNG10 PE=3 SV=1 +MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSA +GKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAEL +QQYCMQNACKDALLVGVPAGSNPFREPRSCALL +>tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, isoform CRA_b OS=Homo sapiens GN=KLRC4-KLRK1 PE=4 SV=1 +MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIA +VAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNW +YESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLT +IIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb index 58be219d..c7d54dde 100644 --- a/docs/tutorials/tutorial_immunopeptidomics.ipynb +++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb @@ -41,20 +41,21 @@ "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n", - "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n" - ] } ], "source": [ "%pip install -q pydivsufsort" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -84,51 +85,18 @@ "The protein sequences are concatenated into a single sequence. The sequences are seperated by a sentinel character, in this case '$', so that no peptides across proteins are formed. Note that the first and last sentinel characters are crutial as well.\n" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def concat_sequences_for_nonspecific_digestion(seq_list, sep=\"$\"):\n", - " return sep + sep.join(seq_list) + sep" - ] - }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "prot_seq_list = [\"MABCDEKFGHIJKLMNOPQRST\",\"FGHIJKLMNOPQR\"]\n", - "cat_prot = concat_sequences_for_nonspecific_digestion(prot_seq_list, sep=\"$\")\n", - "cat_prot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The same can be done directly from a fasta: \n", - "@ Feng do you have an example fasta somwhere? " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + }, { "data": { "text/html": [ @@ -206,21 +174,21 @@ "tr|A0A024RAP8|A0A024RAP8_HUMAN 216 " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from peptdeep.hla.hla_utils import load_prot_df\n", - "fasta_path = \"D:/Software/FASTA/Human/example.fasta\"\n", - "fasta = load_prot_df(fasta_path)\n", - "fasta" + "fasta_path = \"example.fasta\"\n", + "protein_df = load_prot_df(fasta_path)\n", + "protein_df" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -229,15 +197,15 @@ "'$MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSAGKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAELQQYCMQNACKDALLVGVPAGSNPFREPRSCALL$MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIAVAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNWYESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLTIIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV$'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from peptdeep.hla.hla_utils import cat_proteins\n", - "cat_fasta = cat_proteins(fasta['sequence'])\n", - "cat_fasta" + "cat_sequence = cat_proteins(protein_df[\"sequence\"])\n", + "cat_sequence" ] }, { @@ -253,7 +221,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -359,7 +327,7 @@ "[2443 rows x 2 columns]" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -370,7 +338,7 @@ "import sys\n", "\n", "start_idxes, stop_idxes = get_substring_indices(\n", - " cat_fasta, min_len=8, max_len=14, stop_char=\"$\"\n", + " cat_sequence, min_len=8, max_len=14, stop_char=\"$\"\n", ")\n", "digest_pos_df = pd.DataFrame({\n", " \"start_pos\": start_idxes,\n", @@ -381,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -397,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -515,7 +483,7 @@ "[2443 rows x 3 columns]" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -523,13 +491,13 @@ "source": [ "digest_pos_df[\"sequence\"] = digest_pos_df[\n", " [\"start_pos\",\"stop_pos\"]\n", - "].apply(lambda x: cat_fasta[slice(*x)], axis=1)\n", + "].apply(lambda x: cat_sequence[slice(*x)], axis=1)\n", "digest_pos_df" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -538,16 +506,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'seq RAM = 0.16621 Mb, idxes RAM = 0.01969, ratio = 8.44230'" + "'seq RAM = 0.16623 Mb, idxes RAM = 0.01971, ratio = 8.43475'" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -560,7 +528,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Selection of peptide sequences used for library prediction\n", + "## 2. Selection of peptide sequences used for library prediction\n", "The digest_prot_df contains all unspecifically digested peptide sequences between 7 and 14 aa generatable from the concatenated protein sequences. This list is reduced using a HLA1_Binding_Classifier from peptdeep.hla.hla_class1. Two different model architectures are available, an LSTM model (HLA_Class_I_LSTM) and a BERT model (HLA_Class_I_BERT). A pretrained model is only available for the LSTM model architecture.\n", "The HLA1_Binding_Classifer can be used with a pretrained model, tuned with existing peptide data or trained from scratch. Training of a new model should be considered carefully and will not be covered in this tutorial.\n", " " @@ -570,14 +538,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.2 Selection of peptide seqeuence candidates without transferlearning\n", + "### 2.1 Selection of peptide seqeuence candidates without transferlearning\n", "\n", "Selection of peptide sequences for library predicition using the pretrained model can be done in a few steps. First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, we can use any kind of dataframe containing peptide sequences to predict how likely there are HLA peptides, the only requirement beeing that the column containing the peptides is called 'sequence'.\n" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -679,7 +647,7 @@ " 93\n", " DRYRPQPGDEGPGR\n", " 14\n", - " 0.060635\n", + " 0.060634\n", " \n", " \n", " 2441\n", @@ -712,14 +680,14 @@ "... ... ... ... ... ...\n", "2438 112 126 KVSQAAAELQQYCM 14 0.243115\n", "2439 317 331 NGSWQWEDGSILSP 14 0.021114\n", - "2440 79 93 DRYRPQPGDEGPGR 14 0.060635\n", + "2440 79 93 DRYRPQPGDEGPGR 14 0.060634\n", "2441 113 127 VSQAAAELQQYCMQ 14 0.355900\n", "2442 190 204 KQRCPVVKSKCREN 14 0.000362\n", "\n", "[2443 rows x 5 columns]" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -742,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -884,7 +852,7 @@ "[148 rows x 5 columns]" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -902,14 +870,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 1.20it/s]\n" + "100%|██████████| 1/1 [00:01<00:00, 1.27s/it]\n" ] }, { @@ -1051,14 +1019,14 @@ "[148 rows x 5 columns]" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sequences = model.predict_from_proteins(fasta, prob_threshold=0.7)\n", - "sequences" + "sequence_df = model.predict_from_proteins(protein_df, prob_threshold=0.7)\n", + "sequence_df" ] }, { @@ -1074,13 +1042,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "model = HLA1_Binding_Classifier()\n", "model.load_pretrained_hla_model()\n", - "model.load_proteins(fasta)" + "model.load_proteins(fasta_path)" ] }, { @@ -1093,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1102,7 +1070,7 @@ "(1954, 489)" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1122,163 +1090,28 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-22 09:21:38> Training with fixed sequence length: 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=1, lr=2e-05, loss=1.415909733091082\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=2, lr=4e-05, loss=1.0947138496807642\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=3, lr=6e-05, loss=0.8823633790016174\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=4, lr=8e-05, loss=0.7819523641041347\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=5, lr=0.0001, loss=0.7255220583506993\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=6, lr=0.0001, loss=0.705090846334185\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013667055538723\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6968921593257359\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6968518495559692\n", - "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6932548114231655\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n", - " warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n" + "2024-07-23 14:22:06> Training with fixed sequence length: 0\n", + "[Training] Epoch=1, lr=4e-05, loss=1.39779794216156\n", + "[Training] Epoch=2, lr=6e-05, loss=1.0070140702383858\n", + "[Training] Epoch=3, lr=8e-05, loss=0.7982760497501918\n", + "[Training] Epoch=4, lr=0.0001, loss=0.7397338407380241\n", + "[Training] Epoch=5, lr=0.0001, loss=0.7099559647696358\n", + "[Training] Epoch=6, lr=9.045084971874738e-05, loss=0.7016251683235168\n", + "[Training] Epoch=7, lr=6.545084971874738e-05, loss=0.6965694086892265\n", + "[Training] Epoch=8, lr=3.4549150281252636e-05, loss=0.697939566203526\n", + "[Training] Epoch=9, lr=9.549150281252633e-06, loss=0.6959438664572579\n", + "[Training] Epoch=10, lr=1.0000000000000002e-14, loss=0.6928229417119708\n" ] } ], "source": [ - "\n", "model.train(train_seq_df,\n", " epoch=10, warmup_epoch=5, \n", " verbose=True)" @@ -1293,7 +1126,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1327,23 +1160,23 @@ " \n", " 0\n", " 0.5\n", - " 0.496400\n", - " 0.599795\n", - " 0.608495\n", + " 0.511434\n", + " 0.595189\n", + " 0.568577\n", " \n", " \n", " 1\n", " 0.6\n", - " 0.622951\n", - " 0.019447\n", - " 0.011771\n", + " 0.416667\n", + " 0.017912\n", + " 0.025077\n", " \n", " \n", " 2\n", " 0.7\n", - " NaN\n", - " 0.000000\n", - " 0.000000\n", + " 0.333333\n", + " 0.000512\n", + " 0.001024\n", " \n", " \n", " 3\n", @@ -1365,14 +1198,14 @@ ], "text/plain": [ " HLA_prob_pred precision recall false_positive\n", - "0 0.5 0.496400 0.599795 0.608495\n", - "1 0.6 0.622951 0.019447 0.011771\n", - "2 0.7 NaN 0.000000 0.000000\n", + "0 0.5 0.511434 0.595189 0.568577\n", + "1 0.6 0.416667 0.017912 0.025077\n", + "2 0.7 0.333333 0.000512 0.001024\n", "3 0.8 NaN 0.000000 0.000000\n", "4 0.9 NaN 0.000000 0.000000" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1383,7 +1216,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1417,16 +1250,16 @@ " \n", " 0\n", " 0.5\n", - " 0.480159\n", - " 0.494888\n", - " 0.535787\n", + " 0.450192\n", + " 0.480573\n", + " 0.586912\n", " \n", " \n", " 1\n", " 0.6\n", - " 0.461538\n", - " 0.012270\n", - " 0.014315\n", + " 0.470588\n", + " 0.016360\n", + " 0.018405\n", " \n", " \n", " 2\n", @@ -1455,14 +1288,14 @@ ], "text/plain": [ " HLA_prob_pred precision recall false_positive\n", - "0 0.5 0.480159 0.494888 0.535787\n", - "1 0.6 0.461538 0.012270 0.014315\n", + "0 0.5 0.450192 0.480573 0.586912\n", + "1 0.6 0.470588 0.016360 0.018405\n", "2 0.7 NaN 0.000000 0.000000\n", "3 0.8 NaN 0.000000 0.000000\n", "4 0.9 NaN 0.000000 0.000000" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1480,14 +1313,14 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 1.20it/s]\n" + "100%|██████████| 1/1 [00:01<00:00, 1.32s/it]\n" ] }, { @@ -1521,422 +1354,413 @@ " \n", " \n", " 0\n", - " 143\n", - " 151\n", - " 8\n", - " 0.606630\n", - " PFREPRSC\n", - " \n", - " \n", - " 1\n", " 170\n", " 178\n", " 8\n", - " 0.697908\n", + " 0.711809\n", " SEFHNYNL\n", " \n", " \n", - " 2\n", + " 1\n", " 62\n", " 70\n", " 8\n", - " 0.602259\n", + " 0.627015\n", " KAEIARAY\n", " \n", " \n", - " 3\n", - " 87\n", - " 95\n", + " 2\n", + " 106\n", + " 114\n", " 8\n", - " 0.611214\n", - " DEGPGRTP\n", + " 0.628822\n", + " TAYETLKV\n", " \n", " \n", - " 4\n", + " 3\n", " 299\n", " 307\n", " 8\n", - " 0.611188\n", + " 0.605544\n", " LLKLVKSY\n", " \n", " \n", - " 5\n", + " 4\n", " 346\n", " 354\n", " 8\n", - " 0.620160\n", + " 0.646759\n", " YASSFKGY\n", " \n", " \n", - " 6\n", - " 344\n", - " 352\n", - " 8\n", - " 0.601700\n", - " ALYASSFK\n", - " \n", - " \n", - " 7\n", - " 223\n", - " 231\n", - " 8\n", - " 0.605099\n", - " IMVTIWSA\n", - " \n", - " \n", - " 8\n", + " 5\n", " 258\n", " 266\n", " 8\n", - " 0.618778\n", + " 0.624555\n", " ICYKNNCY\n", " \n", " \n", - " 9\n", - " 363\n", - " 371\n", - " 8\n", - " 0.602542\n", - " YICMQRTV\n", - " \n", - " \n", - " 10\n", - " 17\n", - " 25\n", - " 8\n", - " 0.605605\n", - " RRWWMLLA\n", - " \n", - " \n", - " 11\n", + " 6\n", " 294\n", " 303\n", " 9\n", - " 0.605901\n", + " 0.610476\n", " KEDQDLLKL\n", " \n", " \n", - " 12\n", + " 7\n", " 298\n", " 307\n", " 9\n", - " 0.635218\n", + " 0.645020\n", " DLLKLVKSY\n", " \n", " \n", - " 13\n", + " 8\n", " 235\n", " 244\n", " 9\n", - " 0.610781\n", + " 0.629079\n", " SLFNQEVQI\n", " \n", " \n", - " 14\n", - " 221\n", - " 230\n", - " 9\n", - " 0.622273\n", - " FIIMVTIWS\n", - " \n", - " \n", - " 15\n", - " 222\n", - " 231\n", - " 9\n", - " 0.612193\n", - " IIMVTIWSA\n", - " \n", - " \n", - " 16\n", - " 22\n", - " 31\n", - " 9\n", - " 0.604106\n", - " LLAPLLPAL\n", - " \n", - " \n", - " 17\n", + " 9\n", " 257\n", " 266\n", " 9\n", - " 0.620672\n", + " 0.623247\n", " WICYKNNCY\n", " \n", " \n", - " 18\n", + " 10\n", " 267\n", " 276\n", " 9\n", - " 0.614074\n", + " 0.611738\n", " FFDESKNWY\n", " \n", " \n", - " 19\n", + " 11\n", " 17\n", " 26\n", " 9\n", - " 0.604426\n", + " 0.605875\n", " RRWWMLLAP\n", " \n", " \n", - " 20\n", + " 12\n", " 327\n", " 336\n", " 9\n", - " 0.614008\n", + " 0.616737\n", " ILSPNLLTI\n", " \n", " \n", - " 21\n", - " 255\n", - " 265\n", - " 10\n", - " 0.601901\n", - " KNWICYKNNC\n", + " 13\n", + " 74\n", + " 83\n", + " 9\n", + " 0.611590\n", + " RRYHPDRYR\n", " \n", " \n", - " 22\n", + " 14\n", " 344\n", " 354\n", " 10\n", - " 0.630664\n", + " 0.662783\n", " ALYASSFKGY\n", " \n", " \n", - " 23\n", + " 15\n", " 232\n", " 242\n", " 10\n", - " 0.634032\n", + " 0.651600\n", " FLNSLFNQEV\n", " \n", " \n", - " 24\n", + " 16\n", " 221\n", " 231\n", " 10\n", - " 0.632162\n", + " 0.617175\n", " FIIMVTIWSA\n", " \n", " \n", - " 25\n", + " 17\n", " 222\n", " 232\n", " 10\n", - " 0.606705\n", + " 0.600623\n", " IIMVTIWSAV\n", " \n", " \n", - " 26\n", - " 353\n", - " 363\n", - " 10\n", - " 0.611286\n", - " YIENCSTPNT\n", - " \n", - " \n", - " 27\n", - " 205\n", - " 215\n", + " 18\n", + " 74\n", + " 84\n", " 10\n", - " 0.606018\n", - " SPFFFCCFIA\n", + " 0.614895\n", + " RRYHPDRYRP\n", " \n", " \n", - " 28\n", - " 195\n", - " 206\n", - " 11\n", - " 0.607188\n", - " VVKSKCRENAS\n", - " \n", - " \n", - " 29\n", + " 19\n", " 221\n", " 232\n", " 11\n", - " 0.616940\n", + " 0.608950\n", " FIIMVTIWSAV\n", " \n", " \n", - " 30\n", - " 298\n", - " 309\n", - " 11\n", - " 0.600725\n", - " DLLKLVKSYHW\n", - " \n", - " \n", - " 31\n", + " 20\n", " 353\n", " 364\n", " 11\n", - " 0.618278\n", + " 0.613787\n", " YIENCSTPNTY\n", " \n", " \n", - " 32\n", + " 21\n", + " 74\n", + " 85\n", + " 11\n", + " 0.605368\n", + " RRYHPDRYRPQ\n", + " \n", + " \n", + " 22\n", + " 112\n", + " 124\n", + " 12\n", + " 0.612270\n", + " KVSQAAAELQQY\n", + " \n", + " \n", + " 23\n", " 42\n", " 54\n", " 12\n", - " 0.606224\n", + " 0.607715\n", " GLYCGTRDCYEV\n", " \n", " \n", - " 33\n", + " 24\n", " 351\n", " 363\n", " 12\n", - " 0.633097\n", + " 0.616891\n", " KGYIENCSTPNT\n", " \n", " \n", - " 34\n", - " 200\n", - " 212\n", + " 25\n", + " 74\n", + " 86\n", " 12\n", - " 0.608198\n", - " CRENASPFFFCC\n", + " 0.602210\n", + " RRYHPDRYRPQP\n", " \n", " \n", - " 35\n", - " 224\n", - " 236\n", - " 12\n", - " 0.606180\n", - " MVTIWSAVFLNS\n", + " 26\n", + " 86\n", + " 99\n", + " 13\n", + " 0.644656\n", + " GDEGPGRTPQSAE\n", " \n", " \n", - " 36\n", - " 195\n", - " 207\n", - " 12\n", - " 0.612207\n", - " VVKSKCRENASP\n", + " 27\n", + " 351\n", + " 364\n", + " 13\n", + " 0.603497\n", + " KGYIENCSTPNTY\n", " \n", " \n", - " 37\n", - " 166\n", - " 179\n", + " 28\n", + " 73\n", + " 86\n", " 13\n", - " 0.628934\n", - " SWEMSEFHNYNLD\n", + " 0.622453\n", + " ARRYHPDRYRPQP\n", " \n", " \n", - " 38\n", - " 351\n", - " 364\n", + " 29\n", + " 74\n", + " 87\n", " 13\n", - " 0.604953\n", - " KGYIENCSTPNTY\n", + " 0.611441\n", + " RRYHPDRYRPQPG\n", " \n", " \n", - " 39\n", - " 35\n", - " 48\n", + " 30\n", + " 334\n", + " 347\n", " 13\n", - " 0.601324\n", - " PAGALVEGLYCGT\n", + " 0.604354\n", + " TIIEMQKGDCALY\n", " \n", " \n", - " 40\n", + " 31\n", + " 141\n", + " 154\n", + " 13\n", + " 0.601309\n", + " SNPFREPRSCALL\n", + " \n", + " \n", + " 32\n", + " 32\n", + " 45\n", + " 13\n", + " 0.622797\n", + " LVRPAGALVEGLY\n", + " \n", + " \n", + " 33\n", " 130\n", " 143\n", " 13\n", - " 0.603384\n", + " 0.604786\n", " KDALLVGVPAGSN\n", " \n", " \n", - " 41\n", + " 34\n", " 333\n", " 347\n", " 14\n", - " 0.601238\n", + " 0.613545\n", " LTIIEMQKGDCALY\n", " \n", " \n", - " 42\n", - " 185\n", - " 199\n", + " 35\n", + " 60\n", + " 74\n", + " 14\n", + " 0.607648\n", + " AGKAEIARAYRQLA\n", + " \n", + " \n", + " 36\n", + " 85\n", + " 99\n", + " 14\n", + " 0.606241\n", + " PGDEGPGRTPQSAE\n", + " \n", + " \n", + " 37\n", + " 229\n", + " 243\n", + " 14\n", + " 0.606759\n", + " SAVFLNSLFNQEVQ\n", + " \n", + " \n", + " 38\n", + " 86\n", + " 100\n", + " 14\n", + " 0.622891\n", + " GDEGPGRTPQSAEE\n", + " \n", + " \n", + " 39\n", + " 167\n", + " 181\n", " 14\n", - " 0.610031\n", - " STRWQKQRCPVVKS\n", + " 0.611953\n", + " WEMSEFHNYNLDLK\n", " \n", " \n", - " 43\n", + " 40\n", " 117\n", " 131\n", " 14\n", - " 0.600326\n", + " 0.619257\n", " AAELQQYCMQNACK\n", " \n", + " \n", + " 41\n", + " 73\n", + " 87\n", + " 14\n", + " 0.608767\n", + " ARRYHPDRYRPQPG\n", + " \n", + " \n", + " 42\n", + " 329\n", + " 343\n", + " 14\n", + " 0.600299\n", + " SPNLLTIIEMQKGD\n", + " \n", " \n", "\n", "" ], "text/plain": [ " start_pos stop_pos nAA HLA_prob_pred sequence\n", - "0 143 151 8 0.606630 PFREPRSC\n", - "1 170 178 8 0.697908 SEFHNYNL\n", - "2 62 70 8 0.602259 KAEIARAY\n", - "3 87 95 8 0.611214 DEGPGRTP\n", - "4 299 307 8 0.611188 LLKLVKSY\n", - "5 346 354 8 0.620160 YASSFKGY\n", - "6 344 352 8 0.601700 ALYASSFK\n", - "7 223 231 8 0.605099 IMVTIWSA\n", - "8 258 266 8 0.618778 ICYKNNCY\n", - "9 363 371 8 0.602542 YICMQRTV\n", - "10 17 25 8 0.605605 RRWWMLLA\n", - "11 294 303 9 0.605901 KEDQDLLKL\n", - "12 298 307 9 0.635218 DLLKLVKSY\n", - "13 235 244 9 0.610781 SLFNQEVQI\n", - "14 221 230 9 0.622273 FIIMVTIWS\n", - "15 222 231 9 0.612193 IIMVTIWSA\n", - "16 22 31 9 0.604106 LLAPLLPAL\n", - "17 257 266 9 0.620672 WICYKNNCY\n", - "18 267 276 9 0.614074 FFDESKNWY\n", - "19 17 26 9 0.604426 RRWWMLLAP\n", - "20 327 336 9 0.614008 ILSPNLLTI\n", - "21 255 265 10 0.601901 KNWICYKNNC\n", - "22 344 354 10 0.630664 ALYASSFKGY\n", - "23 232 242 10 0.634032 FLNSLFNQEV\n", - "24 221 231 10 0.632162 FIIMVTIWSA\n", - "25 222 232 10 0.606705 IIMVTIWSAV\n", - "26 353 363 10 0.611286 YIENCSTPNT\n", - "27 205 215 10 0.606018 SPFFFCCFIA\n", - "28 195 206 11 0.607188 VVKSKCRENAS\n", - "29 221 232 11 0.616940 FIIMVTIWSAV\n", - "30 298 309 11 0.600725 DLLKLVKSYHW\n", - "31 353 364 11 0.618278 YIENCSTPNTY\n", - "32 42 54 12 0.606224 GLYCGTRDCYEV\n", - "33 351 363 12 0.633097 KGYIENCSTPNT\n", - "34 200 212 12 0.608198 CRENASPFFFCC\n", - "35 224 236 12 0.606180 MVTIWSAVFLNS\n", - "36 195 207 12 0.612207 VVKSKCRENASP\n", - "37 166 179 13 0.628934 SWEMSEFHNYNLD\n", - "38 351 364 13 0.604953 KGYIENCSTPNTY\n", - "39 35 48 13 0.601324 PAGALVEGLYCGT\n", - "40 130 143 13 0.603384 KDALLVGVPAGSN\n", - "41 333 347 14 0.601238 LTIIEMQKGDCALY\n", - "42 185 199 14 0.610031 STRWQKQRCPVVKS\n", - "43 117 131 14 0.600326 AAELQQYCMQNACK" + "0 170 178 8 0.711809 SEFHNYNL\n", + "1 62 70 8 0.627015 KAEIARAY\n", + "2 106 114 8 0.628822 TAYETLKV\n", + "3 299 307 8 0.605544 LLKLVKSY\n", + "4 346 354 8 0.646759 YASSFKGY\n", + "5 258 266 8 0.624555 ICYKNNCY\n", + "6 294 303 9 0.610476 KEDQDLLKL\n", + "7 298 307 9 0.645020 DLLKLVKSY\n", + "8 235 244 9 0.629079 SLFNQEVQI\n", + "9 257 266 9 0.623247 WICYKNNCY\n", + "10 267 276 9 0.611738 FFDESKNWY\n", + "11 17 26 9 0.605875 RRWWMLLAP\n", + "12 327 336 9 0.616737 ILSPNLLTI\n", + "13 74 83 9 0.611590 RRYHPDRYR\n", + "14 344 354 10 0.662783 ALYASSFKGY\n", + "15 232 242 10 0.651600 FLNSLFNQEV\n", + "16 221 231 10 0.617175 FIIMVTIWSA\n", + "17 222 232 10 0.600623 IIMVTIWSAV\n", + "18 74 84 10 0.614895 RRYHPDRYRP\n", + "19 221 232 11 0.608950 FIIMVTIWSAV\n", + "20 353 364 11 0.613787 YIENCSTPNTY\n", + "21 74 85 11 0.605368 RRYHPDRYRPQ\n", + "22 112 124 12 0.612270 KVSQAAAELQQY\n", + "23 42 54 12 0.607715 GLYCGTRDCYEV\n", + "24 351 363 12 0.616891 KGYIENCSTPNT\n", + "25 74 86 12 0.602210 RRYHPDRYRPQP\n", + "26 86 99 13 0.644656 GDEGPGRTPQSAE\n", + "27 351 364 13 0.603497 KGYIENCSTPNTY\n", + "28 73 86 13 0.622453 ARRYHPDRYRPQP\n", + "29 74 87 13 0.611441 RRYHPDRYRPQPG\n", + "30 334 347 13 0.604354 TIIEMQKGDCALY\n", + "31 141 154 13 0.601309 SNPFREPRSCALL\n", + "32 32 45 13 0.622797 LVRPAGALVEGLY\n", + "33 130 143 13 0.604786 KDALLVGVPAGSN\n", + "34 333 347 14 0.613545 LTIIEMQKGDCALY\n", + "35 60 74 14 0.607648 AGKAEIARAYRQLA\n", + "36 85 99 14 0.606241 PGDEGPGRTPQSAE\n", + "37 229 243 14 0.606759 SAVFLNSLFNQEVQ\n", + "38 86 100 14 0.622891 GDEGPGRTPQSAEE\n", + "39 167 181 14 0.611953 WEMSEFHNYNLDLK\n", + "40 117 131 14 0.619257 AAELQQYCMQNACK\n", + "41 73 87 14 0.608767 ARRYHPDRYRPQPG\n", + "42 329 343 14 0.600299 SPNLLTIIEMQKGD" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "model.predict_from_proteins(fasta, prob_threshold=0.6)" + "model.predict_from_proteins(fasta_path, prob_threshold=0.6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Spectral library prediciton" + "## 3. Spectral library prediciton" ] }, { @@ -1948,7 +1772,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1970,7 +1794,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1989,7 +1813,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -2228,20 +2052,20 @@ "[148 rows x 12 columns]" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sequences['protein_id'] = [str(i) for i in range(len(sequences))]\n", - "sequences['protein_idxes'] = sequences.protein_id.astype(\"U\")\n", - "sequences['full_name'] = sequences['protein_id'] \n", - "sequences['gene_org'] = sequences['protein_id'] \n", - "sequences['gene_name'] = sequences['protein_id']\n", - "sequences[\"is_prot_nterm\"] = False\n", - "sequences[\"is_prot_cterm\"] = False\n", - "sequences" + "sequence_df['protein_id'] = [str(i) for i in range(len(sequence_df))]\n", + "sequence_df['protein_idxes'] = sequence_df.protein_id.astype(\"U\")\n", + "sequence_df['full_name'] = sequence_df['protein_id'] \n", + "sequence_df['gene_org'] = sequence_df['protein_id'] \n", + "sequence_df['gene_name'] = sequence_df['protein_id']\n", + "sequence_df[\"is_prot_nterm\"] = False\n", + "sequence_df[\"is_prot_cterm\"] = False\n", + "sequence_df" ] }, { @@ -2253,7 +2077,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -2407,19 +2231,21 @@ "[148 rows x 6 columns]" ] }, - "execution_count": 23, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "speclib.protein_df = sequences[[\"sequence\",\"protein_id\",\"nAA\", 'full_name', 'gene_org', 'gene_name']].copy()\n", + "speclib.protein_df = sequence_df[\n", + " [\"sequence\",\"protein_id\",\"nAA\", 'full_name', 'gene_org', 'gene_name']\n", + "].copy()\n", "speclib.protein_df" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -2610,19 +2436,22 @@ "[148 rows x 8 columns]" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "speclib.precursor_df = sequences[[\"sequence\",\"protein_idxes\",\"start_pos\",\"stop_pos\",\"nAA\",\"HLA_prob_pred\", 'is_prot_nterm', 'is_prot_cterm']].copy()\n", + "speclib.precursor_df = sequence_df[\n", + " [\"sequence\",\"protein_idxes\",\"start_pos\",\"stop_pos\",\n", + " \"nAA\",\"HLA_prob_pred\", 'is_prot_nterm', 'is_prot_cterm']\n", + "].copy()\n", "speclib.precursor_df" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -2813,7 +2642,7 @@ "[148 rows x 8 columns]" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2831,7 +2660,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -3058,7 +2887,7 @@ "[498 rows x 11 columns]" ] }, - "execution_count": 26, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -3078,29 +2907,29 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-22 09:22:23> Predicting RT/IM/MS2 for 400 precursors ...\n", - "2024-07-22 09:22:23> Predicting RT ...\n" + "2024-07-23 14:22:43> Predicting RT/IM/MS2 for 400 precursors ...\n", + "2024-07-23 14:22:43> Predicting RT ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 7/7 [00:00<00:00, 69.31it/s]" + "100%|██████████| 7/7 [00:00<00:00, 27.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-22 09:22:23> Predicting mobility ...\n" + "2024-07-23 14:22:43> Predicting mobility ...\n" ] }, { @@ -3108,14 +2937,14 @@ "output_type": "stream", "text": [ "\n", - "100%|██████████| 7/7 [00:00<00:00, 72.89it/s]" + "100%|██████████| 7/7 [00:00<00:00, 50.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-22 09:22:23> Predicting MS2 ...\n" + "2024-07-23 14:22:44> Predicting MS2 ...\n" ] }, { @@ -3123,14 +2952,14 @@ "output_type": "stream", "text": [ "\n", - "100%|██████████| 7/7 [00:00<00:00, 22.52it/s]" + "100%|██████████| 7/7 [00:00<00:00, 23.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-07-22 09:22:24> End predicting RT/IM/MS2\n" + "2024-07-23 14:22:44> End predicting RT/IM/MS2\n" ] }, { @@ -3154,7 +2983,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -3164,7 +2993,7 @@ "Predict RT for 11 iRT precursors.\n", "Linear regression of `rt_pred` to `irt`:\n", " R_square R slope intercept test_num\n", - "0 0.99007 0.995022 152.235621 -39.23216 11\n" + "0 0.99007 0.995022 152.235639 -39.232164 11\n" ] }, { @@ -3228,13 +3057,13 @@ " 1072.404037\n", " 0.189650\n", " 0.189650\n", - " 254.195923\n", + " 254.195892\n", " 1.253140\n", " 30.0\n", " Lumos\n", " 0\n", " 7\n", - " -10.360729\n", + " -10.360738\n", " \n", " \n", " 1\n", @@ -3258,7 +3087,7 @@ " Lumos\n", " 7\n", " 14\n", - " -10.360729\n", + " -10.360738\n", " \n", " \n", " 2\n", @@ -3276,13 +3105,13 @@ " 1056.409123\n", " 0.289261\n", " 0.289261\n", - " 255.103760\n", + " 255.103699\n", " 1.257373\n", " 30.0\n", " Lumos\n", " 14\n", " 21\n", - " 4.803679\n", + " 4.803681\n", " \n", " \n", " 3\n", @@ -3306,7 +3135,7 @@ " Lumos\n", " 21\n", " 28\n", - " 4.803679\n", + " 4.803681\n", " \n", " \n", " 4\n", @@ -3324,13 +3153,13 @@ " 814.503280\n", " 0.433791\n", " 0.433791\n", - " 256.615234\n", + " 256.615204\n", " 1.260001\n", " 30.0\n", " Lumos\n", " 28\n", " 35\n", - " 26.806266\n", + " 26.806270\n", " \n", " \n", " ...\n", @@ -3372,13 +3201,13 @@ " 775.414662\n", " 0.489545\n", " 0.489545\n", - " 429.360870\n", + " 429.360901\n", " 1.062514\n", " 30.0\n", " Lumos\n", " 3810\n", " 3823\n", - " 35.294021\n", + " 35.294030\n", " \n", " \n", " 396\n", @@ -3396,13 +3225,13 @@ " 517.278867\n", " 0.489545\n", " 0.489545\n", - " 463.231110\n", + " 463.231049\n", " 0.764225\n", " 30.0\n", " Lumos\n", " 3823\n", " 3836\n", - " 35.294021\n", + " 35.294030\n", " \n", " \n", " 397\n", @@ -3426,7 +3255,7 @@ " Lumos\n", " 3836\n", " 3849\n", - " 18.273781\n", + " 18.273780\n", " \n", " \n", " 398\n", @@ -3444,13 +3273,13 @@ " 721.376009\n", " 0.377743\n", " 0.377743\n", - " 404.633667\n", + " 404.633698\n", " 1.000659\n", " 30.0\n", " Lumos\n", " 3849\n", " 3862\n", - " 18.273781\n", + " 18.273780\n", " \n", " \n", " 399\n", @@ -3468,13 +3297,13 @@ " 481.253098\n", " 0.377743\n", " 0.377743\n", - " 476.655640\n", + " 476.655701\n", " 0.785851\n", " 30.0\n", " Lumos\n", " 3862\n", " 3875\n", - " 18.273781\n", + " 18.273780\n", " \n", " \n", "\n", @@ -3509,35 +3338,35 @@ "399 False False ... 481.253098 \n", "\n", " rt_pred rt_norm_pred ccs_pred mobility_pred nce instrument \\\n", - "0 0.189650 0.189650 254.195923 1.253140 30.0 Lumos \n", + "0 0.189650 0.189650 254.195892 1.253140 30.0 Lumos \n", "1 0.189650 0.189650 337.328583 0.831494 30.0 Lumos \n", - "2 0.289261 0.289261 255.103760 1.257373 30.0 Lumos \n", + "2 0.289261 0.289261 255.103699 1.257373 30.0 Lumos \n", "3 0.289261 0.289261 337.444641 0.831621 30.0 Lumos \n", - "4 0.433791 0.433791 256.615234 1.260001 30.0 Lumos \n", + "4 0.433791 0.433791 256.615204 1.260001 30.0 Lumos \n", ".. ... ... ... ... ... ... \n", - "395 0.489545 0.489545 429.360870 1.062514 30.0 Lumos \n", - "396 0.489545 0.489545 463.231110 0.764225 30.0 Lumos \n", + "395 0.489545 0.489545 429.360901 1.062514 30.0 Lumos \n", + "396 0.489545 0.489545 463.231049 0.764225 30.0 Lumos \n", "397 0.377743 0.377743 289.200989 1.430378 30.0 Lumos \n", - "398 0.377743 0.377743 404.633667 1.000659 30.0 Lumos \n", - "399 0.377743 0.377743 476.655640 0.785851 30.0 Lumos \n", + "398 0.377743 0.377743 404.633698 1.000659 30.0 Lumos \n", + "399 0.377743 0.377743 476.655701 0.785851 30.0 Lumos \n", "\n", " frag_start_idx frag_stop_idx irt_pred \n", - "0 0 7 -10.360729 \n", - "1 7 14 -10.360729 \n", - "2 14 21 4.803679 \n", - "3 21 28 4.803679 \n", - "4 28 35 26.806266 \n", + "0 0 7 -10.360738 \n", + "1 7 14 -10.360738 \n", + "2 14 21 4.803681 \n", + "3 21 28 4.803681 \n", + "4 28 35 26.806270 \n", ".. ... ... ... \n", - "395 3810 3823 35.294021 \n", - "396 3823 3836 35.294021 \n", - "397 3836 3849 18.273781 \n", - "398 3849 3862 18.273781 \n", - "399 3862 3875 18.273781 \n", + "395 3810 3823 35.294030 \n", + "396 3823 3836 35.294030 \n", + "397 3836 3849 18.273780 \n", + "398 3849 3862 18.273780 \n", + "399 3862 3875 18.273780 \n", "\n", "[400 rows x 21 columns]" ] }, - "execution_count": 28, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -3555,47 +3384,31 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "hdf_path = \"D:\\Software\\FASTA\\Human\\speclib_example.hdf\"\n", - "\n", - "speclib.save_hdf(hdf_path)" + "# hdf_path = \"D:\\Software\\FASTA\\Human\\speclib_example.hdf\"\n", + "# tsv_path = \"D:\\Software\\FASTA\\Human\\speclib_example.tsv\"\n", + "# speclib.save_hdf(hdf_path) # save as hdf speclib" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1/1 [00:01<00:00, 1.51s/it]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Translation finished, it will take several minutes to export the rest precursors to the tsv file...\n" - ] - } - ], + "outputs": [], "source": [ "from peptdeep.spec_lib.translate import translate_to_tsv\n", "speclib.append_protein_name()\n", - "translate_to_tsv(speclib=speclib, \n", - " tsv = \"D:\\Software\\FASTA\\Human\\speclib_example.tsv\")" + "# translate_to_tsv(speclib=speclib, tsv = tsv_path) # save as tsv speclib" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### 4. matching peptides back to proteins" + "## 4. Matching peptides back to proteins" ] }, { @@ -3607,14 +3420,14 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 2/2 [00:00 float: """ From a2cff7337bc8ac101fb44ab46e1ceaeacdc3ce8a Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 23 Jul 2024 14:40:36 +0200 Subject: [PATCH 09/10] some types --- docs/tutorials/tutorial_immunopeptidomics.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb index c7d54dde..eb536a8a 100644 --- a/docs/tutorials/tutorial_immunopeptidomics.ipynb +++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb @@ -1808,7 +1808,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To reduce the size of the dataframe and predicted library we give each peptide sequence a unique protein identifier (number). This enables the use of search engines that rely on protein information (such as AlphaDIA) but one needs to keep in mind to remove filtering steps based on how many peptides per protein are identified during data analysis. Alternatively, proteins the peptide sequences could originate from can be infered using prot_infer (demonstrated below). " + "To reduce the size of the dataframe and predicted library we give each peptide sequence a unique protein identifier (number). This enables the use of search engines that rely on protein information (such as AlphaDIA) but one needs to keep in mind to remove filtering steps based on how many peptides per protein are identified during data analysis. Alternatively, proteins of the peptide sequences may originate from can be infered using `alphabase.protein.fasta.annotate_precursor_df()` (demonstrated below)." ] }, { From 7d903815157089597c9d0a8a7fccafb15dbdea95 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 23 Jul 2024 23:29:20 +0200 Subject: [PATCH 10/10] #183 to make tests work --- docs/notebooks.rst | 2 +- nbs_tests/mass_spec/mass_calibration.ipynb | 37 +- nbs_tests/mass_spec/match.ipynb | 13 + nbs_tests/mass_spec/ms_reader.ipynb | 9 + nbs_tests/model/ccs.ipynb | 2 +- nbs_tests/model/featurize.ipynb | 4 +- nbs_tests/model/ms2.ipynb | 2 +- nbs_tests/model/rt.ipynb | 2 +- nbs_tests/pipeline_api.ipynb | 9 + nbs_tests/protein/fasta.ipynb | 349 ++++++++++-------- .../maxquant_frag_reader.ipynb | 71 +++- nbs_tests/spec_lib/library_factory.ipynb | 9 + nbs_tests/spec_lib/predict_lib.ipynb | 9 + nbs_tests/spec_lib/test_translate_tsv.ipynb | 2 +- peptdeep/protein/fasta.py | 2 +- tests/run_tests.sh | 8 +- 16 files changed, 345 insertions(+), 185 deletions(-) diff --git a/docs/notebooks.rst b/docs/notebooks.rst index 03646eb5..701040d6 100644 --- a/docs/notebooks.rst +++ b/docs/notebooks.rst @@ -6,10 +6,10 @@ Tutorials and notebooks about how to use AlphaPeptDeep .. toctree:: :maxdepth: 1 + tutorials/tutorial_immunopeptidomics nbs/tutorial_models_from_scratch nbs/tutorial_speclib_from_fasta nbs/alphapeptdeep_hdf_to_tsv nbs/tutorial_model_manager nbs/tutorial_building_rt_model nbs/tutorial_building_ccs_model - nbs/tutorials/tutorial_immunopeptidomics diff --git a/nbs_tests/mass_spec/mass_calibration.ipynb b/nbs_tests/mass_spec/mass_calibration.ipynb index 6ee0cbe6..16140e9f 100644 --- a/nbs_tests/mass_spec/mass_calibration.ipynb +++ b/nbs_tests/mass_spec/mass_calibration.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,16 +18,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], "source": [ "from peptdeep.mass_spec.mass_calibration import *" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -112,7 +129,7 @@ "7 0.0 1.0" ] }, - "execution_count": null, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -146,6 +163,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/mass_spec/match.ipynb b/nbs_tests/mass_spec/match.ipynb index 3a029140..cce9bee6 100644 --- a/nbs_tests/mass_spec/match.ipynb +++ b/nbs_tests/mass_spec/match.ipynb @@ -16,6 +16,15 @@ "# Match" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -377,6 +386,10 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/mass_spec/ms_reader.ipynb b/nbs_tests/mass_spec/ms_reader.ipynb index 4ea3bd81..7064c883 100644 --- a/nbs_tests/mass_spec/ms_reader.ipynb +++ b/nbs_tests/mass_spec/ms_reader.ipynb @@ -16,6 +16,15 @@ "# MS Reader" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/nbs_tests/model/ccs.ipynb b/nbs_tests/model/ccs.ipynb index a9f808fa..4e84ebb5 100644 --- a/nbs_tests/model/ccs.ipynb +++ b/nbs_tests/model/ccs.ipynb @@ -327,7 +327,7 @@ "repeat = 10\n", "precursor_df = pd.DataFrame({\n", " 'sequence': ['AGHCEWQMKYR']*repeat,\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", " 'mod_sites': ['0;4;8']*repeat,\n", " 'nAA': [11]*repeat,\n", " 'charge': [2]*repeat,\n", diff --git a/nbs_tests/model/featurize.ipynb b/nbs_tests/model/featurize.ipynb index 7eafdd2b..8adb958b 100644 --- a/nbs_tests/model/featurize.ipynb +++ b/nbs_tests/model/featurize.ipynb @@ -66,10 +66,10 @@ "outputs": [], "source": [ "#| hide\n", - "x = parse_mod_feature(5, ['Acetyl@Protein N-term','Phospho@S','Oxidation@M'], [0,-1,1])\n", + "x = parse_mod_feature(5, ['Acetyl@Protein_N-term','Phospho@S','Oxidation@M'], [0,-1,1])\n", "assert x.shape == (7, mod_feature_size)\n", "assert np.all(x[1,:]==MOD_TO_FEATURE['Oxidation@M'])\n", - "assert np.all(x[0,:]==MOD_TO_FEATURE['Acetyl@Protein N-term'])\n", + "assert np.all(x[0,:]==MOD_TO_FEATURE['Acetyl@Protein_N-term'])\n", "assert np.all(x[-1,:]==MOD_TO_FEATURE['Phospho@S'])\n", "assert np.all(x[(2,3,4,5),:]==0)" ] diff --git a/nbs_tests/model/ms2.ipynb b/nbs_tests/model/ms2.ipynb index 9fe774e6..c820dfba 100644 --- a/nbs_tests/model/ms2.ipynb +++ b/nbs_tests/model/ms2.ipynb @@ -396,7 +396,7 @@ "repeat = 10\n", "precursor_df = pd.DataFrame({\n", " 'sequence': ['AGHCEWQMKYR']*repeat,\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", " 'mod_sites': ['0;4;8']*repeat,\n", " 'nAA': [11]*repeat,\n", " 'nce': [20]*repeat,\n", diff --git a/nbs_tests/model/rt.ipynb b/nbs_tests/model/rt.ipynb index 9bf8803e..ed952b40 100644 --- a/nbs_tests/model/rt.ipynb +++ b/nbs_tests/model/rt.ipynb @@ -135,7 +135,7 @@ "def create_test_dataframe_with_identical_rows(nrows = 10):\n", " precursor_df = pd.DataFrame({\n", " 'sequence': ['AGHCEWQMKYR']*nrows,\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*nrows,\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*nrows,\n", " 'mod_sites': ['0;4;8']*nrows,\n", " 'nAA': [11]*nrows,\n", " 'rt_norm': [0.6]*nrows\n", diff --git a/nbs_tests/pipeline_api.ipynb b/nbs_tests/pipeline_api.ipynb index 4d81db6f..a5678902 100644 --- a/nbs_tests/pipeline_api.ipynb +++ b/nbs_tests/pipeline_api.ipynb @@ -37,6 +37,15 @@ "The refined models will be saved in the path pointed by \"PEPTDEEP_HOME\" in `peptdeep.settings.global_settings`." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nbs_tests/protein/fasta.ipynb b/nbs_tests/protein/fasta.ipynb index 1256ad16..3c8af3a0 100644 --- a/nbs_tests/protein/fasta.ipynb +++ b/nbs_tests/protein/fasta.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -35,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -197,7 +206,7 @@ "8 False 20 " ] }, - "execution_count": null, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -276,7 +285,7 @@ "1 yy gene FGHIJKLMNOPQR" ] }, - "execution_count": null, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -287,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -469,7 +478,7 @@ "8 False 20 xx " ] }, - "execution_count": null, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -482,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -514,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -584,7 +593,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 7\n", " xx\n", @@ -597,7 +606,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 7\n", " xx\n", @@ -675,7 +684,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 12\n", " xx\n", @@ -714,7 +723,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 13\n", " xx\n", @@ -727,7 +736,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 13\n", " xx\n", @@ -766,7 +775,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;8\n", " 13\n", " xx;yy\n", @@ -779,7 +788,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 13\n", " xx;yy\n", @@ -844,7 +853,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;14;3\n", " 19\n", " xx\n", @@ -857,7 +866,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 19\n", " xx\n", @@ -922,7 +931,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 20\n", " xx\n", @@ -935,7 +944,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;15;4\n", " 20\n", " xx\n", @@ -948,7 +957,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...\n", " 0;1;15;4\n", " 20\n", " xx\n", @@ -961,7 +970,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 20\n", " xx\n", @@ -1009,36 +1018,36 @@ " is_prot_cterm mods \\\n", "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "2 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", "5 True \n", "6 True Oxidation@M \n", "7 True \n", "8 False Carbamidomethyl@C \n", - "9 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "9 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "10 False Oxidation@M;Carbamidomethyl@C \n", "11 False Carbamidomethyl@C \n", - "12 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "13 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "12 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "13 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "14 True Oxidation@M \n", "15 True \n", - "16 True Acetyl@Protein N-term;Oxidation@M \n", - "17 True Acetyl@Protein N-term \n", + "16 True Acetyl@Protein_N-term;Oxidation@M \n", + "17 True Acetyl@Protein_N-term \n", "18 True Oxidation@M \n", "19 True \n", "20 False Oxidation@M;Carbamidomethyl@C \n", "21 False Carbamidomethyl@C \n", - "22 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "23 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "22 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "23 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "24 False Oxidation@M;Carbamidomethyl@C \n", "25 False Oxidation@M;Carbamidomethyl@C \n", "26 False Oxidation@M;Oxidation@M;Carbamidomethyl@C \n", "27 False Carbamidomethyl@C \n", - "28 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "29 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "30 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", - "31 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "28 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "29 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "30 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", + "31 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "\n", " mod_sites nAA proteins genes \n", "0 1;4 7 xx \n", @@ -1075,7 +1084,7 @@ "31 0;4 20 xx " ] }, - "execution_count": null, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1087,7 +1096,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -1103,14 +1112,14 @@ " else:\n", " assert 'Carbamidomethyl@C' not in mods\n", " # test Acetyl@Protein N-term\n", - " if 'Acetyl@Protein N-term' in mods:\n", + " if 'Acetyl@Protein_N-term' in mods:\n", " assert _lib.precursor_df.is_prot_nterm[i]\n", " assert '0' in sites\n", " if '0' in mods:\n", " assert _lib.precursor_df.is_prot_nterm[i]\n", - " assert 'Acetyl@Protein N-term' in mods\n", + " assert 'Acetyl@Protein_N-term' in mods\n", " if not _lib.precursor_df.is_prot_nterm[i]:\n", - " assert 'Acetyl@Protein N-term' not in mods\n", + " assert 'Acetyl@Protein_N-term' not in mods\n", " # test Oxidation@M\n", " if 'Oxidation@M' in mods:\n", " assert 'M' in seq\n", @@ -1133,7 +1142,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -1203,7 +1212,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 7\n", " xx\n", @@ -1216,7 +1225,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 7\n", " xx\n", @@ -1346,7 +1355,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 12\n", " xx\n", @@ -1385,7 +1394,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 13\n", " xx\n", @@ -1398,7 +1407,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 13\n", " xx\n", @@ -1437,7 +1446,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;8\n", " 13\n", " xx;yy\n", @@ -1450,7 +1459,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 13\n", " xx;yy\n", @@ -1567,7 +1576,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;14;3\n", " 19\n", " xx\n", @@ -1580,7 +1589,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 19\n", " xx\n", @@ -1645,7 +1654,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 20\n", " xx\n", @@ -1658,7 +1667,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;15;4\n", " 20\n", " xx\n", @@ -1671,7 +1680,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...\n", " 0;1;15;4\n", " 20\n", " xx\n", @@ -1684,7 +1693,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 20\n", " xx\n", @@ -1740,8 +1749,8 @@ " is_prot_cterm mods \\\n", "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "2 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", "5 True \n", "6 True Oxidation@M;Phospho@S \n", @@ -1751,15 +1760,15 @@ "10 True Phospho@T \n", "11 True \n", "12 False Carbamidomethyl@C \n", - "13 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "13 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "14 False Oxidation@M;Carbamidomethyl@C \n", "15 False Carbamidomethyl@C \n", - "16 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "17 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "16 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "17 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "18 True Oxidation@M \n", "19 True \n", - "20 True Acetyl@Protein N-term;Oxidation@M \n", - "21 True Acetyl@Protein N-term \n", + "20 True Acetyl@Protein_N-term;Oxidation@M \n", + "21 True Acetyl@Protein_N-term \n", "22 True Oxidation@M;Phospho@S \n", "23 True Oxidation@M;Phospho@T \n", "24 True Oxidation@M \n", @@ -1768,16 +1777,16 @@ "27 True \n", "28 False Oxidation@M;Carbamidomethyl@C \n", "29 False Carbamidomethyl@C \n", - "30 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "31 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "30 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "31 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "32 False Oxidation@M;Carbamidomethyl@C \n", "33 False Oxidation@M;Carbamidomethyl@C \n", "34 False Oxidation@M;Oxidation@M;Carbamidomethyl@C \n", "35 False Carbamidomethyl@C \n", - "36 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "37 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "38 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", - "39 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "36 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "37 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "38 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", + "39 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "\n", " mod_sites nAA proteins genes \n", "0 1;4 7 xx \n", @@ -1822,7 +1831,7 @@ "39 0;4 20 xx " ] }, - "execution_count": null, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1836,7 +1845,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1909,7 +1918,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 7\n", " xx\n", @@ -1923,7 +1932,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 7\n", " xx\n", @@ -1965,7 +1974,7 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...\n", + " Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any_N-t...\n", " 4;0;7;13\n", " 20\n", " xx\n", @@ -1979,7 +1988,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4;7;13\n", " 20\n", " xx\n", @@ -1993,7 +2002,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;15;4;7;13\n", " 20\n", " xx\n", @@ -2007,7 +2016,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...\n", " 0;1;15;4;7;13\n", " 20\n", " xx\n", @@ -2021,7 +2030,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth...\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...\n", " 0;4;7;13\n", " 20\n", " xx\n", @@ -2050,15 +2059,15 @@ " is_prot_cterm mods \\\n", "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "2 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", ".. ... ... \n", - "115 False Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t... \n", - "116 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "117 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "118 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", - "119 False Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth... \n", + "115 False Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any_N-t... \n", + "116 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "117 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "118 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", + "119 False Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth... \n", "\n", " mod_sites nAA proteins genes labeling_channel \n", "0 1;4 7 xx none \n", @@ -2076,7 +2085,7 @@ "[120 rows x 11 columns]" ] }, - "execution_count": null, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -2085,15 +2094,15 @@ "#| hide\n", "_lib.add_peptide_labeling({\n", " 'none': [], # not labelled for reference\n", - " 'light': ['Dimethyl@Any N-term','Dimethyl@K'],\n", - " 'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],\n", + " 'light': ['Dimethyl@Any_N-term','Dimethyl@K'],\n", + " 'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],\n", "})\n", "_lib.precursor_df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -2102,7 +2111,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -2229,7 +2238,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;2\n", " 8\n", " 0\n", @@ -2325,7 +2334,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;6\n", " 8\n", " 1\n", @@ -2397,7 +2406,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;3\n", " 9\n", " 0\n", @@ -2421,7 +2430,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 9\n", " 0\n", @@ -2493,7 +2502,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;8;6\n", " 9\n", " 1\n", @@ -2517,7 +2526,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;6\n", " 9\n", " 1\n", @@ -2637,7 +2646,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;7\n", " 11\n", " 0\n", @@ -2661,7 +2670,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;7\n", " 11\n", " 0\n", @@ -2685,7 +2694,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 0\n", @@ -2709,7 +2718,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 0\n", @@ -2791,8 +2800,8 @@ " 0\n", " 0.352144\n", " 0.352144\n", - " 402.555023\n", - " 0.994806\n", + " 402.554993\n", + " 0.994805\n", " 30.0\n", " Lumos\n", " 220\n", @@ -2815,7 +2824,7 @@ " 0\n", " 0.352144\n", " 0.352144\n", - " 482.206787\n", + " 482.206757\n", " 0.794435\n", " 30.0\n", " Lumos\n", @@ -2829,7 +2838,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;4\n", " 11\n", " 1\n", @@ -2839,7 +2848,7 @@ " 0\n", " 0.406691\n", " 0.406691\n", - " 414.260437\n", + " 414.260406\n", " 1.024166\n", " 30.0\n", " Lumos\n", @@ -2853,7 +2862,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;4\n", " 11\n", " 1\n", @@ -2863,7 +2872,7 @@ " 0\n", " 0.406691\n", " 0.406691\n", - " 470.269653\n", + " 470.269684\n", " 0.775096\n", " 30.0\n", " Lumos\n", @@ -2877,7 +2886,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 1\n", @@ -2901,7 +2910,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 1\n", @@ -2911,7 +2920,7 @@ " 0\n", " 0.462864\n", " 0.462864\n", - " 469.226685\n", + " 469.226715\n", " 0.773290\n", " 30.0\n", " Lumos\n", @@ -3162,35 +3171,35 @@ "0 Oxidation@M 2 8 0 \n", "1 8 0 \n", "2 Carbamidomethyl@C 2 8 0 \n", - "3 Acetyl@Protein N-term;Carbamidomethyl@C 0;2 8 0 \n", + "3 Acetyl@Protein_N-term;Carbamidomethyl@C 0;2 8 0 \n", "4 Oxidation@M 6 8 1 \n", "5 8 1 \n", "6 Carbamidomethyl@C 6 8 1 \n", - "7 Acetyl@Protein N-term;Carbamidomethyl@C 0;6 8 1 \n", + "7 Acetyl@Protein_N-term;Carbamidomethyl@C 0;6 8 1 \n", "8 Oxidation@M;Carbamidomethyl@C 1;3 9 0 \n", "9 Carbamidomethyl@C 3 9 0 \n", - "10 Acetyl@Protein N-term;Oxidation@M;Carbamidomet... 0;1;3 9 0 \n", - "11 Acetyl@Protein N-term;Carbamidomethyl@C 0;3 9 0 \n", + "10 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;1;3 9 0 \n", + "11 Acetyl@Protein_N-term;Carbamidomethyl@C 0;3 9 0 \n", "12 Oxidation@M;Carbamidomethyl@C 8;6 9 1 \n", "13 Carbamidomethyl@C 6 9 1 \n", - "14 Acetyl@Protein N-term;Oxidation@M;Carbamidomet... 0;8;6 9 1 \n", - "15 Acetyl@Protein N-term;Carbamidomethyl@C 0;6 9 1 \n", + "14 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;8;6 9 1 \n", + "15 Acetyl@Protein_N-term;Carbamidomethyl@C 0;6 9 1 \n", "16 Oxidation@M 7 11 0 \n", "17 Oxidation@M 7 11 0 \n", "18 11 0 \n", "19 11 0 \n", - "20 Acetyl@Protein N-term;Oxidation@M 0;7 11 0 \n", - "21 Acetyl@Protein N-term;Oxidation@M 0;7 11 0 \n", - "22 Acetyl@Protein N-term 0 11 0 \n", - "23 Acetyl@Protein N-term 0 11 0 \n", + "20 Acetyl@Protein_N-term;Oxidation@M 0;7 11 0 \n", + "21 Acetyl@Protein_N-term;Oxidation@M 0;7 11 0 \n", + "22 Acetyl@Protein_N-term 0 11 0 \n", + "23 Acetyl@Protein_N-term 0 11 0 \n", "24 Oxidation@M 4 11 1 \n", "25 Oxidation@M 4 11 1 \n", "26 11 1 \n", "27 11 1 \n", - "28 Acetyl@Protein N-term;Oxidation@M 0;4 11 1 \n", - "29 Acetyl@Protein N-term;Oxidation@M 0;4 11 1 \n", - "30 Acetyl@Protein N-term 0 11 1 \n", - "31 Acetyl@Protein N-term 0 11 1 \n", + "28 Acetyl@Protein_N-term;Oxidation@M 0;4 11 1 \n", + "29 Acetyl@Protein_N-term;Oxidation@M 0;4 11 1 \n", + "30 Acetyl@Protein_N-term 0 11 1 \n", + "31 Acetyl@Protein_N-term 0 11 1 \n", "32 Oxidation@M 6 13 1 \n", "33 Oxidation@M 6 13 1 \n", "34 13 1 \n", @@ -3269,12 +3278,12 @@ "23 468.311920 0.771782 30.0 Lumos 190 200 \n", "24 400.909912 0.990859 30.0 Lumos 200 210 \n", "25 478.989624 0.789230 30.0 Lumos 210 220 \n", - "26 402.555023 0.994806 30.0 Lumos 220 230 \n", - "27 482.206787 0.794435 30.0 Lumos 230 240 \n", - "28 414.260437 1.024166 30.0 Lumos 240 250 \n", - "29 470.269653 0.775096 30.0 Lumos 250 260 \n", + "26 402.554993 0.994805 30.0 Lumos 220 230 \n", + "27 482.206757 0.794435 30.0 Lumos 230 240 \n", + "28 414.260406 1.024166 30.0 Lumos 240 250 \n", + "29 470.269684 0.775096 30.0 Lumos 250 260 \n", "30 417.726074 1.032617 30.0 Lumos 260 270 \n", - "31 469.226685 0.773290 30.0 Lumos 270 280 \n", + "31 469.226715 0.773290 30.0 Lumos 270 280 \n", "32 421.076538 1.041983 30.0 Lumos 280 292 \n", "33 490.627533 0.809400 30.0 Lumos 292 304 \n", "34 423.214233 1.047176 30.0 Lumos 304 316 \n", @@ -3287,7 +3296,7 @@ "[40 rows x 26 columns]" ] }, - "execution_count": null, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -3326,7 +3335,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -3381,7 +3390,7 @@ " 1\n", " False\n", " True\n", - " Oxidation@M;Dimethyl@Any N-term\n", + " Oxidation@M;Dimethyl@Any_N-term\n", " 2;0\n", " 8\n", " 0\n", @@ -3391,7 +3400,7 @@ " 0\n", " 0.242660\n", " 0.242660\n", - " 345.390839\n", + " 345.390869\n", " 0.850135\n", " 30.0\n", " Lumos\n", @@ -3405,7 +3414,7 @@ " 1\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term\n", + " Dimethyl:2H(6)13C(2)@Any_N-term\n", " 0\n", " 8\n", " 0\n", @@ -3429,7 +3438,7 @@ " 1\n", " False\n", " True\n", - " Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term\n", + " Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term\n", " 2;0\n", " 8\n", " 0\n", @@ -3453,7 +3462,7 @@ " 1\n", " False\n", " True\n", - " Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term\n", + " Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term\n", " 6;0\n", " 8\n", " 1\n", @@ -3463,7 +3472,7 @@ " 2\n", " 0.040846\n", " 0.040846\n", - " 319.400330\n", + " 319.400391\n", " 0.786163\n", " 30.0\n", " Lumos\n", @@ -3477,7 +3486,7 @@ " 1\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term\n", + " Dimethyl:2H(6)13C(2)@Any_N-term\n", " 0\n", " 8\n", " 1\n", @@ -3525,7 +3534,7 @@ " 2\n", " False\n", " True\n", - " Dimethyl@Any N-term;Dimethyl@K\n", + " Dimethyl@Any_N-term;Dimethyl@K\n", " 0;8\n", " 13\n", " 1\n", @@ -3535,8 +3544,8 @@ " 0\n", " 0.620949\n", " 0.620949\n", - " 430.461273\n", - " 1.065108\n", + " 430.461243\n", + " 1.065107\n", " 30.0\n", " Lumos\n", " 692\n", @@ -3549,7 +3558,7 @@ " 2\n", " False\n", " True\n", - " Oxidation@M;Dimethyl@Any N-term;Dimethyl@K\n", + " Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K\n", " 6;0;8\n", " 13\n", " 1\n", @@ -3559,7 +3568,7 @@ " 0\n", " 0.468698\n", " 0.468698\n", - " 482.796692\n", + " 482.796661\n", " 0.796481\n", " 30.0\n", " Lumos\n", @@ -3573,7 +3582,7 @@ " 2\n", " False\n", " True\n", - " Oxidation@M;Dimethyl@Any N-term;Dimethyl@K\n", + " Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K\n", " 6;0;8\n", " 13\n", " 1\n", @@ -3583,7 +3592,7 @@ " 0\n", " 0.468698\n", " 0.468698\n", - " 428.150757\n", + " 428.150787\n", " 1.059489\n", " 30.0\n", " Lumos\n", @@ -3597,7 +3606,7 @@ " 2\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...\n", + " Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...\n", " 0;5\n", " 13\n", " 0\n", @@ -3621,7 +3630,7 @@ " 2\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...\n", + " Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...\n", " 0;5\n", " 13\n", " 0\n", @@ -3658,17 +3667,17 @@ "79 FGHIKLMNPQRST 0 2 False True \n", "\n", " mods mod_sites nAA decoy \\\n", - "0 Oxidation@M;Dimethyl@Any N-term 2;0 8 0 \n", - "1 Dimethyl:2H(6)13C(2)@Any N-term 0 8 0 \n", - "2 Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term 2;0 8 0 \n", - "3 Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term 6;0 8 1 \n", - "4 Dimethyl:2H(6)13C(2)@Any N-term 0 8 1 \n", + "0 Oxidation@M;Dimethyl@Any_N-term 2;0 8 0 \n", + "1 Dimethyl:2H(6)13C(2)@Any_N-term 0 8 0 \n", + "2 Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term 2;0 8 0 \n", + "3 Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term 6;0 8 1 \n", + "4 Dimethyl:2H(6)13C(2)@Any_N-term 0 8 1 \n", ".. ... ... ... ... \n", - "75 Dimethyl@Any N-term;Dimethyl@K 0;8 13 1 \n", - "76 Oxidation@M;Dimethyl@Any N-term;Dimethyl@K 6;0;8 13 1 \n", - "77 Oxidation@M;Dimethyl@Any N-term;Dimethyl@K 6;0;8 13 1 \n", - "78 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", - "79 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", + "75 Dimethyl@Any_N-term;Dimethyl@K 0;8 13 1 \n", + "76 Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K 6;0;8 13 1 \n", + "77 Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K 6;0;8 13 1 \n", + "78 Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)... 0;5 13 0 \n", + "79 Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)... 0;5 13 0 \n", "\n", " charge ... i_5 mono_isotope_idx rt_pred rt_norm_pred \\\n", "0 2 ... 0.001352 0 0.242660 0.242660 \n", @@ -3684,22 +3693,22 @@ "79 3 ... 0.058123 2 0.206957 0.206957 \n", "\n", " ccs_pred mobility_pred nce instrument frag_start_idx frag_stop_idx \n", - "0 345.390839 0.850135 30.0 Lumos 0 7 \n", + "0 345.390869 0.850135 30.0 Lumos 0 7 \n", "1 313.133270 0.770554 30.0 Lumos 7 14 \n", "2 314.302277 0.773615 30.0 Lumos 14 21 \n", - "3 319.400330 0.786163 30.0 Lumos 21 28 \n", + "3 319.400391 0.786163 30.0 Lumos 21 28 \n", "4 320.333069 0.788271 30.0 Lumos 28 35 \n", ".. ... ... ... ... ... ... \n", - "75 430.461273 1.065108 30.0 Lumos 692 704 \n", - "76 482.796692 0.796481 30.0 Lumos 704 716 \n", - "77 428.150757 1.059489 30.0 Lumos 716 728 \n", + "75 430.461243 1.065107 30.0 Lumos 692 704 \n", + "76 482.796661 0.796481 30.0 Lumos 704 716 \n", + "77 428.150787 1.059489 30.0 Lumos 716 728 \n", "78 412.858307 1.021552 30.0 Lumos 728 740 \n", "79 478.660187 0.789583 30.0 Lumos 740 752 \n", "\n", "[80 rows x 27 columns]" ] }, - "execution_count": null, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -3707,8 +3716,8 @@ "source": [ "_lib.import_and_process_protein_dict(protein_dict)\n", "_lib.add_peptide_labeling({\n", - " 'light': ['Dimethyl@Any N-term','Dimethyl@K'],\n", - " 'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],\n", + " 'light': ['Dimethyl@Any_N-term','Dimethyl@K'],\n", + " 'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],\n", "})\n", "_lib.predict_all()\n", "assert (_lib.precursor_df.decoy==1).any()\n", @@ -3732,6 +3741,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb b/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb index 80dda52d..a43bcad3 100644 --- a/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb +++ b/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -27,16 +27,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], "source": [ "from peptdeep.psm_frag_reader.maxquant_frag_reader import *" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -48,9 +56,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " lambda x: parse_phos_probs(x[0], x[1], prob), axis=1\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0.34720501 0.54503546 0.14126802 0.17500845 0.1020231\n", + " 0.04637072 0. 0. 0.01899846 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.02471942 0.41737406 0.67116171 1. 0.37160414 0.59517672\n", + " 0.54813229 0. 0.0606665 0.03838788 0.03735192]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0.04495926 0.0213509 0.02114326 0.01335259 0.\n", + " 0. 0. 0. 0. 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0. 0.51698907 0.87869409 0.14043304 0.1052603\n", + " 0.19786873 0. 0. 0. 0. 0.\n", + " 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0. 0. 0.54449196 0.2230503 0.\n", + " 0.30967216 0. 0. 0. 0. 0.\n", + " 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 18\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmod_sites\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAcetyl@Protein N-term\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmod_sites\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m0\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 20\u001b[0m seq \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAAAGPSNSSSGTSTPR\u001b[39m\u001b[38;5;124m'\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], "source": [ "#| hide\n", "mq_str = '''Raw file\tScan number\tScan index\tSequence\tLength\tMissed cleavages\tModifications\tModified sequence\tPhospho (STY) Probabilities\tPhospho (STY) Score Diffs\tAcetyl (Protein N-term)\tPhospho (STY)\tProteins\tGene Names\tProtein Names\tCharge\tFragmentation\tMass analyzer\tType\tScan event number\tIsotope index\tm/z\tMass\tMass Error [ppm]\tSimple Mass Error [ppm]\tRetention time\tPEP\tScore\tDelta score\tScore diff\tLocalization prob\tCombinatorics\tPIF\tFraction of total spectrum\tBase peak fraction\tPrecursor Full ScanNumber\tPrecursor Intensity\tPrecursor Apex Fraction\tPrecursor Apex Offset\tPrecursor Apex Offset Time\tDiagnostic peak Phospho (STY) Y\tMatches\tIntensities\tMass Deviations [Da]\tMass Deviations [ppm]\tMasses\tNumber of Matches\tIntensity coverage\tPeak coverage\tNeutral loss level\tETD identification type\tReverse\tAll scores\tAll sequences\tAll modified sequences\tid\tProtein group IDs\tPeptide ID\tMod. peptide ID\tEvidence ID\tPhospho (STY) site IDs\n", @@ -69,7 +114,7 @@ "assert 'frag_stop_idx' in mq_reader.psm_df.columns\n", "assert mq_reader.psm_df.mods.values[0] == ''\n", "assert mq_reader.psm_df.mod_sites.values[0] == ''\n", - "assert mq_reader.psm_df.mods.values[1] == 'Acetyl@Protein N-term'\n", + "assert mq_reader.psm_df.mods.values[1] in ('Acetyl@Protein_N-term', 'Acetyl@Protein N-term')\n", "assert mq_reader.psm_df.mod_sites.values[1] == '0'\n", "seq = 'AAAGPSNSSSGTSTPR'\n", "frag_types = raw_df[raw_df['Sequence']==seq]['Matches'].values[0].split(';')\n", @@ -496,6 +541,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/spec_lib/library_factory.ipynb b/nbs_tests/spec_lib/library_factory.ipynb index 9e96cffe..bcf70221 100644 --- a/nbs_tests/spec_lib/library_factory.ipynb +++ b/nbs_tests/spec_lib/library_factory.ipynb @@ -23,6 +23,15 @@ "Factory classes to predict libraries from different sources (input file format)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nbs_tests/spec_lib/predict_lib.ipynb b/nbs_tests/spec_lib/predict_lib.ipynb index 7fa38264..55faffad 100644 --- a/nbs_tests/spec_lib/predict_lib.ipynb +++ b/nbs_tests/spec_lib/predict_lib.ipynb @@ -33,6 +33,15 @@ "\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nbs_tests/spec_lib/test_translate_tsv.ipynb b/nbs_tests/spec_lib/test_translate_tsv.ipynb index 514e2c9b..b9658a39 100644 --- a/nbs_tests/spec_lib/test_translate_tsv.ipynb +++ b/nbs_tests/spec_lib/test_translate_tsv.ipynb @@ -138,7 +138,7 @@ "charged_frag_types = ['b_z1','y_z1','y_modloss_z1']\n", "precursor_df = pd.DataFrame({\n", " 'sequence': ['ASGHCEWMKYR']*repeat+['ASGHCEWMAAR'],\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],\n", " 'mod_sites': ['0;4;8']*repeat+[''],\n", " 'nAA': 11,\n", " 'NCE': 20,\n", diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py index 160a2c72..ecf92fb8 100644 --- a/peptdeep/protein/fasta.py +++ b/peptdeep/protein/fasta.py @@ -21,7 +21,7 @@ def __init__( precursor_charge_max: int = 4, precursor_mz_min: float = 400.0, precursor_mz_max: float = 1800.0, - var_mods: list = ["Acetyl@Protein N-term", "Oxidation@M"], + var_mods: list = ["Acetyl@Protein_N-term", "Oxidation@M"], min_var_mod_num: int = 0, max_var_mod_num: int = 2, fix_mods: list = ["Carbamidomethyl@C"], diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 6edc016c..c383dfc9 100644 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -1,2 +1,6 @@ -INCLUDED_NBS=$(find ../nbs_tests -name "*.ipynb") -python -m pytest --nbmake $(echo $INCLUDED_NBS) +TEST_NBS=$(find ../nbs_tests -name "*.ipynb") +TUTORIAL_NBS=$(find ../docs/tutorials -name "*.ipynb") + +ALL_NBS=$(echo $TEST_NBS$'\n'$TUTORIAL_NBS) + +python -m pytest --nbmake $(echo $ALL_NBS)