From 7b7c7fd7c21ea59fed4002f296eece9ac59032b7 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Tue, 16 Jul 2024 16:13:15 +0200
Subject: [PATCH 01/10] #183 init the hla tutorial

---
 docs/nbs/tutorial_HLA_prediction.rst          |   5 -
 docs/notebooks.rst                            |   1 -
 .../tutorial_immunopeptidomics.ipynb          | 587 ++++++++++++++++++
 peptdeep/hla/hla_utils.py                     |   8 +-
 4 files changed, 591 insertions(+), 10 deletions(-)
 delete mode 100644 docs/nbs/tutorial_HLA_prediction.rst
 create mode 100644 docs/tutorials/tutorial_immunopeptidomics.ipynb

diff --git a/docs/nbs/tutorial_HLA_prediction.rst b/docs/nbs/tutorial_HLA_prediction.rst
deleted file mode 100644
index 544073bf..00000000
--- a/docs/nbs/tutorial_HLA_prediction.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Tutorial: HLA prediction
-==========================
-
-Check `HLA1_Classifier.ipynb <https://github.com/MannLabs/PeptDeep-HLA/blob/master/nbs/HLA1_Classifier.ipynb>`_
-in `PeptDeep-HLA <https://github.com/MannLabs/PeptDeep-HLA>`_ repo.
diff --git a/docs/notebooks.rst b/docs/notebooks.rst
index 1ba96c8a..fa607aab 100644
--- a/docs/notebooks.rst
+++ b/docs/notebooks.rst
@@ -9,7 +9,6 @@ Tutorials and notebooks about how to use AlphaPeptDeep
    nbs/tutorial_models_from_scratch
    nbs/tutorial_speclib_from_fasta
    nbs/alphapeptdeep_hdf_to_tsv
-   nbs/tutorial_HLA_prediction
    nbs/tutorial_model_manager
    nbs/tutorial_building_rt_model
    nbs/tutorial_building_ccs_model
diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb
new file mode 100644
index 00000000..d71290eb
--- /dev/null
+++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb
@@ -0,0 +1,587 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using peptdeep for MHC class I immunopeptidomics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that pydivsufsort package is not installed by peptdeep by default. Install by:\n",
+    "```\n",
+    "pip install \"peptdeep[development,hla]\"\n",
+    "```\n",
+    "\n",
+    "Or install within jupyter notebook:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -q pydivsufsort"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Unspecific digestion in alphabase"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n",
+    "\n",
+    "Unspecific digestion in alphabase involves two steps:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Concatenate protein sequences into a single sequence, separated by a sentinel character, e.g., '$'. For instance:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def concat_sequences_for_nonspecific_digestion(seq_list, sep=\"$\"):\n",
+    "    return sep + sep.join(seq_list) + sep"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prot_seq_list = [\"MABCDEKFGHIJKLMNOPQRST\",\"FGHIJKLMNOPQR\"]\n",
+    "cat_prot = concat_sequences_for_nonspecific_digestion(prot_seq_list, sep=\"$\")\n",
+    "cat_prot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that the first and last sentinel characters are crutial as well."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2. Use `alphabase.protein.lcp_digest.get_substring_indices` to get all non-redundant non-specific sequences from the concatenated sequence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79</th>\n",
+       "      <td>13</td>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>80</th>\n",
+       "      <td>13</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>81</th>\n",
+       "      <td>14</td>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>82</th>\n",
+       "      <td>14</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>83</th>\n",
+       "      <td>15</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>84 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    start_pos  stop_pos\n",
+       "0           1         9\n",
+       "1           1        10\n",
+       "2           1        11\n",
+       "3           1        12\n",
+       "4           1        13\n",
+       "..        ...       ...\n",
+       "79         13        22\n",
+       "80         13        23\n",
+       "81         14        22\n",
+       "82         14        23\n",
+       "83         15        23\n",
+       "\n",
+       "[84 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from alphabase.protein.lcp_digest import get_substring_indices\n",
+    "import pandas as pd\n",
+    "\n",
+    "start_idxes, stop_idxes = get_substring_indices(\n",
+    "    cat_prot, min_len=8, max_len=14, stop_char=\"$\"\n",
+    ")\n",
+    "digest_pos_df = pd.DataFrame({\n",
+    "    \"start_pos\": start_idxes,\n",
+    "    \"stop_pos\": stop_idxes,\n",
+    "})\n",
+    "digest_pos_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "All unspecific peptides can be localted by the `start_pos` and `stop_pos` in `digest_pos_df`, and all peptides are non-redundant guaranteed by the LCP algorithm."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54935</th>\n",
+       "      <td>9987</td>\n",
+       "      <td>9995</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54936</th>\n",
+       "      <td>9987</td>\n",
+       "      <td>9996</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54937</th>\n",
+       "      <td>9988</td>\n",
+       "      <td>9995</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54938</th>\n",
+       "      <td>9988</td>\n",
+       "      <td>9996</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54939</th>\n",
+       "      <td>9989</td>\n",
+       "      <td>9996</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>54940 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       start_pos  stop_pos\n",
+       "0              1         8\n",
+       "1              1         9\n",
+       "2              1        10\n",
+       "3              1        11\n",
+       "4              1        12\n",
+       "...          ...       ...\n",
+       "54935       9987      9995\n",
+       "54936       9987      9996\n",
+       "54937       9988      9995\n",
+       "54938       9988      9996\n",
+       "54939       9989      9996\n",
+       "\n",
+       "[54940 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import random\n",
+    "import string\n",
+    "random.seed(0)\n",
+    "cat_seq = '$'+''.join(random.choices(string.ascii_uppercase+'$', k=10000))+'$'\n",
+    "start_idxes, stop_idxes = get_substring_indices(cat_seq, min_len=7, max_len=14)\n",
+    "digest_pos_df = pd.DataFrame({\n",
+    "    \"start_pos\": start_idxes,\n",
+    "    \"stop_pos\": stop_idxes,\n",
+    "})\n",
+    "digest_pos_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "RAM_use_idxes = sys.getsizeof(digest_pos_df)*1e-6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>sequence</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "      <td>WULGNKV</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>WULGNKVI</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>10</td>\n",
+       "      <td>WULGNKVIM</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>11</td>\n",
+       "      <td>WULGNKVIMP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>12</td>\n",
+       "      <td>WULGNKVIMPY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54935</th>\n",
+       "      <td>9987</td>\n",
+       "      <td>9995</td>\n",
+       "      <td>CESHBWDD</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54936</th>\n",
+       "      <td>9987</td>\n",
+       "      <td>9996</td>\n",
+       "      <td>CESHBWDDX</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54937</th>\n",
+       "      <td>9988</td>\n",
+       "      <td>9995</td>\n",
+       "      <td>ESHBWDD</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54938</th>\n",
+       "      <td>9988</td>\n",
+       "      <td>9996</td>\n",
+       "      <td>ESHBWDDX</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54939</th>\n",
+       "      <td>9989</td>\n",
+       "      <td>9996</td>\n",
+       "      <td>SHBWDDX</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>54940 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       start_pos  stop_pos     sequence\n",
+       "0              1         8      WULGNKV\n",
+       "1              1         9     WULGNKVI\n",
+       "2              1        10    WULGNKVIM\n",
+       "3              1        11   WULGNKVIMP\n",
+       "4              1        12  WULGNKVIMPY\n",
+       "...          ...       ...          ...\n",
+       "54935       9987      9995     CESHBWDD\n",
+       "54936       9987      9996    CESHBWDDX\n",
+       "54937       9988      9995      ESHBWDD\n",
+       "54938       9988      9996     ESHBWDDX\n",
+       "54939       9989      9996      SHBWDDX\n",
+       "\n",
+       "[54940 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "digest_pos_df[\"sequence\"] = digest_pos_df[\n",
+    "    [\"start_pos\",\"stop_pos\"]\n",
+    "].apply(lambda x: cat_seq[slice(*x)], axis=1)\n",
+    "digest_pos_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RAM_use_seqs = sys.getsizeof(digest_pos_df[\"sequence\"])*1e-6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'idxes RAM = 3.25833 Mb, seq RAM = 0.43968, ratio = 7.41063'"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "f\"idxes RAM = {RAM_use_seqs:.5f} Mb, seq RAM = {RAM_use_idxes:.5f}, ratio = {RAM_use_seqs/RAM_use_idxes:.5f}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To save the RAM, the `peptdeep.hla` module works on start and stop indices instead of on peptide sequences directly. This will save about 8 times of the RAM for HLA-I peptides (length from 8 to 14). For a very large protein sequence database, there will be millions of unspecific peptides, so working with strings sometimes is not feasible due to the requirements of extremely large RAM."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Transfer learning for HLA class I prediction with `peptideep.hla`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n",
+    "\n",
+    "model = HLA1_Binding_Classifier()\n",
+    "model.load_pretrained_hla_model()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/peptdeep/hla/hla_utils.py b/peptdeep/hla/hla_utils.py
index d74d36b0..ae29f31f 100644
--- a/peptdeep/hla/hla_utils.py
+++ b/peptdeep/hla/hla_utils.py
@@ -95,12 +95,12 @@ def nonspecific_digest_cat_proteins(
     pd.DataFrame
         A dataframe sorted by `nAA` with three columns:
         `start_pos`: the start index of the peptide in cat_protein
-        `end_pos`: the stop/end index of the peptide in cat_protein
+        `stop_pos`: the stop/end index of the peptide in cat_protein
         `nAA`: the number of amino acids (peptide length).
     """
     pos_starts, pos_ends = get_substring_indices(cat_sequence, min_len, max_len)
-    digest_df = pd.DataFrame(dict(start_pos=pos_starts, end_pos=pos_ends))
-    digest_df["nAA"] = digest_df.end_pos - digest_df.start_pos
+    digest_df = pd.DataFrame(dict(start_pos=pos_starts, stop_pos=pos_ends))
+    digest_df["nAA"] = digest_df.stop_pos - digest_df.start_pos
     digest_df.sort_values("nAA", inplace=True)
     digest_df.reset_index(inplace=True, drop=True)
     return digest_df
@@ -170,7 +170,7 @@ def get_seq_series(idxes_df: pd.DataFrame, cat_prot: str) -> pd.Series:
     pd.Series
         pd.Series with sub-sequences (peptide sequences).
     """
-    return idxes_df[["start_pos", "end_pos"]].apply(
+    return idxes_df[["start_pos", "stop_pos"]].apply(
         lambda x: cat_prot[slice(*x)], axis=1
     )
 

From ed5cd2a89961d790bc388458a3069616cd122140 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Tue, 16 Jul 2024 16:16:18 +0200
Subject: [PATCH 02/10] #183 add immunopeptidomics tutorial in sphix rst

---
 docs/notebooks.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/notebooks.rst b/docs/notebooks.rst
index fa607aab..03646eb5 100644
--- a/docs/notebooks.rst
+++ b/docs/notebooks.rst
@@ -12,3 +12,4 @@ Tutorials and notebooks about how to use AlphaPeptDeep
    nbs/tutorial_model_manager
    nbs/tutorial_building_rt_model
    nbs/tutorial_building_ccs_model
+   nbs/tutorials/tutorial_immunopeptidomics

From 3c308579b35719cfb05cd87e1e42aff4b3bdde5c Mon Sep 17 00:00:00 2001
From: Maria Wahle <wahle@biochem.mpg.de>
Date: Thu, 18 Jul 2024 09:09:25 +0200
Subject: [PATCH 03/10] add test

---
 peptdeep/hla/hla_class1.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/peptdeep/hla/hla_class1.py b/peptdeep/hla/hla_class1.py
index 3a9f5789..67f5e575 100644
--- a/peptdeep/hla/hla_class1.py
+++ b/peptdeep/hla/hla_class1.py
@@ -6,7 +6,7 @@
 from typing import Union
 
 import peptdeep.model.building_block as building_block
-from peptdeep.model.model_interface import ModelInterface
+from peptdeep.model.model_interface import ModelInterface, append_nAA_column_if_missing
 from peptdeep.model.featurize import get_ascii_indices
 from peptdeep.pretrained_models import pretrain_dir, download_models, global_settings
 
@@ -380,6 +380,40 @@ def predict_from_proteins(
         peptide_df["sequence"] = get_seq_series(peptide_df, self._cat_protein_sequence)
         return peptide_df
 
+    def _concat_neg_df(self, precursor_df, column_to_train='HLA'):
+        precursor_df = append_nAA_column_if_missing(precursor_df)
+        precursor_df[column_to_train] = 1
+        df_list = [precursor_df]
+        for nAA, group_df in precursor_df.groupby('nAA'):
+            rnd_seqs = get_random_sequences(
+                self.protein_df, 
+                n=len(group_df),
+                pep_len = nAA
+            )
+            df_list.append(pd.DataFrame(
+                {'sequence':rnd_seqs,'nAA':nAA,column_to_train:0}
+            ))
+        return pd.concat(df_list).reset_index(drop=True)
+
+    def test(self, precursor_df):
+        df = self._concat_neg_df(precursor_df)
+        self.predict(df)
+        prob_list = []
+        precision_list = []
+        recall_list = []
+        fp_list = []
+        for prob in [0.5,0.6,0.7,0.8, 0.9]:
+            prob_list.append(prob)
+            precision_list.append(df[df.HLA_prob_pred>prob].HLA.mean())
+            recall_list.append(df[df.HLA_prob_pred>prob].HLA.sum()/len(df)*2)
+            fp_list.append(1-(1-df[df.HLA_prob_pred<prob].HLA).sum()/len(df)*2)
+        return pd.DataFrame(dict(
+            HLA_prob_pred=prob_list,
+            precision=precision_list,
+            recall=recall_list,
+            false_positive=fp_list
+            ))    
+
     def _download_pretrained_hla_model(self):
         download_models(url=self._model_url, target_path=self._model_zip)
 

From a77b2fe21183de75166cd4e4bcbcf5c61d1c48b6 Mon Sep 17 00:00:00 2001
From: Maria Wahle <wahle@biochem.mpg.de>
Date: Thu, 18 Jul 2024 13:49:52 +0200
Subject: [PATCH 04/10] tutorial

---
 .../tutorial_immunopeptidomics.ipynb          | 1329 +++++++++++++++--
 nbs_tests/hla/hla_class1.ipynb                |    9 +-
 2 files changed, 1171 insertions(+), 167 deletions(-)

diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb
index d71290eb..1ee12f5e 100644
--- a/docs/tutorials/tutorial_immunopeptidomics.ipynb
+++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb
@@ -11,6 +11,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "This notebook introduces how to generate spectral libraries for immunopeptidomics analysis from a list of protein sequences. This entails several steps:\n",
+    "\n",
+    "1. unspecific digestion of protein sequences\n",
+    "2. selection of peptide sequences used for library prediction by peptdeep-hla predicition\n",
+    "   2.1 using the pretrained model\n",
+    "   2.2 using an improved model by including a transfer learning step\n",
+    "3. spectral library prediction\n",
+    "4. matching the peptides back to the proteins (this can be done before or after library prediction or seach)  \n",
+    "\n",
+    "\n",
+    "\n",
     "Note that pydivsufsort package is not installed by peptdeep by default. Install by:\n",
     "```\n",
     "pip install \"peptdeep[development,hla]\"\n",
@@ -21,7 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
@@ -40,28 +51,34 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Unspecific digestion in alphabase"
+    "## 1. Unspecific digestion in alphabase"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n",
+    "The unspecific digestion workflow uses the longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n",
     "\n",
-    "Unspecific digestion in alphabase involves two steps:"
+    "This means, the digestion is performed on a single sequence of strings and retrives both the peptide sequence as well as the start and stop indeces of the peptide within the complete sequence. Therefore, unspecific digestion in alphabase involves two steps:\n",
+    "\n",
+    "1. concatenation of protein sequences into a single sequence\n",
+    "2. unspecific digestion\n",
+    "\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "1. Concatenate protein sequences into a single sequence, separated by a sentinel character, e.g., '$'. For instance:"
+    "#### 1.1 Concatenate protein sequences into a single sequence\n",
+    "\n",
+    "The protein sequences are concatenated into a single sequence. The sequences are seperated by a sentinel character, in this case '$', so that no peptides across proteins are formed. Note that the first and last sentinel characters are crutial as well.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -71,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
@@ -80,7 +97,7 @@
        "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -95,19 +112,136 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that the first and last sentinel characters are crutial as well."
+    "The same can be done directly from a fasta: \n",
+    "@ Feng do you have an example fasta somwhere? "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>protein_id</th>\n",
+       "      <th>full_name</th>\n",
+       "      <th>gene_name</th>\n",
+       "      <th>description</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>nAA</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>tr|A0A024R161|A0A024R161_HUMAN</th>\n",
+       "      <td>A0A024R161</td>\n",
+       "      <td>tr|A0A024R161|A0A024R161_HUMAN</td>\n",
+       "      <td>DNAJC25-GNG10</td>\n",
+       "      <td>tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot...</td>\n",
+       "      <td>MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...</td>\n",
+       "      <td>153</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>tr|A0A024RAP8|A0A024RAP8_HUMAN</th>\n",
+       "      <td>A0A024RAP8</td>\n",
+       "      <td>tr|A0A024RAP8|A0A024RAP8_HUMAN</td>\n",
+       "      <td>KLRC4-KLRK1</td>\n",
+       "      <td>tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso...</td>\n",
+       "      <td>MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC...</td>\n",
+       "      <td>216</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                protein_id                       full_name  \\\n",
+       "tr|A0A024R161|A0A024R161_HUMAN  A0A024R161  tr|A0A024R161|A0A024R161_HUMAN   \n",
+       "tr|A0A024RAP8|A0A024RAP8_HUMAN  A0A024RAP8  tr|A0A024RAP8|A0A024RAP8_HUMAN   \n",
+       "\n",
+       "                                    gene_name  \\\n",
+       "tr|A0A024R161|A0A024R161_HUMAN  DNAJC25-GNG10   \n",
+       "tr|A0A024RAP8|A0A024RAP8_HUMAN    KLRC4-KLRK1   \n",
+       "\n",
+       "                                                                      description  \\\n",
+       "tr|A0A024R161|A0A024R161_HUMAN  tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot...   \n",
+       "tr|A0A024RAP8|A0A024RAP8_HUMAN  tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso...   \n",
+       "\n",
+       "                                                                         sequence  \\\n",
+       "tr|A0A024R161|A0A024R161_HUMAN  MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...   \n",
+       "tr|A0A024RAP8|A0A024RAP8_HUMAN  MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC...   \n",
+       "\n",
+       "                                nAA  \n",
+       "tr|A0A024R161|A0A024R161_HUMAN  153  \n",
+       "tr|A0A024RAP8|A0A024RAP8_HUMAN  216  "
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from peptdeep.hla.hla_utils import load_prot_df\n",
+    "fasta = load_prot_df(r\"D:\\Software\\FASTA\\Human\\example.fasta\")\n",
+    "fasta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'$MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSAGKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAELQQYCMQNACKDALLVGVPAGSNPFREPRSCALL$MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIAVAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNWYESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLTIIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV$'"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from peptdeep.hla.hla_utils import cat_proteins\n",
+    "cat_fasta = cat_proteins(fasta['sequence'])\n",
+    "cat_fasta"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "2. Use `alphabase.protein.lcp_digest.get_substring_indices` to get all non-redundant non-specific sequences from the concatenated sequence."
+    "#### 1.2 Unspecific digestion\n",
+    "\n",
+    "Use `alphabase.protein.lcp_digest.get_substring_indices` to get all non-redundant non-specific peptide sequences from the concatenated protein sequence. The digested peptide sequences are stored in a dataframe based on their start and stop indices in the concantenated protein sequence string. To save the RAM, the `peptdeep.hla` module works on start and stop indices instead of on peptide sequences directly. This will save about 8 times of the RAM for HLA-I peptides (length from 7 to 14, deomnstrated below). For a large protein sequence database, there will be millions of unspecific peptides, so working with strings is not feasible for a complete human fasta due to the requirements of extremely large RAM. (~ 70M unspecific sequences from the reviewed swissprot fasta require ~ 4-5 GB RAM already).\n",
+    "\n",
+    "Using the get_substring_indices function we extract the start and stop indices of all peptide sequences between 7 and 14 aa (min_len, max_len) from the concatenated protein sequences. All peptides sequences are unique, guranteed by the LCP algorithm."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
@@ -167,53 +301,53 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>79</th>\n",
-       "      <td>13</td>\n",
-       "      <td>22</td>\n",
+       "      <th>2438</th>\n",
+       "      <td>361</td>\n",
+       "      <td>370</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>80</th>\n",
-       "      <td>13</td>\n",
-       "      <td>23</td>\n",
+       "      <th>2439</th>\n",
+       "      <td>361</td>\n",
+       "      <td>371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>81</th>\n",
-       "      <td>14</td>\n",
-       "      <td>22</td>\n",
+       "      <th>2440</th>\n",
+       "      <td>362</td>\n",
+       "      <td>370</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>82</th>\n",
-       "      <td>14</td>\n",
-       "      <td>23</td>\n",
+       "      <th>2441</th>\n",
+       "      <td>362</td>\n",
+       "      <td>371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>83</th>\n",
-       "      <td>15</td>\n",
-       "      <td>23</td>\n",
+       "      <th>2442</th>\n",
+       "      <td>363</td>\n",
+       "      <td>371</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>84 rows × 2 columns</p>\n",
+       "<p>2443 rows × 2 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "    start_pos  stop_pos\n",
-       "0           1         9\n",
-       "1           1        10\n",
-       "2           1        11\n",
-       "3           1        12\n",
-       "4           1        13\n",
-       "..        ...       ...\n",
-       "79         13        22\n",
-       "80         13        23\n",
-       "81         14        22\n",
-       "82         14        23\n",
-       "83         15        23\n",
+       "      start_pos  stop_pos\n",
+       "0             1         9\n",
+       "1             1        10\n",
+       "2             1        11\n",
+       "3             1        12\n",
+       "4             1        13\n",
+       "...         ...       ...\n",
+       "2438        361       370\n",
+       "2439        361       371\n",
+       "2440        362       370\n",
+       "2441        362       371\n",
+       "2442        363       371\n",
        "\n",
-       "[84 rows x 2 columns]"
+       "[2443 rows x 2 columns]"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -221,9 +355,10 @@
    "source": [
     "from alphabase.protein.lcp_digest import get_substring_indices\n",
     "import pandas as pd\n",
+    "import sys\n",
     "\n",
     "start_idxes, stop_idxes = get_substring_indices(\n",
-    "    cat_prot, min_len=8, max_len=14, stop_char=\"$\"\n",
+    "    cat_fasta, min_len=8, max_len=14, stop_char=\"$\"\n",
     ")\n",
     "digest_pos_df = pd.DataFrame({\n",
     "    \"start_pos\": start_idxes,\n",
@@ -232,16 +367,25 @@
     "digest_pos_df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RAM_use_idxes = sys.getsizeof(digest_pos_df)*1e-6"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "All unspecific peptides can be localted by the `start_pos` and `stop_pos` in `digest_pos_df`, and all peptides are non-redundant guaranteed by the LCP algorithm."
+    "The unspecific peptide sequences can be localted by the `start_pos` and `stop_pos`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
@@ -267,117 +411,161 @@
        "      <th></th>\n",
        "      <th>start_pos</th>\n",
        "      <th>stop_pos</th>\n",
+       "      <th>sequence</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1</td>\n",
-       "      <td>8</td>\n",
+       "      <td>9</td>\n",
+       "      <td>MGAPLLSP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1</td>\n",
-       "      <td>9</td>\n",
+       "      <td>10</td>\n",
+       "      <td>MGAPLLSPG</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>1</td>\n",
-       "      <td>10</td>\n",
+       "      <td>11</td>\n",
+       "      <td>MGAPLLSPGW</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>1</td>\n",
-       "      <td>11</td>\n",
+       "      <td>12</td>\n",
+       "      <td>MGAPLLSPGWG</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>1</td>\n",
-       "      <td>12</td>\n",
+       "      <td>13</td>\n",
+       "      <td>MGAPLLSPGWGA</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54935</th>\n",
-       "      <td>9987</td>\n",
-       "      <td>9995</td>\n",
+       "      <th>2438</th>\n",
+       "      <td>361</td>\n",
+       "      <td>370</td>\n",
+       "      <td>NTYICMQRT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54936</th>\n",
-       "      <td>9987</td>\n",
-       "      <td>9996</td>\n",
+       "      <th>2439</th>\n",
+       "      <td>361</td>\n",
+       "      <td>371</td>\n",
+       "      <td>NTYICMQRTV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54937</th>\n",
-       "      <td>9988</td>\n",
-       "      <td>9995</td>\n",
+       "      <th>2440</th>\n",
+       "      <td>362</td>\n",
+       "      <td>370</td>\n",
+       "      <td>TYICMQRT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54938</th>\n",
-       "      <td>9988</td>\n",
-       "      <td>9996</td>\n",
+       "      <th>2441</th>\n",
+       "      <td>362</td>\n",
+       "      <td>371</td>\n",
+       "      <td>TYICMQRTV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54939</th>\n",
-       "      <td>9989</td>\n",
-       "      <td>9996</td>\n",
+       "      <th>2442</th>\n",
+       "      <td>363</td>\n",
+       "      <td>371</td>\n",
+       "      <td>YICMQRTV</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>54940 rows × 2 columns</p>\n",
+       "<p>2443 rows × 3 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       start_pos  stop_pos\n",
-       "0              1         8\n",
-       "1              1         9\n",
-       "2              1        10\n",
-       "3              1        11\n",
-       "4              1        12\n",
-       "...          ...       ...\n",
-       "54935       9987      9995\n",
-       "54936       9987      9996\n",
-       "54937       9988      9995\n",
-       "54938       9988      9996\n",
-       "54939       9989      9996\n",
+       "      start_pos  stop_pos      sequence\n",
+       "0             1         9      MGAPLLSP\n",
+       "1             1        10     MGAPLLSPG\n",
+       "2             1        11    MGAPLLSPGW\n",
+       "3             1        12   MGAPLLSPGWG\n",
+       "4             1        13  MGAPLLSPGWGA\n",
+       "...         ...       ...           ...\n",
+       "2438        361       370     NTYICMQRT\n",
+       "2439        361       371    NTYICMQRTV\n",
+       "2440        362       370      TYICMQRT\n",
+       "2441        362       371     TYICMQRTV\n",
+       "2442        363       371      YICMQRTV\n",
        "\n",
-       "[54940 rows x 2 columns]"
+       "[2443 rows x 3 columns]"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import random\n",
-    "import string\n",
-    "random.seed(0)\n",
-    "cat_seq = '$'+''.join(random.choices(string.ascii_uppercase+'$', k=10000))+'$'\n",
-    "start_idxes, stop_idxes = get_substring_indices(cat_seq, min_len=7, max_len=14)\n",
-    "digest_pos_df = pd.DataFrame({\n",
-    "    \"start_pos\": start_idxes,\n",
-    "    \"stop_pos\": stop_idxes,\n",
-    "})\n",
+    "digest_pos_df[\"sequence\"] = digest_pos_df[\n",
+    "    [\"start_pos\",\"stop_pos\"]\n",
+    "].apply(lambda x: cat_fasta[slice(*x)], axis=1)\n",
     "digest_pos_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import sys\n",
-    "RAM_use_idxes = sys.getsizeof(digest_pos_df)*1e-6"
+    "RAM_use_seqs = sys.getsizeof(digest_pos_df[\"sequence\"])*1e-6"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'seq RAM = 0.16621 Mb, idxes RAM = 0.01969, ratio = 8.44230'"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "f\"seq RAM = {RAM_use_seqs:.5f} Mb, idxes RAM = {RAM_use_idxes:.5f}, ratio = {RAM_use_seqs/RAM_use_idxes:.5f}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Selection of peptide sequences used for library prediction\n",
+    "The digest_prot_df contains all unspecifically digested peptide sequences between 7 and 14 aa generatable from the concatenated protein sequences. This list is reduced using a HLA1_Binding_Classifier from peptdeep.hla.hla_class1. Two different model architectures are available, an LSTM model (HLA_Class_I_LSTM) and a BERT model (HLA_Class_I_BERT). A pretrained model is only available for the LSTM model architecture.\n",
+    "The HLA1_Binding_Classifer can be used with a pretrained model, tuned with existing peptide data or trained from scratch. Training of a new model should be considered carefully and will not be covered in this tutorial.\n",
+    "   "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Selection of peptide seqeuence candidates without transferlearning\n",
+    "\n",
+    "Selection of peptide sequences for library predicition using the pretrained model can be done in a few steps. First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, we can use any kind of dataframe containing peptide sequences to predict how likely there are HLA peptides, the only requirement beeing that the column containing the peptides is called 'sequence'.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
    "metadata": {},
    "outputs": [
     {
@@ -404,163 +592,978 @@
        "      <th>start_pos</th>\n",
        "      <th>stop_pos</th>\n",
        "      <th>sequence</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>MGAPLLSP</td>\n",
        "      <td>8</td>\n",
-       "      <td>WULGNKV</td>\n",
+       "      <td>0.239477</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>9</td>\n",
-       "      <td>WULGNKVI</td>\n",
+       "      <td>145</td>\n",
+       "      <td>153</td>\n",
+       "      <td>REPRSCAL</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.061692</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>10</td>\n",
-       "      <td>WULGNKVIM</td>\n",
+       "      <td>146</td>\n",
+       "      <td>154</td>\n",
+       "      <td>EPRSCALL</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.137313</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1</td>\n",
-       "      <td>11</td>\n",
-       "      <td>WULGNKVIMP</td>\n",
+       "      <td>155</td>\n",
+       "      <td>163</td>\n",
+       "      <td>MGWIRGRR</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.056462</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>WULGNKVIMPY</td>\n",
+       "      <td>156</td>\n",
+       "      <td>164</td>\n",
+       "      <td>GWIRGRRS</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.001298</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54935</th>\n",
-       "      <td>9987</td>\n",
-       "      <td>9995</td>\n",
-       "      <td>CESHBWDD</td>\n",
+       "      <th>2438</th>\n",
+       "      <td>112</td>\n",
+       "      <td>126</td>\n",
+       "      <td>KVSQAAAELQQYCM</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.243115</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54936</th>\n",
-       "      <td>9987</td>\n",
-       "      <td>9996</td>\n",
-       "      <td>CESHBWDDX</td>\n",
+       "      <th>2439</th>\n",
+       "      <td>317</td>\n",
+       "      <td>331</td>\n",
+       "      <td>NGSWQWEDGSILSP</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.021114</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54937</th>\n",
-       "      <td>9988</td>\n",
-       "      <td>9995</td>\n",
-       "      <td>ESHBWDD</td>\n",
+       "      <th>2440</th>\n",
+       "      <td>79</td>\n",
+       "      <td>93</td>\n",
+       "      <td>DRYRPQPGDEGPGR</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.060635</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54938</th>\n",
-       "      <td>9988</td>\n",
-       "      <td>9996</td>\n",
-       "      <td>ESHBWDDX</td>\n",
+       "      <th>2441</th>\n",
+       "      <td>113</td>\n",
+       "      <td>127</td>\n",
+       "      <td>VSQAAAELQQYCMQ</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.355900</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54939</th>\n",
-       "      <td>9989</td>\n",
-       "      <td>9996</td>\n",
-       "      <td>SHBWDDX</td>\n",
+       "      <th>2442</th>\n",
+       "      <td>190</td>\n",
+       "      <td>204</td>\n",
+       "      <td>KQRCPVVKSKCREN</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.000362</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>54940 rows × 3 columns</p>\n",
+       "<p>2443 rows × 5 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       start_pos  stop_pos     sequence\n",
-       "0              1         8      WULGNKV\n",
-       "1              1         9     WULGNKVI\n",
-       "2              1        10    WULGNKVIM\n",
-       "3              1        11   WULGNKVIMP\n",
-       "4              1        12  WULGNKVIMPY\n",
-       "...          ...       ...          ...\n",
-       "54935       9987      9995     CESHBWDD\n",
-       "54936       9987      9996    CESHBWDDX\n",
-       "54937       9988      9995      ESHBWDD\n",
-       "54938       9988      9996     ESHBWDDX\n",
-       "54939       9989      9996      SHBWDDX\n",
+       "      start_pos  stop_pos        sequence  nAA  HLA_prob_pred\n",
+       "0             1         9        MGAPLLSP    8       0.239477\n",
+       "1           145       153        REPRSCAL    8       0.061692\n",
+       "2           146       154        EPRSCALL    8       0.137313\n",
+       "3           155       163        MGWIRGRR    8       0.056462\n",
+       "4           156       164        GWIRGRRS    8       0.001298\n",
+       "...         ...       ...             ...  ...            ...\n",
+       "2438        112       126  KVSQAAAELQQYCM   14       0.243115\n",
+       "2439        317       331  NGSWQWEDGSILSP   14       0.021114\n",
+       "2440         79        93  DRYRPQPGDEGPGR   14       0.060635\n",
+       "2441        113       127  VSQAAAELQQYCMQ   14       0.355900\n",
+       "2442        190       204  KQRCPVVKSKCREN   14       0.000362\n",
        "\n",
-       "[54940 rows x 3 columns]"
+       "[2443 rows x 5 columns]"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "digest_pos_df[\"sequence\"] = digest_pos_df[\n",
-    "    [\"start_pos\",\"stop_pos\"]\n",
-    "].apply(lambda x: cat_seq[slice(*x)], axis=1)\n",
-    "digest_pos_df"
+    "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n",
+    "\n",
+    "model = HLA1_Binding_Classifier()\n",
+    "model.load_pretrained_hla_model()\n",
+    "manual_prediction = model.predict(digest_pos_df)\n",
+    "manual_prediction\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 42,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "RAM_use_seqs = sys.getsizeof(digest_pos_df[\"sequence\"])*1e-6"
+    "Next, we can filter the list based on the HLA_prob_pred. The higher the probability, the more likely it is for the peptide sequence to be present in a immunopeptidomics sample. It is not recommended to use a cut-off below 0.7 as this inflates the spectral library massively. It is rather recommended to use more conservative cut-offs. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>130</td>\n",
+       "      <td>138</td>\n",
+       "      <td>KDALLVGV</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.817415</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>137</td>\n",
+       "      <td>145</td>\n",
+       "      <td>VPAGSNPF</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.751329</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.940019</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67</th>\n",
+       "      <td>181</td>\n",
+       "      <td>189</td>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.895964</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2318</th>\n",
+       "      <td>95</td>\n",
+       "      <td>109</td>\n",
+       "      <td>QSAEEAFLLVATAY</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.969541</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2378</th>\n",
+       "      <td>329</td>\n",
+       "      <td>343</td>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.756001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2382</th>\n",
+       "      <td>5</td>\n",
+       "      <td>19</td>\n",
+       "      <td>LLSPGWGAGAAGRR</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.733784</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2408</th>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2419</th>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>148 rows × 5 columns</p>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "'idxes RAM = 3.25833 Mb, seq RAM = 0.43968, ratio = 7.41063'"
+       "      start_pos  stop_pos        sequence  nAA  HLA_prob_pred\n",
+       "17          168       176        EMSEFHNY    8       0.793702\n",
+       "24          130       138        KDALLVGV    8       0.817415\n",
+       "31          137       145        VPAGSNPF    8       0.751329\n",
+       "37          170       178        SEFHNYNL    8       0.940019\n",
+       "67          181       189        KSDFSTRW    8       0.895964\n",
+       "...         ...       ...             ...  ...            ...\n",
+       "2318         95       109  QSAEEAFLLVATAY   14       0.969541\n",
+       "2378        329       343  SPNLLTIIEMQKGD   14       0.756001\n",
+       "2382          5        19  LLSPGWGAGAAGRR   14       0.733784\n",
+       "2408        110       124  TLKVSQAAAELQQY   14       0.891976\n",
+       "2419          6        20  LSPGWGAGAAGRRW   14       0.842583\n",
+       "\n",
+       "[148 rows x 5 columns]"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 48,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "f\"idxes RAM = {RAM_use_seqs:.5f} Mb, seq RAM = {RAM_use_idxes:.5f}, ratio = {RAM_use_seqs/RAM_use_idxes:.5f}\""
+    "manual_prediction[manual_prediction['HLA_prob_pred'] > 0.7]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To save the RAM, the `peptdeep.hla` module works on start and stop indices instead of on peptide sequences directly. This will save about 8 times of the RAM for HLA-I peptides (length from 8 to 14). For a very large protein sequence database, there will be millions of unspecific peptides, so working with strings sometimes is not feasible due to the requirements of extremely large RAM."
+    "As described above, directly using the sequences for classification can be memory intense for large lists of sequences. Thereby, the manual concatenation, unspecific digestion, predicition and filtering is only suggested for small sets of proteins or integration of selected sequences (e.g mutations, nuORFs etc.). This can be circumvented by directly predicting and filtering from a fasta using model.predict_from_proteins(). This executes the concatenation, unspecific digestion, predicition and filtering automatically in batches. Thereby the whole process can be done more efficient and be performed without a specialized computation infrastructure."
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 49,
    "metadata": {},
-   "source": [
-    "## Transfer learning for HLA class I prediction with `peptideep.hla`"
-   ]
-  },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00,  1.24it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>sequence</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>130</td>\n",
+       "      <td>138</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.817415</td>\n",
+       "      <td>KDALLVGV</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>137</td>\n",
+       "      <td>145</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.751329</td>\n",
+       "      <td>VPAGSNPF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.940019</td>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>181</td>\n",
+       "      <td>189</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.895964</td>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>143</th>\n",
+       "      <td>95</td>\n",
+       "      <td>109</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.969541</td>\n",
+       "      <td>QSAEEAFLLVATAY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>144</th>\n",
+       "      <td>329</td>\n",
+       "      <td>343</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.756001</td>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>145</th>\n",
+       "      <td>5</td>\n",
+       "      <td>19</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.733784</td>\n",
+       "      <td>LLSPGWGAGAAGRR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>146</th>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>147</th>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>148 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     start_pos  stop_pos  nAA  HLA_prob_pred        sequence\n",
+       "0          168       176    8       0.793702        EMSEFHNY\n",
+       "1          130       138    8       0.817415        KDALLVGV\n",
+       "2          137       145    8       0.751329        VPAGSNPF\n",
+       "3          170       178    8       0.940019        SEFHNYNL\n",
+       "4          181       189    8       0.895964        KSDFSTRW\n",
+       "..         ...       ...  ...            ...             ...\n",
+       "143         95       109   14       0.969541  QSAEEAFLLVATAY\n",
+       "144        329       343   14       0.756001  SPNLLTIIEMQKGD\n",
+       "145          5        19   14       0.733784  LLSPGWGAGAAGRR\n",
+       "146        110       124   14       0.891976  TLKVSQAAAELQQY\n",
+       "147          6        20   14       0.842583  LSPGWGAGAAGRRW\n",
+       "\n",
+       "[148 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.predict_from_proteins(fasta, prob_threshold=0.7)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Selection of peptide seqeuence candidates with transferlearning\n",
+    "\n",
+    "To perform transferlearning we need a list of peptide sequences we expect to be present in our sample. These peptides can be retrived from several different sources like DDA or directDIA search results. It is recommended to use at the very least 1000 sequences for transferlearning. The more sequences available the better the transferlearning step works. The model performance can be assessed after transferlearning and should be assessed before predicition. \n",
+    "\n",
+    "First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, a protein dataframe is added, in this example the previousely loaded fasta file. The protein dataframe is used by the Classifier internaly to draw negative training data during model training and testing."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n",
-    "\n",
     "model = HLA1_Binding_Classifier()\n",
-    "model.load_pretrained_hla_model()"
+    "model.load_pretrained_hla_model()\n",
+    "model.load_proteins(fasta)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we load the peptide sequences wee use for transferlearning and split it into a training and testing dataset. This step is very important to assess the model performance after transferlearning. Here, we use the digest_pos_df generated above. As these are no immunopeptides, but a list of unspecifically digested proteins, the model performance will not improve, but the pronciples remain the same.  \n",
+    "@ Feng should we include a example file so that the model is actually improved or just use this? "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1954, 489)"
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_seq_df = digest_pos_df.sample(frac=0.2)\n",
+    "train_seq_df = digest_pos_df.drop(index=test_seq_df.index)\n",
+    "len(train_seq_df), len(test_seq_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we train the model using the training sequence dataframe. In this example we use 10 training epochs, in a real experiment more should be used. Good starting points are 40 epochs for a training dataset of around 10000 sequences or 100 epochs for a training dataset of around 1000 sequences. For a real experiment the warmup_epochs can be increased to 10.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-07-18 10:24:25> Training with fixed sequence length: 0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=1, lr=2e-05, loss=1.4192258289882116\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=2, lr=4e-05, loss=1.0882413131850106\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=3, lr=6e-05, loss=0.8716121912002563\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=4, lr=8e-05, loss=0.7767811502729144\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=5, lr=0.0001, loss=0.7206867933273315\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=6, lr=0.0001, loss=0.7072907941681998\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013800655092511\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6962822931153434\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6965692894799369\n",
+      "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6948717491967338\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "model.train(train_seq_df,\n",
+    "            epoch=10, warmup_epoch=5, \n",
+    "            verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can assess the model performance after transferlearning using the model.test() function on the training and testing data. This can also be done before transferlearning to assess how well the model fits the available data already. The test assesses the precision, recall and fals positive rate of the model at different probability cut offs. As a rule of thumb a false postitve rate above 7% (@FENG adjust in case lower/higher) is not recomendable because the peptide list gets disproportionally larger, leading to lower IDs during the search. In case of a high false postitive rate, the probability cut off at which the peptides are predicted should be increased.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>precision</th>\n",
+       "      <th>recall</th>\n",
+       "      <th>false_positive</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.507442</td>\n",
+       "      <td>0.558342</td>\n",
+       "      <td>0.541965</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.6</td>\n",
+       "      <td>0.516129</td>\n",
+       "      <td>0.016377</td>\n",
+       "      <td>0.015353</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.7</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.8</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.9</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   HLA_prob_pred  precision    recall  false_positive\n",
+       "0            0.5   0.507442  0.558342        0.541965\n",
+       "1            0.6   0.516129  0.016377        0.015353\n",
+       "2            0.7        NaN  0.000000        0.000000\n",
+       "3            0.8        NaN  0.000000        0.000000\n",
+       "4            0.9        NaN  0.000000        0.000000"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.test(train_seq_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>precision</th>\n",
+       "      <th>recall</th>\n",
+       "      <th>false_positive</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.47807</td>\n",
+       "      <td>0.445808</td>\n",
+       "      <td>0.486708</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.6</td>\n",
+       "      <td>0.62500</td>\n",
+       "      <td>0.020450</td>\n",
+       "      <td>0.012270</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.7</td>\n",
+       "      <td>1.00000</td>\n",
+       "      <td>0.002045</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.8</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.9</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   HLA_prob_pred  precision    recall  false_positive\n",
+       "0            0.5    0.47807  0.445808        0.486708\n",
+       "1            0.6    0.62500  0.020450        0.012270\n",
+       "2            0.7    1.00000  0.002045        0.000000\n",
+       "3            0.8        NaN  0.000000        0.000000\n",
+       "4            0.9        NaN  0.000000        0.000000"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.test(test_seq_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After transferlearning and testing the new model, peptides can be predicted as with the pretrained model. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00,  1.24it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>sequence</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>26801</td>\n",
+       "      <td>26809</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.715877</td>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   start_pos  stop_pos  nAA  HLA_prob_pred  sequence\n",
+       "0      26801     26809    8       0.715877  SEFHNYNL"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.predict_from_proteins(digest_pos_df)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Spectral library prediciton"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -579,7 +1582,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,
diff --git a/nbs_tests/hla/hla_class1.ipynb b/nbs_tests/hla/hla_class1.ipynb
index d0fa0eb3..f4bcd7ae 100644
--- a/nbs_tests/hla/hla_class1.ipynb
+++ b/nbs_tests/hla/hla_class1.ipynb
@@ -33,10 +33,11 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
+      "2024-07-02 17:16:12> Downloading https://github.com/MannLabs/alphapeptdeep/releases/download/pre-trained-models/hla_model.zip ...\n",
+      "2024-07-02 17:16:14> The pretrained models had been downloaded in C:\\Users\\wahle/peptdeep\\pretrained_models\\hla_model.zip\n"
      ]
     }
    ],
@@ -78,7 +79,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00, 14.32it/s]\n"
+      "100%|██████████| 1/1 [00:00<00:00,  7.46it/s]\n"
      ]
     },
     {
@@ -321,7 +322,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,

From 3286f7bf411544173a0a1956d9cb97d4c8dcae43 Mon Sep 17 00:00:00 2001
From: Maria Wahle <wahle@biochem.mpg.de>
Date: Fri, 19 Jul 2024 14:46:22 +0200
Subject: [PATCH 05/10] Finished library prediction

---
 .../tutorial_immunopeptidomics.ipynb          | 2081 ++++++++++++++++-
 1 file changed, 1981 insertions(+), 100 deletions(-)

diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb
index 1ee12f5e..f8f27618 100644
--- a/docs/tutorials/tutorial_immunopeptidomics.ipynb
+++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb
@@ -32,7 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -41,6 +41,14 @@
      "text": [
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n",
+      "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n"
+     ]
     }
    ],
    "source": [
@@ -60,7 +68,7 @@
    "source": [
     "The unspecific digestion workflow uses the longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n",
     "\n",
-    "This means, the digestion is performed on a single sequence of strings and retrives both the peptide sequence as well as the start and stop indeces of the peptide within the complete sequence. Therefore, unspecific digestion in alphabase involves two steps:\n",
+    "This means, the digestion is performed on a single sequence of strings and retrives both the peptide sequence as well as the start and stop indices of the peptide within the complete sequence. Therefore, unspecific digestion in alphabase involves two steps:\n",
     "\n",
     "1. concatenation of protein sequences into a single sequence\n",
     "2. unspecific digestion\n",
@@ -78,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -88,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -97,7 +105,7 @@
        "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -145,6 +153,7 @@
        "      <th>protein_id</th>\n",
        "      <th>full_name</th>\n",
        "      <th>gene_name</th>\n",
+       "      <th>gene_org</th>\n",
        "      <th>description</th>\n",
        "      <th>sequence</th>\n",
        "      <th>nAA</th>\n",
@@ -156,6 +165,7 @@
        "      <td>A0A024R161</td>\n",
        "      <td>tr|A0A024R161|A0A024R161_HUMAN</td>\n",
        "      <td>DNAJC25-GNG10</td>\n",
+       "      <td>A0A024R161_HUMAN</td>\n",
        "      <td>tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot...</td>\n",
        "      <td>MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...</td>\n",
        "      <td>153</td>\n",
@@ -165,6 +175,7 @@
        "      <td>A0A024RAP8</td>\n",
        "      <td>tr|A0A024RAP8|A0A024RAP8_HUMAN</td>\n",
        "      <td>KLRC4-KLRK1</td>\n",
+       "      <td>A0A024RAP8_HUMAN</td>\n",
        "      <td>tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso...</td>\n",
        "      <td>MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC...</td>\n",
        "      <td>216</td>\n",
@@ -178,9 +189,9 @@
        "tr|A0A024R161|A0A024R161_HUMAN  A0A024R161  tr|A0A024R161|A0A024R161_HUMAN   \n",
        "tr|A0A024RAP8|A0A024RAP8_HUMAN  A0A024RAP8  tr|A0A024RAP8|A0A024RAP8_HUMAN   \n",
        "\n",
-       "                                    gene_name  \\\n",
-       "tr|A0A024R161|A0A024R161_HUMAN  DNAJC25-GNG10   \n",
-       "tr|A0A024RAP8|A0A024RAP8_HUMAN    KLRC4-KLRK1   \n",
+       "                                    gene_name          gene_org  \\\n",
+       "tr|A0A024R161|A0A024R161_HUMAN  DNAJC25-GNG10  A0A024R161_HUMAN   \n",
+       "tr|A0A024RAP8|A0A024RAP8_HUMAN    KLRC4-KLRK1  A0A024RAP8_HUMAN   \n",
        "\n",
        "                                                                      description  \\\n",
        "tr|A0A024R161|A0A024R161_HUMAN  tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot...   \n",
@@ -202,13 +213,14 @@
    ],
    "source": [
     "from peptdeep.hla.hla_utils import load_prot_df\n",
-    "fasta = load_prot_df(r\"D:\\Software\\FASTA\\Human\\example.fasta\")\n",
+    "fasta_path = \"D:/Software/FASTA/Human/example.fasta\"\n",
+    "fasta = load_prot_df(fasta_path)\n",
     "fasta"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -217,7 +229,7 @@
        "'$MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSAGKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAELQQYCMQNACKDALLVGVPAGSNPFREPRSCALL$MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIAVAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNWYESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLTIIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV$'"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -241,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -347,7 +359,7 @@
        "[2443 rows x 2 columns]"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -369,7 +381,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -385,7 +397,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -503,7 +515,7 @@
        "[2443 rows x 3 columns]"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -517,7 +529,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -526,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -535,7 +547,7 @@
        "'seq RAM = 0.16621 Mb, idxes RAM = 0.01969, ratio = 8.44230'"
       ]
      },
-     "execution_count": 46,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -565,7 +577,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -707,7 +719,7 @@
        "[2443 rows x 5 columns]"
       ]
      },
-     "execution_count": 47,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -725,12 +737,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Next, we can filter the list based on the HLA_prob_pred. The higher the probability, the more likely it is for the peptide sequence to be present in a immunopeptidomics sample. It is not recommended to use a cut-off below 0.7 as this inflates the spectral library massively. It is rather recommended to use more conservative cut-offs. "
+    "Next, we can filter the list based on the HLA_prob_pred. The higher the probability, the more likely it is for the peptide sequence to be present in a immunopeptidomics sample. It is not recommended to use a cut-off below 0.7 as this inflates the spectral library. It is rather recommended to use more conservative cut-offs. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -872,7 +884,7 @@
        "[148 rows x 5 columns]"
       ]
      },
-     "execution_count": 48,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -890,21 +902,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  0%|          | 0/1 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  1.24it/s]\n"
+      "100%|██████████| 1/1 [00:00<00:00,  1.23it/s]\n"
      ]
     },
     {
@@ -1046,13 +1051,14 @@
        "[148 rows x 5 columns]"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "model.predict_from_proteins(fasta, prob_threshold=0.7)"
+    "sequences = model.predict_from_proteins(fasta, prob_threshold=0.7)\n",
+    "sequences"
    ]
   },
   {
@@ -1068,7 +1074,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1087,7 +1093,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -1096,7 +1102,7 @@
        "(1954, 489)"
       ]
      },
-     "execution_count": 51,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1116,21 +1122,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-18 10:24:25> Training with fixed sequence length: 0\n"
+      "2024-07-19 14:16:34> Training with fixed sequence length: 0\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1138,14 +1144,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=1, lr=2e-05, loss=1.4192258289882116\n"
+      "[Training] Epoch=1, lr=2e-05, loss=1.403803927557809\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1153,14 +1159,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=2, lr=4e-05, loss=1.0882413131850106\n"
+      "[Training] Epoch=2, lr=4e-05, loss=1.0939611451966422\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1168,14 +1174,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=3, lr=6e-05, loss=0.8716121912002563\n"
+      "[Training] Epoch=3, lr=6e-05, loss=0.8742348296301705\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1183,14 +1189,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=4, lr=8e-05, loss=0.7767811502729144\n"
+      "[Training] Epoch=4, lr=8e-05, loss=0.7860026274408612\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1198,14 +1204,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=5, lr=0.0001, loss=0.7206867933273315\n"
+      "[Training] Epoch=5, lr=0.0001, loss=0.7296201757022313\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1213,14 +1219,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=6, lr=0.0001, loss=0.7072907941681998\n"
+      "[Training] Epoch=6, lr=0.0001, loss=0.7098635860851833\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1228,14 +1234,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013800655092511\n"
+      "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7049905742917743\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1243,14 +1249,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6962822931153434\n"
+      "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6990227273532322\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     },
@@ -1258,15 +1264,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6965692894799369\n",
-      "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6948717491967338\n"
+      "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6956126008714948\n",
+      "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6955537881170001\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\wahle\\Anaconda3\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:149: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
+      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
       "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
      ]
     }
@@ -1287,7 +1293,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -1321,16 +1327,16 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.5</td>\n",
-       "      <td>0.507442</td>\n",
-       "      <td>0.558342</td>\n",
-       "      <td>0.541965</td>\n",
+       "      <td>0.504579</td>\n",
+       "      <td>0.563971</td>\n",
+       "      <td>0.553736</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.6</td>\n",
-       "      <td>0.516129</td>\n",
-       "      <td>0.016377</td>\n",
-       "      <td>0.015353</td>\n",
+       "      <td>0.488889</td>\n",
+       "      <td>0.011259</td>\n",
+       "      <td>0.011771</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -1359,14 +1365,14 @@
       ],
       "text/plain": [
        "   HLA_prob_pred  precision    recall  false_positive\n",
-       "0            0.5   0.507442  0.558342        0.541965\n",
-       "1            0.6   0.516129  0.016377        0.015353\n",
+       "0            0.5   0.504579  0.563971        0.553736\n",
+       "1            0.6   0.488889  0.011259        0.011771\n",
        "2            0.7        NaN  0.000000        0.000000\n",
        "3            0.8        NaN  0.000000        0.000000\n",
        "4            0.9        NaN  0.000000        0.000000"
       ]
      },
-     "execution_count": 53,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1377,7 +1383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -1411,22 +1417,22 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.5</td>\n",
-       "      <td>0.47807</td>\n",
-       "      <td>0.445808</td>\n",
-       "      <td>0.486708</td>\n",
+       "      <td>0.484288</td>\n",
+       "      <td>0.535787</td>\n",
+       "      <td>0.570552</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.6</td>\n",
-       "      <td>0.62500</td>\n",
-       "      <td>0.020450</td>\n",
-       "      <td>0.012270</td>\n",
+       "      <td>0.285714</td>\n",
+       "      <td>0.004090</td>\n",
+       "      <td>0.010225</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0.7</td>\n",
-       "      <td>1.00000</td>\n",
-       "      <td>0.002045</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1449,14 +1455,14 @@
       ],
       "text/plain": [
        "   HLA_prob_pred  precision    recall  false_positive\n",
-       "0            0.5    0.47807  0.445808        0.486708\n",
-       "1            0.6    0.62500  0.020450        0.012270\n",
-       "2            0.7    1.00000  0.002045        0.000000\n",
+       "0            0.5   0.484288  0.535787        0.570552\n",
+       "1            0.6   0.285714  0.004090        0.010225\n",
+       "2            0.7        NaN  0.000000        0.000000\n",
        "3            0.8        NaN  0.000000        0.000000\n",
        "4            0.9        NaN  0.000000        0.000000"
       ]
      },
-     "execution_count": 54,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1474,14 +1480,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  1.24it/s]\n"
+      "100%|██████████| 1/1 [00:00<00:00,  1.21it/s]\n"
      ]
     },
     {
@@ -1515,55 +1521,1930 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>26801</td>\n",
-       "      <td>26809</td>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.715877</td>\n",
+       "      <td>0.673504</td>\n",
        "      <td>SEFHNYNL</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>181</td>\n",
+       "      <td>189</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.617312</td>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>309</td>\n",
+       "      <td>317</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.615951</td>\n",
+       "      <td>MGLVHIPT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>299</td>\n",
+       "      <td>307</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.601598</td>\n",
+       "      <td>LLKLVKSY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>346</td>\n",
+       "      <td>354</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.609713</td>\n",
+       "      <td>YASSFKGY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>344</td>\n",
+       "      <td>352</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.635873</td>\n",
+       "      <td>ALYASSFK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>294</td>\n",
+       "      <td>303</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.600454</td>\n",
+       "      <td>KEDQDLLKL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>298</td>\n",
+       "      <td>307</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.628539</td>\n",
+       "      <td>DLLKLVKSY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>74</td>\n",
+       "      <td>83</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.602105</td>\n",
+       "      <td>RRYHPDRYR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>344</td>\n",
+       "      <td>354</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.625569</td>\n",
+       "      <td>ALYASSFKGY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>232</td>\n",
+       "      <td>242</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.607737</td>\n",
+       "      <td>FLNSLFNQEV</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>353</td>\n",
+       "      <td>363</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.610844</td>\n",
+       "      <td>YIENCSTPNT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>53</td>\n",
+       "      <td>63</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.608182</td>\n",
+       "      <td>VLGVSRSAGK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>298</td>\n",
+       "      <td>309</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0.608567</td>\n",
+       "      <td>DLLKLVKSYHW</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>351</td>\n",
+       "      <td>362</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0.607036</td>\n",
+       "      <td>KGYIENCSTPN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>52</td>\n",
+       "      <td>63</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0.635592</td>\n",
+       "      <td>EVLGVSRSAGK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>130</td>\n",
+       "      <td>142</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.601588</td>\n",
+       "      <td>KDALLVGVPAGS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>351</td>\n",
+       "      <td>363</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.632752</td>\n",
+       "      <td>KGYIENCSTPNT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>86</td>\n",
+       "      <td>99</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.608231</td>\n",
+       "      <td>GDEGPGRTPQSAE</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>141</td>\n",
+       "      <td>154</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.603257</td>\n",
+       "      <td>SNPFREPRSCALL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>32</td>\n",
+       "      <td>45</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.608740</td>\n",
+       "      <td>LVRPAGALVEGLY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>130</td>\n",
+       "      <td>143</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.620658</td>\n",
+       "      <td>KDALLVGVPAGSN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>185</td>\n",
+       "      <td>199</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.625906</td>\n",
+       "      <td>STRWQKQRCPVVKS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>60</td>\n",
+       "      <td>74</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.684522</td>\n",
+       "      <td>AGKAEIARAYRQLA</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   start_pos  stop_pos  nAA  HLA_prob_pred  sequence\n",
-       "0      26801     26809    8       0.715877  SEFHNYNL"
+       "    start_pos  stop_pos  nAA  HLA_prob_pred        sequence\n",
+       "0         170       178    8       0.673504        SEFHNYNL\n",
+       "1         181       189    8       0.617312        KSDFSTRW\n",
+       "2         309       317    8       0.615951        MGLVHIPT\n",
+       "3         299       307    8       0.601598        LLKLVKSY\n",
+       "4         346       354    8       0.609713        YASSFKGY\n",
+       "5         344       352    8       0.635873        ALYASSFK\n",
+       "6         294       303    9       0.600454       KEDQDLLKL\n",
+       "7         298       307    9       0.628539       DLLKLVKSY\n",
+       "8          74        83    9       0.602105       RRYHPDRYR\n",
+       "9         344       354   10       0.625569      ALYASSFKGY\n",
+       "10        232       242   10       0.607737      FLNSLFNQEV\n",
+       "11        353       363   10       0.610844      YIENCSTPNT\n",
+       "12         53        63   10       0.608182      VLGVSRSAGK\n",
+       "13        298       309   11       0.608567     DLLKLVKSYHW\n",
+       "14        351       362   11       0.607036     KGYIENCSTPN\n",
+       "15         52        63   11       0.635592     EVLGVSRSAGK\n",
+       "16        130       142   12       0.601588    KDALLVGVPAGS\n",
+       "17        351       363   12       0.632752    KGYIENCSTPNT\n",
+       "18         86        99   13       0.608231   GDEGPGRTPQSAE\n",
+       "19        141       154   13       0.603257   SNPFREPRSCALL\n",
+       "20         32        45   13       0.608740   LVRPAGALVEGLY\n",
+       "21        130       143   13       0.620658   KDALLVGVPAGSN\n",
+       "22        185       199   14       0.625906  STRWQKQRCPVVKS\n",
+       "23         60        74   14       0.684522  AGKAEIARAYRQLA"
       ]
      },
-     "execution_count": 55,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "model.predict_from_proteins(digest_pos_df)"
+    "model.predict_from_proteins(fasta, prob_threshold=0.6)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Spectral library prediciton"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now the spectral library for the filtered peptide list can be predicted using PredictSpecLibFasta. First, one needs to select the models for rt/ccs/ms2 prediction using the ModelManager. One can select from a set of pretrained models or load externally trained models. Here we load the 'HLA' model (at the moment this still loads the generic model, but in the futer this is supposed to be replaced by an HLA specfic internal model). "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "from peptdeep.spec_lib.predict_lib import  ModelManager\n",
+    "from peptdeep.protein.fasta import PredictSpecLibFasta\n",
+    "\n",
+    "model_mgr = ModelManager()\n",
+    "model_mgr.load_installed_models(model_type='HLA')"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Spectral library prediciton"
+    "In the next step, the PredictSpecLibFasta is initialized using the preloaded model. The presettings here are selected for the prediction of tryptic libraries so some parameters need to be adjusted, in particular precursor_charge_min, precursor_charge_max. By default Carbamidomethylation is set as a fixed modification (fix_mod) and Acetylation and Oxidation are set as variable modifications (var_mod). Those can be removed by adding an empty list as shown for the variable modifications. \n",
+    "\n",
+    "Of note, PredictSpecLibFasta can also be used to predict a library from a fasta file. Therfore one can also set the protease (default trypsin) and the minimum and maximum peptide length (7 to 35). Wee dont need to change those parameters here, as we wont make use of the digestion functions but rather provide a already digested sequence table. \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speclib = PredictSpecLibFasta(model_manager=model_mgr,\n",
+    "                              precursor_charge_min=1,\n",
+    "                              precursor_charge_max=3,\n",
+    "                              fix_mods=[])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": []
+   "source": [
+    "To reduce the size of the dataframe and predicted library we give each peptide sequence a unique protein identifier (number). This enables the use of search engines that rely on protein information (such as AlphaDIA) but one needs to keep in mind to remove filtering steps based on how many peptides per protein are identified during data analysis. Alternatively, proteins the peptide sequences could originate from can be infered using prot_infer (demonstrated below).   "
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>protein_id</th>\n",
+       "      <th>protein_idxes</th>\n",
+       "      <th>full_name</th>\n",
+       "      <th>gene_org</th>\n",
+       "      <th>gene_name</th>\n",
+       "      <th>is_prot_nterm</th>\n",
+       "      <th>is_prot_cterm</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>130</td>\n",
+       "      <td>138</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.817415</td>\n",
+       "      <td>KDALLVGV</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>137</td>\n",
+       "      <td>145</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.751329</td>\n",
+       "      <td>VPAGSNPF</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.940019</td>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>181</td>\n",
+       "      <td>189</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.895964</td>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>143</th>\n",
+       "      <td>95</td>\n",
+       "      <td>109</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.969541</td>\n",
+       "      <td>QSAEEAFLLVATAY</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>144</th>\n",
+       "      <td>329</td>\n",
+       "      <td>343</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.756001</td>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>145</th>\n",
+       "      <td>5</td>\n",
+       "      <td>19</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.733784</td>\n",
+       "      <td>LLSPGWGAGAAGRR</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>146</th>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>147</th>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>148 rows × 12 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     start_pos  stop_pos  nAA  HLA_prob_pred        sequence protein_id  \\\n",
+       "0          168       176    8       0.793702        EMSEFHNY          0   \n",
+       "1          130       138    8       0.817415        KDALLVGV          1   \n",
+       "2          137       145    8       0.751329        VPAGSNPF          2   \n",
+       "3          170       178    8       0.940019        SEFHNYNL          3   \n",
+       "4          181       189    8       0.895964        KSDFSTRW          4   \n",
+       "..         ...       ...  ...            ...             ...        ...   \n",
+       "143         95       109   14       0.969541  QSAEEAFLLVATAY        143   \n",
+       "144        329       343   14       0.756001  SPNLLTIIEMQKGD        144   \n",
+       "145          5        19   14       0.733784  LLSPGWGAGAAGRR        145   \n",
+       "146        110       124   14       0.891976  TLKVSQAAAELQQY        146   \n",
+       "147          6        20   14       0.842583  LSPGWGAGAAGRRW        147   \n",
+       "\n",
+       "    protein_idxes full_name gene_org gene_name  is_prot_nterm  is_prot_cterm  \n",
+       "0               0         0        0         0          False          False  \n",
+       "1               1         1        1         1          False          False  \n",
+       "2               2         2        2         2          False          False  \n",
+       "3               3         3        3         3          False          False  \n",
+       "4               4         4        4         4          False          False  \n",
+       "..            ...       ...      ...       ...            ...            ...  \n",
+       "143           143       143      143       143          False          False  \n",
+       "144           144       144      144       144          False          False  \n",
+       "145           145       145      145       145          False          False  \n",
+       "146           146       146      146       146          False          False  \n",
+       "147           147       147      147       147          False          False  \n",
+       "\n",
+       "[148 rows x 12 columns]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sequences['protein_id'] = [str(i) for i in range(len(sequences))]\n",
+    "sequences['protein_idxes'] = sequences.protein_id.astype(\"U\")\n",
+    "sequences['full_name'] = sequences['protein_id'] \n",
+    "sequences['gene_org'] = sequences['protein_id'] \n",
+    "sequences['gene_name'] = sequences['protein_id']\n",
+    "sequences[\"is_prot_nterm\"] = False\n",
+    "sequences[\"is_prot_cterm\"] = False\n",
+    "sequences"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The sequence dataframe contains all the relevant information to be passed to the protein_df and the precursor_df."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>protein_id</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>full_name</th>\n",
+       "      <th>gene_org</th>\n",
+       "      <th>gene_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>KDALLVGV</td>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>VPAGSNPF</td>\n",
+       "      <td>2</td>\n",
+       "      <td>8</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "      <td>3</td>\n",
+       "      <td>8</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "      <td>4</td>\n",
+       "      <td>8</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>143</th>\n",
+       "      <td>QSAEEAFLLVATAY</td>\n",
+       "      <td>143</td>\n",
+       "      <td>14</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>144</th>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "      <td>144</td>\n",
+       "      <td>14</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>145</th>\n",
+       "      <td>LLSPGWGAGAAGRR</td>\n",
+       "      <td>145</td>\n",
+       "      <td>14</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>146</th>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>14</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>147</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>14</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>148 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           sequence protein_id  nAA full_name gene_org gene_name\n",
+       "0          EMSEFHNY          0    8         0        0         0\n",
+       "1          KDALLVGV          1    8         1        1         1\n",
+       "2          VPAGSNPF          2    8         2        2         2\n",
+       "3          SEFHNYNL          3    8         3        3         3\n",
+       "4          KSDFSTRW          4    8         4        4         4\n",
+       "..              ...        ...  ...       ...      ...       ...\n",
+       "143  QSAEEAFLLVATAY        143   14       143      143       143\n",
+       "144  SPNLLTIIEMQKGD        144   14       144      144       144\n",
+       "145  LLSPGWGAGAAGRR        145   14       145      145       145\n",
+       "146  TLKVSQAAAELQQY        146   14       146      146       146\n",
+       "147  LSPGWGAGAAGRRW        147   14       147      147       147\n",
+       "\n",
+       "[148 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "speclib.protein_df = sequences[[\"sequence\",\"protein_id\",\"nAA\", 'full_name', 'gene_org', 'gene_name']].copy()\n",
+    "speclib.protein_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>protein_idxes</th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>is_prot_nterm</th>\n",
+       "      <th>is_prot_cterm</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>KDALLVGV</td>\n",
+       "      <td>1</td>\n",
+       "      <td>130</td>\n",
+       "      <td>138</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.817415</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>VPAGSNPF</td>\n",
+       "      <td>2</td>\n",
+       "      <td>137</td>\n",
+       "      <td>145</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.751329</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "      <td>3</td>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.940019</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "      <td>4</td>\n",
+       "      <td>181</td>\n",
+       "      <td>189</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.895964</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>143</th>\n",
+       "      <td>QSAEEAFLLVATAY</td>\n",
+       "      <td>143</td>\n",
+       "      <td>95</td>\n",
+       "      <td>109</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.969541</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>144</th>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "      <td>144</td>\n",
+       "      <td>329</td>\n",
+       "      <td>343</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.756001</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>145</th>\n",
+       "      <td>LLSPGWGAGAAGRR</td>\n",
+       "      <td>145</td>\n",
+       "      <td>5</td>\n",
+       "      <td>19</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.733784</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>146</th>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>147</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>148 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           sequence protein_idxes  start_pos  stop_pos  nAA  HLA_prob_pred  \\\n",
+       "0          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "1          KDALLVGV             1        130       138    8       0.817415   \n",
+       "2          VPAGSNPF             2        137       145    8       0.751329   \n",
+       "3          SEFHNYNL             3        170       178    8       0.940019   \n",
+       "4          KSDFSTRW             4        181       189    8       0.895964   \n",
+       "..              ...           ...        ...       ...  ...            ...   \n",
+       "143  QSAEEAFLLVATAY           143         95       109   14       0.969541   \n",
+       "144  SPNLLTIIEMQKGD           144        329       343   14       0.756001   \n",
+       "145  LLSPGWGAGAAGRR           145          5        19   14       0.733784   \n",
+       "146  TLKVSQAAAELQQY           146        110       124   14       0.891976   \n",
+       "147  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "\n",
+       "     is_prot_nterm  is_prot_cterm  \n",
+       "0            False          False  \n",
+       "1            False          False  \n",
+       "2            False          False  \n",
+       "3            False          False  \n",
+       "4            False          False  \n",
+       "..             ...            ...  \n",
+       "143          False          False  \n",
+       "144          False          False  \n",
+       "145          False          False  \n",
+       "146          False          False  \n",
+       "147          False          False  \n",
+       "\n",
+       "[148 rows x 8 columns]"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "speclib.precursor_df = sequences[[\"sequence\",\"protein_idxes\",\"start_pos\",\"stop_pos\",\"nAA\",\"HLA_prob_pred\", 'is_prot_nterm', 'is_prot_cterm']].copy()\n",
+    "speclib.precursor_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>protein_idxes</th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>is_prot_nterm</th>\n",
+       "      <th>is_prot_cterm</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>KDALLVGV</td>\n",
+       "      <td>1</td>\n",
+       "      <td>130</td>\n",
+       "      <td>138</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.817415</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>VPAGSNPF</td>\n",
+       "      <td>2</td>\n",
+       "      <td>137</td>\n",
+       "      <td>145</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.751329</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "      <td>3</td>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.940019</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "      <td>4</td>\n",
+       "      <td>181</td>\n",
+       "      <td>189</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.895964</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>143</th>\n",
+       "      <td>QSAEEAFLLVATAY</td>\n",
+       "      <td>143</td>\n",
+       "      <td>95</td>\n",
+       "      <td>109</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.969541</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>144</th>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "      <td>144</td>\n",
+       "      <td>329</td>\n",
+       "      <td>343</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.756001</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>145</th>\n",
+       "      <td>LLSPGWGAGAAGRR</td>\n",
+       "      <td>145</td>\n",
+       "      <td>5</td>\n",
+       "      <td>19</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.733784</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>146</th>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>147</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>148 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           sequence protein_idxes  start_pos  stop_pos  nAA  HLA_prob_pred  \\\n",
+       "0          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "1          KDALLVGV             1        130       138    8       0.817415   \n",
+       "2          VPAGSNPF             2        137       145    8       0.751329   \n",
+       "3          SEFHNYNL             3        170       178    8       0.940019   \n",
+       "4          KSDFSTRW             4        181       189    8       0.895964   \n",
+       "..              ...           ...        ...       ...  ...            ...   \n",
+       "143  QSAEEAFLLVATAY           143         95       109   14       0.969541   \n",
+       "144  SPNLLTIIEMQKGD           144        329       343   14       0.756001   \n",
+       "145  LLSPGWGAGAAGRR           145          5        19   14       0.733784   \n",
+       "146  TLKVSQAAAELQQY           146        110       124   14       0.891976   \n",
+       "147  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "\n",
+       "     is_prot_nterm  is_prot_cterm  \n",
+       "0            False          False  \n",
+       "1            False          False  \n",
+       "2            False          False  \n",
+       "3            False          False  \n",
+       "4            False          False  \n",
+       "..             ...            ...  \n",
+       "143          False          False  \n",
+       "144          False          False  \n",
+       "145          False          False  \n",
+       "146          False          False  \n",
+       "147          False          False  \n",
+       "\n",
+       "[148 rows x 8 columns]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "speclib.precursor_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, the modifications and charges can be added to the peptide dataframe using add_modifications and add_charge. This creates a unique entry for every combination of charge and modification for all the sequences in the precursor dataframe. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>protein_idxes</th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>is_prot_nterm</th>\n",
+       "      <th>is_prot_cterm</th>\n",
+       "      <th>mods</th>\n",
+       "      <th>mod_sites</th>\n",
+       "      <th>charge</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Oxidation@M</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Oxidation@M</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Oxidation@M</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>493</th>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>494</th>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>495</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>496</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>497</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>498 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           sequence protein_idxes  start_pos  stop_pos  nAA  HLA_prob_pred  \\\n",
+       "0          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "1          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "2          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "3          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "4          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "..              ...           ...        ...       ...  ...            ...   \n",
+       "493  TLKVSQAAAELQQY           146        110       124   14       0.891976   \n",
+       "494  TLKVSQAAAELQQY           146        110       124   14       0.891976   \n",
+       "495  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "496  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "497  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "\n",
+       "     is_prot_nterm  is_prot_cterm         mods mod_sites  charge  \n",
+       "0            False          False  Oxidation@M         2       1  \n",
+       "1            False          False  Oxidation@M         2       2  \n",
+       "2            False          False  Oxidation@M         2       3  \n",
+       "3            False          False                              1  \n",
+       "4            False          False                              2  \n",
+       "..             ...            ...          ...       ...     ...  \n",
+       "493          False          False                              2  \n",
+       "494          False          False                              3  \n",
+       "495          False          False                              1  \n",
+       "496          False          False                              2  \n",
+       "497          False          False                              3  \n",
+       "\n",
+       "[498 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "speclib.add_modifications()\n",
+    "speclib.add_charge()\n",
+    "speclib.precursor_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now ccs, rt and ms2 can be predicted for each entry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-07-19 14:19:49> Predicting RT/IM/MS2 for 400 precursors ...\n",
+      "2024-07-19 14:19:49> Predicting RT ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 7/7 [00:00<00:00, 65.96it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-07-19 14:19:49> Predicting mobility ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "100%|██████████| 7/7 [00:00<00:00, 70.12it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-07-19 14:19:49> Predicting MS2 ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "100%|██████████| 7/7 [00:00<00:00, 23.54it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-07-19 14:19:50> End predicting RT/IM/MS2\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "speclib.predict_all()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "iRTs can be added using translate_rt_to_irt_pred. This is not neccessary for search engines like DIA-NN or AlphaDIA but required for Spectronaut."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Predict RT for 11 iRT precursors.\n",
+      "Linear regression of `rt_pred` to `irt`:\n",
+      "   R_square         R       slope  intercept  test_num\n",
+      "0   0.99007  0.995022  152.235621  -39.23216        11\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>protein_idxes</th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>is_prot_nterm</th>\n",
+       "      <th>is_prot_cterm</th>\n",
+       "      <th>mods</th>\n",
+       "      <th>mod_sites</th>\n",
+       "      <th>...</th>\n",
+       "      <th>precursor_mz</th>\n",
+       "      <th>rt_pred</th>\n",
+       "      <th>rt_norm_pred</th>\n",
+       "      <th>ccs_pred</th>\n",
+       "      <th>mobility_pred</th>\n",
+       "      <th>nce</th>\n",
+       "      <th>instrument</th>\n",
+       "      <th>frag_start_idx</th>\n",
+       "      <th>frag_stop_idx</th>\n",
+       "      <th>irt_pred</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Oxidation@M</td>\n",
+       "      <td>2</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1072.404037</td>\n",
+       "      <td>0.189650</td>\n",
+       "      <td>0.189650</td>\n",
+       "      <td>254.195923</td>\n",
+       "      <td>1.253140</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7</td>\n",
+       "      <td>-10.360729</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Oxidation@M</td>\n",
+       "      <td>2</td>\n",
+       "      <td>...</td>\n",
+       "      <td>536.705657</td>\n",
+       "      <td>0.189650</td>\n",
+       "      <td>0.189650</td>\n",
+       "      <td>337.328583</td>\n",
+       "      <td>0.831494</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>7</td>\n",
+       "      <td>14</td>\n",
+       "      <td>-10.360729</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>1056.409123</td>\n",
+       "      <td>0.289261</td>\n",
+       "      <td>0.289261</td>\n",
+       "      <td>255.103760</td>\n",
+       "      <td>1.257373</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>14</td>\n",
+       "      <td>21</td>\n",
+       "      <td>4.803679</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>528.708200</td>\n",
+       "      <td>0.289261</td>\n",
+       "      <td>0.289261</td>\n",
+       "      <td>337.444641</td>\n",
+       "      <td>0.831621</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>21</td>\n",
+       "      <td>28</td>\n",
+       "      <td>4.803679</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>KDALLVGV</td>\n",
+       "      <td>1</td>\n",
+       "      <td>130</td>\n",
+       "      <td>138</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.817415</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>814.503280</td>\n",
+       "      <td>0.433791</td>\n",
+       "      <td>0.433791</td>\n",
+       "      <td>256.615234</td>\n",
+       "      <td>1.260001</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>28</td>\n",
+       "      <td>35</td>\n",
+       "      <td>26.806266</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>395</th>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>775.414662</td>\n",
+       "      <td>0.489545</td>\n",
+       "      <td>0.489545</td>\n",
+       "      <td>429.360870</td>\n",
+       "      <td>1.062514</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>3810</td>\n",
+       "      <td>3823</td>\n",
+       "      <td>35.294021</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>396</th>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>517.278867</td>\n",
+       "      <td>0.489545</td>\n",
+       "      <td>0.489545</td>\n",
+       "      <td>463.231110</td>\n",
+       "      <td>0.764225</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>3823</td>\n",
+       "      <td>3836</td>\n",
+       "      <td>35.294021</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>397</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>1441.744742</td>\n",
+       "      <td>0.377743</td>\n",
+       "      <td>0.377743</td>\n",
+       "      <td>289.200989</td>\n",
+       "      <td>1.430378</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>3836</td>\n",
+       "      <td>3849</td>\n",
+       "      <td>18.273781</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>398</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>721.376009</td>\n",
+       "      <td>0.377743</td>\n",
+       "      <td>0.377743</td>\n",
+       "      <td>404.633667</td>\n",
+       "      <td>1.000659</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>3849</td>\n",
+       "      <td>3862</td>\n",
+       "      <td>18.273781</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>399</th>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>...</td>\n",
+       "      <td>481.253098</td>\n",
+       "      <td>0.377743</td>\n",
+       "      <td>0.377743</td>\n",
+       "      <td>476.655640</td>\n",
+       "      <td>0.785851</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>Lumos</td>\n",
+       "      <td>3862</td>\n",
+       "      <td>3875</td>\n",
+       "      <td>18.273781</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>400 rows × 21 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           sequence protein_idxes  start_pos  stop_pos  nAA  HLA_prob_pred  \\\n",
+       "0          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "1          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "2          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "3          EMSEFHNY             0        168       176    8       0.793702   \n",
+       "4          KDALLVGV             1        130       138    8       0.817415   \n",
+       "..              ...           ...        ...       ...  ...            ...   \n",
+       "395  TLKVSQAAAELQQY           146        110       124   14       0.891976   \n",
+       "396  TLKVSQAAAELQQY           146        110       124   14       0.891976   \n",
+       "397  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "398  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "399  LSPGWGAGAAGRRW           147          6        20   14       0.842583   \n",
+       "\n",
+       "     is_prot_nterm  is_prot_cterm         mods mod_sites  ...  precursor_mz  \\\n",
+       "0            False          False  Oxidation@M         2  ...   1072.404037   \n",
+       "1            False          False  Oxidation@M         2  ...    536.705657   \n",
+       "2            False          False                         ...   1056.409123   \n",
+       "3            False          False                         ...    528.708200   \n",
+       "4            False          False                         ...    814.503280   \n",
+       "..             ...            ...          ...       ...  ...           ...   \n",
+       "395          False          False                         ...    775.414662   \n",
+       "396          False          False                         ...    517.278867   \n",
+       "397          False          False                         ...   1441.744742   \n",
+       "398          False          False                         ...    721.376009   \n",
+       "399          False          False                         ...    481.253098   \n",
+       "\n",
+       "      rt_pred  rt_norm_pred    ccs_pred  mobility_pred   nce  instrument  \\\n",
+       "0    0.189650      0.189650  254.195923       1.253140  30.0       Lumos   \n",
+       "1    0.189650      0.189650  337.328583       0.831494  30.0       Lumos   \n",
+       "2    0.289261      0.289261  255.103760       1.257373  30.0       Lumos   \n",
+       "3    0.289261      0.289261  337.444641       0.831621  30.0       Lumos   \n",
+       "4    0.433791      0.433791  256.615234       1.260001  30.0       Lumos   \n",
+       "..        ...           ...         ...            ...   ...         ...   \n",
+       "395  0.489545      0.489545  429.360870       1.062514  30.0       Lumos   \n",
+       "396  0.489545      0.489545  463.231110       0.764225  30.0       Lumos   \n",
+       "397  0.377743      0.377743  289.200989       1.430378  30.0       Lumos   \n",
+       "398  0.377743      0.377743  404.633667       1.000659  30.0       Lumos   \n",
+       "399  0.377743      0.377743  476.655640       0.785851  30.0       Lumos   \n",
+       "\n",
+       "    frag_start_idx  frag_stop_idx   irt_pred  \n",
+       "0                0              7 -10.360729  \n",
+       "1                7             14 -10.360729  \n",
+       "2               14             21   4.803679  \n",
+       "3               21             28   4.803679  \n",
+       "4               28             35  26.806266  \n",
+       "..             ...            ...        ...  \n",
+       "395           3810           3823  35.294021  \n",
+       "396           3823           3836  35.294021  \n",
+       "397           3836           3849  18.273781  \n",
+       "398           3849           3862  18.273781  \n",
+       "399           3862           3875  18.273781  \n",
+       "\n",
+       "[400 rows x 21 columns]"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "speclib.translate_rt_to_irt_pred()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, the predicted library can be exported in an hdf format (AlphaDIA) or translated to a tsv. The tsv translation can be very time consuming. Before the spectral library can be translated, the gene and protein column need to be mapped from the protein_df into the precursor_df. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hdf_path = \"D:\\Software\\FASTA\\Human\\speclib_example.hdf\"\n",
+    "\n",
+    "speclib.save_hdf(hdf_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:01<00:00,  1.50s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Translation finished, it will take several minutes to export the rest precursors to the tsv file...\n"
+     ]
+    }
+   ],
+   "source": [
+    "from peptdeep.spec_lib.translate import translate_to_tsv\n",
+    "speclib.append_protein_name()\n",
+    "translate_to_tsv(speclib=speclib, \n",
+    "                tsv =  \"D:\\Software\\FASTA\\Human\\speclib_example.tsv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 4. matching peptides back to proteins"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The peptide sequnces can be matched back to proteins using ProteinInfer, requiring a 'sequence' column. ProteinInfer can be provided with any number of fasta files and lists all the proteins the peptide sequence appears in. This can be done with the sequence output of any search engine or before the library is generated. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from alphabase.??? import ProteinInfer\n",
+    "infer = ProteinInfer(fasta_path)\n",
+    "infer_df = infer.infer_peptides(sequences.sequence.values)\n",
+    "infer_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sequences_infered = pd.merge(sequences, infer_df, how = 'left', on ='sequence')"
+   ]
   }
  ],
  "metadata": {
@@ -1582,7 +3463,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

From 503ca255a5489baabad86d1e97c5b655a1d5cd1f Mon Sep 17 00:00:00 2001
From: Maria Wahle <wahle@biochem.mpg.de>
Date: Mon, 22 Jul 2024 09:23:37 +0200
Subject: [PATCH 06/10] include protein annotation

---
 .../tutorial_immunopeptidomics.ipynb          | 822 ++++++++++++++----
 1 file changed, 648 insertions(+), 174 deletions(-)

diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb
index f8f27618..58be219d 100644
--- a/docs/tutorials/tutorial_immunopeptidomics.ipynb
+++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb
@@ -126,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -206,7 +206,7 @@
        "tr|A0A024RAP8|A0A024RAP8_HUMAN  216  "
       ]
      },
-     "execution_count": 40,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -909,7 +909,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  1.23it/s]\n"
+      "100%|██████████| 1/1 [00:00<00:00,  1.20it/s]\n"
      ]
     },
     {
@@ -1129,7 +1129,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-19 14:16:34> Training with fixed sequence length: 0\n"
+      "2024-07-22 09:21:38> Training with fixed sequence length: 0\n"
      ]
     },
     {
@@ -1144,7 +1144,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=1, lr=2e-05, loss=1.403803927557809\n"
+      "[Training] Epoch=1, lr=2e-05, loss=1.415909733091082\n"
      ]
     },
     {
@@ -1159,7 +1159,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=2, lr=4e-05, loss=1.0939611451966422\n"
+      "[Training] Epoch=2, lr=4e-05, loss=1.0947138496807642\n"
      ]
     },
     {
@@ -1174,7 +1174,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=3, lr=6e-05, loss=0.8742348296301705\n"
+      "[Training] Epoch=3, lr=6e-05, loss=0.8823633790016174\n"
      ]
     },
     {
@@ -1189,7 +1189,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=4, lr=8e-05, loss=0.7860026274408612\n"
+      "[Training] Epoch=4, lr=8e-05, loss=0.7819523641041347\n"
      ]
     },
     {
@@ -1204,7 +1204,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=5, lr=0.0001, loss=0.7296201757022313\n"
+      "[Training] Epoch=5, lr=0.0001, loss=0.7255220583506993\n"
      ]
     },
     {
@@ -1219,7 +1219,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=6, lr=0.0001, loss=0.7098635860851833\n"
+      "[Training] Epoch=6, lr=0.0001, loss=0.705090846334185\n"
      ]
     },
     {
@@ -1234,7 +1234,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7049905742917743\n"
+      "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013667055538723\n"
      ]
     },
     {
@@ -1249,7 +1249,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6990227273532322\n"
+      "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6968921593257359\n"
      ]
     },
     {
@@ -1264,8 +1264,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6956126008714948\n",
-      "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6955537881170001\n"
+      "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6968518495559692\n",
+      "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6932548114231655\n"
      ]
     },
     {
@@ -1327,15 +1327,15 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.5</td>\n",
-       "      <td>0.504579</td>\n",
-       "      <td>0.563971</td>\n",
-       "      <td>0.553736</td>\n",
+       "      <td>0.496400</td>\n",
+       "      <td>0.599795</td>\n",
+       "      <td>0.608495</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.6</td>\n",
-       "      <td>0.488889</td>\n",
-       "      <td>0.011259</td>\n",
+       "      <td>0.622951</td>\n",
+       "      <td>0.019447</td>\n",
        "      <td>0.011771</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1365,8 +1365,8 @@
       ],
       "text/plain": [
        "   HLA_prob_pred  precision    recall  false_positive\n",
-       "0            0.5   0.504579  0.563971        0.553736\n",
-       "1            0.6   0.488889  0.011259        0.011771\n",
+       "0            0.5   0.496400  0.599795        0.608495\n",
+       "1            0.6   0.622951  0.019447        0.011771\n",
        "2            0.7        NaN  0.000000        0.000000\n",
        "3            0.8        NaN  0.000000        0.000000\n",
        "4            0.9        NaN  0.000000        0.000000"
@@ -1417,16 +1417,16 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.5</td>\n",
-       "      <td>0.484288</td>\n",
+       "      <td>0.480159</td>\n",
+       "      <td>0.494888</td>\n",
        "      <td>0.535787</td>\n",
-       "      <td>0.570552</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.6</td>\n",
-       "      <td>0.285714</td>\n",
-       "      <td>0.004090</td>\n",
-       "      <td>0.010225</td>\n",
+       "      <td>0.461538</td>\n",
+       "      <td>0.012270</td>\n",
+       "      <td>0.014315</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -1455,8 +1455,8 @@
       ],
       "text/plain": [
        "   HLA_prob_pred  precision    recall  false_positive\n",
-       "0            0.5   0.484288  0.535787        0.570552\n",
-       "1            0.6   0.285714  0.004090        0.010225\n",
+       "0            0.5   0.480159  0.494888        0.535787\n",
+       "1            0.6   0.461538  0.012270        0.014315\n",
        "2            0.7        NaN  0.000000        0.000000\n",
        "3            0.8        NaN  0.000000        0.000000\n",
        "4            0.9        NaN  0.000000        0.000000"
@@ -1480,14 +1480,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  1.21it/s]\n"
+      "100%|██████████| 1/1 [00:00<00:00,  1.20it/s]\n"
      ]
     },
     {
@@ -1521,195 +1521,355 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>170</td>\n",
-       "      <td>178</td>\n",
+       "      <td>143</td>\n",
+       "      <td>151</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.673504</td>\n",
-       "      <td>SEFHNYNL</td>\n",
+       "      <td>0.606630</td>\n",
+       "      <td>PFREPRSC</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>181</td>\n",
-       "      <td>189</td>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.617312</td>\n",
-       "      <td>KSDFSTRW</td>\n",
+       "      <td>0.697908</td>\n",
+       "      <td>SEFHNYNL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>309</td>\n",
-       "      <td>317</td>\n",
+       "      <td>62</td>\n",
+       "      <td>70</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.615951</td>\n",
-       "      <td>MGLVHIPT</td>\n",
+       "      <td>0.602259</td>\n",
+       "      <td>KAEIARAY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
+       "      <td>87</td>\n",
+       "      <td>95</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.611214</td>\n",
+       "      <td>DEGPGRTP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
        "      <td>299</td>\n",
        "      <td>307</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.601598</td>\n",
+       "      <td>0.611188</td>\n",
        "      <td>LLKLVKSY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>5</th>\n",
        "      <td>346</td>\n",
        "      <td>354</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.609713</td>\n",
+       "      <td>0.620160</td>\n",
        "      <td>YASSFKGY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
+       "      <th>6</th>\n",
        "      <td>344</td>\n",
        "      <td>352</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.635873</td>\n",
+       "      <td>0.601700</td>\n",
        "      <td>ALYASSFK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <td>223</td>\n",
+       "      <td>231</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.605099</td>\n",
+       "      <td>IMVTIWSA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>258</td>\n",
+       "      <td>266</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.618778</td>\n",
+       "      <td>ICYKNNCY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>363</td>\n",
+       "      <td>371</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.602542</td>\n",
+       "      <td>YICMQRTV</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>17</td>\n",
+       "      <td>25</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.605605</td>\n",
+       "      <td>RRWWMLLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
        "      <td>294</td>\n",
        "      <td>303</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.600454</td>\n",
+       "      <td>0.605901</td>\n",
        "      <td>KEDQDLLKL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
+       "      <th>12</th>\n",
        "      <td>298</td>\n",
        "      <td>307</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.628539</td>\n",
+       "      <td>0.635218</td>\n",
        "      <td>DLLKLVKSY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>74</td>\n",
-       "      <td>83</td>\n",
+       "      <th>13</th>\n",
+       "      <td>235</td>\n",
+       "      <td>244</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.602105</td>\n",
-       "      <td>RRYHPDRYR</td>\n",
+       "      <td>0.610781</td>\n",
+       "      <td>SLFNQEVQI</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>9</th>\n",
+       "      <th>14</th>\n",
+       "      <td>221</td>\n",
+       "      <td>230</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.622273</td>\n",
+       "      <td>FIIMVTIWS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>222</td>\n",
+       "      <td>231</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.612193</td>\n",
+       "      <td>IIMVTIWSA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>22</td>\n",
+       "      <td>31</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.604106</td>\n",
+       "      <td>LLAPLLPAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>257</td>\n",
+       "      <td>266</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.620672</td>\n",
+       "      <td>WICYKNNCY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>267</td>\n",
+       "      <td>276</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.614074</td>\n",
+       "      <td>FFDESKNWY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>17</td>\n",
+       "      <td>26</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.604426</td>\n",
+       "      <td>RRWWMLLAP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>327</td>\n",
+       "      <td>336</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.614008</td>\n",
+       "      <td>ILSPNLLTI</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>255</td>\n",
+       "      <td>265</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.601901</td>\n",
+       "      <td>KNWICYKNNC</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
        "      <td>344</td>\n",
        "      <td>354</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.625569</td>\n",
+       "      <td>0.630664</td>\n",
        "      <td>ALYASSFKGY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
+       "      <th>23</th>\n",
        "      <td>232</td>\n",
        "      <td>242</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.607737</td>\n",
+       "      <td>0.634032</td>\n",
        "      <td>FLNSLFNQEV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>11</th>\n",
+       "      <th>24</th>\n",
+       "      <td>221</td>\n",
+       "      <td>231</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.632162</td>\n",
+       "      <td>FIIMVTIWSA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>222</td>\n",
+       "      <td>232</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.606705</td>\n",
+       "      <td>IIMVTIWSAV</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
        "      <td>353</td>\n",
        "      <td>363</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.610844</td>\n",
+       "      <td>0.611286</td>\n",
        "      <td>YIENCSTPNT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>53</td>\n",
-       "      <td>63</td>\n",
+       "      <th>27</th>\n",
+       "      <td>205</td>\n",
+       "      <td>215</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.608182</td>\n",
-       "      <td>VLGVSRSAGK</td>\n",
+       "      <td>0.606018</td>\n",
+       "      <td>SPFFFCCFIA</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>298</td>\n",
-       "      <td>309</td>\n",
+       "      <th>28</th>\n",
+       "      <td>195</td>\n",
+       "      <td>206</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.608567</td>\n",
-       "      <td>DLLKLVKSYHW</td>\n",
+       "      <td>0.607188</td>\n",
+       "      <td>VVKSKCRENAS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>351</td>\n",
-       "      <td>362</td>\n",
+       "      <th>29</th>\n",
+       "      <td>221</td>\n",
+       "      <td>232</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.607036</td>\n",
-       "      <td>KGYIENCSTPN</td>\n",
+       "      <td>0.616940</td>\n",
+       "      <td>FIIMVTIWSAV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>52</td>\n",
-       "      <td>63</td>\n",
+       "      <th>30</th>\n",
+       "      <td>298</td>\n",
+       "      <td>309</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.635592</td>\n",
-       "      <td>EVLGVSRSAGK</td>\n",
+       "      <td>0.600725</td>\n",
+       "      <td>DLLKLVKSYHW</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>130</td>\n",
-       "      <td>142</td>\n",
+       "      <th>31</th>\n",
+       "      <td>353</td>\n",
+       "      <td>364</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0.618278</td>\n",
+       "      <td>YIENCSTPNTY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>42</td>\n",
+       "      <td>54</td>\n",
        "      <td>12</td>\n",
-       "      <td>0.601588</td>\n",
-       "      <td>KDALLVGVPAGS</td>\n",
+       "      <td>0.606224</td>\n",
+       "      <td>GLYCGTRDCYEV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>17</th>\n",
+       "      <th>33</th>\n",
        "      <td>351</td>\n",
        "      <td>363</td>\n",
        "      <td>12</td>\n",
-       "      <td>0.632752</td>\n",
+       "      <td>0.633097</td>\n",
        "      <td>KGYIENCSTPNT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>86</td>\n",
-       "      <td>99</td>\n",
+       "      <th>34</th>\n",
+       "      <td>200</td>\n",
+       "      <td>212</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.608198</td>\n",
+       "      <td>CRENASPFFFCC</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>224</td>\n",
+       "      <td>236</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.606180</td>\n",
+       "      <td>MVTIWSAVFLNS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>195</td>\n",
+       "      <td>207</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.612207</td>\n",
+       "      <td>VVKSKCRENASP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>166</td>\n",
+       "      <td>179</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.608231</td>\n",
-       "      <td>GDEGPGRTPQSAE</td>\n",
+       "      <td>0.628934</td>\n",
+       "      <td>SWEMSEFHNYNLD</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>141</td>\n",
-       "      <td>154</td>\n",
+       "      <th>38</th>\n",
+       "      <td>351</td>\n",
+       "      <td>364</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.603257</td>\n",
-       "      <td>SNPFREPRSCALL</td>\n",
+       "      <td>0.604953</td>\n",
+       "      <td>KGYIENCSTPNTY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>32</td>\n",
-       "      <td>45</td>\n",
+       "      <th>39</th>\n",
+       "      <td>35</td>\n",
+       "      <td>48</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.608740</td>\n",
-       "      <td>LVRPAGALVEGLY</td>\n",
+       "      <td>0.601324</td>\n",
+       "      <td>PAGALVEGLYCGT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>21</th>\n",
+       "      <th>40</th>\n",
        "      <td>130</td>\n",
        "      <td>143</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.620658</td>\n",
+       "      <td>0.603384</td>\n",
        "      <td>KDALLVGVPAGSN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>22</th>\n",
+       "      <th>41</th>\n",
+       "      <td>333</td>\n",
+       "      <td>347</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.601238</td>\n",
+       "      <td>LTIIEMQKGDCALY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
        "      <td>185</td>\n",
        "      <td>199</td>\n",
        "      <td>14</td>\n",
-       "      <td>0.625906</td>\n",
+       "      <td>0.610031</td>\n",
        "      <td>STRWQKQRCPVVKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>60</td>\n",
-       "      <td>74</td>\n",
+       "      <th>43</th>\n",
+       "      <td>117</td>\n",
+       "      <td>131</td>\n",
        "      <td>14</td>\n",
-       "      <td>0.684522</td>\n",
-       "      <td>AGKAEIARAYRQLA</td>\n",
+       "      <td>0.600326</td>\n",
+       "      <td>AAELQQYCMQNACK</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1717,33 +1877,53 @@
       ],
       "text/plain": [
        "    start_pos  stop_pos  nAA  HLA_prob_pred        sequence\n",
-       "0         170       178    8       0.673504        SEFHNYNL\n",
-       "1         181       189    8       0.617312        KSDFSTRW\n",
-       "2         309       317    8       0.615951        MGLVHIPT\n",
-       "3         299       307    8       0.601598        LLKLVKSY\n",
-       "4         346       354    8       0.609713        YASSFKGY\n",
-       "5         344       352    8       0.635873        ALYASSFK\n",
-       "6         294       303    9       0.600454       KEDQDLLKL\n",
-       "7         298       307    9       0.628539       DLLKLVKSY\n",
-       "8          74        83    9       0.602105       RRYHPDRYR\n",
-       "9         344       354   10       0.625569      ALYASSFKGY\n",
-       "10        232       242   10       0.607737      FLNSLFNQEV\n",
-       "11        353       363   10       0.610844      YIENCSTPNT\n",
-       "12         53        63   10       0.608182      VLGVSRSAGK\n",
-       "13        298       309   11       0.608567     DLLKLVKSYHW\n",
-       "14        351       362   11       0.607036     KGYIENCSTPN\n",
-       "15         52        63   11       0.635592     EVLGVSRSAGK\n",
-       "16        130       142   12       0.601588    KDALLVGVPAGS\n",
-       "17        351       363   12       0.632752    KGYIENCSTPNT\n",
-       "18         86        99   13       0.608231   GDEGPGRTPQSAE\n",
-       "19        141       154   13       0.603257   SNPFREPRSCALL\n",
-       "20         32        45   13       0.608740   LVRPAGALVEGLY\n",
-       "21        130       143   13       0.620658   KDALLVGVPAGSN\n",
-       "22        185       199   14       0.625906  STRWQKQRCPVVKS\n",
-       "23         60        74   14       0.684522  AGKAEIARAYRQLA"
+       "0         143       151    8       0.606630        PFREPRSC\n",
+       "1         170       178    8       0.697908        SEFHNYNL\n",
+       "2          62        70    8       0.602259        KAEIARAY\n",
+       "3          87        95    8       0.611214        DEGPGRTP\n",
+       "4         299       307    8       0.611188        LLKLVKSY\n",
+       "5         346       354    8       0.620160        YASSFKGY\n",
+       "6         344       352    8       0.601700        ALYASSFK\n",
+       "7         223       231    8       0.605099        IMVTIWSA\n",
+       "8         258       266    8       0.618778        ICYKNNCY\n",
+       "9         363       371    8       0.602542        YICMQRTV\n",
+       "10         17        25    8       0.605605        RRWWMLLA\n",
+       "11        294       303    9       0.605901       KEDQDLLKL\n",
+       "12        298       307    9       0.635218       DLLKLVKSY\n",
+       "13        235       244    9       0.610781       SLFNQEVQI\n",
+       "14        221       230    9       0.622273       FIIMVTIWS\n",
+       "15        222       231    9       0.612193       IIMVTIWSA\n",
+       "16         22        31    9       0.604106       LLAPLLPAL\n",
+       "17        257       266    9       0.620672       WICYKNNCY\n",
+       "18        267       276    9       0.614074       FFDESKNWY\n",
+       "19         17        26    9       0.604426       RRWWMLLAP\n",
+       "20        327       336    9       0.614008       ILSPNLLTI\n",
+       "21        255       265   10       0.601901      KNWICYKNNC\n",
+       "22        344       354   10       0.630664      ALYASSFKGY\n",
+       "23        232       242   10       0.634032      FLNSLFNQEV\n",
+       "24        221       231   10       0.632162      FIIMVTIWSA\n",
+       "25        222       232   10       0.606705      IIMVTIWSAV\n",
+       "26        353       363   10       0.611286      YIENCSTPNT\n",
+       "27        205       215   10       0.606018      SPFFFCCFIA\n",
+       "28        195       206   11       0.607188     VVKSKCRENAS\n",
+       "29        221       232   11       0.616940     FIIMVTIWSAV\n",
+       "30        298       309   11       0.600725     DLLKLVKSYHW\n",
+       "31        353       364   11       0.618278     YIENCSTPNTY\n",
+       "32         42        54   12       0.606224    GLYCGTRDCYEV\n",
+       "33        351       363   12       0.633097    KGYIENCSTPNT\n",
+       "34        200       212   12       0.608198    CRENASPFFFCC\n",
+       "35        224       236   12       0.606180    MVTIWSAVFLNS\n",
+       "36        195       207   12       0.612207    VVKSKCRENASP\n",
+       "37        166       179   13       0.628934   SWEMSEFHNYNLD\n",
+       "38        351       364   13       0.604953   KGYIENCSTPNTY\n",
+       "39         35        48   13       0.601324   PAGALVEGLYCGT\n",
+       "40        130       143   13       0.603384   KDALLVGVPAGSN\n",
+       "41        333       347   14       0.601238  LTIIEMQKGDCALY\n",
+       "42        185       199   14       0.610031  STRWQKQRCPVVKS\n",
+       "43        117       131   14       0.600326  AAELQQYCMQNACK"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1768,7 +1948,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1790,7 +1970,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1809,7 +1989,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -2048,7 +2228,7 @@
        "[148 rows x 12 columns]"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2073,7 +2253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -2227,7 +2407,7 @@
        "[148 rows x 6 columns]"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2239,7 +2419,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -2430,7 +2610,7 @@
        "[148 rows x 8 columns]"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2442,7 +2622,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -2633,7 +2813,7 @@
        "[148 rows x 8 columns]"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2651,7 +2831,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -2878,7 +3058,7 @@
        "[498 rows x 11 columns]"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2898,29 +3078,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-19 14:19:49> Predicting RT/IM/MS2 for 400 precursors ...\n",
-      "2024-07-19 14:19:49> Predicting RT ...\n"
+      "2024-07-22 09:22:23> Predicting RT/IM/MS2 for 400 precursors ...\n",
+      "2024-07-22 09:22:23> Predicting RT ...\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 7/7 [00:00<00:00, 65.96it/s]"
+      "100%|██████████| 7/7 [00:00<00:00, 69.31it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-19 14:19:49> Predicting mobility ...\n"
+      "2024-07-22 09:22:23> Predicting mobility ...\n"
      ]
     },
     {
@@ -2928,14 +3108,14 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "100%|██████████| 7/7 [00:00<00:00, 70.12it/s]"
+      "100%|██████████| 7/7 [00:00<00:00, 72.89it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-19 14:19:49> Predicting MS2 ...\n"
+      "2024-07-22 09:22:23> Predicting MS2 ...\n"
      ]
     },
     {
@@ -2943,14 +3123,14 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "100%|██████████| 7/7 [00:00<00:00, 23.54it/s]"
+      "100%|██████████| 7/7 [00:00<00:00, 22.52it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-19 14:19:50> End predicting RT/IM/MS2\n"
+      "2024-07-22 09:22:24> End predicting RT/IM/MS2\n"
      ]
     },
     {
@@ -2974,7 +3154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -3357,7 +3537,7 @@
        "[400 rows x 21 columns]"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3375,7 +3555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3386,14 +3566,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:01<00:00,  1.50s/it]\n"
+      "100%|██████████| 1/1 [00:01<00:00,  1.51s/it]\n"
      ]
     },
     {
@@ -3422,19 +3602,315 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The peptide sequnces can be matched back to proteins using ProteinInfer, requiring a 'sequence' column. ProteinInfer can be provided with any number of fasta files and lists all the proteins the peptide sequence appears in. This can be done with the sequence output of any search engine or before the library is generated. "
+    "The peptide sequnces can be matched back to proteins using annotate_precursor_df, requiring a 'sequence' column and a protein_df like the previously loaded fasta file. This can be done with the sequence output of any search engine or before the library is generated. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2/2 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_pos</th>\n",
+       "      <th>stop_pos</th>\n",
+       "      <th>nAA</th>\n",
+       "      <th>HLA_prob_pred</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>protein_id</th>\n",
+       "      <th>protein_idxes</th>\n",
+       "      <th>full_name</th>\n",
+       "      <th>gene_org</th>\n",
+       "      <th>gene_name</th>\n",
+       "      <th>is_prot_nterm</th>\n",
+       "      <th>is_prot_cterm</th>\n",
+       "      <th>genes</th>\n",
+       "      <th>proteins</th>\n",
+       "      <th>cardinality</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>168</td>\n",
+       "      <td>176</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.793702</td>\n",
+       "      <td>EMSEFHNY</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024RAP8_HUMAN</td>\n",
+       "      <td>A0A024RAP8</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>130</td>\n",
+       "      <td>138</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.817415</td>\n",
+       "      <td>KDALLVGV</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024R161_HUMAN</td>\n",
+       "      <td>A0A024R161</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>137</td>\n",
+       "      <td>145</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.751329</td>\n",
+       "      <td>VPAGSNPF</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024R161_HUMAN</td>\n",
+       "      <td>A0A024R161</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>170</td>\n",
+       "      <td>178</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.940019</td>\n",
+       "      <td>SEFHNYNL</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024RAP8_HUMAN</td>\n",
+       "      <td>A0A024RAP8</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>181</td>\n",
+       "      <td>189</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0.895964</td>\n",
+       "      <td>KSDFSTRW</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024RAP8_HUMAN</td>\n",
+       "      <td>A0A024RAP8</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>143</th>\n",
+       "      <td>95</td>\n",
+       "      <td>109</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.969541</td>\n",
+       "      <td>QSAEEAFLLVATAY</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>143</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024R161_HUMAN</td>\n",
+       "      <td>A0A024R161</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>144</th>\n",
+       "      <td>329</td>\n",
+       "      <td>343</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.756001</td>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024RAP8_HUMAN</td>\n",
+       "      <td>A0A024RAP8</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>145</th>\n",
+       "      <td>5</td>\n",
+       "      <td>19</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.733784</td>\n",
+       "      <td>LLSPGWGAGAAGRR</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>145</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024R161_HUMAN</td>\n",
+       "      <td>A0A024R161</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>146</th>\n",
+       "      <td>110</td>\n",
+       "      <td>124</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.891976</td>\n",
+       "      <td>TLKVSQAAAELQQY</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>146</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024R161_HUMAN</td>\n",
+       "      <td>A0A024R161</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>147</th>\n",
+       "      <td>6</td>\n",
+       "      <td>20</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.842583</td>\n",
+       "      <td>LSPGWGAGAAGRRW</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>147</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>A0A024R161_HUMAN</td>\n",
+       "      <td>A0A024R161</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>148 rows × 15 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     start_pos  stop_pos  nAA  HLA_prob_pred        sequence protein_id  \\\n",
+       "0          168       176    8       0.793702        EMSEFHNY          0   \n",
+       "1          130       138    8       0.817415        KDALLVGV          1   \n",
+       "2          137       145    8       0.751329        VPAGSNPF          2   \n",
+       "3          170       178    8       0.940019        SEFHNYNL          3   \n",
+       "4          181       189    8       0.895964        KSDFSTRW          4   \n",
+       "..         ...       ...  ...            ...             ...        ...   \n",
+       "143         95       109   14       0.969541  QSAEEAFLLVATAY        143   \n",
+       "144        329       343   14       0.756001  SPNLLTIIEMQKGD        144   \n",
+       "145          5        19   14       0.733784  LLSPGWGAGAAGRR        145   \n",
+       "146        110       124   14       0.891976  TLKVSQAAAELQQY        146   \n",
+       "147          6        20   14       0.842583  LSPGWGAGAAGRRW        147   \n",
+       "\n",
+       "    protein_idxes full_name gene_org gene_name  is_prot_nterm  is_prot_cterm  \\\n",
+       "0               0         0        0         0          False          False   \n",
+       "1               1         1        1         1          False          False   \n",
+       "2               2         2        2         2          False          False   \n",
+       "3               3         3        3         3          False          False   \n",
+       "4               4         4        4         4          False          False   \n",
+       "..            ...       ...      ...       ...            ...            ...   \n",
+       "143           143       143      143       143          False          False   \n",
+       "144           144       144      144       144          False          False   \n",
+       "145           145       145      145       145          False          False   \n",
+       "146           146       146      146       146          False          False   \n",
+       "147           147       147      147       147          False          False   \n",
+       "\n",
+       "                genes    proteins  cardinality  \n",
+       "0    A0A024RAP8_HUMAN  A0A024RAP8            1  \n",
+       "1    A0A024R161_HUMAN  A0A024R161            1  \n",
+       "2    A0A024R161_HUMAN  A0A024R161            1  \n",
+       "3    A0A024RAP8_HUMAN  A0A024RAP8            1  \n",
+       "4    A0A024RAP8_HUMAN  A0A024RAP8            1  \n",
+       "..                ...         ...          ...  \n",
+       "143  A0A024R161_HUMAN  A0A024R161            1  \n",
+       "144  A0A024RAP8_HUMAN  A0A024RAP8            1  \n",
+       "145  A0A024R161_HUMAN  A0A024R161            1  \n",
+       "146  A0A024R161_HUMAN  A0A024R161            1  \n",
+       "147  A0A024R161_HUMAN  A0A024R161            1  \n",
+       "\n",
+       "[148 rows x 15 columns]"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# from alphabase.??? import ProteinInfer\n",
-    "infer = ProteinInfer(fasta_path)\n",
-    "infer_df = infer.infer_peptides(sequences.sequence.values)\n",
-    "infer_df"
+    "from alphabase.protein.fasta import annotate_precursor_df\n",
+    "inferred_sequences = annotate_precursor_df(sequences, fasta)\n",
+    "inferred_sequences"
    ]
   },
   {
@@ -3442,9 +3918,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "sequences_infered = pd.merge(sequences, infer_df, how = 'left', on ='sequence')"
-   ]
+   "source": []
   }
  ],
  "metadata": {

From dc0326ccc03ee191a8001c1e23e611ef2a802e97 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Tue, 23 Jul 2024 10:08:15 +0200
Subject: [PATCH 07/10] #183 ruff reformat

---
 peptdeep/hla/hla_class1.py | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/peptdeep/hla/hla_class1.py b/peptdeep/hla/hla_class1.py
index 67f5e575..f093d53b 100644
--- a/peptdeep/hla/hla_class1.py
+++ b/peptdeep/hla/hla_class1.py
@@ -380,19 +380,17 @@ def predict_from_proteins(
         peptide_df["sequence"] = get_seq_series(peptide_df, self._cat_protein_sequence)
         return peptide_df
 
-    def _concat_neg_df(self, precursor_df, column_to_train='HLA'):
+    def _concat_neg_df(self, precursor_df, column_to_train="HLA"):
         precursor_df = append_nAA_column_if_missing(precursor_df)
         precursor_df[column_to_train] = 1
         df_list = [precursor_df]
-        for nAA, group_df in precursor_df.groupby('nAA'):
+        for nAA, group_df in precursor_df.groupby("nAA"):
             rnd_seqs = get_random_sequences(
-                self.protein_df, 
-                n=len(group_df),
-                pep_len = nAA
+                self.protein_df, n=len(group_df), pep_len=nAA
+            )
+            df_list.append(
+                pd.DataFrame({"sequence": rnd_seqs, "nAA": nAA, column_to_train: 0})
             )
-            df_list.append(pd.DataFrame(
-                {'sequence':rnd_seqs,'nAA':nAA,column_to_train:0}
-            ))
         return pd.concat(df_list).reset_index(drop=True)
 
     def test(self, precursor_df):
@@ -402,17 +400,21 @@ def test(self, precursor_df):
         precision_list = []
         recall_list = []
         fp_list = []
-        for prob in [0.5,0.6,0.7,0.8, 0.9]:
+        for prob in [0.5, 0.6, 0.7, 0.8, 0.9]:
             prob_list.append(prob)
-            precision_list.append(df[df.HLA_prob_pred>prob].HLA.mean())
-            recall_list.append(df[df.HLA_prob_pred>prob].HLA.sum()/len(df)*2)
-            fp_list.append(1-(1-df[df.HLA_prob_pred<prob].HLA).sum()/len(df)*2)
-        return pd.DataFrame(dict(
-            HLA_prob_pred=prob_list,
-            precision=precision_list,
-            recall=recall_list,
-            false_positive=fp_list
-            ))    
+            precision_list.append(df[df.HLA_prob_pred > prob].HLA.mean())
+            recall_list.append(df[df.HLA_prob_pred > prob].HLA.sum() / len(df) * 2)
+            fp_list.append(
+                1 - (1 - df[df.HLA_prob_pred < prob].HLA).sum() / len(df) * 2
+            )
+        return pd.DataFrame(
+            dict(
+                HLA_prob_pred=prob_list,
+                precision=precision_list,
+                recall=recall_list,
+                false_positive=fp_list,
+            )
+        )
 
     def _download_pretrained_hla_model(self):
         download_models(url=self._model_url, target_path=self._model_zip)

From d500554a685d14d9d4608c0dfb9dded2450c3488 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Tue, 23 Jul 2024 14:27:46 +0200
Subject: [PATCH 08/10] #183 hla tutorial move fasta to cur folder

---
 docs/tutorials/example.fasta                  |   9 +
 .../tutorial_immunopeptidomics.ipynb          | 987 +++++++-----------
 peptdeep/model/model_interface.py             |   8 +-
 3 files changed, 412 insertions(+), 592 deletions(-)
 create mode 100644 docs/tutorials/example.fasta

diff --git a/docs/tutorials/example.fasta b/docs/tutorials/example.fasta
new file mode 100644
index 00000000..5619e28a
--- /dev/null
+++ b/docs/tutorials/example.fasta
@@ -0,0 +1,9 @@
+>tr|A0A024R161|A0A024R161_HUMAN Guanine nucleotide-binding protein subunit gamma OS=Homo sapiens GN=DNAJC25-GNG10 PE=3 SV=1
+MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSA
+GKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAEL
+QQYCMQNACKDALLVGVPAGSNPFREPRSCALL
+>tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, isoform CRA_b OS=Homo sapiens GN=KLRC4-KLRK1 PE=4 SV=1
+MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIA
+VAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNW
+YESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLT
+IIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV
diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb
index 58be219d..c7d54dde 100644
--- a/docs/tutorials/tutorial_immunopeptidomics.ipynb
+++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb
@@ -41,20 +41,21 @@
      "text": [
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n",
-      "WARNING: Ignoring invalid distribution -lpharaw (c:\\users\\wahle\\.conda\\envs\\feng\\lib\\site-packages)\n"
-     ]
     }
    ],
    "source": [
     "%pip install -q pydivsufsort"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -84,51 +85,18 @@
     "The protein sequences are concatenated into a single sequence. The sequences are seperated by a sentinel character, in this case '$', so that no peptides across proteins are formed. Note that the first and last sentinel characters are crutial as well.\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def concat_sequences_for_nonspecific_digestion(seq_list, sep=\"$\"):\n",
-    "    return sep + sep.join(seq_list) + sep"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "'$MABCDEKFGHIJKLMNOPQRST$FGHIJKLMNOPQR$'"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "prot_seq_list = [\"MABCDEKFGHIJKLMNOPQRST\",\"FGHIJKLMNOPQR\"]\n",
-    "cat_prot = concat_sequences_for_nonspecific_digestion(prot_seq_list, sep=\"$\")\n",
-    "cat_prot"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The same can be done directly from a fasta: \n",
-    "@ Feng do you have an example fasta somwhere? "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -206,21 +174,21 @@
        "tr|A0A024RAP8|A0A024RAP8_HUMAN  216  "
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "from peptdeep.hla.hla_utils import load_prot_df\n",
-    "fasta_path = \"D:/Software/FASTA/Human/example.fasta\"\n",
-    "fasta = load_prot_df(fasta_path)\n",
-    "fasta"
+    "fasta_path = \"example.fasta\"\n",
+    "protein_df = load_prot_df(fasta_path)\n",
+    "protein_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -229,15 +197,15 @@
        "'$MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSAGKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAELQQYCMQNACKDALLVGVPAGSNPFREPRSCALL$MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIAVAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNWYESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLTIIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV$'"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "from peptdeep.hla.hla_utils import cat_proteins\n",
-    "cat_fasta = cat_proteins(fasta['sequence'])\n",
-    "cat_fasta"
+    "cat_sequence = cat_proteins(protein_df[\"sequence\"])\n",
+    "cat_sequence"
    ]
   },
   {
@@ -253,7 +221,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -359,7 +327,7 @@
        "[2443 rows x 2 columns]"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -370,7 +338,7 @@
     "import sys\n",
     "\n",
     "start_idxes, stop_idxes = get_substring_indices(\n",
-    "    cat_fasta, min_len=8, max_len=14, stop_char=\"$\"\n",
+    "    cat_sequence, min_len=8, max_len=14, stop_char=\"$\"\n",
     ")\n",
     "digest_pos_df = pd.DataFrame({\n",
     "    \"start_pos\": start_idxes,\n",
@@ -381,7 +349,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -397,7 +365,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -515,7 +483,7 @@
        "[2443 rows x 3 columns]"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -523,13 +491,13 @@
    "source": [
     "digest_pos_df[\"sequence\"] = digest_pos_df[\n",
     "    [\"start_pos\",\"stop_pos\"]\n",
-    "].apply(lambda x: cat_fasta[slice(*x)], axis=1)\n",
+    "].apply(lambda x: cat_sequence[slice(*x)], axis=1)\n",
     "digest_pos_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -538,16 +506,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'seq RAM = 0.16621 Mb, idxes RAM = 0.01969, ratio = 8.44230'"
+       "'seq RAM = 0.16623 Mb, idxes RAM = 0.01971, ratio = 8.43475'"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -560,7 +528,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Selection of peptide sequences used for library prediction\n",
+    "## 2. Selection of peptide sequences used for library prediction\n",
     "The digest_prot_df contains all unspecifically digested peptide sequences between 7 and 14 aa generatable from the concatenated protein sequences. This list is reduced using a HLA1_Binding_Classifier from peptdeep.hla.hla_class1. Two different model architectures are available, an LSTM model (HLA_Class_I_LSTM) and a BERT model (HLA_Class_I_BERT). A pretrained model is only available for the LSTM model architecture.\n",
     "The HLA1_Binding_Classifer can be used with a pretrained model, tuned with existing peptide data or trained from scratch. Training of a new model should be considered carefully and will not be covered in this tutorial.\n",
     "   "
@@ -570,14 +538,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 2.2 Selection of peptide seqeuence candidates without transferlearning\n",
+    "### 2.1 Selection of peptide seqeuence candidates without transferlearning\n",
     "\n",
     "Selection of peptide sequences for library predicition using the pretrained model can be done in a few steps. First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, we can use any kind of dataframe containing peptide sequences to predict how likely there are HLA peptides, the only requirement beeing that the column containing the peptides is called 'sequence'.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -679,7 +647,7 @@
        "      <td>93</td>\n",
        "      <td>DRYRPQPGDEGPGR</td>\n",
        "      <td>14</td>\n",
-       "      <td>0.060635</td>\n",
+       "      <td>0.060634</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2441</th>\n",
@@ -712,14 +680,14 @@
        "...         ...       ...             ...  ...            ...\n",
        "2438        112       126  KVSQAAAELQQYCM   14       0.243115\n",
        "2439        317       331  NGSWQWEDGSILSP   14       0.021114\n",
-       "2440         79        93  DRYRPQPGDEGPGR   14       0.060635\n",
+       "2440         79        93  DRYRPQPGDEGPGR   14       0.060634\n",
        "2441        113       127  VSQAAAELQQYCMQ   14       0.355900\n",
        "2442        190       204  KQRCPVVKSKCREN   14       0.000362\n",
        "\n",
        "[2443 rows x 5 columns]"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -742,7 +710,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -884,7 +852,7 @@
        "[148 rows x 5 columns]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -902,14 +870,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  1.20it/s]\n"
+      "100%|██████████| 1/1 [00:01<00:00,  1.27s/it]\n"
      ]
     },
     {
@@ -1051,14 +1019,14 @@
        "[148 rows x 5 columns]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "sequences = model.predict_from_proteins(fasta, prob_threshold=0.7)\n",
-    "sequences"
+    "sequence_df = model.predict_from_proteins(protein_df, prob_threshold=0.7)\n",
+    "sequence_df"
    ]
   },
   {
@@ -1074,13 +1042,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
     "model = HLA1_Binding_Classifier()\n",
     "model.load_pretrained_hla_model()\n",
-    "model.load_proteins(fasta)"
+    "model.load_proteins(fasta_path)"
    ]
   },
   {
@@ -1093,7 +1061,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -1102,7 +1070,7 @@
        "(1954, 489)"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1122,163 +1090,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-22 09:21:38> Training with fixed sequence length: 0\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=1, lr=2e-05, loss=1.415909733091082\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=2, lr=4e-05, loss=1.0947138496807642\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=3, lr=6e-05, loss=0.8823633790016174\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=4, lr=8e-05, loss=0.7819523641041347\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=5, lr=0.0001, loss=0.7255220583506993\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=6, lr=0.0001, loss=0.705090846334185\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=7, lr=9.045084971874738e-05, loss=0.7013667055538723\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=8, lr=6.545084971874738e-05, loss=0.6968921593257359\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training] Epoch=9, lr=3.4549150281252636e-05, loss=0.6968518495559692\n",
-      "[Training] Epoch=10, lr=9.549150281252633e-06, loss=0.6932548114231655\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\wahle\\.conda\\envs\\feng\\lib\\site-packages\\torch\\optim\\lr_scheduler.py:156: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.\n",
-      "  warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)\n"
+      "2024-07-23 14:22:06> Training with fixed sequence length: 0\n",
+      "[Training] Epoch=1, lr=4e-05, loss=1.39779794216156\n",
+      "[Training] Epoch=2, lr=6e-05, loss=1.0070140702383858\n",
+      "[Training] Epoch=3, lr=8e-05, loss=0.7982760497501918\n",
+      "[Training] Epoch=4, lr=0.0001, loss=0.7397338407380241\n",
+      "[Training] Epoch=5, lr=0.0001, loss=0.7099559647696358\n",
+      "[Training] Epoch=6, lr=9.045084971874738e-05, loss=0.7016251683235168\n",
+      "[Training] Epoch=7, lr=6.545084971874738e-05, loss=0.6965694086892265\n",
+      "[Training] Epoch=8, lr=3.4549150281252636e-05, loss=0.697939566203526\n",
+      "[Training] Epoch=9, lr=9.549150281252633e-06, loss=0.6959438664572579\n",
+      "[Training] Epoch=10, lr=1.0000000000000002e-14, loss=0.6928229417119708\n"
      ]
     }
    ],
    "source": [
-    "\n",
     "model.train(train_seq_df,\n",
     "            epoch=10, warmup_epoch=5, \n",
     "            verbose=True)"
@@ -1293,7 +1126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -1327,23 +1160,23 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.5</td>\n",
-       "      <td>0.496400</td>\n",
-       "      <td>0.599795</td>\n",
-       "      <td>0.608495</td>\n",
+       "      <td>0.511434</td>\n",
+       "      <td>0.595189</td>\n",
+       "      <td>0.568577</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.6</td>\n",
-       "      <td>0.622951</td>\n",
-       "      <td>0.019447</td>\n",
-       "      <td>0.011771</td>\n",
+       "      <td>0.416667</td>\n",
+       "      <td>0.017912</td>\n",
+       "      <td>0.025077</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0.7</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.000512</td>\n",
+       "      <td>0.001024</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -1365,14 +1198,14 @@
       ],
       "text/plain": [
        "   HLA_prob_pred  precision    recall  false_positive\n",
-       "0            0.5   0.496400  0.599795        0.608495\n",
-       "1            0.6   0.622951  0.019447        0.011771\n",
-       "2            0.7        NaN  0.000000        0.000000\n",
+       "0            0.5   0.511434  0.595189        0.568577\n",
+       "1            0.6   0.416667  0.017912        0.025077\n",
+       "2            0.7   0.333333  0.000512        0.001024\n",
        "3            0.8        NaN  0.000000        0.000000\n",
        "4            0.9        NaN  0.000000        0.000000"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1383,7 +1216,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -1417,16 +1250,16 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.5</td>\n",
-       "      <td>0.480159</td>\n",
-       "      <td>0.494888</td>\n",
-       "      <td>0.535787</td>\n",
+       "      <td>0.450192</td>\n",
+       "      <td>0.480573</td>\n",
+       "      <td>0.586912</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.6</td>\n",
-       "      <td>0.461538</td>\n",
-       "      <td>0.012270</td>\n",
-       "      <td>0.014315</td>\n",
+       "      <td>0.470588</td>\n",
+       "      <td>0.016360</td>\n",
+       "      <td>0.018405</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -1455,14 +1288,14 @@
       ],
       "text/plain": [
        "   HLA_prob_pred  precision    recall  false_positive\n",
-       "0            0.5   0.480159  0.494888        0.535787\n",
-       "1            0.6   0.461538  0.012270        0.014315\n",
+       "0            0.5   0.450192  0.480573        0.586912\n",
+       "1            0.6   0.470588  0.016360        0.018405\n",
        "2            0.7        NaN  0.000000        0.000000\n",
        "3            0.8        NaN  0.000000        0.000000\n",
        "4            0.9        NaN  0.000000        0.000000"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1480,14 +1313,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  1.20it/s]\n"
+      "100%|██████████| 1/1 [00:01<00:00,  1.32s/it]\n"
      ]
     },
     {
@@ -1521,422 +1354,413 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>143</td>\n",
-       "      <td>151</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0.606630</td>\n",
-       "      <td>PFREPRSC</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
        "      <td>170</td>\n",
        "      <td>178</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.697908</td>\n",
+       "      <td>0.711809</td>\n",
        "      <td>SEFHNYNL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>1</th>\n",
        "      <td>62</td>\n",
        "      <td>70</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.602259</td>\n",
+       "      <td>0.627015</td>\n",
        "      <td>KAEIARAY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>87</td>\n",
-       "      <td>95</td>\n",
+       "      <th>2</th>\n",
+       "      <td>106</td>\n",
+       "      <td>114</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.611214</td>\n",
-       "      <td>DEGPGRTP</td>\n",
+       "      <td>0.628822</td>\n",
+       "      <td>TAYETLKV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>3</th>\n",
        "      <td>299</td>\n",
        "      <td>307</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.611188</td>\n",
+       "      <td>0.605544</td>\n",
        "      <td>LLKLVKSY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
+       "      <th>4</th>\n",
        "      <td>346</td>\n",
        "      <td>354</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.620160</td>\n",
+       "      <td>0.646759</td>\n",
        "      <td>YASSFKGY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>344</td>\n",
-       "      <td>352</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0.601700</td>\n",
-       "      <td>ALYASSFK</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>223</td>\n",
-       "      <td>231</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0.605099</td>\n",
-       "      <td>IMVTIWSA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
+       "      <th>5</th>\n",
        "      <td>258</td>\n",
        "      <td>266</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.618778</td>\n",
+       "      <td>0.624555</td>\n",
        "      <td>ICYKNNCY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>363</td>\n",
-       "      <td>371</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0.602542</td>\n",
-       "      <td>YICMQRTV</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>17</td>\n",
-       "      <td>25</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0.605605</td>\n",
-       "      <td>RRWWMLLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
+       "      <th>6</th>\n",
        "      <td>294</td>\n",
        "      <td>303</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.605901</td>\n",
+       "      <td>0.610476</td>\n",
        "      <td>KEDQDLLKL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>12</th>\n",
+       "      <th>7</th>\n",
        "      <td>298</td>\n",
        "      <td>307</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.635218</td>\n",
+       "      <td>0.645020</td>\n",
        "      <td>DLLKLVKSY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
+       "      <th>8</th>\n",
        "      <td>235</td>\n",
        "      <td>244</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.610781</td>\n",
+       "      <td>0.629079</td>\n",
        "      <td>SLFNQEVQI</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>221</td>\n",
-       "      <td>230</td>\n",
-       "      <td>9</td>\n",
-       "      <td>0.622273</td>\n",
-       "      <td>FIIMVTIWS</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>222</td>\n",
-       "      <td>231</td>\n",
-       "      <td>9</td>\n",
-       "      <td>0.612193</td>\n",
-       "      <td>IIMVTIWSA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>22</td>\n",
-       "      <td>31</td>\n",
-       "      <td>9</td>\n",
-       "      <td>0.604106</td>\n",
-       "      <td>LLAPLLPAL</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
+       "      <th>9</th>\n",
        "      <td>257</td>\n",
        "      <td>266</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.620672</td>\n",
+       "      <td>0.623247</td>\n",
        "      <td>WICYKNNCY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18</th>\n",
+       "      <th>10</th>\n",
        "      <td>267</td>\n",
        "      <td>276</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.614074</td>\n",
+       "      <td>0.611738</td>\n",
        "      <td>FFDESKNWY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>19</th>\n",
+       "      <th>11</th>\n",
        "      <td>17</td>\n",
        "      <td>26</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.604426</td>\n",
+       "      <td>0.605875</td>\n",
        "      <td>RRWWMLLAP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>20</th>\n",
+       "      <th>12</th>\n",
        "      <td>327</td>\n",
        "      <td>336</td>\n",
        "      <td>9</td>\n",
-       "      <td>0.614008</td>\n",
+       "      <td>0.616737</td>\n",
        "      <td>ILSPNLLTI</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>255</td>\n",
-       "      <td>265</td>\n",
-       "      <td>10</td>\n",
-       "      <td>0.601901</td>\n",
-       "      <td>KNWICYKNNC</td>\n",
+       "      <th>13</th>\n",
+       "      <td>74</td>\n",
+       "      <td>83</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.611590</td>\n",
+       "      <td>RRYHPDRYR</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>22</th>\n",
+       "      <th>14</th>\n",
        "      <td>344</td>\n",
        "      <td>354</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.630664</td>\n",
+       "      <td>0.662783</td>\n",
        "      <td>ALYASSFKGY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>23</th>\n",
+       "      <th>15</th>\n",
        "      <td>232</td>\n",
        "      <td>242</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.634032</td>\n",
+       "      <td>0.651600</td>\n",
        "      <td>FLNSLFNQEV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>24</th>\n",
+       "      <th>16</th>\n",
        "      <td>221</td>\n",
        "      <td>231</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.632162</td>\n",
+       "      <td>0.617175</td>\n",
        "      <td>FIIMVTIWSA</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>25</th>\n",
+       "      <th>17</th>\n",
        "      <td>222</td>\n",
        "      <td>232</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.606705</td>\n",
+       "      <td>0.600623</td>\n",
        "      <td>IIMVTIWSAV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>26</th>\n",
-       "      <td>353</td>\n",
-       "      <td>363</td>\n",
-       "      <td>10</td>\n",
-       "      <td>0.611286</td>\n",
-       "      <td>YIENCSTPNT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27</th>\n",
-       "      <td>205</td>\n",
-       "      <td>215</td>\n",
+       "      <th>18</th>\n",
+       "      <td>74</td>\n",
+       "      <td>84</td>\n",
        "      <td>10</td>\n",
-       "      <td>0.606018</td>\n",
-       "      <td>SPFFFCCFIA</td>\n",
+       "      <td>0.614895</td>\n",
+       "      <td>RRYHPDRYRP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>28</th>\n",
-       "      <td>195</td>\n",
-       "      <td>206</td>\n",
-       "      <td>11</td>\n",
-       "      <td>0.607188</td>\n",
-       "      <td>VVKSKCRENAS</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29</th>\n",
+       "      <th>19</th>\n",
        "      <td>221</td>\n",
        "      <td>232</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.616940</td>\n",
+       "      <td>0.608950</td>\n",
        "      <td>FIIMVTIWSAV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>30</th>\n",
-       "      <td>298</td>\n",
-       "      <td>309</td>\n",
-       "      <td>11</td>\n",
-       "      <td>0.600725</td>\n",
-       "      <td>DLLKLVKSYHW</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
+       "      <th>20</th>\n",
        "      <td>353</td>\n",
        "      <td>364</td>\n",
        "      <td>11</td>\n",
-       "      <td>0.618278</td>\n",
+       "      <td>0.613787</td>\n",
        "      <td>YIENCSTPNTY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>32</th>\n",
+       "      <th>21</th>\n",
+       "      <td>74</td>\n",
+       "      <td>85</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0.605368</td>\n",
+       "      <td>RRYHPDRYRPQ</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>112</td>\n",
+       "      <td>124</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.612270</td>\n",
+       "      <td>KVSQAAAELQQY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
        "      <td>42</td>\n",
        "      <td>54</td>\n",
        "      <td>12</td>\n",
-       "      <td>0.606224</td>\n",
+       "      <td>0.607715</td>\n",
        "      <td>GLYCGTRDCYEV</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>33</th>\n",
+       "      <th>24</th>\n",
        "      <td>351</td>\n",
        "      <td>363</td>\n",
        "      <td>12</td>\n",
-       "      <td>0.633097</td>\n",
+       "      <td>0.616891</td>\n",
        "      <td>KGYIENCSTPNT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>34</th>\n",
-       "      <td>200</td>\n",
-       "      <td>212</td>\n",
+       "      <th>25</th>\n",
+       "      <td>74</td>\n",
+       "      <td>86</td>\n",
        "      <td>12</td>\n",
-       "      <td>0.608198</td>\n",
-       "      <td>CRENASPFFFCC</td>\n",
+       "      <td>0.602210</td>\n",
+       "      <td>RRYHPDRYRPQP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>35</th>\n",
-       "      <td>224</td>\n",
-       "      <td>236</td>\n",
-       "      <td>12</td>\n",
-       "      <td>0.606180</td>\n",
-       "      <td>MVTIWSAVFLNS</td>\n",
+       "      <th>26</th>\n",
+       "      <td>86</td>\n",
+       "      <td>99</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.644656</td>\n",
+       "      <td>GDEGPGRTPQSAE</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>36</th>\n",
-       "      <td>195</td>\n",
-       "      <td>207</td>\n",
-       "      <td>12</td>\n",
-       "      <td>0.612207</td>\n",
-       "      <td>VVKSKCRENASP</td>\n",
+       "      <th>27</th>\n",
+       "      <td>351</td>\n",
+       "      <td>364</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.603497</td>\n",
+       "      <td>KGYIENCSTPNTY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37</th>\n",
-       "      <td>166</td>\n",
-       "      <td>179</td>\n",
+       "      <th>28</th>\n",
+       "      <td>73</td>\n",
+       "      <td>86</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.628934</td>\n",
-       "      <td>SWEMSEFHNYNLD</td>\n",
+       "      <td>0.622453</td>\n",
+       "      <td>ARRYHPDRYRPQP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>38</th>\n",
-       "      <td>351</td>\n",
-       "      <td>364</td>\n",
+       "      <th>29</th>\n",
+       "      <td>74</td>\n",
+       "      <td>87</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.604953</td>\n",
-       "      <td>KGYIENCSTPNTY</td>\n",
+       "      <td>0.611441</td>\n",
+       "      <td>RRYHPDRYRPQPG</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>39</th>\n",
-       "      <td>35</td>\n",
-       "      <td>48</td>\n",
+       "      <th>30</th>\n",
+       "      <td>334</td>\n",
+       "      <td>347</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.601324</td>\n",
-       "      <td>PAGALVEGLYCGT</td>\n",
+       "      <td>0.604354</td>\n",
+       "      <td>TIIEMQKGDCALY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>40</th>\n",
+       "      <th>31</th>\n",
+       "      <td>141</td>\n",
+       "      <td>154</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.601309</td>\n",
+       "      <td>SNPFREPRSCALL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>32</td>\n",
+       "      <td>45</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.622797</td>\n",
+       "      <td>LVRPAGALVEGLY</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
        "      <td>130</td>\n",
        "      <td>143</td>\n",
        "      <td>13</td>\n",
-       "      <td>0.603384</td>\n",
+       "      <td>0.604786</td>\n",
        "      <td>KDALLVGVPAGSN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>41</th>\n",
+       "      <th>34</th>\n",
        "      <td>333</td>\n",
        "      <td>347</td>\n",
        "      <td>14</td>\n",
-       "      <td>0.601238</td>\n",
+       "      <td>0.613545</td>\n",
        "      <td>LTIIEMQKGDCALY</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>42</th>\n",
-       "      <td>185</td>\n",
-       "      <td>199</td>\n",
+       "      <th>35</th>\n",
+       "      <td>60</td>\n",
+       "      <td>74</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.607648</td>\n",
+       "      <td>AGKAEIARAYRQLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>85</td>\n",
+       "      <td>99</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.606241</td>\n",
+       "      <td>PGDEGPGRTPQSAE</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>229</td>\n",
+       "      <td>243</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.606759</td>\n",
+       "      <td>SAVFLNSLFNQEVQ</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>86</td>\n",
+       "      <td>100</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.622891</td>\n",
+       "      <td>GDEGPGRTPQSAEE</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>167</td>\n",
+       "      <td>181</td>\n",
        "      <td>14</td>\n",
-       "      <td>0.610031</td>\n",
-       "      <td>STRWQKQRCPVVKS</td>\n",
+       "      <td>0.611953</td>\n",
+       "      <td>WEMSEFHNYNLDLK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>43</th>\n",
+       "      <th>40</th>\n",
        "      <td>117</td>\n",
        "      <td>131</td>\n",
        "      <td>14</td>\n",
-       "      <td>0.600326</td>\n",
+       "      <td>0.619257</td>\n",
        "      <td>AAELQQYCMQNACK</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>73</td>\n",
+       "      <td>87</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.608767</td>\n",
+       "      <td>ARRYHPDRYRPQPG</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
+       "      <td>329</td>\n",
+       "      <td>343</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.600299</td>\n",
+       "      <td>SPNLLTIIEMQKGD</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "    start_pos  stop_pos  nAA  HLA_prob_pred        sequence\n",
-       "0         143       151    8       0.606630        PFREPRSC\n",
-       "1         170       178    8       0.697908        SEFHNYNL\n",
-       "2          62        70    8       0.602259        KAEIARAY\n",
-       "3          87        95    8       0.611214        DEGPGRTP\n",
-       "4         299       307    8       0.611188        LLKLVKSY\n",
-       "5         346       354    8       0.620160        YASSFKGY\n",
-       "6         344       352    8       0.601700        ALYASSFK\n",
-       "7         223       231    8       0.605099        IMVTIWSA\n",
-       "8         258       266    8       0.618778        ICYKNNCY\n",
-       "9         363       371    8       0.602542        YICMQRTV\n",
-       "10         17        25    8       0.605605        RRWWMLLA\n",
-       "11        294       303    9       0.605901       KEDQDLLKL\n",
-       "12        298       307    9       0.635218       DLLKLVKSY\n",
-       "13        235       244    9       0.610781       SLFNQEVQI\n",
-       "14        221       230    9       0.622273       FIIMVTIWS\n",
-       "15        222       231    9       0.612193       IIMVTIWSA\n",
-       "16         22        31    9       0.604106       LLAPLLPAL\n",
-       "17        257       266    9       0.620672       WICYKNNCY\n",
-       "18        267       276    9       0.614074       FFDESKNWY\n",
-       "19         17        26    9       0.604426       RRWWMLLAP\n",
-       "20        327       336    9       0.614008       ILSPNLLTI\n",
-       "21        255       265   10       0.601901      KNWICYKNNC\n",
-       "22        344       354   10       0.630664      ALYASSFKGY\n",
-       "23        232       242   10       0.634032      FLNSLFNQEV\n",
-       "24        221       231   10       0.632162      FIIMVTIWSA\n",
-       "25        222       232   10       0.606705      IIMVTIWSAV\n",
-       "26        353       363   10       0.611286      YIENCSTPNT\n",
-       "27        205       215   10       0.606018      SPFFFCCFIA\n",
-       "28        195       206   11       0.607188     VVKSKCRENAS\n",
-       "29        221       232   11       0.616940     FIIMVTIWSAV\n",
-       "30        298       309   11       0.600725     DLLKLVKSYHW\n",
-       "31        353       364   11       0.618278     YIENCSTPNTY\n",
-       "32         42        54   12       0.606224    GLYCGTRDCYEV\n",
-       "33        351       363   12       0.633097    KGYIENCSTPNT\n",
-       "34        200       212   12       0.608198    CRENASPFFFCC\n",
-       "35        224       236   12       0.606180    MVTIWSAVFLNS\n",
-       "36        195       207   12       0.612207    VVKSKCRENASP\n",
-       "37        166       179   13       0.628934   SWEMSEFHNYNLD\n",
-       "38        351       364   13       0.604953   KGYIENCSTPNTY\n",
-       "39         35        48   13       0.601324   PAGALVEGLYCGT\n",
-       "40        130       143   13       0.603384   KDALLVGVPAGSN\n",
-       "41        333       347   14       0.601238  LTIIEMQKGDCALY\n",
-       "42        185       199   14       0.610031  STRWQKQRCPVVKS\n",
-       "43        117       131   14       0.600326  AAELQQYCMQNACK"
+       "0         170       178    8       0.711809        SEFHNYNL\n",
+       "1          62        70    8       0.627015        KAEIARAY\n",
+       "2         106       114    8       0.628822        TAYETLKV\n",
+       "3         299       307    8       0.605544        LLKLVKSY\n",
+       "4         346       354    8       0.646759        YASSFKGY\n",
+       "5         258       266    8       0.624555        ICYKNNCY\n",
+       "6         294       303    9       0.610476       KEDQDLLKL\n",
+       "7         298       307    9       0.645020       DLLKLVKSY\n",
+       "8         235       244    9       0.629079       SLFNQEVQI\n",
+       "9         257       266    9       0.623247       WICYKNNCY\n",
+       "10        267       276    9       0.611738       FFDESKNWY\n",
+       "11         17        26    9       0.605875       RRWWMLLAP\n",
+       "12        327       336    9       0.616737       ILSPNLLTI\n",
+       "13         74        83    9       0.611590       RRYHPDRYR\n",
+       "14        344       354   10       0.662783      ALYASSFKGY\n",
+       "15        232       242   10       0.651600      FLNSLFNQEV\n",
+       "16        221       231   10       0.617175      FIIMVTIWSA\n",
+       "17        222       232   10       0.600623      IIMVTIWSAV\n",
+       "18         74        84   10       0.614895      RRYHPDRYRP\n",
+       "19        221       232   11       0.608950     FIIMVTIWSAV\n",
+       "20        353       364   11       0.613787     YIENCSTPNTY\n",
+       "21         74        85   11       0.605368     RRYHPDRYRPQ\n",
+       "22        112       124   12       0.612270    KVSQAAAELQQY\n",
+       "23         42        54   12       0.607715    GLYCGTRDCYEV\n",
+       "24        351       363   12       0.616891    KGYIENCSTPNT\n",
+       "25         74        86   12       0.602210    RRYHPDRYRPQP\n",
+       "26         86        99   13       0.644656   GDEGPGRTPQSAE\n",
+       "27        351       364   13       0.603497   KGYIENCSTPNTY\n",
+       "28         73        86   13       0.622453   ARRYHPDRYRPQP\n",
+       "29         74        87   13       0.611441   RRYHPDRYRPQPG\n",
+       "30        334       347   13       0.604354   TIIEMQKGDCALY\n",
+       "31        141       154   13       0.601309   SNPFREPRSCALL\n",
+       "32         32        45   13       0.622797   LVRPAGALVEGLY\n",
+       "33        130       143   13       0.604786   KDALLVGVPAGSN\n",
+       "34        333       347   14       0.613545  LTIIEMQKGDCALY\n",
+       "35         60        74   14       0.607648  AGKAEIARAYRQLA\n",
+       "36         85        99   14       0.606241  PGDEGPGRTPQSAE\n",
+       "37        229       243   14       0.606759  SAVFLNSLFNQEVQ\n",
+       "38         86       100   14       0.622891  GDEGPGRTPQSAEE\n",
+       "39        167       181   14       0.611953  WEMSEFHNYNLDLK\n",
+       "40        117       131   14       0.619257  AAELQQYCMQNACK\n",
+       "41         73        87   14       0.608767  ARRYHPDRYRPQPG\n",
+       "42        329       343   14       0.600299  SPNLLTIIEMQKGD"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "model.predict_from_proteins(fasta, prob_threshold=0.6)"
+    "model.predict_from_proteins(fasta_path, prob_threshold=0.6)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Spectral library prediciton"
+    "## 3. Spectral library prediciton"
    ]
   },
   {
@@ -1948,7 +1772,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1970,7 +1794,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1989,7 +1813,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -2228,20 +2052,20 @@
        "[148 rows x 12 columns]"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "sequences['protein_id'] = [str(i) for i in range(len(sequences))]\n",
-    "sequences['protein_idxes'] = sequences.protein_id.astype(\"U\")\n",
-    "sequences['full_name'] = sequences['protein_id'] \n",
-    "sequences['gene_org'] = sequences['protein_id'] \n",
-    "sequences['gene_name'] = sequences['protein_id']\n",
-    "sequences[\"is_prot_nterm\"] = False\n",
-    "sequences[\"is_prot_cterm\"] = False\n",
-    "sequences"
+    "sequence_df['protein_id'] = [str(i) for i in range(len(sequence_df))]\n",
+    "sequence_df['protein_idxes'] = sequence_df.protein_id.astype(\"U\")\n",
+    "sequence_df['full_name'] = sequence_df['protein_id'] \n",
+    "sequence_df['gene_org'] = sequence_df['protein_id'] \n",
+    "sequence_df['gene_name'] = sequence_df['protein_id']\n",
+    "sequence_df[\"is_prot_nterm\"] = False\n",
+    "sequence_df[\"is_prot_cterm\"] = False\n",
+    "sequence_df"
    ]
   },
   {
@@ -2253,7 +2077,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -2407,19 +2231,21 @@
        "[148 rows x 6 columns]"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "speclib.protein_df = sequences[[\"sequence\",\"protein_id\",\"nAA\", 'full_name', 'gene_org', 'gene_name']].copy()\n",
+    "speclib.protein_df = sequence_df[\n",
+    "    [\"sequence\",\"protein_id\",\"nAA\", 'full_name', 'gene_org', 'gene_name']\n",
+    "].copy()\n",
     "speclib.protein_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -2610,19 +2436,22 @@
        "[148 rows x 8 columns]"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "speclib.precursor_df = sequences[[\"sequence\",\"protein_idxes\",\"start_pos\",\"stop_pos\",\"nAA\",\"HLA_prob_pred\", 'is_prot_nterm', 'is_prot_cterm']].copy()\n",
+    "speclib.precursor_df = sequence_df[\n",
+    "    [\"sequence\",\"protein_idxes\",\"start_pos\",\"stop_pos\",\n",
+    "     \"nAA\",\"HLA_prob_pred\", 'is_prot_nterm', 'is_prot_cterm']\n",
+    "].copy()\n",
     "speclib.precursor_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -2813,7 +2642,7 @@
        "[148 rows x 8 columns]"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2831,7 +2660,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -3058,7 +2887,7 @@
        "[498 rows x 11 columns]"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3078,29 +2907,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-22 09:22:23> Predicting RT/IM/MS2 for 400 precursors ...\n",
-      "2024-07-22 09:22:23> Predicting RT ...\n"
+      "2024-07-23 14:22:43> Predicting RT/IM/MS2 for 400 precursors ...\n",
+      "2024-07-23 14:22:43> Predicting RT ...\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 7/7 [00:00<00:00, 69.31it/s]"
+      "100%|██████████| 7/7 [00:00<00:00, 27.54it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-22 09:22:23> Predicting mobility ...\n"
+      "2024-07-23 14:22:43> Predicting mobility ...\n"
      ]
     },
     {
@@ -3108,14 +2937,14 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "100%|██████████| 7/7 [00:00<00:00, 72.89it/s]"
+      "100%|██████████| 7/7 [00:00<00:00, 50.06it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-22 09:22:23> Predicting MS2 ...\n"
+      "2024-07-23 14:22:44> Predicting MS2 ...\n"
      ]
     },
     {
@@ -3123,14 +2952,14 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "100%|██████████| 7/7 [00:00<00:00, 22.52it/s]"
+      "100%|██████████| 7/7 [00:00<00:00, 23.73it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-07-22 09:22:24> End predicting RT/IM/MS2\n"
+      "2024-07-23 14:22:44> End predicting RT/IM/MS2\n"
      ]
     },
     {
@@ -3154,7 +2983,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -3164,7 +2993,7 @@
       "Predict RT for 11 iRT precursors.\n",
       "Linear regression of `rt_pred` to `irt`:\n",
       "   R_square         R       slope  intercept  test_num\n",
-      "0   0.99007  0.995022  152.235621  -39.23216        11\n"
+      "0   0.99007  0.995022  152.235639 -39.232164        11\n"
      ]
     },
     {
@@ -3228,13 +3057,13 @@
        "      <td>1072.404037</td>\n",
        "      <td>0.189650</td>\n",
        "      <td>0.189650</td>\n",
-       "      <td>254.195923</td>\n",
+       "      <td>254.195892</td>\n",
        "      <td>1.253140</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>0</td>\n",
        "      <td>7</td>\n",
-       "      <td>-10.360729</td>\n",
+       "      <td>-10.360738</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -3258,7 +3087,7 @@
        "      <td>Lumos</td>\n",
        "      <td>7</td>\n",
        "      <td>14</td>\n",
-       "      <td>-10.360729</td>\n",
+       "      <td>-10.360738</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -3276,13 +3105,13 @@
        "      <td>1056.409123</td>\n",
        "      <td>0.289261</td>\n",
        "      <td>0.289261</td>\n",
-       "      <td>255.103760</td>\n",
+       "      <td>255.103699</td>\n",
        "      <td>1.257373</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>14</td>\n",
        "      <td>21</td>\n",
-       "      <td>4.803679</td>\n",
+       "      <td>4.803681</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -3306,7 +3135,7 @@
        "      <td>Lumos</td>\n",
        "      <td>21</td>\n",
        "      <td>28</td>\n",
-       "      <td>4.803679</td>\n",
+       "      <td>4.803681</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -3324,13 +3153,13 @@
        "      <td>814.503280</td>\n",
        "      <td>0.433791</td>\n",
        "      <td>0.433791</td>\n",
-       "      <td>256.615234</td>\n",
+       "      <td>256.615204</td>\n",
        "      <td>1.260001</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>28</td>\n",
        "      <td>35</td>\n",
-       "      <td>26.806266</td>\n",
+       "      <td>26.806270</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -3372,13 +3201,13 @@
        "      <td>775.414662</td>\n",
        "      <td>0.489545</td>\n",
        "      <td>0.489545</td>\n",
-       "      <td>429.360870</td>\n",
+       "      <td>429.360901</td>\n",
        "      <td>1.062514</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>3810</td>\n",
        "      <td>3823</td>\n",
-       "      <td>35.294021</td>\n",
+       "      <td>35.294030</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>396</th>\n",
@@ -3396,13 +3225,13 @@
        "      <td>517.278867</td>\n",
        "      <td>0.489545</td>\n",
        "      <td>0.489545</td>\n",
-       "      <td>463.231110</td>\n",
+       "      <td>463.231049</td>\n",
        "      <td>0.764225</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>3823</td>\n",
        "      <td>3836</td>\n",
-       "      <td>35.294021</td>\n",
+       "      <td>35.294030</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>397</th>\n",
@@ -3426,7 +3255,7 @@
        "      <td>Lumos</td>\n",
        "      <td>3836</td>\n",
        "      <td>3849</td>\n",
-       "      <td>18.273781</td>\n",
+       "      <td>18.273780</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>398</th>\n",
@@ -3444,13 +3273,13 @@
        "      <td>721.376009</td>\n",
        "      <td>0.377743</td>\n",
        "      <td>0.377743</td>\n",
-       "      <td>404.633667</td>\n",
+       "      <td>404.633698</td>\n",
        "      <td>1.000659</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>3849</td>\n",
        "      <td>3862</td>\n",
-       "      <td>18.273781</td>\n",
+       "      <td>18.273780</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>399</th>\n",
@@ -3468,13 +3297,13 @@
        "      <td>481.253098</td>\n",
        "      <td>0.377743</td>\n",
        "      <td>0.377743</td>\n",
-       "      <td>476.655640</td>\n",
+       "      <td>476.655701</td>\n",
        "      <td>0.785851</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>3862</td>\n",
        "      <td>3875</td>\n",
-       "      <td>18.273781</td>\n",
+       "      <td>18.273780</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -3509,35 +3338,35 @@
        "399          False          False                         ...    481.253098   \n",
        "\n",
        "      rt_pred  rt_norm_pred    ccs_pred  mobility_pred   nce  instrument  \\\n",
-       "0    0.189650      0.189650  254.195923       1.253140  30.0       Lumos   \n",
+       "0    0.189650      0.189650  254.195892       1.253140  30.0       Lumos   \n",
        "1    0.189650      0.189650  337.328583       0.831494  30.0       Lumos   \n",
-       "2    0.289261      0.289261  255.103760       1.257373  30.0       Lumos   \n",
+       "2    0.289261      0.289261  255.103699       1.257373  30.0       Lumos   \n",
        "3    0.289261      0.289261  337.444641       0.831621  30.0       Lumos   \n",
-       "4    0.433791      0.433791  256.615234       1.260001  30.0       Lumos   \n",
+       "4    0.433791      0.433791  256.615204       1.260001  30.0       Lumos   \n",
        "..        ...           ...         ...            ...   ...         ...   \n",
-       "395  0.489545      0.489545  429.360870       1.062514  30.0       Lumos   \n",
-       "396  0.489545      0.489545  463.231110       0.764225  30.0       Lumos   \n",
+       "395  0.489545      0.489545  429.360901       1.062514  30.0       Lumos   \n",
+       "396  0.489545      0.489545  463.231049       0.764225  30.0       Lumos   \n",
        "397  0.377743      0.377743  289.200989       1.430378  30.0       Lumos   \n",
-       "398  0.377743      0.377743  404.633667       1.000659  30.0       Lumos   \n",
-       "399  0.377743      0.377743  476.655640       0.785851  30.0       Lumos   \n",
+       "398  0.377743      0.377743  404.633698       1.000659  30.0       Lumos   \n",
+       "399  0.377743      0.377743  476.655701       0.785851  30.0       Lumos   \n",
        "\n",
        "    frag_start_idx  frag_stop_idx   irt_pred  \n",
-       "0                0              7 -10.360729  \n",
-       "1                7             14 -10.360729  \n",
-       "2               14             21   4.803679  \n",
-       "3               21             28   4.803679  \n",
-       "4               28             35  26.806266  \n",
+       "0                0              7 -10.360738  \n",
+       "1                7             14 -10.360738  \n",
+       "2               14             21   4.803681  \n",
+       "3               21             28   4.803681  \n",
+       "4               28             35  26.806270  \n",
        "..             ...            ...        ...  \n",
-       "395           3810           3823  35.294021  \n",
-       "396           3823           3836  35.294021  \n",
-       "397           3836           3849  18.273781  \n",
-       "398           3849           3862  18.273781  \n",
-       "399           3862           3875  18.273781  \n",
+       "395           3810           3823  35.294030  \n",
+       "396           3823           3836  35.294030  \n",
+       "397           3836           3849  18.273780  \n",
+       "398           3849           3862  18.273780  \n",
+       "399           3862           3875  18.273780  \n",
        "\n",
        "[400 rows x 21 columns]"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3555,47 +3384,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
-    "hdf_path = \"D:\\Software\\FASTA\\Human\\speclib_example.hdf\"\n",
-    "\n",
-    "speclib.save_hdf(hdf_path)"
+    "# hdf_path = \"D:\\Software\\FASTA\\Human\\speclib_example.hdf\"\n",
+    "# tsv_path = \"D:\\Software\\FASTA\\Human\\speclib_example.tsv\"\n",
+    "# speclib.save_hdf(hdf_path) # save as hdf speclib"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 29,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:01<00:00,  1.51s/it]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Translation finished, it will take several minutes to export the rest precursors to the tsv file...\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from peptdeep.spec_lib.translate import translate_to_tsv\n",
     "speclib.append_protein_name()\n",
-    "translate_to_tsv(speclib=speclib, \n",
-    "                tsv =  \"D:\\Software\\FASTA\\Human\\speclib_example.tsv\")"
+    "# translate_to_tsv(speclib=speclib, tsv = tsv_path) # save as tsv speclib"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### 4. matching peptides back to proteins"
+    "## 4. Matching peptides back to proteins"
    ]
   },
   {
@@ -3607,14 +3420,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 2/2 [00:00<?, ?it/s]\n"
+      "100%|██████████| 2/2 [00:00<00:00, 7639.90it/s]\n"
      ]
     },
     {
@@ -3902,15 +3715,15 @@
        "[148 rows x 15 columns]"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "from alphabase.protein.fasta import annotate_precursor_df\n",
-    "inferred_sequences = annotate_precursor_df(sequences, fasta)\n",
-    "inferred_sequences"
+    "inferred_sequence_df = annotate_precursor_df(sequence_df, protein_df)\n",
+    "inferred_sequence_df"
    ]
   },
   {
@@ -3937,7 +3750,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/peptdeep/model/model_interface.py b/peptdeep/model/model_interface.py
index 993373ec..864e3370 100644
--- a/peptdeep/model/model_interface.py
+++ b/peptdeep/model/model_interface.py
@@ -79,19 +79,17 @@ def __init__(
             optimizer, num_warmup_steps, num_training_steps, num_cycles, last_epoch
         )
 
-    def step(self, epoch: int, loss: float):
+    def step(self, epoch: int = None, loss=None):
         """
         Get the learning rate for the next epoch.
 
         Parameters
         ----------
-        epoch : int
+        epoch : int (Deprecated)
             The current epoch number.
-        loss : float
-            The loss value of the current epoch.
 
         """
-        return self.lambda_lr.step(epoch)
+        return self.lambda_lr.step()
 
     def get_last_lr(self) -> float:
         """

From a2cff7337bc8ac101fb44ab46e1ceaeacdc3ce8a Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Tue, 23 Jul 2024 14:40:36 +0200
Subject: [PATCH 09/10] some types

---
 docs/tutorials/tutorial_immunopeptidomics.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb
index c7d54dde..eb536a8a 100644
--- a/docs/tutorials/tutorial_immunopeptidomics.ipynb
+++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb
@@ -1808,7 +1808,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To reduce the size of the dataframe and predicted library we give each peptide sequence a unique protein identifier (number). This enables the use of search engines that rely on protein information (such as AlphaDIA) but one needs to keep in mind to remove filtering steps based on how many peptides per protein are identified during data analysis. Alternatively, proteins the peptide sequences could originate from can be infered using prot_infer (demonstrated below).   "
+    "To reduce the size of the dataframe and predicted library we give each peptide sequence a unique protein identifier (number). This enables the use of search engines that rely on protein information (such as AlphaDIA) but one needs to keep in mind to remove filtering steps based on how many peptides per protein are identified during data analysis. Alternatively, proteins of the peptide sequences may originate from can be infered using `alphabase.protein.fasta.annotate_precursor_df()` (demonstrated below)."
    ]
   },
   {

From 7d903815157089597c9d0a8a7fccafb15dbdea95 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Tue, 23 Jul 2024 23:29:20 +0200
Subject: [PATCH 10/10] #183 to make tests work

---
 docs/notebooks.rst                            |   2 +-
 nbs_tests/mass_spec/mass_calibration.ipynb    |  37 +-
 nbs_tests/mass_spec/match.ipynb               |  13 +
 nbs_tests/mass_spec/ms_reader.ipynb           |   9 +
 nbs_tests/model/ccs.ipynb                     |   2 +-
 nbs_tests/model/featurize.ipynb               |   4 +-
 nbs_tests/model/ms2.ipynb                     |   2 +-
 nbs_tests/model/rt.ipynb                      |   2 +-
 nbs_tests/pipeline_api.ipynb                  |   9 +
 nbs_tests/protein/fasta.ipynb                 | 349 ++++++++++--------
 .../maxquant_frag_reader.ipynb                |  71 +++-
 nbs_tests/spec_lib/library_factory.ipynb      |   9 +
 nbs_tests/spec_lib/predict_lib.ipynb          |   9 +
 nbs_tests/spec_lib/test_translate_tsv.ipynb   |   2 +-
 peptdeep/protein/fasta.py                     |   2 +-
 tests/run_tests.sh                            |   8 +-
 16 files changed, 345 insertions(+), 185 deletions(-)

diff --git a/docs/notebooks.rst b/docs/notebooks.rst
index 03646eb5..701040d6 100644
--- a/docs/notebooks.rst
+++ b/docs/notebooks.rst
@@ -6,10 +6,10 @@ Tutorials and notebooks about how to use AlphaPeptDeep
 .. toctree::
    :maxdepth: 1
 
+   tutorials/tutorial_immunopeptidomics
    nbs/tutorial_models_from_scratch
    nbs/tutorial_speclib_from_fasta
    nbs/alphapeptdeep_hdf_to_tsv
    nbs/tutorial_model_manager
    nbs/tutorial_building_rt_model
    nbs/tutorial_building_ccs_model
-   nbs/tutorials/tutorial_immunopeptidomics
diff --git a/nbs_tests/mass_spec/mass_calibration.ipynb b/nbs_tests/mass_spec/mass_calibration.ipynb
index 6ee0cbe6..16140e9f 100644
--- a/nbs_tests/mass_spec/mass_calibration.ipynb
+++ b/nbs_tests/mass_spec/mass_calibration.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -18,16 +18,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
+     ]
+    }
+   ],
    "source": [
     "from peptdeep.mass_spec.mass_calibration import *"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -112,7 +129,7 @@
        "7  0.0  1.0"
       ]
      },
-     "execution_count": null,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -146,6 +163,18 @@
    "display_name": "Python 3.8.3 ('base')",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/nbs_tests/mass_spec/match.ipynb b/nbs_tests/mass_spec/match.ipynb
index 3a029140..cce9bee6 100644
--- a/nbs_tests/mass_spec/match.ipynb
+++ b/nbs_tests/mass_spec/match.ipynb
@@ -16,6 +16,15 @@
     "# Match"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -377,6 +386,10 @@
    "display_name": "Python 3.8.3 ('base')",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/nbs_tests/mass_spec/ms_reader.ipynb b/nbs_tests/mass_spec/ms_reader.ipynb
index 4ea3bd81..7064c883 100644
--- a/nbs_tests/mass_spec/ms_reader.ipynb
+++ b/nbs_tests/mass_spec/ms_reader.ipynb
@@ -16,6 +16,15 @@
     "# MS Reader"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/nbs_tests/model/ccs.ipynb b/nbs_tests/model/ccs.ipynb
index a9f808fa..4e84ebb5 100644
--- a/nbs_tests/model/ccs.ipynb
+++ b/nbs_tests/model/ccs.ipynb
@@ -327,7 +327,7 @@
     "repeat = 10\n",
     "precursor_df = pd.DataFrame({\n",
     "    'sequence': ['AGHCEWQMKYR']*repeat,\n",
-    "    'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n",
+    "    'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n",
     "    'mod_sites': ['0;4;8']*repeat,\n",
     "    'nAA': [11]*repeat,\n",
     "    'charge': [2]*repeat,\n",
diff --git a/nbs_tests/model/featurize.ipynb b/nbs_tests/model/featurize.ipynb
index 7eafdd2b..8adb958b 100644
--- a/nbs_tests/model/featurize.ipynb
+++ b/nbs_tests/model/featurize.ipynb
@@ -66,10 +66,10 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "x = parse_mod_feature(5, ['Acetyl@Protein N-term','Phospho@S','Oxidation@M'], [0,-1,1])\n",
+    "x = parse_mod_feature(5, ['Acetyl@Protein_N-term','Phospho@S','Oxidation@M'], [0,-1,1])\n",
     "assert x.shape == (7, mod_feature_size)\n",
     "assert np.all(x[1,:]==MOD_TO_FEATURE['Oxidation@M'])\n",
-    "assert np.all(x[0,:]==MOD_TO_FEATURE['Acetyl@Protein N-term'])\n",
+    "assert np.all(x[0,:]==MOD_TO_FEATURE['Acetyl@Protein_N-term'])\n",
     "assert np.all(x[-1,:]==MOD_TO_FEATURE['Phospho@S'])\n",
     "assert np.all(x[(2,3,4,5),:]==0)"
    ]
diff --git a/nbs_tests/model/ms2.ipynb b/nbs_tests/model/ms2.ipynb
index 9fe774e6..c820dfba 100644
--- a/nbs_tests/model/ms2.ipynb
+++ b/nbs_tests/model/ms2.ipynb
@@ -396,7 +396,7 @@
     "repeat = 10\n",
     "precursor_df = pd.DataFrame({\n",
     "    'sequence': ['AGHCEWQMKYR']*repeat,\n",
-    "    'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n",
+    "    'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n",
     "    'mod_sites': ['0;4;8']*repeat,\n",
     "    'nAA': [11]*repeat,\n",
     "    'nce': [20]*repeat,\n",
diff --git a/nbs_tests/model/rt.ipynb b/nbs_tests/model/rt.ipynb
index 9bf8803e..ed952b40 100644
--- a/nbs_tests/model/rt.ipynb
+++ b/nbs_tests/model/rt.ipynb
@@ -135,7 +135,7 @@
     "def create_test_dataframe_with_identical_rows(nrows = 10):\n",
     "    precursor_df = pd.DataFrame({\n",
     "        'sequence': ['AGHCEWQMKYR']*nrows,\n",
-    "        'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*nrows,\n",
+    "        'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*nrows,\n",
     "        'mod_sites': ['0;4;8']*nrows,\n",
     "        'nAA': [11]*nrows,\n",
     "        'rt_norm': [0.6]*nrows\n",
diff --git a/nbs_tests/pipeline_api.ipynb b/nbs_tests/pipeline_api.ipynb
index 4d81db6f..a5678902 100644
--- a/nbs_tests/pipeline_api.ipynb
+++ b/nbs_tests/pipeline_api.ipynb
@@ -37,6 +37,15 @@
     "The refined models will be saved in the path pointed by \"PEPTDEEP_HOME\" in `peptdeep.settings.global_settings`."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/nbs_tests/protein/fasta.ipynb b/nbs_tests/protein/fasta.ipynb
index 1256ad16..3c8af3a0 100644
--- a/nbs_tests/protein/fasta.ipynb
+++ b/nbs_tests/protein/fasta.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -18,7 +18,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -35,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -197,7 +206,7 @@
        "8          False                  20  "
       ]
      },
-     "execution_count": null,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -224,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -276,7 +285,7 @@
        "1         yy      gene           FGHIJKLMNOPQR"
       ]
      },
-     "execution_count": null,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -287,7 +296,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -469,7 +478,7 @@
        "8          False                  20       xx        "
       ]
      },
-     "execution_count": null,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -482,7 +491,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,7 +523,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -584,7 +593,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4</td>\n",
        "      <td>7</td>\n",
        "      <td>xx</td>\n",
@@ -597,7 +606,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;4</td>\n",
        "      <td>7</td>\n",
        "      <td>xx</td>\n",
@@ -675,7 +684,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;3</td>\n",
        "      <td>12</td>\n",
        "      <td>xx</td>\n",
@@ -714,7 +723,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4</td>\n",
        "      <td>13</td>\n",
        "      <td>xx</td>\n",
@@ -727,7 +736,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;4</td>\n",
        "      <td>13</td>\n",
        "      <td>xx</td>\n",
@@ -766,7 +775,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M</td>\n",
        "      <td>0;8</td>\n",
        "      <td>13</td>\n",
        "      <td>xx;yy</td>\n",
@@ -779,7 +788,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term</td>\n",
+       "      <td>Acetyl@Protein_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>13</td>\n",
        "      <td>xx;yy</td>\n",
@@ -844,7 +853,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;14;3</td>\n",
        "      <td>19</td>\n",
        "      <td>xx</td>\n",
@@ -857,7 +866,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;3</td>\n",
        "      <td>19</td>\n",
        "      <td>xx</td>\n",
@@ -922,7 +931,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -935,7 +944,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;15;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -948,7 +957,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...</td>\n",
        "      <td>0;1;15;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -961,7 +970,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -1009,36 +1018,36 @@
        "    is_prot_cterm                                               mods  \\\n",
        "0           False                      Oxidation@M;Carbamidomethyl@C   \n",
        "1           False                                  Carbamidomethyl@C   \n",
-       "2           False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "3           False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "2           False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "3           False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "4            True                                        Oxidation@M   \n",
        "5            True                                                      \n",
        "6            True                                        Oxidation@M   \n",
        "7            True                                                      \n",
        "8           False                                  Carbamidomethyl@C   \n",
-       "9           False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "9           False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "10          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "11          False                                  Carbamidomethyl@C   \n",
-       "12          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "13          False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "12          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "13          False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "14           True                                        Oxidation@M   \n",
        "15           True                                                      \n",
-       "16           True                  Acetyl@Protein N-term;Oxidation@M   \n",
-       "17           True                              Acetyl@Protein N-term   \n",
+       "16           True                  Acetyl@Protein_N-term;Oxidation@M   \n",
+       "17           True                              Acetyl@Protein_N-term   \n",
        "18           True                                        Oxidation@M   \n",
        "19           True                                                      \n",
        "20          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "21          False                                  Carbamidomethyl@C   \n",
-       "22          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "23          False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "22          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "23          False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "24          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "25          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "26          False          Oxidation@M;Oxidation@M;Carbamidomethyl@C   \n",
        "27          False                                  Carbamidomethyl@C   \n",
-       "28          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "29          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "30          False  Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...   \n",
-       "31          False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "28          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "29          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "30          False  Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...   \n",
+       "31          False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "\n",
        "   mod_sites  nAA proteins genes  \n",
        "0        1;4    7       xx        \n",
@@ -1075,7 +1084,7 @@
        "31       0;4   20       xx        "
       ]
      },
-     "execution_count": null,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1087,7 +1096,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1103,14 +1112,14 @@
     "    else:\n",
     "        assert 'Carbamidomethyl@C' not in mods\n",
     "    # test Acetyl@Protein N-term\n",
-    "    if 'Acetyl@Protein N-term' in mods:\n",
+    "    if 'Acetyl@Protein_N-term' in mods:\n",
     "        assert _lib.precursor_df.is_prot_nterm[i]\n",
     "        assert '0' in sites\n",
     "    if '0' in mods:\n",
     "        assert _lib.precursor_df.is_prot_nterm[i]\n",
-    "        assert 'Acetyl@Protein N-term' in mods\n",
+    "        assert 'Acetyl@Protein_N-term' in mods\n",
     "    if not _lib.precursor_df.is_prot_nterm[i]:\n",
-    "        assert 'Acetyl@Protein N-term' not in mods\n",
+    "        assert 'Acetyl@Protein_N-term' not in mods\n",
     "    # test Oxidation@M\n",
     "    if 'Oxidation@M' in mods:\n",
     "        assert 'M' in seq\n",
@@ -1133,7 +1142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -1203,7 +1212,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4</td>\n",
        "      <td>7</td>\n",
        "      <td>xx</td>\n",
@@ -1216,7 +1225,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;4</td>\n",
        "      <td>7</td>\n",
        "      <td>xx</td>\n",
@@ -1346,7 +1355,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;3</td>\n",
        "      <td>12</td>\n",
        "      <td>xx</td>\n",
@@ -1385,7 +1394,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4</td>\n",
        "      <td>13</td>\n",
        "      <td>xx</td>\n",
@@ -1398,7 +1407,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;4</td>\n",
        "      <td>13</td>\n",
        "      <td>xx</td>\n",
@@ -1437,7 +1446,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M</td>\n",
        "      <td>0;8</td>\n",
        "      <td>13</td>\n",
        "      <td>xx;yy</td>\n",
@@ -1450,7 +1459,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term</td>\n",
+       "      <td>Acetyl@Protein_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>13</td>\n",
        "      <td>xx;yy</td>\n",
@@ -1567,7 +1576,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;14;3</td>\n",
        "      <td>19</td>\n",
        "      <td>xx</td>\n",
@@ -1580,7 +1589,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;3</td>\n",
        "      <td>19</td>\n",
        "      <td>xx</td>\n",
@@ -1645,7 +1654,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -1658,7 +1667,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;15;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -1671,7 +1680,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...</td>\n",
        "      <td>0;1;15;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -1684,7 +1693,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;4</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -1740,8 +1749,8 @@
        "    is_prot_cterm                                               mods  \\\n",
        "0           False                      Oxidation@M;Carbamidomethyl@C   \n",
        "1           False                                  Carbamidomethyl@C   \n",
-       "2           False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "3           False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "2           False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "3           False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "4            True                                        Oxidation@M   \n",
        "5            True                                                      \n",
        "6            True                              Oxidation@M;Phospho@S   \n",
@@ -1751,15 +1760,15 @@
        "10           True                                          Phospho@T   \n",
        "11           True                                                      \n",
        "12          False                                  Carbamidomethyl@C   \n",
-       "13          False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "13          False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "14          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "15          False                                  Carbamidomethyl@C   \n",
-       "16          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "17          False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "16          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "17          False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "18           True                                        Oxidation@M   \n",
        "19           True                                                      \n",
-       "20           True                  Acetyl@Protein N-term;Oxidation@M   \n",
-       "21           True                              Acetyl@Protein N-term   \n",
+       "20           True                  Acetyl@Protein_N-term;Oxidation@M   \n",
+       "21           True                              Acetyl@Protein_N-term   \n",
        "22           True                              Oxidation@M;Phospho@S   \n",
        "23           True                              Oxidation@M;Phospho@T   \n",
        "24           True                                        Oxidation@M   \n",
@@ -1768,16 +1777,16 @@
        "27           True                                                      \n",
        "28          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "29          False                                  Carbamidomethyl@C   \n",
-       "30          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "31          False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "30          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "31          False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "32          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "33          False                      Oxidation@M;Carbamidomethyl@C   \n",
        "34          False          Oxidation@M;Oxidation@M;Carbamidomethyl@C   \n",
        "35          False                                  Carbamidomethyl@C   \n",
-       "36          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "37          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "38          False  Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...   \n",
-       "39          False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "36          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "37          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "38          False  Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...   \n",
+       "39          False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "\n",
        "   mod_sites  nAA proteins genes  \n",
        "0        1;4    7       xx        \n",
@@ -1822,7 +1831,7 @@
        "39       0;4   20       xx        "
       ]
      },
-     "execution_count": null,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1836,7 +1845,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -1909,7 +1918,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4</td>\n",
        "      <td>7</td>\n",
        "      <td>xx</td>\n",
@@ -1923,7 +1932,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;4</td>\n",
        "      <td>7</td>\n",
        "      <td>xx</td>\n",
@@ -1965,7 +1974,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...</td>\n",
+       "      <td>Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any_N-t...</td>\n",
        "      <td>4;0;7;13</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -1979,7 +1988,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;4;7;13</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -1993,7 +2002,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;15;4;7;13</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -2007,7 +2016,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...</td>\n",
        "      <td>0;1;15;4;7;13</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -2021,7 +2030,7 @@
        "      <td>2</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...</td>\n",
        "      <td>0;4;7;13</td>\n",
        "      <td>20</td>\n",
        "      <td>xx</td>\n",
@@ -2050,15 +2059,15 @@
        "     is_prot_cterm                                               mods  \\\n",
        "0            False                      Oxidation@M;Carbamidomethyl@C   \n",
        "1            False                                  Carbamidomethyl@C   \n",
-       "2            False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "3            False            Acetyl@Protein N-term;Carbamidomethyl@C   \n",
+       "2            False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "3            False            Acetyl@Protein_N-term;Carbamidomethyl@C   \n",
        "4             True                                        Oxidation@M   \n",
        "..             ...                                                ...   \n",
-       "115          False  Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...   \n",
-       "116          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "117          False  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...   \n",
-       "118          False  Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...   \n",
-       "119          False  Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth...   \n",
+       "115          False  Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any_N-t...   \n",
+       "116          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "117          False  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...   \n",
+       "118          False  Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...   \n",
+       "119          False  Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...   \n",
        "\n",
        "         mod_sites  nAA proteins genes labeling_channel  \n",
        "0              1;4    7       xx                   none  \n",
@@ -2076,7 +2085,7 @@
        "[120 rows x 11 columns]"
       ]
      },
-     "execution_count": null,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2085,15 +2094,15 @@
     "#| hide\n",
     "_lib.add_peptide_labeling({\n",
     "    'none': [], # not labelled for reference\n",
-    "    'light': ['Dimethyl@Any N-term','Dimethyl@K'],\n",
-    "    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],\n",
+    "    'light': ['Dimethyl@Any_N-term','Dimethyl@K'],\n",
+    "    'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],\n",
     "})\n",
     "_lib.precursor_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2102,7 +2111,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -2229,7 +2238,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;2</td>\n",
        "      <td>8</td>\n",
        "      <td>0</td>\n",
@@ -2325,7 +2334,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;6</td>\n",
        "      <td>8</td>\n",
        "      <td>1</td>\n",
@@ -2397,7 +2406,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;1;3</td>\n",
        "      <td>9</td>\n",
        "      <td>0</td>\n",
@@ -2421,7 +2430,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;3</td>\n",
        "      <td>9</td>\n",
        "      <td>0</td>\n",
@@ -2493,7 +2502,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M;Carbamidomet...</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...</td>\n",
        "      <td>0;8;6</td>\n",
        "      <td>9</td>\n",
        "      <td>1</td>\n",
@@ -2517,7 +2526,7 @@
        "      <td>0</td>\n",
        "      <td>True</td>\n",
        "      <td>False</td>\n",
-       "      <td>Acetyl@Protein N-term;Carbamidomethyl@C</td>\n",
+       "      <td>Acetyl@Protein_N-term;Carbamidomethyl@C</td>\n",
        "      <td>0;6</td>\n",
        "      <td>9</td>\n",
        "      <td>1</td>\n",
@@ -2637,7 +2646,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M</td>\n",
        "      <td>0;7</td>\n",
        "      <td>11</td>\n",
        "      <td>0</td>\n",
@@ -2661,7 +2670,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M</td>\n",
        "      <td>0;7</td>\n",
        "      <td>11</td>\n",
        "      <td>0</td>\n",
@@ -2685,7 +2694,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term</td>\n",
+       "      <td>Acetyl@Protein_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>11</td>\n",
        "      <td>0</td>\n",
@@ -2709,7 +2718,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term</td>\n",
+       "      <td>Acetyl@Protein_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>11</td>\n",
        "      <td>0</td>\n",
@@ -2791,8 +2800,8 @@
        "      <td>0</td>\n",
        "      <td>0.352144</td>\n",
        "      <td>0.352144</td>\n",
-       "      <td>402.555023</td>\n",
-       "      <td>0.994806</td>\n",
+       "      <td>402.554993</td>\n",
+       "      <td>0.994805</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>220</td>\n",
@@ -2815,7 +2824,7 @@
        "      <td>0</td>\n",
        "      <td>0.352144</td>\n",
        "      <td>0.352144</td>\n",
-       "      <td>482.206787</td>\n",
+       "      <td>482.206757</td>\n",
        "      <td>0.794435</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -2829,7 +2838,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M</td>\n",
        "      <td>0;4</td>\n",
        "      <td>11</td>\n",
        "      <td>1</td>\n",
@@ -2839,7 +2848,7 @@
        "      <td>0</td>\n",
        "      <td>0.406691</td>\n",
        "      <td>0.406691</td>\n",
-       "      <td>414.260437</td>\n",
+       "      <td>414.260406</td>\n",
        "      <td>1.024166</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -2853,7 +2862,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term;Oxidation@M</td>\n",
+       "      <td>Acetyl@Protein_N-term;Oxidation@M</td>\n",
        "      <td>0;4</td>\n",
        "      <td>11</td>\n",
        "      <td>1</td>\n",
@@ -2863,7 +2872,7 @@
        "      <td>0</td>\n",
        "      <td>0.406691</td>\n",
        "      <td>0.406691</td>\n",
-       "      <td>470.269653</td>\n",
+       "      <td>470.269684</td>\n",
        "      <td>0.775096</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -2877,7 +2886,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term</td>\n",
+       "      <td>Acetyl@Protein_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>11</td>\n",
        "      <td>1</td>\n",
@@ -2901,7 +2910,7 @@
        "      <td>1</td>\n",
        "      <td>True</td>\n",
        "      <td>True</td>\n",
-       "      <td>Acetyl@Protein N-term</td>\n",
+       "      <td>Acetyl@Protein_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>11</td>\n",
        "      <td>1</td>\n",
@@ -2911,7 +2920,7 @@
        "      <td>0</td>\n",
        "      <td>0.462864</td>\n",
        "      <td>0.462864</td>\n",
-       "      <td>469.226685</td>\n",
+       "      <td>469.226715</td>\n",
        "      <td>0.773290</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -3162,35 +3171,35 @@
        "0                                         Oxidation@M         2    8      0   \n",
        "1                                                                  8      0   \n",
        "2                                   Carbamidomethyl@C         2    8      0   \n",
-       "3             Acetyl@Protein N-term;Carbamidomethyl@C       0;2    8      0   \n",
+       "3             Acetyl@Protein_N-term;Carbamidomethyl@C       0;2    8      0   \n",
        "4                                         Oxidation@M         6    8      1   \n",
        "5                                                                  8      1   \n",
        "6                                   Carbamidomethyl@C         6    8      1   \n",
-       "7             Acetyl@Protein N-term;Carbamidomethyl@C       0;6    8      1   \n",
+       "7             Acetyl@Protein_N-term;Carbamidomethyl@C       0;6    8      1   \n",
        "8                       Oxidation@M;Carbamidomethyl@C       1;3    9      0   \n",
        "9                                   Carbamidomethyl@C         3    9      0   \n",
-       "10  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...     0;1;3    9      0   \n",
-       "11            Acetyl@Protein N-term;Carbamidomethyl@C       0;3    9      0   \n",
+       "10  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...     0;1;3    9      0   \n",
+       "11            Acetyl@Protein_N-term;Carbamidomethyl@C       0;3    9      0   \n",
        "12                      Oxidation@M;Carbamidomethyl@C       8;6    9      1   \n",
        "13                                  Carbamidomethyl@C         6    9      1   \n",
-       "14  Acetyl@Protein N-term;Oxidation@M;Carbamidomet...     0;8;6    9      1   \n",
-       "15            Acetyl@Protein N-term;Carbamidomethyl@C       0;6    9      1   \n",
+       "14  Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...     0;8;6    9      1   \n",
+       "15            Acetyl@Protein_N-term;Carbamidomethyl@C       0;6    9      1   \n",
        "16                                        Oxidation@M         7   11      0   \n",
        "17                                        Oxidation@M         7   11      0   \n",
        "18                                                                11      0   \n",
        "19                                                                11      0   \n",
-       "20                  Acetyl@Protein N-term;Oxidation@M       0;7   11      0   \n",
-       "21                  Acetyl@Protein N-term;Oxidation@M       0;7   11      0   \n",
-       "22                              Acetyl@Protein N-term         0   11      0   \n",
-       "23                              Acetyl@Protein N-term         0   11      0   \n",
+       "20                  Acetyl@Protein_N-term;Oxidation@M       0;7   11      0   \n",
+       "21                  Acetyl@Protein_N-term;Oxidation@M       0;7   11      0   \n",
+       "22                              Acetyl@Protein_N-term         0   11      0   \n",
+       "23                              Acetyl@Protein_N-term         0   11      0   \n",
        "24                                        Oxidation@M         4   11      1   \n",
        "25                                        Oxidation@M         4   11      1   \n",
        "26                                                                11      1   \n",
        "27                                                                11      1   \n",
-       "28                  Acetyl@Protein N-term;Oxidation@M       0;4   11      1   \n",
-       "29                  Acetyl@Protein N-term;Oxidation@M       0;4   11      1   \n",
-       "30                              Acetyl@Protein N-term         0   11      1   \n",
-       "31                              Acetyl@Protein N-term         0   11      1   \n",
+       "28                  Acetyl@Protein_N-term;Oxidation@M       0;4   11      1   \n",
+       "29                  Acetyl@Protein_N-term;Oxidation@M       0;4   11      1   \n",
+       "30                              Acetyl@Protein_N-term         0   11      1   \n",
+       "31                              Acetyl@Protein_N-term         0   11      1   \n",
        "32                                        Oxidation@M         6   13      1   \n",
        "33                                        Oxidation@M         6   13      1   \n",
        "34                                                                13      1   \n",
@@ -3269,12 +3278,12 @@
        "23  468.311920       0.771782  30.0       Lumos             190            200  \n",
        "24  400.909912       0.990859  30.0       Lumos             200            210  \n",
        "25  478.989624       0.789230  30.0       Lumos             210            220  \n",
-       "26  402.555023       0.994806  30.0       Lumos             220            230  \n",
-       "27  482.206787       0.794435  30.0       Lumos             230            240  \n",
-       "28  414.260437       1.024166  30.0       Lumos             240            250  \n",
-       "29  470.269653       0.775096  30.0       Lumos             250            260  \n",
+       "26  402.554993       0.994805  30.0       Lumos             220            230  \n",
+       "27  482.206757       0.794435  30.0       Lumos             230            240  \n",
+       "28  414.260406       1.024166  30.0       Lumos             240            250  \n",
+       "29  470.269684       0.775096  30.0       Lumos             250            260  \n",
        "30  417.726074       1.032617  30.0       Lumos             260            270  \n",
-       "31  469.226685       0.773290  30.0       Lumos             270            280  \n",
+       "31  469.226715       0.773290  30.0       Lumos             270            280  \n",
        "32  421.076538       1.041983  30.0       Lumos             280            292  \n",
        "33  490.627533       0.809400  30.0       Lumos             292            304  \n",
        "34  423.214233       1.047176  30.0       Lumos             304            316  \n",
@@ -3287,7 +3296,7 @@
        "[40 rows x 26 columns]"
       ]
      },
-     "execution_count": null,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3326,7 +3335,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -3381,7 +3390,7 @@
        "      <td>1</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Oxidation@M;Dimethyl@Any N-term</td>\n",
+       "      <td>Oxidation@M;Dimethyl@Any_N-term</td>\n",
        "      <td>2;0</td>\n",
        "      <td>8</td>\n",
        "      <td>0</td>\n",
@@ -3391,7 +3400,7 @@
        "      <td>0</td>\n",
        "      <td>0.242660</td>\n",
        "      <td>0.242660</td>\n",
-       "      <td>345.390839</td>\n",
+       "      <td>345.390869</td>\n",
        "      <td>0.850135</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -3405,7 +3414,7 @@
        "      <td>1</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Dimethyl:2H(6)13C(2)@Any N-term</td>\n",
+       "      <td>Dimethyl:2H(6)13C(2)@Any_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>8</td>\n",
        "      <td>0</td>\n",
@@ -3429,7 +3438,7 @@
        "      <td>1</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term</td>\n",
+       "      <td>Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term</td>\n",
        "      <td>2;0</td>\n",
        "      <td>8</td>\n",
        "      <td>0</td>\n",
@@ -3453,7 +3462,7 @@
        "      <td>1</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term</td>\n",
+       "      <td>Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term</td>\n",
        "      <td>6;0</td>\n",
        "      <td>8</td>\n",
        "      <td>1</td>\n",
@@ -3463,7 +3472,7 @@
        "      <td>2</td>\n",
        "      <td>0.040846</td>\n",
        "      <td>0.040846</td>\n",
-       "      <td>319.400330</td>\n",
+       "      <td>319.400391</td>\n",
        "      <td>0.786163</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -3477,7 +3486,7 @@
        "      <td>1</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Dimethyl:2H(6)13C(2)@Any N-term</td>\n",
+       "      <td>Dimethyl:2H(6)13C(2)@Any_N-term</td>\n",
        "      <td>0</td>\n",
        "      <td>8</td>\n",
        "      <td>1</td>\n",
@@ -3525,7 +3534,7 @@
        "      <td>2</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Dimethyl@Any N-term;Dimethyl@K</td>\n",
+       "      <td>Dimethyl@Any_N-term;Dimethyl@K</td>\n",
        "      <td>0;8</td>\n",
        "      <td>13</td>\n",
        "      <td>1</td>\n",
@@ -3535,8 +3544,8 @@
        "      <td>0</td>\n",
        "      <td>0.620949</td>\n",
        "      <td>0.620949</td>\n",
-       "      <td>430.461273</td>\n",
-       "      <td>1.065108</td>\n",
+       "      <td>430.461243</td>\n",
+       "      <td>1.065107</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
        "      <td>692</td>\n",
@@ -3549,7 +3558,7 @@
        "      <td>2</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Oxidation@M;Dimethyl@Any N-term;Dimethyl@K</td>\n",
+       "      <td>Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K</td>\n",
        "      <td>6;0;8</td>\n",
        "      <td>13</td>\n",
        "      <td>1</td>\n",
@@ -3559,7 +3568,7 @@
        "      <td>0</td>\n",
        "      <td>0.468698</td>\n",
        "      <td>0.468698</td>\n",
-       "      <td>482.796692</td>\n",
+       "      <td>482.796661</td>\n",
        "      <td>0.796481</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -3573,7 +3582,7 @@
        "      <td>2</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Oxidation@M;Dimethyl@Any N-term;Dimethyl@K</td>\n",
+       "      <td>Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K</td>\n",
        "      <td>6;0;8</td>\n",
        "      <td>13</td>\n",
        "      <td>1</td>\n",
@@ -3583,7 +3592,7 @@
        "      <td>0</td>\n",
        "      <td>0.468698</td>\n",
        "      <td>0.468698</td>\n",
-       "      <td>428.150757</td>\n",
+       "      <td>428.150787</td>\n",
        "      <td>1.059489</td>\n",
        "      <td>30.0</td>\n",
        "      <td>Lumos</td>\n",
@@ -3597,7 +3606,7 @@
        "      <td>2</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...</td>\n",
+       "      <td>Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...</td>\n",
        "      <td>0;5</td>\n",
        "      <td>13</td>\n",
        "      <td>0</td>\n",
@@ -3621,7 +3630,7 @@
        "      <td>2</td>\n",
        "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...</td>\n",
+       "      <td>Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...</td>\n",
        "      <td>0;5</td>\n",
        "      <td>13</td>\n",
        "      <td>0</td>\n",
@@ -3658,17 +3667,17 @@
        "79  FGHIKLMNPQRST             0              2          False           True   \n",
        "\n",
        "                                                 mods mod_sites  nAA  decoy  \\\n",
-       "0                     Oxidation@M;Dimethyl@Any N-term       2;0    8      0   \n",
-       "1                     Dimethyl:2H(6)13C(2)@Any N-term         0    8      0   \n",
-       "2         Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term       2;0    8      0   \n",
-       "3         Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term       6;0    8      1   \n",
-       "4                     Dimethyl:2H(6)13C(2)@Any N-term         0    8      1   \n",
+       "0                     Oxidation@M;Dimethyl@Any_N-term       2;0    8      0   \n",
+       "1                     Dimethyl:2H(6)13C(2)@Any_N-term         0    8      0   \n",
+       "2         Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term       2;0    8      0   \n",
+       "3         Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term       6;0    8      1   \n",
+       "4                     Dimethyl:2H(6)13C(2)@Any_N-term         0    8      1   \n",
        "..                                                ...       ...  ...    ...   \n",
-       "75                     Dimethyl@Any N-term;Dimethyl@K       0;8   13      1   \n",
-       "76         Oxidation@M;Dimethyl@Any N-term;Dimethyl@K     6;0;8   13      1   \n",
-       "77         Oxidation@M;Dimethyl@Any N-term;Dimethyl@K     6;0;8   13      1   \n",
-       "78  Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...       0;5   13      0   \n",
-       "79  Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...       0;5   13      0   \n",
+       "75                     Dimethyl@Any_N-term;Dimethyl@K       0;8   13      1   \n",
+       "76         Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K     6;0;8   13      1   \n",
+       "77         Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K     6;0;8   13      1   \n",
+       "78  Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...       0;5   13      0   \n",
+       "79  Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...       0;5   13      0   \n",
        "\n",
        "    charge  ...       i_5 mono_isotope_idx   rt_pred  rt_norm_pred  \\\n",
        "0        2  ...  0.001352                0  0.242660      0.242660   \n",
@@ -3684,22 +3693,22 @@
        "79       3  ...  0.058123                2  0.206957      0.206957   \n",
        "\n",
        "      ccs_pred  mobility_pred   nce  instrument  frag_start_idx  frag_stop_idx  \n",
-       "0   345.390839       0.850135  30.0       Lumos               0              7  \n",
+       "0   345.390869       0.850135  30.0       Lumos               0              7  \n",
        "1   313.133270       0.770554  30.0       Lumos               7             14  \n",
        "2   314.302277       0.773615  30.0       Lumos              14             21  \n",
-       "3   319.400330       0.786163  30.0       Lumos              21             28  \n",
+       "3   319.400391       0.786163  30.0       Lumos              21             28  \n",
        "4   320.333069       0.788271  30.0       Lumos              28             35  \n",
        "..         ...            ...   ...         ...             ...            ...  \n",
-       "75  430.461273       1.065108  30.0       Lumos             692            704  \n",
-       "76  482.796692       0.796481  30.0       Lumos             704            716  \n",
-       "77  428.150757       1.059489  30.0       Lumos             716            728  \n",
+       "75  430.461243       1.065107  30.0       Lumos             692            704  \n",
+       "76  482.796661       0.796481  30.0       Lumos             704            716  \n",
+       "77  428.150787       1.059489  30.0       Lumos             716            728  \n",
        "78  412.858307       1.021552  30.0       Lumos             728            740  \n",
        "79  478.660187       0.789583  30.0       Lumos             740            752  \n",
        "\n",
        "[80 rows x 27 columns]"
       ]
      },
-     "execution_count": null,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3707,8 +3716,8 @@
    "source": [
     "_lib.import_and_process_protein_dict(protein_dict)\n",
     "_lib.add_peptide_labeling({\n",
-    "    'light': ['Dimethyl@Any N-term','Dimethyl@K'],\n",
-    "    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],\n",
+    "    'light': ['Dimethyl@Any_N-term','Dimethyl@K'],\n",
+    "    'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],\n",
     "})\n",
     "_lib.predict_all()\n",
     "assert (_lib.precursor_df.decoy==1).any()\n",
@@ -3732,6 +3741,18 @@
    "display_name": "Python 3.8.3 ('base')",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb b/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb
index 80dda52d..a43bcad3 100644
--- a/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb
+++ b/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -27,16 +27,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
+     ]
+    }
+   ],
    "source": [
     "from peptdeep.psm_frag_reader.maxquant_frag_reader import *"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -48,9 +56,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  lambda x: parse_phos_probs(x[0], x[1], prob), axis=1\n",
+      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.         0.34720501 0.54503546 0.14126802 0.17500845 0.1020231\n",
+      " 0.04637072 0.         0.         0.01899846 0.        ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n",
+      "  self._fragment_intensity_df.iloc[start:end, :] = intens\n",
+      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.02471942 0.41737406 0.67116171 1.         0.37160414 0.59517672\n",
+      " 0.54813229 0.         0.0606665  0.03838788 0.03735192]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n",
+      "  self._fragment_intensity_df.iloc[start:end, :] = intens\n",
+      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.         0.04495926 0.0213509  0.02114326 0.01335259 0.\n",
+      " 0.         0.         0.         0.         0.        ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n",
+      "  self._fragment_intensity_df.iloc[start:end, :] = intens\n",
+      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.         0.         0.51698907 0.87869409 0.14043304 0.1052603\n",
+      " 0.19786873 0.         0.         0.         0.         0.\n",
+      " 0.        ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n",
+      "  self._fragment_intensity_df.iloc[start:end, :] = intens\n",
+      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.         0.         0.         0.54449196 0.2230503  0.\n",
+      " 0.30967216 0.         0.         0.         0.         0.\n",
+      " 0.        ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n",
+      "  self._fragment_intensity_df.iloc[start:end, :] = intens\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[4], line 18\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmod_sites\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAcetyl@Protein N-term\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmod_sites\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m0\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     20\u001b[0m seq \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAAAGPSNSSSGTSTPR\u001b[39m\u001b[38;5;124m'\u001b[39m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
    "source": [
     "#| hide\n",
     "mq_str = '''Raw file\tScan number\tScan index\tSequence\tLength\tMissed cleavages\tModifications\tModified sequence\tPhospho (STY) Probabilities\tPhospho (STY) Score Diffs\tAcetyl (Protein N-term)\tPhospho (STY)\tProteins\tGene Names\tProtein Names\tCharge\tFragmentation\tMass analyzer\tType\tScan event number\tIsotope index\tm/z\tMass\tMass Error [ppm]\tSimple Mass Error [ppm]\tRetention time\tPEP\tScore\tDelta score\tScore diff\tLocalization prob\tCombinatorics\tPIF\tFraction of total spectrum\tBase peak fraction\tPrecursor Full ScanNumber\tPrecursor Intensity\tPrecursor Apex Fraction\tPrecursor Apex Offset\tPrecursor Apex Offset Time\tDiagnostic peak Phospho (STY) Y\tMatches\tIntensities\tMass Deviations [Da]\tMass Deviations [ppm]\tMasses\tNumber of Matches\tIntensity coverage\tPeak coverage\tNeutral loss level\tETD identification type\tReverse\tAll scores\tAll sequences\tAll modified sequences\tid\tProtein group IDs\tPeptide ID\tMod. peptide ID\tEvidence ID\tPhospho (STY) site IDs\n",
@@ -69,7 +114,7 @@
     "assert 'frag_stop_idx' in mq_reader.psm_df.columns\n",
     "assert mq_reader.psm_df.mods.values[0] == ''\n",
     "assert mq_reader.psm_df.mod_sites.values[0] == ''\n",
-    "assert mq_reader.psm_df.mods.values[1] == 'Acetyl@Protein N-term'\n",
+    "assert mq_reader.psm_df.mods.values[1] in ('Acetyl@Protein_N-term', 'Acetyl@Protein N-term')\n",
     "assert mq_reader.psm_df.mod_sites.values[1] == '0'\n",
     "seq = 'AAAGPSNSSSGTSTPR'\n",
     "frag_types = raw_df[raw_df['Sequence']==seq]['Matches'].values[0].split(';')\n",
@@ -496,6 +541,18 @@
    "display_name": "Python 3.8.3 ('base')",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/nbs_tests/spec_lib/library_factory.ipynb b/nbs_tests/spec_lib/library_factory.ipynb
index 9e96cffe..bcf70221 100644
--- a/nbs_tests/spec_lib/library_factory.ipynb
+++ b/nbs_tests/spec_lib/library_factory.ipynb
@@ -23,6 +23,15 @@
     "Factory classes to predict libraries from different sources (input file format)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/nbs_tests/spec_lib/predict_lib.ipynb b/nbs_tests/spec_lib/predict_lib.ipynb
index 7fa38264..55faffad 100644
--- a/nbs_tests/spec_lib/predict_lib.ipynb
+++ b/nbs_tests/spec_lib/predict_lib.ipynb
@@ -33,6 +33,15 @@
     "\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch # noqa: 401, to prevent crash in Mac Arm"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/nbs_tests/spec_lib/test_translate_tsv.ipynb b/nbs_tests/spec_lib/test_translate_tsv.ipynb
index 514e2c9b..b9658a39 100644
--- a/nbs_tests/spec_lib/test_translate_tsv.ipynb
+++ b/nbs_tests/spec_lib/test_translate_tsv.ipynb
@@ -138,7 +138,7 @@
     "charged_frag_types = ['b_z1','y_z1','y_modloss_z1']\n",
     "precursor_df = pd.DataFrame({\n",
     "    'sequence': ['ASGHCEWMKYR']*repeat+['ASGHCEWMAAR'],\n",
-    "    'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],\n",
+    "    'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],\n",
     "    'mod_sites': ['0;4;8']*repeat+[''],\n",
     "    'nAA': 11,\n",
     "    'NCE': 20,\n",
diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py
index 160a2c72..ecf92fb8 100644
--- a/peptdeep/protein/fasta.py
+++ b/peptdeep/protein/fasta.py
@@ -21,7 +21,7 @@ def __init__(
         precursor_charge_max: int = 4,
         precursor_mz_min: float = 400.0,
         precursor_mz_max: float = 1800.0,
-        var_mods: list = ["Acetyl@Protein N-term", "Oxidation@M"],
+        var_mods: list = ["Acetyl@Protein_N-term", "Oxidation@M"],
         min_var_mod_num: int = 0,
         max_var_mod_num: int = 2,
         fix_mods: list = ["Carbamidomethyl@C"],
diff --git a/tests/run_tests.sh b/tests/run_tests.sh
index 6edc016c..c383dfc9 100644
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -1,2 +1,6 @@
-INCLUDED_NBS=$(find ../nbs_tests -name "*.ipynb")
-python -m pytest --nbmake $(echo $INCLUDED_NBS)
+TEST_NBS=$(find ../nbs_tests -name "*.ipynb")
+TUTORIAL_NBS=$(find ../docs/tutorials -name "*.ipynb")
+
+ALL_NBS=$(echo $TEST_NBS$'\n'$TUTORIAL_NBS)
+
+python -m pytest --nbmake $(echo $ALL_NBS)