wilhelm-lab · finnkap · May 24, 2024 · May 24, 2024 · May 24, 2024 · May 31, 2024
diff --git a/baseline_model/Prosit_baseline_model.keras b/baseline_model/Prosit_baseline_model.keras
diff --git a/notebooks/Example_automatic_refinement_transfer_learning.ipynb b/notebooks/Example_automatic_refinement_transfer_learning.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = '-1'\n",
+    "os.environ['HF_HOME'] = '/cmnfs/proj/bmpc_dlomix/datasets'\n",
+    "os.environ['HF_DATASETS_CACHE'] = '/cmnfs/proj/bmpc_dlomix/datasets/hf_cache'\n",
+    "\n",
+    "num_proc = 16\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = f\"{num_proc}\"\n",
+    "os.environ[\"TF_NUM_INTRAOP_THREADS\"] = f\"{num_proc}\"\n",
+    "os.environ[\"TF_NUM_INTEROP_THREADS\"] = f\"{num_proc}\"\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "tf.config.threading.set_inter_op_parallelism_threads(num_proc)\n",
+    "tf.config.threading.set_intra_op_parallelism_threads(num_proc)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dlomix.data import load_processed_dataset\n",
+    "\n",
+    "dataset = load_processed_dataset('/cmnfs/proj/bmpc_dlomix/datasets/processed/ptm_baseline_small_cleaned_bs1024')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dlomix.models import PrositIntensityPredictor\n",
+    "from dlomix.losses import masked_spectral_distance, masked_pearson_correlation_distance\n",
+    "\n",
+    "model = tf.keras.models.load_model('/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024_unmod_extended/7ef3360f-2349-46c0-a905-01187d4899e2.keras')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dlomix.refinement_transfer_learning.automatic_rl_tl import AutomaticRlTlTraining, AutomaticRlTlTrainingConfig\n",
+    "\n",
+    "config = AutomaticRlTlTrainingConfig(\n",
+    "    dataset=dataset,\n",
+    "    baseline_model=model,\n",
+    "    use_wandb=True\n",
+    ")\n",
+    "\n",
+    "trainer = AutomaticRlTlTraining(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_model = trainer.train()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
     packages=setuptools.find_packages(where="src"),
     package_dir={"": "src"},
     include_package_data=True,
-    package_data={"": ["data/processing/pickled_feature_dicts/*"]},
+    package_data={"": ["data/processing/pickled_feature_dicts/*", "prosit_baseline_model.txt", "refinement_transfer_learning/user_report.ipynb"]},
     install_requires=[
         "datasets",
         "fpdf",
@@ -45,6 +45,10 @@
         "wandb": [
             "wandb >= 0.15",
         ],
+        "rltl-report": [
+            "nbconvert",
+            "ipykernel"
+        ]
     },
     classifiers=[
         "Programming Language :: Python :: 3",

diff --git a/src/dlomix/data/charge_state.py b/src/dlomix/data/charge_state.py
@@ -42,6 +42,8 @@ def __init__(
         sequence_column: str = "modified_sequence",
         label_column: str = "most_abundant_charge_by_count",
         val_ratio: float = 0.2,
+        test_ratio: float = 0.2,
+        advanced_splitting: bool = False,
         max_seq_len: Union[int, str] = 30,
         dataset_type: str = "tf",
         batch_size: int = 256,
@@ -59,6 +61,8 @@ def __init__(
         auto_cleanup_cache: bool = True,
         num_proc: Optional[int] = None,
         batch_processing_size: int = 1000,
+        inference_only: bool = False,
+        ion_types: Optional[List[str]] = None,
     ):
         super().__init__(
             data_source,
@@ -68,6 +72,8 @@ def __init__(
             sequence_column,
             label_column,
             val_ratio,
+            test_ratio,
+            advanced_splitting,
             max_seq_len,
             dataset_type,
             batch_size,
@@ -85,4 +91,6 @@ def __init__(
             auto_cleanup_cache,
             num_proc,
             batch_processing_size,
+            inference_only,
+            ion_types,
         )