Merge pull request #13 from wilhelm-lab/ms2ai-integration

Ms2ai integration - Retention Time only
wilhelm-lab · Oct 29, 2022 · 1a72430 · 1a72430
2 parents bf71b1d + 0c857ae
commit 1a72430
Show file tree

Hide file tree

Showing 15 changed files with 251 additions and 107 deletions.
diff --git a/.gitignore b/.gitignore
@@ -149,4 +149,14 @@ notebooks/wandb
 /notebooks/data*
 
 # local to do file if exists :)
-todo.txt
+todo.txt
+
+.DS_Store
+
+# model checkpoints in the run scripts directory
+run_scripts/checkpoint*
+run_scripts/*.index
+run_scripts/*.data-*
+
+# testing metadata
+metadata.parquet
diff --git a/dlomix/__init__.py b/dlomix/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.2dev1"
+__version__ = "0.0.3"
 
 META_DATA = {
     "author": "Omar Shouman",

diff --git a/dlomix/constants.py b/dlomix/constants.py
@@ -1,3 +1,5 @@
+DEFAULT_PARQUET_ENGINE = "pyarrow"
+
 retention_time_pipeline_parameters = {
     "model_params": {"seq_length": 30},
     "data_params": {
@@ -8,10 +10,11 @@
     "trained_model_stats": [0.0, 1.0],
 }
 
-retention_time_pipeline_parameters.update( 
+retention_time_pipeline_parameters.update(
     {
-    "trained_model_url":
-        "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop" + retention_time_pipeline_parameters['trained_model_path'].strip("..") + retention_time_pipeline_parameters['trained_model_zipfile_name']
+        "trained_model_url": "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop"
+        + retention_time_pipeline_parameters["trained_model_path"].strip("..")
+        + retention_time_pipeline_parameters["trained_model_zipfile_name"]
     }
 )
 

diff --git a/dlomix/data/IntensityDataset.py b/dlomix/data/IntensityDataset.py
@@ -107,7 +107,6 @@ def __init__(
         self.precursor_charge = None
         self.intensities = None
 
-
         self.features_df = None
         self.example_id = None
 
@@ -174,13 +173,17 @@ def load_data(self, data):
 
     def _read_data(self):
         if isinstance(self.data_source, tuple):
-            tuple_size_is_three_or_four = len(self.data_source) == 3 or len(self.data_source) == 4
+            tuple_size_is_three_or_four = (
+                len(self.data_source) == 3 or len(self.data_source) == 4
+            )
             if tuple_size_is_three_or_four:
-                tuple_elements_are_ndarray = all([isinstance(x, np.ndarray) for x in self.data_source])
+                tuple_elements_are_ndarray = all(
+                    [isinstance(x, np.ndarray) for x in self.data_source]
+                )
                 if tuple_elements_are_ndarray:
                     self.sequences = self.data_source[0]
                     self.collision_energy = self.data_source[1]
-                    self.precursor_charge =  self.data_source[2]
+                    self.precursor_charge = self.data_source[2]
                     if len(self.data_source) == 4:
                         self.intensities = self.data_source[3]
                         self.no_intensities = False
@@ -208,14 +211,13 @@ def _read_data(self):
             self.precursor_charge = df[self.precursor_charge_col]
             self.intensities = df[self.intensities_col]
 
-
             # parse strings into lists, for precursor charge and intensities
             if isinstance(self.precursor_charge.iloc[0], str):
                 self.precursor_charge = self.precursor_charge.apply(eval)
-            
+
             if isinstance(self.intensities.iloc[0], str):
                 self.intensities = self.intensities.apply(eval)
-            
+
             # get numpy arrays with .values() for all inputs and intensities
 
             self.sequences = self.sequences.values
@@ -225,8 +227,12 @@ def _read_data(self):
 
             print(type(self.precursor_charge))
             print(self.precursor_charge)
-            self.precursor_charge = convert_nested_list_to_numpy_array(self.precursor_charge.values, dtype=np.float64)
-            self.intensities = convert_nested_list_to_numpy_array(self.intensities.values)
+            self.precursor_charge = convert_nested_list_to_numpy_array(
+                self.precursor_charge.values, dtype=np.float64
+            )
+            self.intensities = convert_nested_list_to_numpy_array(
+                self.intensities.values
+            )
 
             self.features_df = df[self.feature_cols]
         else:
@@ -237,24 +243,25 @@ def _read_data(self):
 
         # give the index of the element as an ID for later reference if needed
         self.example_id = list(range(len(self.sequences)))
-
-
-
-
 
     def _validate_remove_long_sequences(self) -> None:
         """
         Validate if all sequences are shorter than the padding length, otherwise drop them.
         """
         assert self.sequences.shape[0] > 0, "No sequences in the provided data."
 
-
         # check if count of examples matches for all provided inputs
-        lengths = [len(self.sequences), len(self.collision_energy), len(self.precursor_charge)]
+        lengths = [
+            len(self.sequences),
+            len(self.collision_energy),
+            len(self.precursor_charge),
+        ]
         if not self.no_intensities:
             lengths = lengths + [len(self.intensities)]
-
-        assert np.all(lengths == np.array(lengths[0])), "Count of examples does not match for sequences and targets."
+
+        assert np.all(
+            lengths == np.array(lengths[0])
+        ), "Count of examples does not match for sequences and targets."
 
         limit = self.seq_length
         vectorized_len = np.vectorize(lambda x: len(x))
@@ -288,20 +295,19 @@ def _build_tf_dataset(self):
                     self.sequences[self.indicies_dict[split]],
                     self.collision_energy[self.indicies_dict[split]],
                     self.precursor_charge[self.indicies_dict[split]],
-                    self.intensities[self.indicies_dict[split]]
+                    self.intensities[self.indicies_dict[split]],
                 )
             )
 
     def _preprocess_tf_dataset(self):
         # ToDo: convert input to dict and assume this as the general case --> abstract out in parent class
-
-
+
         for split in self.tf_dataset.keys():
             self.tf_dataset[split] = (
                 self.tf_dataset[split]
                 .map(
-                        IntensityDataset._convert_inputs_to_dict,
-                        num_parallel_calls=tf.data.AUTOTUNE,
+                    IntensityDataset._convert_inputs_to_dict,
+                    num_parallel_calls=tf.data.AUTOTUNE,
                 )
                 .map(
                     lambda i, t: self._split_sequence(i, t),
@@ -357,10 +363,10 @@ def _normalize_target(self, seq, target):
         )
         return seq, target
 
-    def _split_sequence(self, inputs,  target):
+    def _split_sequence(self, inputs, target):
 
         inputs["sequence"] = tf.strings.bytes_split(inputs["sequence"])
-        
+
         return inputs, target
 
     """
@@ -369,7 +375,11 @@ def _split_sequence(self, inputs,  target):
 
     @staticmethod
     def _convert_inputs_to_dict(seq, collision, precursor, target):
-        inputs_dict = {"sequence": seq, "collision_energy": collision, "precursor_charge": precursor}
+        inputs_dict = {
+            "sequence": seq,
+            "collision_energy": collision,
+            "precursor_charge": precursor,
+        }
         return inputs_dict, target
 
     def _generate_single_counts(self, inputs, target):

diff --git a/dlomix/data/RetentionTimeDataset.py b/dlomix/data/RetentionTimeDataset.py
@@ -1,6 +1,8 @@
+import json
 import pandas as pd
 import numpy as np
 import tensorflow as tf
+from dlomix.constants import DEFAULT_PARQUET_ENGINE
 
 """
  TODO: check if it is better to abstract out a generic class for TF dataset wrapper, including:
@@ -49,6 +51,9 @@ class RetentionTimeDataset:
     BATCHES_TO_PREFETCH = tf.data.AUTOTUNE
 
     SAMPLE_RUN_N = 100
+    METADATA_KEY = "metadata"
+    PARAMS_KEY = "parameters"
+    TARGET_NAME_KEY = "target_column_key"
 
     # TODO: For test dataset --> examples with longer sequences --> do not drop, add NaN for prediction
 
@@ -168,6 +173,9 @@ def load_data(self, data):
     """
 
     def _read_data(self):
+        if isinstance(self.data_source, dict):
+            self._update_data_loading_for_json_format()
+
         if isinstance(self.data_source, tuple):
             tuple_size_is_two = len(self.data_source) == 2
             if tuple_size_is_two:
@@ -187,8 +195,13 @@ def _read_data(self):
             self.targets = np.zeros(self.sequences.shape[0])
             self._data_mean, self._data_std = 0, 1
 
-        elif isinstance(self.data_source, str):
-            df = pd.read_csv(self.data_source)
+        elif isinstance(self.data_source, (str, dict)):
+            if isinstance(self.data_source, dict):
+                #  a dict is passed via the json
+                df = pd.DataFrame(self.data_source)
+            else:
+                # a string path is passed via the json or as a constructor argument
+                df = self._resolve_string_data_path()
 
             # used only for testing with a smaller sample from a csv file
             if self.sample_run:
@@ -209,12 +222,47 @@ def _read_data(self):
         else:
             raise ValueError(
                 "Data source has to be either a tuple of two numpy arrays, a single numpy array, "
-                "or a string path to a csv file."
+                "or a string with a path to a csv/parquet/json file."
             )
 
         # give the index of the element as an ID for later reference if needed
         self.example_id = list(range(len(self.sequences)))
 
+    def _update_data_loading_for_json_format(self):
+        json_dict = self.data_source
+
+        self.data_source = json_dict.get(RetentionTimeDataset.METADATA_KEY, "")
+        self.target_col = json_dict.get(RetentionTimeDataset.PARAMS_KEY, {}).get(
+            RetentionTimeDataset.TARGET_NAME_KEY, self.target_col
+        )
+        # ToDo: make dynamic based on parameters
+        self.sequence_col = "modified_sequence"
+
+    def _resolve_string_data_path(self):
+        is_json_file = self.data_source.endswith(".json")
+
+        if is_json_file:
+            json_dict = read_json_file(self.data_source)
+            self._update_data_loading_for_json_format(json_dict)
+
+        is_parquet_url = ".parquet" in self.data_source and self.data_source.startswith(
+            "http"
+        )
+        is_parquet_file = self.data_source.endswith(".parquet")
+        is_csv_file = self.data_source.endswith(".csv")
+
+        if is_parquet_url or is_parquet_file:
+            df = read_parquet_file_pandas(self.data_source, DEFAULT_PARQUET_ENGINE)
+            return df
+        elif is_csv_file:
+            df = pd.read_csv(self.data_source)
+            return df
+        else:
+            raise ValueError(
+                "Invalid data source provided as a string, please provide a path to a csv, parquet, or "
+                "or a json file."
+            )
+
     def _validate_remove_long_sequences(self) -> None:
         """
         Validate if all sequences are shorter than the padding length, otherwise drop them.
@@ -409,3 +457,49 @@ def data_mean(self, value):
     @data_std.setter
     def data_std(self, value):
         self._data_std = value
+
+
+# to go to reader classes or reader utils
+
+
+def read_parquet_file_pandas(filepath, parquet_engine):
+    try:
+        df = pd.read_parquet(filepath, engine=parquet_engine)
+    except ImportError:
+        raise ImportError(
+            "Parquet engine is missing, please install fastparquet using pip or conda."
+        )
+    return df
+
+
+def read_json_file(filepath):
+    with open(filepath, "r") as j:
+        json_dict = json.loads(j.read())
+    return json_dict
+
+
+if __name__ == "__main__":
+    test_data_dict = {
+        "metadata": {
+            "linear rt": [1, 2, 3],
+            "modified_sequence": ["ABC", "ABC", "ABC"],
+        },
+        "annotations": {},
+        "parameters": {"target_column_key": "linear rt"},
+    }
+
+    pd.DataFrame(test_data_dict["metadata"]).to_parquet("metadata.parquet")
+
+    test_data_dict_file = {
+        "metadata": "metadata.parquet",
+        "annotations": {},
+        "parameters": {"target_column_key": "linear rt"},
+    }
+
+    rtdataset = RetentionTimeDataset(data_source=test_data_dict, seq_length=20)
+    print(rtdataset.sequences)
+    print(rtdataset.targets)
+
+    rtdataset = RetentionTimeDataset(data_source=test_data_dict_file, seq_length=20)
+    print(rtdataset.sequences)
+    print(rtdataset.targets)
diff --git a/dlomix/data/__init__.py b/dlomix/data/__init__.py
@@ -1,4 +1,4 @@
 from .RetentionTimeDataset import *
 from .IntensityDataset import *
 
-__all__ = ['RetentionTimeDataset', 'IntensityDataset']
+__all__ = ["RetentionTimeDataset", "IntensityDataset"]
diff --git a/dlomix/layers/attention.py b/dlomix/layers/attention.py
@@ -4,22 +4,22 @@
 
 
 class DecoderAttentionLayer(tf.keras.layers.Layer):
-  def __init__(self, time_steps):
-    super(DecoderAttentionLayer, self).__init__()
-    self.time_steps = time_steps
+    def __init__(self, time_steps):
+        super(DecoderAttentionLayer, self).__init__()
+        self.time_steps = time_steps
 
-  def build(self, input_shape):
-    self.permute = tf.keras.layers.Permute((2, 1))
-    self.dense = tf.keras.layers.Dense(self.time_steps, activation='softmax')
-    self.multiply = tf.keras.layers.Multiply()
+    def build(self, input_shape):
+        self.permute = tf.keras.layers.Permute((2, 1))
+        self.dense = tf.keras.layers.Dense(self.time_steps, activation="softmax")
+        self.multiply = tf.keras.layers.Multiply()
+
+    def call(self, inputs):
+        x = self.permute(inputs)
+        x = self.dense(x)
+        x = self.permute(x)
+        x = self.multiply([inputs, x])
+        return x
 
-  def call(self, inputs):
-      x = self.permute(inputs)
-      x = self.dense(x)
-      x = self.permute(x)
-      x = self.multiply([inputs, x])
-      return x
-
 
 class AttentionLayer(tf.keras.layers.Layer):
     def __init__(

diff --git a/dlomix/losses/__init__.py b/dlomix/losses/__init__.py
@@ -1,3 +1,3 @@
 from .intensity import masked_spectral_distance
 
-__all__ =[masked_spectral_distance]
+__all__ = [masked_spectral_distance]