Skip to content

Commit

Permalink
Merge pull request #13 from wilhelm-lab/ms2ai-integration
Browse files Browse the repository at this point in the history
Ms2ai integration - Retention Time only
  • Loading branch information
omsh authored Oct 29, 2022
2 parents bf71b1d + 0c857ae commit 1a72430
Show file tree
Hide file tree
Showing 15 changed files with 251 additions and 107 deletions.
12 changes: 11 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,14 @@ notebooks/wandb
/notebooks/data*

# local to do file if exists :)
todo.txt
todo.txt

.DS_Store

# model checkpoints in the run scripts directory
run_scripts/checkpoint*
run_scripts/*.index
run_scripts/*.data-*

# testing metadata
metadata.parquet
2 changes: 1 addition & 1 deletion dlomix/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.0.2dev1"
__version__ = "0.0.3"

META_DATA = {
"author": "Omar Shouman",
Expand Down
9 changes: 6 additions & 3 deletions dlomix/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
DEFAULT_PARQUET_ENGINE = "pyarrow"

retention_time_pipeline_parameters = {
"model_params": {"seq_length": 30},
"data_params": {
Expand All @@ -8,10 +10,11 @@
"trained_model_stats": [0.0, 1.0],
}

retention_time_pipeline_parameters.update(
retention_time_pipeline_parameters.update(
{
"trained_model_url":
"https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop" + retention_time_pipeline_parameters['trained_model_path'].strip("..") + retention_time_pipeline_parameters['trained_model_zipfile_name']
"trained_model_url": "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop"
+ retention_time_pipeline_parameters["trained_model_path"].strip("..")
+ retention_time_pipeline_parameters["trained_model_zipfile_name"]
}
)

Expand Down
60 changes: 35 additions & 25 deletions dlomix/data/IntensityDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ def __init__(
self.precursor_charge = None
self.intensities = None


self.features_df = None
self.example_id = None

Expand Down Expand Up @@ -174,13 +173,17 @@ def load_data(self, data):

def _read_data(self):
if isinstance(self.data_source, tuple):
tuple_size_is_three_or_four = len(self.data_source) == 3 or len(self.data_source) == 4
tuple_size_is_three_or_four = (
len(self.data_source) == 3 or len(self.data_source) == 4
)
if tuple_size_is_three_or_four:
tuple_elements_are_ndarray = all([isinstance(x, np.ndarray) for x in self.data_source])
tuple_elements_are_ndarray = all(
[isinstance(x, np.ndarray) for x in self.data_source]
)
if tuple_elements_are_ndarray:
self.sequences = self.data_source[0]
self.collision_energy = self.data_source[1]
self.precursor_charge = self.data_source[2]
self.precursor_charge = self.data_source[2]
if len(self.data_source) == 4:
self.intensities = self.data_source[3]
self.no_intensities = False
Expand Down Expand Up @@ -208,14 +211,13 @@ def _read_data(self):
self.precursor_charge = df[self.precursor_charge_col]
self.intensities = df[self.intensities_col]


# parse strings into lists, for precursor charge and intensities
if isinstance(self.precursor_charge.iloc[0], str):
self.precursor_charge = self.precursor_charge.apply(eval)

if isinstance(self.intensities.iloc[0], str):
self.intensities = self.intensities.apply(eval)

# get numpy arrays with .values() for all inputs and intensities

self.sequences = self.sequences.values
Expand All @@ -225,8 +227,12 @@ def _read_data(self):

print(type(self.precursor_charge))
print(self.precursor_charge)
self.precursor_charge = convert_nested_list_to_numpy_array(self.precursor_charge.values, dtype=np.float64)
self.intensities = convert_nested_list_to_numpy_array(self.intensities.values)
self.precursor_charge = convert_nested_list_to_numpy_array(
self.precursor_charge.values, dtype=np.float64
)
self.intensities = convert_nested_list_to_numpy_array(
self.intensities.values
)

self.features_df = df[self.feature_cols]
else:
Expand All @@ -237,24 +243,25 @@ def _read_data(self):

# give the index of the element as an ID for later reference if needed
self.example_id = list(range(len(self.sequences)))





def _validate_remove_long_sequences(self) -> None:
"""
Validate if all sequences are shorter than the padding length, otherwise drop them.
"""
assert self.sequences.shape[0] > 0, "No sequences in the provided data."


# check if count of examples matches for all provided inputs
lengths = [len(self.sequences), len(self.collision_energy), len(self.precursor_charge)]
lengths = [
len(self.sequences),
len(self.collision_energy),
len(self.precursor_charge),
]
if not self.no_intensities:
lengths = lengths + [len(self.intensities)]

assert np.all(lengths == np.array(lengths[0])), "Count of examples does not match for sequences and targets."

assert np.all(
lengths == np.array(lengths[0])
), "Count of examples does not match for sequences and targets."

limit = self.seq_length
vectorized_len = np.vectorize(lambda x: len(x))
Expand Down Expand Up @@ -288,20 +295,19 @@ def _build_tf_dataset(self):
self.sequences[self.indicies_dict[split]],
self.collision_energy[self.indicies_dict[split]],
self.precursor_charge[self.indicies_dict[split]],
self.intensities[self.indicies_dict[split]]
self.intensities[self.indicies_dict[split]],
)
)

def _preprocess_tf_dataset(self):
# ToDo: convert input to dict and assume this as the general case --> abstract out in parent class



for split in self.tf_dataset.keys():
self.tf_dataset[split] = (
self.tf_dataset[split]
.map(
IntensityDataset._convert_inputs_to_dict,
num_parallel_calls=tf.data.AUTOTUNE,
IntensityDataset._convert_inputs_to_dict,
num_parallel_calls=tf.data.AUTOTUNE,
)
.map(
lambda i, t: self._split_sequence(i, t),
Expand Down Expand Up @@ -357,10 +363,10 @@ def _normalize_target(self, seq, target):
)
return seq, target

def _split_sequence(self, inputs, target):
def _split_sequence(self, inputs, target):

inputs["sequence"] = tf.strings.bytes_split(inputs["sequence"])

return inputs, target

"""
Expand All @@ -369,7 +375,11 @@ def _split_sequence(self, inputs, target):

@staticmethod
def _convert_inputs_to_dict(seq, collision, precursor, target):
inputs_dict = {"sequence": seq, "collision_energy": collision, "precursor_charge": precursor}
inputs_dict = {
"sequence": seq,
"collision_energy": collision,
"precursor_charge": precursor,
}
return inputs_dict, target

def _generate_single_counts(self, inputs, target):
Expand Down
100 changes: 97 additions & 3 deletions dlomix/data/RetentionTimeDataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from dlomix.constants import DEFAULT_PARQUET_ENGINE

"""
TODO: check if it is better to abstract out a generic class for TF dataset wrapper, including:
Expand Down Expand Up @@ -49,6 +51,9 @@ class RetentionTimeDataset:
BATCHES_TO_PREFETCH = tf.data.AUTOTUNE

SAMPLE_RUN_N = 100
METADATA_KEY = "metadata"
PARAMS_KEY = "parameters"
TARGET_NAME_KEY = "target_column_key"

# TODO: For test dataset --> examples with longer sequences --> do not drop, add NaN for prediction

Expand Down Expand Up @@ -168,6 +173,9 @@ def load_data(self, data):
"""

def _read_data(self):
if isinstance(self.data_source, dict):
self._update_data_loading_for_json_format()

if isinstance(self.data_source, tuple):
tuple_size_is_two = len(self.data_source) == 2
if tuple_size_is_two:
Expand All @@ -187,8 +195,13 @@ def _read_data(self):
self.targets = np.zeros(self.sequences.shape[0])
self._data_mean, self._data_std = 0, 1

elif isinstance(self.data_source, str):
df = pd.read_csv(self.data_source)
elif isinstance(self.data_source, (str, dict)):
if isinstance(self.data_source, dict):
# a dict is passed via the json
df = pd.DataFrame(self.data_source)
else:
# a string path is passed via the json or as a constructor argument
df = self._resolve_string_data_path()

# used only for testing with a smaller sample from a csv file
if self.sample_run:
Expand All @@ -209,12 +222,47 @@ def _read_data(self):
else:
raise ValueError(
"Data source has to be either a tuple of two numpy arrays, a single numpy array, "
"or a string path to a csv file."
"or a string with a path to a csv/parquet/json file."
)

# give the index of the element as an ID for later reference if needed
self.example_id = list(range(len(self.sequences)))

def _update_data_loading_for_json_format(self):
json_dict = self.data_source

self.data_source = json_dict.get(RetentionTimeDataset.METADATA_KEY, "")
self.target_col = json_dict.get(RetentionTimeDataset.PARAMS_KEY, {}).get(
RetentionTimeDataset.TARGET_NAME_KEY, self.target_col
)
# ToDo: make dynamic based on parameters
self.sequence_col = "modified_sequence"

def _resolve_string_data_path(self):
is_json_file = self.data_source.endswith(".json")

if is_json_file:
json_dict = read_json_file(self.data_source)
self._update_data_loading_for_json_format(json_dict)

is_parquet_url = ".parquet" in self.data_source and self.data_source.startswith(
"http"
)
is_parquet_file = self.data_source.endswith(".parquet")
is_csv_file = self.data_source.endswith(".csv")

if is_parquet_url or is_parquet_file:
df = read_parquet_file_pandas(self.data_source, DEFAULT_PARQUET_ENGINE)
return df
elif is_csv_file:
df = pd.read_csv(self.data_source)
return df
else:
raise ValueError(
"Invalid data source provided as a string, please provide a path to a csv, parquet, or "
"or a json file."
)

def _validate_remove_long_sequences(self) -> None:
"""
Validate if all sequences are shorter than the padding length, otherwise drop them.
Expand Down Expand Up @@ -409,3 +457,49 @@ def data_mean(self, value):
@data_std.setter
def data_std(self, value):
self._data_std = value


# to go to reader classes or reader utils


def read_parquet_file_pandas(filepath, parquet_engine):
try:
df = pd.read_parquet(filepath, engine=parquet_engine)
except ImportError:
raise ImportError(
"Parquet engine is missing, please install fastparquet using pip or conda."
)
return df


def read_json_file(filepath):
with open(filepath, "r") as j:
json_dict = json.loads(j.read())
return json_dict


if __name__ == "__main__":
test_data_dict = {
"metadata": {
"linear rt": [1, 2, 3],
"modified_sequence": ["ABC", "ABC", "ABC"],
},
"annotations": {},
"parameters": {"target_column_key": "linear rt"},
}

pd.DataFrame(test_data_dict["metadata"]).to_parquet("metadata.parquet")

test_data_dict_file = {
"metadata": "metadata.parquet",
"annotations": {},
"parameters": {"target_column_key": "linear rt"},
}

rtdataset = RetentionTimeDataset(data_source=test_data_dict, seq_length=20)
print(rtdataset.sequences)
print(rtdataset.targets)

rtdataset = RetentionTimeDataset(data_source=test_data_dict_file, seq_length=20)
print(rtdataset.sequences)
print(rtdataset.targets)
2 changes: 1 addition & 1 deletion dlomix/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .RetentionTimeDataset import *
from .IntensityDataset import *

__all__ = ['RetentionTimeDataset', 'IntensityDataset']
__all__ = ["RetentionTimeDataset", "IntensityDataset"]
28 changes: 14 additions & 14 deletions dlomix/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,22 @@


class DecoderAttentionLayer(tf.keras.layers.Layer):
def __init__(self, time_steps):
super(DecoderAttentionLayer, self).__init__()
self.time_steps = time_steps
def __init__(self, time_steps):
super(DecoderAttentionLayer, self).__init__()
self.time_steps = time_steps

def build(self, input_shape):
self.permute = tf.keras.layers.Permute((2, 1))
self.dense = tf.keras.layers.Dense(self.time_steps, activation='softmax')
self.multiply = tf.keras.layers.Multiply()
def build(self, input_shape):
self.permute = tf.keras.layers.Permute((2, 1))
self.dense = tf.keras.layers.Dense(self.time_steps, activation="softmax")
self.multiply = tf.keras.layers.Multiply()

def call(self, inputs):
x = self.permute(inputs)
x = self.dense(x)
x = self.permute(x)
x = self.multiply([inputs, x])
return x

def call(self, inputs):
x = self.permute(inputs)
x = self.dense(x)
x = self.permute(x)
x = self.multiply([inputs, x])
return x


class AttentionLayer(tf.keras.layers.Layer):
def __init__(
Expand Down
2 changes: 1 addition & 1 deletion dlomix/losses/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .intensity import masked_spectral_distance

__all__ =[masked_spectral_distance]
__all__ = [masked_spectral_distance]
Loading

0 comments on commit 1a72430

Please sign in to comment.