Skip to content

Commit

Permalink
Feature/intensity tutorial (#18)
Browse files Browse the repository at this point in the history
* Added pearson correlation to losses

* fix pearson correlation loss name

* added seaborn to setup.py

* post-processing intensity temp utils

* added intensity report and postprocessing functions

* minor fixes

* refactored report

* bumped up version to v0.0.4

* minor fix spectral angle function

* fixed the decoder param for layer size

---------

Co-authored-by: WassimG <wassim.gabriel@gmail.com>
  • Loading branch information
omsh and WassimG authored Feb 25, 2023
1 parent 995861f commit d256a27
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 13 deletions.
2 changes: 1 addition & 1 deletion dlomix/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.0.3"
__version__ = "0.0.4"

META_DATA = {
"author": "Omar Shouman",
Expand Down
8 changes: 3 additions & 5 deletions dlomix/data/IntensityDataset.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pandas as pd
import numpy as np
import pandas as pd
import tensorflow as tf
from dlomix.utils import convert_nested_list_to_numpy_array

from dlomix.utils import convert_nested_list_to_numpy_array

# take into consideration if the pandas dataframe is pickled or not and then call read_pickle instead of read_csv
# allow the possiblity to have three different dataset objects, one for train, val, and test
Expand Down Expand Up @@ -225,8 +225,6 @@ def _read_data(self):
# for concatenation later, we expand dimensions
self.collision_energy = self.collision_energy.values.reshape(-1, 1)

print(type(self.precursor_charge))
print(self.precursor_charge)
self.precursor_charge = convert_nested_list_to_numpy_array(
self.precursor_charge.values, dtype=np.float64
)
Expand Down Expand Up @@ -339,7 +337,7 @@ def get_split_targets(self, split="val"):
+ list(self.indicies_dict.keys())
)

return self.targets[self.indicies_dict[split]]
return self.intensities[self.indicies_dict[split]]

def denormalize_targets(self, targets):
"""Denormalize the given targets (can also be predictions) by multiplying the standard deviation and adding the mean.
Expand Down
4 changes: 2 additions & 2 deletions dlomix/losses/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .intensity import masked_spectral_distance
from .intensity import masked_spectral_distance, masked_pearson_correlation_distance

__all__ = [masked_spectral_distance]
__all__ = [masked_spectral_distance, masked_pearson_correlation_distance]
16 changes: 16 additions & 0 deletions dlomix/losses/intensity.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,19 @@ def masked_spectral_distance(y_true, y_pred):
product = K.sum(pred_norm * true_norm, axis=1)
arccos = tf.math.acos(product)
return 2 * arccos / np.pi


def masked_pearson_correlation_distance(y_true, y_pred):
epsilon = K.epsilon()

# Masking: we multiply values by (true + 1) because then the peaks that cannot
# be there (and have value of -1 as explained above) won't be considered
pred_masked = ((y_true + 1) * y_pred) / (y_true + 1 + epsilon)
true_masked = ((y_true + 1) * y_true) / (y_true + 1 + epsilon)

mx = tf.math.reduce_mean(true_masked)
my = tf.math.reduce_mean(pred_masked)
xm, ym = true_masked-mx, pred_masked-my
r_num = tf.math.reduce_mean(tf.multiply(xm, ym))
r_den = tf.math.reduce_std(xm) * tf.math.reduce_std(ym)
return 1 - (r_num/r_den)
3 changes: 1 addition & 2 deletions dlomix/models/prosit.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def _build_decoder(self):
self.decoder = tf.keras.Sequential(
[
tf.keras.layers.GRU(
units=self.recurrent_layers_sizes[1],
units=self.regressor_layer_size,
return_sequences=True,
name="decoder",
),
Expand All @@ -206,7 +206,6 @@ def call(self, inputs, **kwargs):
encoded_meta = self.meta_encoder([collision_energy_in, precursor_charge_in])

x = self.string_lookup(peptides_in)
print("encoded sequence: ", x)
x = self.embedding(x)
x = self.sequence_encoder(x)
x = self.attention(x)
Expand Down
78 changes: 78 additions & 0 deletions dlomix/reports/IntensityReport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from os.path import join

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib.ticker import LogLocator

from .postprocessing import normalize_intensity_predictions
from .Report import PDFFile, Report


class IntensityReport(Report):
"""Report generation for Fragment Ion Intensity Prediction tasks."""

TARGETS_LABEL = "x"
PREDICTIONS_LABEL = "y"
DEFAULT_BATCH_SIZE = 600

def __init__(self, output_path, history, figures_ext="png", batch_size=0):
super(IntensityReport, self).__init__(output_path, history, figures_ext)

self.pdf_file = PDFFile("DLOmix - Fragment Ion Intensity Report")

if batch_size:
self.batch_size = batch_size
else:
self.batch_size = IntensityReport.DEFAULT_BATCH_SIZE

def generate_report(self, dataset, predictions):
self._init_report_resources()

predictions_df = self.generate_intensity_results_df(dataset, predictions)
self.plot_all_metrics()

# make custom plots
self.plot_spectral_angle(predictions_df)

self._compile_report_resources_add_pdf_pages()
self.pdf_file.output(join(self._output_path, "intensity_Report.pdf"), "F")


def generate_intensity_results_df(self, dataset, predictions):
predictions_df = pd.DataFrame()

predictions_df['sequences'] = dataset.sequences
predictions_df['intensities_pred'] = predictions.tolist()
predictions_df['precursor_charge_onehot'] = dataset.precursor_charge.tolist()
predictions_df['intensities_raw'] = dataset.intensities.tolist()

return predictions_df

def plot_spectral_angle(
self,
predictions_df
):
"""Create spectral plot
Arguments
---------
predictions_df: dataframe with raw intensities, predictions, sequences, precursor_charges
"""

predictions_acc = normalize_intensity_predictions(predictions_df, self.batch_size)
violin_plot = sns.violinplot(predictions_acc['spectral_angle'])

save_path = join(self._output_path, "violin_spectral_angle_plot" + self._figures_ext)

fig = violin_plot.get_figure()
fig.savefig(save_path)

self._add_report_resource(
"spectral_angle_plot",
"Spectral angle plot",
"The following figure shows the spectral angle plot for the test data.",
save_path,
)
5 changes: 4 additions & 1 deletion dlomix/reports/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from .IntensityReport import IntensityReport
from .RetentionTimeReport import RetentionTimeReport

__all__ = [RetentionTimeReport]
__all__ = ["RetentionTimeReport",
"IntensityReport",
]
96 changes: 96 additions & 0 deletions dlomix/reports/postprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import functools

import numpy as np
import tensorflow as tf

import dlomix.losses as losses


def reshape_dims(array):
n, dims = array.shape
assert dims == 174
nlosses = 1
return array.reshape(
[array.shape[0], 30 - 1, 2, nlosses, 3]
)


def reshape_flat(array):
s = array.shape
flat_dim = [s[0], functools.reduce(lambda x, y: x * y, s[1:], 1)]
return array.reshape(flat_dim)


def normalize_base_peak(array):
# flat
maxima = array.max(axis=1)
array = array / maxima[:, np.newaxis]
return array


def mask_outofrange(array, lengths, mask=-1.):
# dim
for i in range(array.shape[0]):
array[i, lengths[i] - 1 :, :, :, :] = mask
return array


def mask_outofcharge(array, charges, mask=-1.):
# dim
for i in range(array.shape[0]):
if charges[i] < 3:
array[i, :, :, :, charges[i] :] = mask
return array


def get_spectral_angle(true, pred, batch_size=600):

n = true.shape[0]
sa = np.zeros([n])

def iterate():
if n > batch_size:
for i in range(n // batch_size):
true_sample = true[i * batch_size : (i + 1) * batch_size]
pred_sample = pred[i * batch_size : (i + 1) * batch_size]
yield i, true_sample, pred_sample
i = n // batch_size
yield i, true[(i) * batch_size :], pred[(i) * batch_size :]
else:
yield 0, true, pred

for i, t_b, p_b in iterate():
tf.compat.v1.reset_default_graph()
with tf.compat.v1.Session() as s:
sa_graph = losses.masked_spectral_distance(t_b, p_b)
sa_b = 1 - s.run(sa_graph)
sa[i * batch_size : i * batch_size + sa_b.shape[0]] = sa_b
sa = np.nan_to_num(sa)
return sa


def normalize_intensity_predictions(data, batch_size=600):
assert "sequences" in data, "Key sequences is missing in the data provided for post-processing"
assert "intensities_pred" in data, "Key intensities_pred is missing in the data provided for post-processing"
assert "precursor_charge_onehot" in data, "Key precursor_charge_onehot is missing in the data provided for post-processing"

sequence_lengths = data["sequences"].apply(lambda x: len(x))
intensities = np.stack(data["intensities_pred"].to_numpy()).astype(np.float32)
precursor_charge_onehot = np.stack(data["precursor_charge_onehot"].to_numpy())
charges = list(precursor_charge_onehot.argmax(axis=1) + 1)

intensities[intensities < 0] = 0
intensities = reshape_dims(intensities)
intensities = mask_outofrange(intensities, sequence_lengths)
intensities = mask_outofcharge(intensities, charges)
intensities = reshape_flat(intensities)
m_idx = intensities == -1
intensities = normalize_base_peak(intensities)
intensities[m_idx] = -1
data["intensities_pred"] = intensities

if "intensities_raw" in data:
data["spectral_angle"] = get_spectral_angle(
np.stack(data["intensities_raw"].to_numpy()).astype(np.float32), intensities, batch_size=batch_size
)
return data
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
with open("README.md", "r") as fh:
long_description = fh.read()

from dlomix import __version__, META_DATA
from dlomix import META_DATA, __version__

VERSION = __version__

Expand All @@ -24,7 +24,9 @@
'matplotlib',
'scikit-learn',
'tensorflow',
'pyarrow'],
'pyarrow',
'seaborn',
],
extras_require={
"dev": [
"pytest >= 3.7",
Expand Down

0 comments on commit d256a27

Please sign in to comment.