Feature/intensity tutorial (#18)

* Added pearson correlation to losses * fix pearson correlation loss name * added seaborn to setup.py * post-processing intensity temp utils * added intensity report and postprocessing functions * minor fixes * refactored report * bumped up version to v0.0.4 * minor fix spectral angle function * fixed the decoder param for layer size --------- Co-authored-by: WassimG <wassim.gabriel@gmail.com>
wilhelm-lab · Feb 25, 2023 · d256a27 · d256a27
1 parent 995861f
commit d256a27
Show file tree

Hide file tree

Showing 9 changed files with 205 additions and 13 deletions.
diff --git a/dlomix/__init__.py b/dlomix/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.3"
+__version__ = "0.0.4"
 
 META_DATA = {
     "author": "Omar Shouman",

diff --git a/dlomix/data/IntensityDataset.py b/dlomix/data/IntensityDataset.py
@@ -1,8 +1,8 @@
-import pandas as pd
 import numpy as np
+import pandas as pd
 import tensorflow as tf
-from dlomix.utils import convert_nested_list_to_numpy_array
 
+from dlomix.utils import convert_nested_list_to_numpy_array
 
 # take into consideration if the pandas dataframe is pickled or not and then call read_pickle instead of read_csv
 # allow the possiblity to have three different dataset objects, one for train, val, and test
@@ -225,8 +225,6 @@ def _read_data(self):
             # for concatenation later, we expand dimensions
             self.collision_energy = self.collision_energy.values.reshape(-1, 1)
 
-            print(type(self.precursor_charge))
-            print(self.precursor_charge)
             self.precursor_charge = convert_nested_list_to_numpy_array(
                 self.precursor_charge.values, dtype=np.float64
             )
@@ -339,7 +337,7 @@ def get_split_targets(self, split="val"):
                 + list(self.indicies_dict.keys())
             )
 
-        return self.targets[self.indicies_dict[split]]
+        return self.intensities[self.indicies_dict[split]]
 
     def denormalize_targets(self, targets):
         """Denormalize the given targets (can also be predictions) by multiplying the standard deviation and adding the mean.

diff --git a/dlomix/losses/__init__.py b/dlomix/losses/__init__.py
@@ -1,3 +1,3 @@
-from .intensity import masked_spectral_distance
+from .intensity import masked_spectral_distance, masked_pearson_correlation_distance
 
-__all__ = [masked_spectral_distance]
+__all__ = [masked_spectral_distance, masked_pearson_correlation_distance]
diff --git a/dlomix/losses/intensity.py b/dlomix/losses/intensity.py
@@ -28,3 +28,19 @@ def masked_spectral_distance(y_true, y_pred):
     product = K.sum(pred_norm * true_norm, axis=1)
     arccos = tf.math.acos(product)
     return 2 * arccos / np.pi
+
+
+def masked_pearson_correlation_distance(y_true, y_pred):
+    epsilon = K.epsilon()
+
+    # Masking: we multiply values by (true + 1) because then the peaks that cannot
+    # be there (and have value of -1 as explained above) won't be considered
+    pred_masked = ((y_true + 1) * y_pred) / (y_true + 1 + epsilon)
+    true_masked = ((y_true + 1) * y_true) / (y_true + 1 + epsilon)
+
+    mx = tf.math.reduce_mean(true_masked)
+    my = tf.math.reduce_mean(pred_masked)
+    xm, ym = true_masked-mx, pred_masked-my
+    r_num = tf.math.reduce_mean(tf.multiply(xm, ym))
+    r_den = tf.math.reduce_std(xm) * tf.math.reduce_std(ym)
+    return 1 - (r_num/r_den)
diff --git a/dlomix/models/prosit.py b/dlomix/models/prosit.py
@@ -189,7 +189,7 @@ def _build_decoder(self):
         self.decoder = tf.keras.Sequential(
             [
                 tf.keras.layers.GRU(
-                    units=self.recurrent_layers_sizes[1],
+                    units=self.regressor_layer_size,
                     return_sequences=True,
                     name="decoder",
                 ),
@@ -206,7 +206,6 @@ def call(self, inputs, **kwargs):
         encoded_meta = self.meta_encoder([collision_energy_in, precursor_charge_in])
 
         x = self.string_lookup(peptides_in)
-        print("encoded sequence: ", x)
         x = self.embedding(x)
         x = self.sequence_encoder(x)
         x = self.attention(x)

diff --git a/dlomix/reports/IntensityReport.py b/dlomix/reports/IntensityReport.py
@@ -0,0 +1,78 @@
+from os.path import join
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib import pyplot as plt
+from matplotlib.colors import LogNorm
+from matplotlib.ticker import LogLocator
+
+from .postprocessing import normalize_intensity_predictions
+from .Report import PDFFile, Report
+
+
+class IntensityReport(Report):
+    """Report generation for Fragment Ion Intensity Prediction tasks."""
+
+    TARGETS_LABEL = "x"
+    PREDICTIONS_LABEL = "y"
+    DEFAULT_BATCH_SIZE = 600
+
+    def __init__(self, output_path, history, figures_ext="png", batch_size=0):
+        super(IntensityReport, self).__init__(output_path, history, figures_ext)
+
+        self.pdf_file = PDFFile("DLOmix - Fragment Ion Intensity Report")
+
+        if batch_size:
+            self.batch_size = batch_size
+        else:
+            self.batch_size = IntensityReport.DEFAULT_BATCH_SIZE
+
+    def generate_report(self, dataset, predictions):
+        self._init_report_resources()
+
+        predictions_df = self.generate_intensity_results_df(dataset, predictions)
+        self.plot_all_metrics()
+
+        # make custom plots
+        self.plot_spectral_angle(predictions_df)
+
+        self._compile_report_resources_add_pdf_pages()
+        self.pdf_file.output(join(self._output_path, "intensity_Report.pdf"), "F")
+
+
+    def generate_intensity_results_df(self, dataset, predictions):
+        predictions_df = pd.DataFrame()
+
+        predictions_df['sequences'] = dataset.sequences
+        predictions_df['intensities_pred'] = predictions.tolist()
+        predictions_df['precursor_charge_onehot'] = dataset.precursor_charge.tolist()
+        predictions_df['intensities_raw'] = dataset.intensities.tolist()
+
+        return predictions_df
+
+    def plot_spectral_angle(
+        self,
+        predictions_df
+        ):
+        """Create spectral  plot
+
+        Arguments
+        ---------
+            predictions_df:  dataframe with raw intensities, predictions, sequences, precursor_charges
+        """
+
+        predictions_acc = normalize_intensity_predictions(predictions_df, self.batch_size)
+        violin_plot = sns.violinplot(predictions_acc['spectral_angle'])
+
+        save_path = join(self._output_path, "violin_spectral_angle_plot" + self._figures_ext)
+
+        fig = violin_plot.get_figure()
+        fig.savefig(save_path)
+
+        self._add_report_resource(
+            "spectral_angle_plot",
+            "Spectral angle plot",
+            "The following figure shows the spectral angle plot for the test data.",
+            save_path,
+        )
diff --git a/dlomix/reports/__init__.py b/dlomix/reports/__init__.py
@@ -1,3 +1,6 @@
+from .IntensityReport import IntensityReport
 from .RetentionTimeReport import RetentionTimeReport
 
-__all__ = [RetentionTimeReport]
+__all__ = ["RetentionTimeReport",
+           "IntensityReport",
+           ]
diff --git a/dlomix/reports/postprocessing.py b/dlomix/reports/postprocessing.py
@@ -0,0 +1,96 @@
+import functools
+
+import numpy as np
+import tensorflow as tf
+
+import dlomix.losses as losses
+
+
+def reshape_dims(array):
+    n, dims = array.shape
+    assert dims == 174
+    nlosses = 1
+    return array.reshape(
+        [array.shape[0], 30 - 1, 2, nlosses, 3]
+    )
+
+
+def reshape_flat(array):
+    s = array.shape
+    flat_dim = [s[0], functools.reduce(lambda x, y: x * y, s[1:], 1)]
+    return array.reshape(flat_dim)
+
+
+def normalize_base_peak(array):
+    # flat
+    maxima = array.max(axis=1)
+    array = array / maxima[:, np.newaxis]
+    return array
+
+
+def mask_outofrange(array, lengths, mask=-1.):
+    # dim
+    for i in range(array.shape[0]):
+        array[i, lengths[i] - 1 :, :, :, :] = mask
+    return array
+
+
+def mask_outofcharge(array, charges, mask=-1.):
+    # dim
+    for i in range(array.shape[0]):
+        if charges[i] < 3:
+            array[i, :, :, :, charges[i] :] = mask
+    return array
+
+
+def get_spectral_angle(true, pred, batch_size=600):
+
+    n = true.shape[0]
+    sa = np.zeros([n])
+
+    def iterate():
+        if n > batch_size:
+            for i in range(n // batch_size):
+                true_sample = true[i * batch_size : (i + 1) * batch_size]
+                pred_sample = pred[i * batch_size : (i + 1) * batch_size]
+                yield i, true_sample, pred_sample
+            i = n // batch_size
+            yield i, true[(i) * batch_size :], pred[(i) * batch_size :]
+        else:
+            yield 0, true, pred
+
+    for i, t_b, p_b in iterate():
+        tf.compat.v1.reset_default_graph()
+        with tf.compat.v1.Session() as s:
+            sa_graph = losses.masked_spectral_distance(t_b, p_b)
+            sa_b = 1 - s.run(sa_graph)
+            sa[i * batch_size : i * batch_size + sa_b.shape[0]] = sa_b
+    sa = np.nan_to_num(sa)
+    return sa
+
+
+def normalize_intensity_predictions(data, batch_size=600):
+    assert "sequences" in data, "Key sequences is missing in the data provided for post-processing"
+    assert "intensities_pred" in data, "Key intensities_pred is missing in the data provided for post-processing"
+    assert "precursor_charge_onehot" in data, "Key precursor_charge_onehot is missing in the data provided for post-processing"
+
+    sequence_lengths = data["sequences"].apply(lambda x: len(x))
+    intensities =  np.stack(data["intensities_pred"].to_numpy()).astype(np.float32)
+    precursor_charge_onehot = np.stack(data["precursor_charge_onehot"].to_numpy())
+    charges = list(precursor_charge_onehot.argmax(axis=1) + 1)
+
+    intensities[intensities < 0] = 0
+    intensities = reshape_dims(intensities)
+    intensities = mask_outofrange(intensities, sequence_lengths)
+    intensities = mask_outofcharge(intensities, charges)
+    intensities = reshape_flat(intensities)
+    m_idx = intensities == -1
+    intensities = normalize_base_peak(intensities)
+    intensities[m_idx] = -1
+    data["intensities_pred"] = intensities
+
+    if "intensities_raw" in data:
+        data["spectral_angle"] = get_spectral_angle(
+            np.stack(data["intensities_raw"].to_numpy()).astype(np.float32), intensities, batch_size=batch_size
+        )
+    return data
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
-from dlomix import __version__, META_DATA
+from dlomix import META_DATA, __version__
 
 VERSION = __version__
 
@@ -24,7 +24,9 @@
         'matplotlib',
         'scikit-learn',
         'tensorflow',
-        'pyarrow'],
+        'pyarrow',
+        'seaborn',
+        ],
     extras_require={
         "dev": [
             "pytest >= 3.7",