From 8ae28fb9600fd234e7a6d63aa5316448290e65fe Mon Sep 17 00:00:00 2001 From: aurelien delfosse Date: Wed, 26 May 2021 00:03:58 +0200 Subject: [PATCH] (fix): silent progress bar mode Add an optional silent progress bar. Remove systematics warnings, now depends on the size of embeddings. Adds a CONTRIBUTING.md with PR best practices, code style guide, and code of conduct for contributors. --- biotransformers/utils/utils.py | 71 ++++++++---- .../wrappers/transformers_wrappers.py | 101 ++++++++++++------ 2 files changed, 120 insertions(+), 52 deletions(-) diff --git a/biotransformers/utils/utils.py b/biotransformers/utils/utils.py index 240dd1e..4d17afe 100755 --- a/biotransformers/utils/utils.py +++ b/biotransformers/utils/utils.py @@ -1,37 +1,66 @@ import math from dataclasses import dataclass -from typing import List +from typing import List, Tuple from biotransformers.utils.logger import logger log = logger("utils") -def convert_bytes_size(size_bytes): +def convert_bytes_size(size_bytes: int) -> Tuple[str, bool]: + """[summary] + + Args: + size_bytes: size in bytes + + Returns: + Tuple[str,bool]: return the size with correct units and a condition + to display the warning message. + """ if size_bytes == 0: - return "0B" + return "0B", False size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = int(round(size_bytes / p, 2)) - return "%s%s" % (s, size_name[i]) + is_warning = True if i >= 3 else False + return "%s%s" % (s, size_name[i]), is_warning -def _check_memory_embeddings(sequences_list, embeddings_size, pool_mode): + +def _check_memory_embeddings( + sequences_list: List[str], embeddings_size: int, pool_mode: Tuple[str, ...] +): + """Function to compute the memory taken by the embeddings with float64 number. + + Args: + sequences_list: sequences of proteins + embeddings_size : size of the embeddings vector, depends on the model + pool_mode : aggregation function + """ num_of_sequences = len(sequences_list) emb_dict_len = len(pool_mode) tensor_memory_bits = 64 # double/float64 memory_bits = num_of_sequences * embeddings_size * emb_dict_len * tensor_memory_bits memory_bytes = int(memory_bits / 8) - memory_convert_bytes = convert_bytes_size(memory_bytes) - log.warning( - "Embeddings will need about %s of memory." - "Please make sure you have enough space", - memory_convert_bytes, - ) + memory_convert_bytes, is_warning = convert_bytes_size(memory_bytes) + + if is_warning: + log.warning( + "Embeddings will need about %s of memory." + "Please make sure you have enough space", + memory_convert_bytes, + ) + +def _check_memory_logits(sequences_list: List[str], vocab_size: int, pass_mode: str): + """Function to compute the memory taken by the logits with float64 number. -def _check_memory_logits(sequences_list, vocab_size, pass_mode): + Args: + sequences_list ([type]): [description] + vocab_size ([type]): [description] + pass_mode ([type]): [description] + """ num_of_sequences = len(sequences_list) sum_seq_len = sum([len(seq) for seq in sequences_list]) max_seq_len = max([len(seq) for seq in sequences_list]) @@ -42,20 +71,22 @@ def _check_memory_logits(sequences_list, vocab_size, pass_mode): memory_bits = num_of_sequences * max_seq_len * vocab_size * tensor_memory_bits memory_bytes = int(memory_bits / 8) - memory_convert_bytes = convert_bytes_size(memory_bytes) - log.warning( - "%s mode will need about %s of memory. Please make sure you have enough space", - pass_mode, - memory_convert_bytes, - ) + memory_convert_bytes, is_warning = convert_bytes_size(memory_bytes) + + if is_warning: + log.warning( + "%s mode will need about %s of memory. Please make sure you have enough space", + pass_mode, + memory_convert_bytes, + ) def _check_sequence(sequences_list: List[str], model: str, length: int): """Function that control sequence length Args: - model (str): name of the model - length (int): length limit to consider + model : name of the model + length : length limit to consider Raises: ValueError is model esm1b_t33_650M_UR50S and sequence_length >1024 """ diff --git a/biotransformers/wrappers/transformers_wrappers.py b/biotransformers/wrappers/transformers_wrappers.py index 35f989a..21c8cdc 100755 --- a/biotransformers/wrappers/transformers_wrappers.py +++ b/biotransformers/wrappers/transformers_wrappers.py @@ -5,7 +5,7 @@ sequences, and displays some properties of the transformer model. """ from abc import ABC, abstractmethod -from typing import Any, Dict, Generator, Iterable, List, Tuple +from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple import numpy as np import torch @@ -252,7 +252,15 @@ def _filter_loglikelihoods( labels: torch.Tensor, tokens: List[int], ) -> torch.Tensor: + """Function to compute the loglikelihood of sequences based on logits + Args: + logits : [description] + labels : Position of + tokens: [description] + Returns: + Torch.tensor: tensor + """ masks = torch.zeros(labels.shape, dtype=torch.bool) for token_id in tokens: masks += labels == token_id @@ -325,9 +333,7 @@ def _filter_and_pool_embeddings( return embeddings_dict def _model_evaluation( - self, - model_inputs: Dict[str, torch.tensor], - batch_size: int = 1, + self, model_inputs: Dict[str, torch.tensor], batch_size: int = 1, **kwargs ) -> Tuple[torch.Tensor, torch.Tensor]: """ Function which computes logits and embeddings based on a list of sequences, @@ -343,7 +349,7 @@ def _model_evaluation( * logits [num_seqs, max_len_seqs, vocab_size] * embeddings [num_seqs, max_len_seqs+1, embedding_size] """ - + silent = kwargs.get("silent", False) # Initialize logits and embeddings before looping over batches logits = torch.Tensor() # [num_seqs, max_len_seqs+1, vocab_size] embeddings = torch.Tensor() # [num_seqs, max_len_seqs+1, embedding_size] @@ -351,6 +357,7 @@ def _model_evaluation( for batch_inputs in tqdm( self._generate_chunks(model_inputs, batch_size), total=self._get_num_batch_iter(model_inputs, batch_size), + disable=silent, ): batch_logits, batch_embeddings = self._model_pass(batch_inputs) @@ -360,7 +367,11 @@ def _model_evaluation( return logits, embeddings def _compute_logits( - self, model_inputs: Dict[str, torch.Tensor], batch_size: int, pass_mode: str + self, + model_inputs: Dict[str, torch.Tensor], + batch_size: int, + pass_mode: str, + **kwargs ) -> torch.Tensor: """Intermediate function to compute logits @@ -374,10 +385,14 @@ def _compute_logits( """ if pass_mode == "masked": model_inputs, masked_ids_list = self._repeat_and_mask_inputs(model_inputs) - logits, _ = self._model_evaluation(model_inputs, batch_size=batch_size) + logits, _ = self._model_evaluation( + model_inputs, batch_size=batch_size, **kwargs + ) logits = self._gather_masked_outputs(logits, masked_ids_list) elif pass_mode == "forward": - logits, _ = self._model_evaluation(model_inputs, batch_size=batch_size) + logits, _ = self._model_evaluation( + model_inputs, batch_size=batch_size, **kwargs + ) return logits def _compute_accuracy(self, logits: torch.Tensor, labels: torch.Tensor) -> float: @@ -445,12 +460,16 @@ def compute_logits( batch_size: int = 1, tokens_list: List[str] = None, pass_mode: str = "forward", - ) -> Tuple[np.ndarray, np.ndarray]: - """Function that computes the logits from sequences + silent: bool = False, + ) -> Tuple[List[np.ndarray]]: + """Function that computes the logits from sequences. + + It returns a list of logits for each sequence. Each sequence in the list + contains only the amino acid to interest. Args: sequences_list: List of sequences - batch_size: Batch size + batch_size: number of sequences to consider for the forward pass pass_mode: Mode of model evaluation ('forward' or 'masked') tokens_list: List of tokens to consider @@ -466,10 +485,14 @@ def compute_logits( inputs, labels, tokens = self._process_sequences_and_tokens( sequences_list, tokens_list ) - logits = self._compute_logits(inputs, batch_size, pass_mode) + logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent) logits, labels = self._filter_logits(logits, labels, tokens) - return logits.numpy(), labels.numpy() + lengths = [len(sequence) for sequence in sequences_list] + splitted_logits = torch.split(logits, lengths, dim=0) + splitted_logits = [logits.numpy() for logits in splitted_logits] + + return splitted_logits def compute_probabilities( self, @@ -477,21 +500,25 @@ def compute_probabilities( batch_size: int = 1, tokens_list: List[str] = None, pass_mode: str = "forward", + silent: bool = False, ) -> List[Dict[int, Dict[str, float]]]: """Function that computes the probabilities over amino-acids from sequences. + It takes as inputs a list of sequences and returns a list of dictionaries. Each dictionary contains the probabilities over the natural amino-acids for each position in the sequence. The keys represent the positions (indexed starting with 0) and the values are dictionaries of probabilities over - the natural amino-acids for this position. In these dictionaries, the keys are - the amino-acids and the value the corresponding probabilities. + the natural amino-acids for this position. + + In these dictionaries, the keys are the amino-acids and the value + the corresponding probabilities. Args: sequences_list: List of sequences - batch_size: Batch size + batch_size: number of sequences to consider for the forward pass pass_mode: Mode of model evaluation ('forward' or 'masked') tokens_list: List of tokens to consider - + silent : display or not progress bar Returns: List[Dict[int, Dict[str, float]]]: dictionaries of probabilities per seq """ @@ -504,7 +531,7 @@ def compute_probabilities( inputs, labels, tokens = self._process_sequences_and_tokens( sequences_list, tokens_list ) - logits = self._compute_logits(inputs, batch_size, pass_mode) + logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent) logits, _ = self._filter_logits(logits, labels, tokens) lengths = [len(sequence) for sequence in sequences_list] @@ -535,6 +562,7 @@ def compute_loglikelihood( batch_size: int = 1, tokens_list: List[str] = None, pass_mode: str = "forward", + silent: bool = False, ) -> np.ndarray: """Function that computes loglikelihoods of sequences @@ -545,7 +573,7 @@ def compute_loglikelihood( tokens_list: List of tokens to consider Returns: - torch.Tensor: loglikelihoods in torch.tensor format + torch.Tensor: loglikelihoods in numpy format """ if tokens_list is None: tokens_list = NATURAL_AAS_LIST @@ -556,7 +584,7 @@ def compute_loglikelihood( inputs, labels, tokens = self._process_sequences_and_tokens( sequences_list, tokens_list ) - logits = self._compute_logits(inputs, batch_size, pass_mode) + logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent) loglikelihoods = self._filter_loglikelihoods(logits, labels, tokens) return loglikelihoods.numpy() @@ -567,15 +595,21 @@ def compute_embeddings( batch_size: int = 1, pool_mode: Tuple[str, ...] = ("cls", "mean"), tokens_list: List[str] = None, + silent: bool = False, ) -> Dict[str, np.ndarray]: - """Function that computes embeddings of sequences + """Function that computes embeddings of sequences. + + The embedding has a size (n_sequence, num_tokens, embeddings_size) so we use + an aggregation function specified in pool_mode to aggregate the tensor on + the num_tokens dimension. 'mean' signifies that we take the mean over the + num_tokens dimension. Args: sequences_list: List of sequences batch_size: Batch size pool_mode: Mode of pooling ('cls', 'mean', 'min', 'max) tokens_list: List of tokens to consider - + silent : whereas to display or not progress bar Returns: torch.Tensor: Tensor of shape [number_of_sequences, embeddings_size] """ @@ -600,6 +634,7 @@ def compute_embeddings( for batch_inputs in tqdm( self._generate_chunks(inputs, batch_size), total=self._get_num_batch_iter(inputs, batch_size), + disable=silent, ): _, batch_embeddings = self._model_pass(batch_inputs) batch_labels = batch_inputs["input_ids"] @@ -621,6 +656,7 @@ def compute_accuracy( batch_size: int = 1, pass_mode: str = "forward", tokens_list: List[str] = None, + silent: bool = False, ) -> float: """Compute model accuracy from the input sequences @@ -629,7 +665,7 @@ def compute_accuracy( batch_size: [description]. Defaults to 1. pass_mode: [description]. Defaults to "forward". tokens_list: [description]. Defaults to None. - + silent: whereas to display or not progress bar Returns: [type]: [description] """ @@ -641,7 +677,7 @@ def compute_accuracy( inputs, labels, tokens = self._process_sequences_and_tokens( sequences_list, tokens_list ) - logits = self._compute_logits(inputs, batch_size, pass_mode) + logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent) logits, labels = self._filter_logits(logits, labels, tokens) accuracy = self._compute_accuracy(logits, labels) @@ -652,18 +688,19 @@ def compute_calibration( sequences_list: List[str], batch_size: int = 1, pass_mode: str = "forward", - tokens_list: List[str] = None, + tokens_list: Optional[List[str]] = None, n_bins: int = 10, + silent: bool = False, ) -> Dict[str, Any]: """Compute model calibration from the input sequences Args: - sequences_list ([type]): [description] - batch_size ([type], optional): [description]. Defaults to 1. - pass_mode ([type], optional): [description]. Defaults to "forward". - tokens_list ([type], optional): [description]. Defaults to None. - n_bins ([type], optional): [description]. Defaults to 10. - + sequences_list : [description] + batch_size : [description]. Defaults to 1. + pass_mode : [description]. Defaults to "forward". + tokens_list : [description]. Defaults to None. + n_bins : [description]. Defaults to 10. + silent: display or not progress bar Returns: [type]: [description] """ @@ -675,7 +712,7 @@ def compute_calibration( inputs, labels, tokens = self._process_sequences_and_tokens( sequences_list, tokens_list ) - logits = self._compute_logits(inputs, batch_size, pass_mode) + logits = self._compute_logits(inputs, batch_size, pass_mode, silent=silent) logits, labels = self._filter_logits(logits, labels, tokens) calibration_dict = self._compute_calibration(logits, labels, n_bins)