Efficient decoder text generation wrapper (huggingface#273)

* Set the default matmul_proportion in IPUConfig to 0.2 so default config will work with decoder wrapper
graphcore · Mar 17, 2023 · 609c870 · 609c870
1 parent 1a5579f
commit 609c870
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 33 deletions.
diff --git a/optimum/graphcore/generation_utils.py b/optimum/graphcore/generation_utils.py
@@ -535,6 +535,7 @@ def beam_search(
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             outputs = self._call_generate(
+                t=torch.tensor(cur_len - 1),
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,
@@ -545,13 +546,6 @@ def beam_search(
             if not self.config.is_encoder_decoder:
                 model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, :cur_len]
 
-            outputs.logits = outputs.logits[:, :cur_len, :]
-            if outputs.logits.dim() == 3:
-                outputs.logits = outputs.logits[:, :cur_len, :]
-            # If the dimension of logits is 2, then only the logits of the last non-padding token is returned, so no need to slice.
-            else:
-                next_token_logits = outputs.logits
-
             # Change: remove synced_gpu code
 
             # Change: cast to float on cpu
@@ -840,6 +834,7 @@ def sample(
 
             # forward pass to get next token
             outputs = self._call_generate(
+                t=torch.tensor(cur_len - 1),
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,
@@ -850,13 +845,6 @@ def sample(
             if not self.config.is_encoder_decoder:
                 model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, :cur_len]
 
-            outputs.logits = outputs.logits[:, :cur_len, :]
-            if outputs.logits.dim() == 3:
-                outputs.logits = outputs.logits[:, :cur_len, :]
-            # If the dimension of logits is 2, then only the logits of the last non-padding token is returned, so no need to slice.
-            else:
-                next_token_logits = outputs.logits
-
             # Change: remove synced_gpu code
 
             # Change: cast to float on cpu
@@ -1119,6 +1107,7 @@ def beam_sample(
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             outputs = self._call_generate(
+                t=torch.tensor(cur_len - 1),
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,
@@ -1129,13 +1118,6 @@ def beam_sample(
             if not self.config.is_encoder_decoder:
                 model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, :cur_len]
 
-            outputs.logits = outputs.logits[:, :cur_len, :]
-            if outputs.logits.dim() == 3:
-                outputs.logits = outputs.logits[:, :cur_len, :]
-            # If the dimension of logits is 2, then only the logits of the last non-padding token is returned, so no need to slice.
-            else:
-                next_token_logits = outputs.logits
-
             # Change: remove synced_gpu code
 
             # Change: cast to float on cpu
@@ -1253,4 +1235,4 @@ def beam_sample(
                     hidden_states=decoder_hidden_states,
                 )
         else:
-            return sequence_outputs["sequences"]
+            return sequence_outputs["sequences"]
diff --git a/optimum/graphcore/ipu_configuration.py b/optimum/graphcore/ipu_configuration.py
@@ -138,7 +138,7 @@ def __init__(self, **kwargs):
                 'The "sharded_execution_for_inference" parameter is deprecated, sharded execution is always used during inference'
             )
 
-        self.matmul_proportion = kwargs.pop("matmul_proportion", 0.6)
+        self.matmul_proportion = kwargs.pop("matmul_proportion", 0.2)
 
         if "enable_half_first_order_momentum" in kwargs:
             warnings.warn('The "enable_half_first_order_momentum" parameter is deprecated')

diff --git a/optimum/graphcore/pipelines/__init__.py b/optimum/graphcore/pipelines/__init__.py
@@ -138,7 +138,7 @@ class IncompatibleIPUConfigError(Exception):
         "class": (AutoModelForCausalLM,),
         "default": {
             "model": ("gpt2", "e7da7f2"),
-            "ipu_config": "Graphcore/gpt2-small-ipu",
+            "ipu_config": IPUConfig(layers_per_ipu=[12], matmul_proportion=0.2),
             "max_length": 50,
         },
         "type": "text",
@@ -148,7 +148,7 @@ class IncompatibleIPUConfigError(Exception):
         "class": (AutoModelForSeq2SeqLM,),
         "default": {
             "model": ("ainize/bart-base-cnn", "b90bc9a"),
-            "ipu_config": "Graphcore/bart-base-ipu",
+            "ipu_config": IPUConfig(layers_per_ipu=[12], matmul_proportion=0.2),
             "max_input_length": 50,
             "max_length": 20,
             "truncation": "only_first",
@@ -161,7 +161,7 @@ class IncompatibleIPUConfigError(Exception):
         "class": (AutoModelForSeq2SeqLM,),
         "default": {
             "model": ("t5-small", "9507060"),
-            "ipu_config": "Graphcore/t5-small-ipu",
+            "ipu_config": IPUConfig(layers_per_ipu=[12], matmul_proportion=0.2),
             "max_length": 50,
             "max_input_length": 45,
             "truncation": "only_first",
@@ -173,7 +173,7 @@ class IncompatibleIPUConfigError(Exception):
         "class": (AutoModelForSeq2SeqLM,),
         "default": {
             "model": ("t5-small", "9507060"),
-            "ipu_config": "Graphcore/t5-small-ipu",
+            "ipu_config": IPUConfig(layers_per_ipu=[12], matmul_proportion=0.2),
             "max_length": 50,
             "max_input_length": 50,
             "truncation": "only_first",
@@ -210,7 +210,7 @@ def list_tasks() -> List[str]:
 
 def get_poplar_executor(
     model: PreTrainedModel,
-    ipu_config: Union[str, dict] = None,
+    ipu_config: Union[IPUConfig, str, dict] = None,
     fp16: bool = True,
 ) -> PreTrainedModel:
     ipu_config_arg = ipu_config
@@ -219,8 +219,8 @@ def get_poplar_executor(
         ipu_config = IPUConfig.from_pretrained(ipu_config)
     elif isinstance(ipu_config, dict):
         ipu_config = IPUConfig.from_dict(ipu_config)
-    else:
-        raise ValueError("ipu_config must be a string or a dictionary.")
+    elif not isinstance(ipu_config, IPUConfig):
+        raise ValueError("ipu_config must be an IPUConfig, string, or a dictionary.")
     ipu_config.inference_device_iterations = 1
     # TODO: inference_replication_factor should be adaptive, especially for batching.
     ipu_config.inference_replication_factor = 1
@@ -280,7 +280,7 @@ def check_model_type(self, supported_models: Union[List[str], dict]):
 def pipeline(
     task: str = None,
     model: Optional[Any] = None,
-    ipu_config: Union[str, dict] = None,
+    ipu_config: Union[IPUConfig, str, dict] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     revision: Optional[str] = None,
@@ -411,8 +411,8 @@ def new_forward(self, model_inputs, *args, **kwargs):
     # Implement pipelines __del__ to clean up poplar exector
     def _del(self):
         # For text generation models, deallocate the internal poplar executor
-        if hasattr(self.model, "poptorch_model"):
-            self.model.poptorch_model.destroy()
+        if hasattr(self.model, "poptorch_decoder"):
+            self.model.poptorch_decoder.destroy()
 
     pipeline_class.__del__ = _del