Add Support For Two String Inputs (#422)

* solution with concat * with Reshape * successfully works on tokenizers * add max_length truncation for pair input * add check_tokenizer.ipynb * corrected division by half * use transformation instead of modifying pipeline * add tests * extend tests, revert benchmark workarounds * apply comments, fix segment_ids/token_type_ids * fix tests, run ruff format * update pass_rates.json and stats.json --------- Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
openvinotoolkit · Mar 6, 2025 · 26d9676 · 26d9676
1 parent 7881453
commit 26d9676
Show file tree

Hide file tree

Showing 9 changed files with 413 additions and 31 deletions.
diff --git a/python/openvino_tokenizers/__init__.py b/python/openvino_tokenizers/__init__.py
@@ -68,15 +68,17 @@ def new_fe_init(self, *args, **kwargs):
     old_fe_init(self, *args, **kwargs)
     self.add_extension(str(_ext_path))
 
+
 def get_create_wrapper(old_create: Callable) -> Callable:
     @functools.wraps(old_fe_init)
     def new_create(*args, **kwargs):
         op_name = args[0] if len(args) > 0 else None
-        if len(args) > 0 and op_name in ['StringTensorUnpack', 'StringTensorPack']:
+        if len(args) > 0 and op_name in ["StringTensorUnpack", "StringTensorPack"]:
             msg = f"Creating {op_name} from extension is deprecated. Consider creating operation from original opset factory."
-            f"E.g. _get_opset_factory(\"opset15\").create(\"{op_name}\", ...)"
+            f'E.g. _get_opset_factory("opset15").create("{op_name}", ...)'
             logger.info(msg)
         return old_create(*args, **kwargs)
+
     return new_create
 
 

diff --git a/python/openvino_tokenizers/convert_tokenizer.py b/python/openvino_tokenizers/convert_tokenizer.py
@@ -10,6 +10,7 @@
 
 from openvino import Model, Type
 from openvino.exceptions import OVTypeError
+from openvino_tokenizers.tokenizer_transformations import add_second_input
 
 from openvino_tokenizers.constants import UTF8ReplaceMode
 from openvino_tokenizers.utils import (
@@ -73,6 +74,7 @@ def convert_tokenizer(
     use_sentencepiece_backend: bool = False,
     utf8_replace_mode: Optional[UTF8ReplaceMode] = UTF8ReplaceMode.REPLACE,
     max_length: Optional[int] = None,
+    number_of_inputs: int = 1,
 ) -> Union[Model, Tuple[Model, Model]]:
     """
     Converts a given tokenizer object into an OpenVINO-compatible model.
@@ -114,6 +116,7 @@ def convert_tokenizer(
         is_sentencepiece_model,
         is_tiktoken_model,
     )
+
     # For some reason dataclass transforms None -> (None,)
     if params.max_length and params.max_length != (None,):
         tokenizer_object.model_max_length = params.max_length
@@ -140,6 +143,11 @@ def convert_tokenizer(
     if ov_tokenizers is None:
         raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}")
 
+    assert params.number_of_inputs in [1, 2], "Number of inputs should be 1 or 2"
+
+    if params.number_of_inputs == 2:
+        add_second_input(ov_tokenizers[0] if isinstance(ov_tokenizers, tuple) else ov_tokenizers)
+
     if isinstance(ov_tokenizers, tuple):
         return (
             change_outputs_type(ov_tokenizers[0], params.tokenizer_output_type),

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -277,9 +277,11 @@ def __add__(self, other: "CharsmapStep") -> "CharsmapStep":
         if self.charsmap is not None and other.charsmap is not None:
             raise ValueError("Cannot add two CharsmapStep instances with non-None charsmap attributes")
         if (
-                self.normalization_form is not None and other.normalization_form is not None
-                and self.normalization_form != "identity" and other.normalization_form != "identity"
-                and self.normalization_form != other.normalization_form
+            self.normalization_form is not None
+            and other.normalization_form is not None
+            and self.normalization_form != "identity"
+            and other.normalization_form != "identity"
+            and self.normalization_form != other.normalization_form
         ):
             raise ValueError("Cannot add two CharsmapStep instances with different normalization_form attributes")
 
@@ -293,7 +295,6 @@ def __add__(self, other: "CharsmapStep") -> "CharsmapStep":
             nmt=self.nmt or other.nmt,
         )
 
-
     @classmethod
     def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
         return cls(charsmap=base64.b64decode(step_json["precompiled_charsmap"]))
@@ -892,10 +893,41 @@ def from_hf_json_template_postprocessor(
         cls, post_processor_dict: Dict[str, Any], number_of_inputs: int = 1, add_special_tokens: bool = True
     ) -> "CombineSegmentsStep":
         inputs: List[TokenWithTypeId] = []
-        if number_of_inputs == 1:
-            post_processor = post_processor_dict["single"]
-        else:
-            post_processor = post_processor_dict["pair"]
+
+        post_processor = post_processor_dict["single"]
+        pair_post_processor = post_processor_dict["pair"]
+
+        single_num_inputs = len(post_processor)
+        pair_num_inputs = len(pair_post_processor)
+        num_additional = pair_num_inputs - single_num_inputs
+        start_from_idx = single_num_inputs - num_additional
+
+        if number_of_inputs == 2 and not num_additional in [2] and not start_from_idx >= 0:
+            raise UserInputError("Only adding one additional pair for the second input is currently supported")
+
+        is_two_inputs_supported = True
+
+        # Assert that post_processor_dict for pair inputs is extended variant for single inputs
+        for i in range(num_additional):
+            pair_input = pair_post_processor[single_num_inputs + i]
+            single_input = post_processor[start_from_idx + i]
+
+            is_two_inputs_supported = pair_input.keys() == single_input.keys()
+            if not is_two_inputs_supported:
+                break
+            for key in pair_input.keys():
+                if key == "SpecialToken":
+                    is_two_inputs_supported = pair_input[key]["id"] == single_input[key]["id"]
+
+                    if not is_two_inputs_supported:
+                        break
+
+        if number_of_inputs == 2 and not is_two_inputs_supported:
+            raise UserInputError(
+                f"Two inputs not supported for this post-processors "
+                f"single input post_processor {post_processor} "
+                f"and pair input post_processor {pair_post_processor}"
+            )
 
         for template_dict in post_processor:
             if "SpecialToken" in template_dict:
@@ -919,15 +951,13 @@ def from_hf_json_bert_postprocessor(
         inputs.append(
             AddToken(token=post_processor_dict["cls"][0], token_type_id=0, enabled_by_default=add_special_tokens)
         )
+        inputs[-1].token_id = post_processor_dict["cls"][1]
         inputs.append(Sequence(token_type_id=0))
         inputs.append(
             AddToken(token=post_processor_dict["sep"][0], token_type_id=0, enabled_by_default=add_special_tokens)
         )
-        if number_of_inputs == 2:
-            inputs.append(Sequence(token_type_id=1))
-            inputs.append(
-                AddToken(token=post_processor_dict["sep"][0], token_type_id=1, enabled_by_default=add_special_tokens)
-            )
+        inputs[-1].token_id = post_processor_dict["sep"][1]
+
         return cls(inputs, add_special_tokens=add_special_tokens)
 
     @classmethod
@@ -1119,13 +1149,13 @@ def add_special_tokens_to_vocab(vocab: List[str, bytes], added_tokens: Dict[int,
 
     @classmethod
     def from_hf_json(
-            cls,
-            tokenizer_json: Dict[str, Any],
-            pipeline_vocab: Optional[List[str]],
-            added_tokens: Optional[List[int]] = None,
-            skip_tokens: Optional[List[int]] = None,
-            do_skip_tokens: bool = True,
-            is_byte_level: bool = False,
+        cls,
+        tokenizer_json: Dict[str, Any],
+        pipeline_vocab: Optional[List[str]],
+        added_tokens: Optional[List[int]] = None,
+        skip_tokens: Optional[List[int]] = None,
+        do_skip_tokens: bool = True,
+        is_byte_level: bool = False,
     ) -> "VocabDecoderStep":
         model_type = tokenizer_json["model"]["type"]
 
@@ -1334,25 +1364,21 @@ def replace_normalization_step(step: BasePipelineStep) -> BasePipelineStep:
 
         return step
 
-
     def merge_normalization_steps(self) -> None:
         self.steps = [self.replace_normalization_step(step) for step in self.steps]
 
         charsmap_steps = [step for step in self.steps if isinstance(step, CharsmapStep)]
         if len(charsmap_steps) > 1:
-            first_step_position = next(
-                idx for idx, step in enumerate(self.steps) if isinstance(step, CharsmapStep)
-            )
+            first_step_position = next(idx for idx, step in enumerate(self.steps) if isinstance(step, CharsmapStep))
             steps_without_charsmaps = [step for step in self.steps if not isinstance(step, CharsmapStep)]
 
             steps_without_charsmaps.insert(first_step_position, reduce(add, charsmap_steps))
             self.steps = steps_without_charsmaps
 
-
     def get_tokenizer_ov_subgraph(self) -> Model:
         self.finalize()
 
-        string_inputs = [op.Parameter(Type.string, PartialShape(["?"])) for _ in range(self.number_of_inputs)]
+        string_inputs = [op.Parameter(Type.string, PartialShape(["?"]))]
 
         processing_outputs = []
         for input_node in string_inputs: