Skip to content

Commit

Permalink
Add Support For Two String Inputs (#422)
Browse files Browse the repository at this point in the history
* solution with concat

* with Reshape

* successfully works on tokenizers

* add max_length truncation for pair input

* add check_tokenizer.ipynb

* corrected division by half

* use transformation instead of modifying pipeline

* add tests

* extend tests, revert benchmark workarounds

* apply comments, fix segment_ids/token_type_ids

* fix tests, run ruff format

* update pass_rates.json and stats.json

---------

Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
  • Loading branch information
pavel-esir and andrei-kochin authored Mar 6, 2025
1 parent 7881453 commit 26d9676
Show file tree
Hide file tree
Showing 9 changed files with 413 additions and 31 deletions.
6 changes: 4 additions & 2 deletions python/openvino_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,17 @@ def new_fe_init(self, *args, **kwargs):
old_fe_init(self, *args, **kwargs)
self.add_extension(str(_ext_path))


def get_create_wrapper(old_create: Callable) -> Callable:
@functools.wraps(old_fe_init)
def new_create(*args, **kwargs):
op_name = args[0] if len(args) > 0 else None
if len(args) > 0 and op_name in ['StringTensorUnpack', 'StringTensorPack']:
if len(args) > 0 and op_name in ["StringTensorUnpack", "StringTensorPack"]:
msg = f"Creating {op_name} from extension is deprecated. Consider creating operation from original opset factory."
f"E.g. _get_opset_factory(\"opset15\").create(\"{op_name}\", ...)"
f'E.g. _get_opset_factory("opset15").create("{op_name}", ...)'
logger.info(msg)
return old_create(*args, **kwargs)

return new_create


Expand Down
8 changes: 8 additions & 0 deletions python/openvino_tokenizers/convert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from openvino import Model, Type
from openvino.exceptions import OVTypeError
from openvino_tokenizers.tokenizer_transformations import add_second_input

from openvino_tokenizers.constants import UTF8ReplaceMode
from openvino_tokenizers.utils import (
Expand Down Expand Up @@ -73,6 +74,7 @@ def convert_tokenizer(
use_sentencepiece_backend: bool = False,
utf8_replace_mode: Optional[UTF8ReplaceMode] = UTF8ReplaceMode.REPLACE,
max_length: Optional[int] = None,
number_of_inputs: int = 1,
) -> Union[Model, Tuple[Model, Model]]:
"""
Converts a given tokenizer object into an OpenVINO-compatible model.
Expand Down Expand Up @@ -114,6 +116,7 @@ def convert_tokenizer(
is_sentencepiece_model,
is_tiktoken_model,
)

# For some reason dataclass transforms None -> (None,)
if params.max_length and params.max_length != (None,):
tokenizer_object.model_max_length = params.max_length
Expand All @@ -140,6 +143,11 @@ def convert_tokenizer(
if ov_tokenizers is None:
raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}")

assert params.number_of_inputs in [1, 2], "Number of inputs should be 1 or 2"

if params.number_of_inputs == 2:
add_second_input(ov_tokenizers[0] if isinstance(ov_tokenizers, tuple) else ov_tokenizers)

if isinstance(ov_tokenizers, tuple):
return (
change_outputs_type(ov_tokenizers[0], params.tokenizer_output_type),
Expand Down
78 changes: 52 additions & 26 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,11 @@ def __add__(self, other: "CharsmapStep") -> "CharsmapStep":
if self.charsmap is not None and other.charsmap is not None:
raise ValueError("Cannot add two CharsmapStep instances with non-None charsmap attributes")
if (
self.normalization_form is not None and other.normalization_form is not None
and self.normalization_form != "identity" and other.normalization_form != "identity"
and self.normalization_form != other.normalization_form
self.normalization_form is not None
and other.normalization_form is not None
and self.normalization_form != "identity"
and other.normalization_form != "identity"
and self.normalization_form != other.normalization_form
):
raise ValueError("Cannot add two CharsmapStep instances with different normalization_form attributes")

Expand All @@ -293,7 +295,6 @@ def __add__(self, other: "CharsmapStep") -> "CharsmapStep":
nmt=self.nmt or other.nmt,
)


@classmethod
def from_hf_step_json(cls, step_json: Dict[str, Any]) -> "CharsmapStep":
return cls(charsmap=base64.b64decode(step_json["precompiled_charsmap"]))
Expand Down Expand Up @@ -892,10 +893,41 @@ def from_hf_json_template_postprocessor(
cls, post_processor_dict: Dict[str, Any], number_of_inputs: int = 1, add_special_tokens: bool = True
) -> "CombineSegmentsStep":
inputs: List[TokenWithTypeId] = []
if number_of_inputs == 1:
post_processor = post_processor_dict["single"]
else:
post_processor = post_processor_dict["pair"]

post_processor = post_processor_dict["single"]
pair_post_processor = post_processor_dict["pair"]

single_num_inputs = len(post_processor)
pair_num_inputs = len(pair_post_processor)
num_additional = pair_num_inputs - single_num_inputs
start_from_idx = single_num_inputs - num_additional

if number_of_inputs == 2 and not num_additional in [2] and not start_from_idx >= 0:
raise UserInputError("Only adding one additional pair for the second input is currently supported")

is_two_inputs_supported = True

# Assert that post_processor_dict for pair inputs is extended variant for single inputs
for i in range(num_additional):
pair_input = pair_post_processor[single_num_inputs + i]
single_input = post_processor[start_from_idx + i]

is_two_inputs_supported = pair_input.keys() == single_input.keys()
if not is_two_inputs_supported:
break
for key in pair_input.keys():
if key == "SpecialToken":
is_two_inputs_supported = pair_input[key]["id"] == single_input[key]["id"]

if not is_two_inputs_supported:
break

if number_of_inputs == 2 and not is_two_inputs_supported:
raise UserInputError(
f"Two inputs not supported for this post-processors "
f"single input post_processor {post_processor} "
f"and pair input post_processor {pair_post_processor}"
)

for template_dict in post_processor:
if "SpecialToken" in template_dict:
Expand All @@ -919,15 +951,13 @@ def from_hf_json_bert_postprocessor(
inputs.append(
AddToken(token=post_processor_dict["cls"][0], token_type_id=0, enabled_by_default=add_special_tokens)
)
inputs[-1].token_id = post_processor_dict["cls"][1]
inputs.append(Sequence(token_type_id=0))
inputs.append(
AddToken(token=post_processor_dict["sep"][0], token_type_id=0, enabled_by_default=add_special_tokens)
)
if number_of_inputs == 2:
inputs.append(Sequence(token_type_id=1))
inputs.append(
AddToken(token=post_processor_dict["sep"][0], token_type_id=1, enabled_by_default=add_special_tokens)
)
inputs[-1].token_id = post_processor_dict["sep"][1]

return cls(inputs, add_special_tokens=add_special_tokens)

@classmethod
Expand Down Expand Up @@ -1119,13 +1149,13 @@ def add_special_tokens_to_vocab(vocab: List[str, bytes], added_tokens: Dict[int,

@classmethod
def from_hf_json(
cls,
tokenizer_json: Dict[str, Any],
pipeline_vocab: Optional[List[str]],
added_tokens: Optional[List[int]] = None,
skip_tokens: Optional[List[int]] = None,
do_skip_tokens: bool = True,
is_byte_level: bool = False,
cls,
tokenizer_json: Dict[str, Any],
pipeline_vocab: Optional[List[str]],
added_tokens: Optional[List[int]] = None,
skip_tokens: Optional[List[int]] = None,
do_skip_tokens: bool = True,
is_byte_level: bool = False,
) -> "VocabDecoderStep":
model_type = tokenizer_json["model"]["type"]

Expand Down Expand Up @@ -1334,25 +1364,21 @@ def replace_normalization_step(step: BasePipelineStep) -> BasePipelineStep:

return step


def merge_normalization_steps(self) -> None:
self.steps = [self.replace_normalization_step(step) for step in self.steps]

charsmap_steps = [step for step in self.steps if isinstance(step, CharsmapStep)]
if len(charsmap_steps) > 1:
first_step_position = next(
idx for idx, step in enumerate(self.steps) if isinstance(step, CharsmapStep)
)
first_step_position = next(idx for idx, step in enumerate(self.steps) if isinstance(step, CharsmapStep))
steps_without_charsmaps = [step for step in self.steps if not isinstance(step, CharsmapStep)]

steps_without_charsmaps.insert(first_step_position, reduce(add, charsmap_steps))
self.steps = steps_without_charsmaps


def get_tokenizer_ov_subgraph(self) -> Model:
self.finalize()

string_inputs = [op.Parameter(Type.string, PartialShape(["?"])) for _ in range(self.number_of_inputs)]
string_inputs = [op.Parameter(Type.string, PartialShape(["?"]))]

processing_outputs = []
for input_node in string_inputs:
Expand Down
Loading

0 comments on commit 26d9676

Please sign in to comment.