Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wav2 vec2 phoneme ctc tokenizer optimisation #16817

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https://
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
for more information.

#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md)

**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**

### Develop on Windows

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ def __init__(
self.phonemizer_lang = phonemizer_lang
self.phonemizer_backend = phonemizer_backend

if do_phonemize:
self.init_backend(self.phonemizer_lang)

with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
Expand All @@ -169,6 +172,18 @@ def vocab_size(self) -> int:
def get_vocab(self) -> Dict:
return dict(self.encoder, **self.added_tokens_encoder)

def init_backend(self, phonemizer_lang: str):
"""
Initializes the backend.

Args:
phonemizer_lang (`str`): The language to be used.
"""
requires_backends(self, "phonemizer")
from phonemizer.backend import BACKENDS

self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags")

def prepare_for_tokenization(
self,
text: str,
Expand Down Expand Up @@ -209,6 +224,7 @@ def prepare_for_tokenization(
# set the correct phonemizer language
if phonemizer_lang is not None:
self.phonemizer_lang = phonemizer_lang
self.init_backend(phonemizer_lang)

return (text, {})

Expand All @@ -234,23 +250,20 @@ def _tokenize(self, text, **kwargs):
return tokens

def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
requires_backends(self, "phonemizer")

from phonemizer import phonemize
from phonemizer.separator import Separator

word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang
if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
self.init_backend(phonemizer_lang)
else:
phonemizer_lang = self.phonemizer_lang

separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
phonemes = phonemize(
text,
language=phonemizer_lang,
backend=self.phonemizer_backend,
phonemes = self.backend.phonemize(
[text],
separator=separator,
language_switch="remove-flags",
)
phonemes = phonemes.strip()
phonemes = phonemes[0].strip()

return phonemes

Expand Down