From ec7f5aa93ddd97081cec214c0d3769d132ec8619 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 13 Apr 2022 16:04:04 +0200 Subject: [PATCH 1/6] Solved href rendering issue in heading Markdown references in headings such as '####' don't render well. Replaced it with

... banners. --- CONTRIBUTING.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c0af3fbaa64c..2be17542f494 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https:// Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for more information. -#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md) - +

This guide was heavily inspired by the awesome scikit-learn guide to contributing

### Develop on Windows From c29ef894d39daf18dbf60e09d98a619daea27594 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 13 Apr 2022 16:14:41 +0200 Subject: [PATCH 2/6] PhonemeTokenizer optimization using phonemizer lib The backend should only be initialized once, otherwise it is reloaded. Added `init_backend` function, intializes a backend attribute. Phonemize re-uses self.backend. Should give ~10 times faster phonemization. --- .../tokenization_wav2vec2_phoneme.py | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index b37f902c346b..1f3d602e7de3 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -158,6 +158,9 @@ def __init__( self.phonemizer_lang = phonemizer_lang self.phonemizer_backend = phonemizer_backend + if do_phonemize: + self.init_backend(self.phonemizer_lang) + with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -169,6 +172,17 @@ def vocab_size(self) -> int: def get_vocab(self) -> Dict: return dict(self.encoder, **self.added_tokens_encoder) + def init_backend(self, phonemizer_lang: str): + """Initiaizes the backend + + Args: + phonemizer_lang (str): language to be used + """ + requires_backends(self, "phonemizer") + from phonemizer.backend import BACKENDS + + self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags") + def prepare_for_tokenization( self, text: str, @@ -209,6 +223,7 @@ def prepare_for_tokenization( # set the correct phonemizer language if phonemizer_lang is not None: self.phonemizer_lang = phonemizer_lang + self.init_backend(phonemizer_lang) return (text, {}) @@ -234,23 +249,20 @@ def _tokenize(self, text, **kwargs): return tokens def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str: - requires_backends(self, "phonemizer") - - from phonemizer import phonemize from phonemizer.separator import Separator word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else "" - phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang + if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang: + self.init_backend(phonemizer_lang) + else: + phonemizer_lang = self.phonemizer_lang separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="") - phonemes = phonemize( - text, - language=phonemizer_lang, - backend=self.phonemizer_backend, + phonemes = self.backend.phonemize( + [text], separator=separator, - language_switch="remove-flags", ) - phonemes = phonemes.strip() + phonemes = phonemes[0].strip() return phonemes From 0050f42ea8d03ded9fd908347ba8918550c68eff Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 13 Apr 2022 16:21:49 +0200 Subject: [PATCH 3/6] formatted file with make style --- .../models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index 1f3d602e7de3..b02ea3196da9 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -173,10 +173,10 @@ def get_vocab(self) -> Dict: return dict(self.encoder, **self.added_tokens_encoder) def init_backend(self, phonemizer_lang: str): - """Initiaizes the backend + """Initiaizes the backend Args: - phonemizer_lang (str): language to be used + phonemizer_lang (str): language to be used """ requires_backends(self, "phonemizer") from phonemizer.backend import BACKENDS From 91dcab948c2a948a945d4b6134b9612ce55e59d9 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 19 Apr 2022 00:50:36 +0200 Subject: [PATCH 4/6] Documentation suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- .../models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index b02ea3196da9..023d5436b86d 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -173,7 +173,8 @@ def get_vocab(self) -> Dict: return dict(self.encoder, **self.added_tokens_encoder) def init_backend(self, phonemizer_lang: str): - """Initiaizes the backend + """ + Initializes the backend. Args: phonemizer_lang (str): language to be used From ad58a533c919ec0ece63d0755bbe1c7a633d9f7e Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 19 Apr 2022 00:51:56 +0200 Subject: [PATCH 5/6] Update /tokenization_wav2vec2_phoneme.py based on PR suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- .../models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index 023d5436b86d..6bd355645e5a 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -177,7 +177,7 @@ def init_backend(self, phonemizer_lang: str): Initializes the backend. Args: - phonemizer_lang (str): language to be used + phonemizer_lang (`str`): The language to be used. """ requires_backends(self, "phonemizer") from phonemizer.backend import BACKENDS From 8381b7c82564520a4351471eba3f88678b380125 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 19 Apr 2022 08:50:11 +0200 Subject: [PATCH 6/6] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2be17542f494..e74510948a9c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -368,7 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https:// Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for more information. -

This guide was heavily inspired by the awesome scikit-learn guide to contributing

+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).** ### Develop on Windows