Skip to content

Commit

Permalink
Merge branch 'ppdiffuesers_ldm_to_original' of https://github.com/Jun…
Browse files Browse the repository at this point in the history
…nYu/PaddleNLP into ppdiffuesers_ldm_to_original
  • Loading branch information
JunnYu committed Nov 18, 2022
2 parents e854d56 + be8fe3a commit fc0b22b
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 3 deletions.
1 change: 1 addition & 0 deletions fast_tokenizer/docs/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,5 @@ fast_tokenizer
目前FastTokenizer提供了以下C++使用示例。

[ErnieFastTokenizer C++示例](../../examples/ernie/)

[ClipFastTokenizer C++示例](../../examples/clip/)
Empty file.
8 changes: 7 additions & 1 deletion fast_tokenizer/fast_tokenizer/pybind/pretokenizers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,17 @@ void BindPreTokenizers(pybind11::module* m) {
pretokenizer_ptr =
py_pretokenizer
.cast<pretokenizers::ByteLevelPreTokenizer*>();
} else if (py::type::of(py_pretokenizer)
.is(py::type::of<
pretokenizers::SplitPreTokenizer>())) {
pretokenizer_ptr =
py_pretokenizer.cast<pretokenizers::SplitPreTokenizer*>();
} else {
throw py::value_error(
"Type of normalizers should be one of `BertPreTokenizer`,"
" `MetaSpacePreTokenizer`, `SequencePreTokenizer`,"
" `WhitespacePreTokenizer`, `ByteLevelPreTokenizer`");
" `WhitespacePreTokenizer`, `ByteLevelPreTokenizer`, "
"`SplitPreTokenizer`");
}
pretokenizers.push_back(pretokenizer_ptr);
}
Expand Down
2 changes: 1 addition & 1 deletion fast_tokenizer/python/fast_tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,4 +550,4 @@ def get_thread_num():
from . import pretokenizers
from . import postprocessors
from . import decoders
from .tokenizers_impl import ErnieFastTokenizer, SentencePieceBPEFastTokenizer
from .tokenizers_impl import ErnieFastTokenizer, SentencePieceBPEFastTokenizer, ClipFastTokenizer
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from .base_tokenizer import BaseFastTokenizer
from .ernie import ErnieFastTokenizer
from .sentencepiece_bpe import SentencePieceBPEFastTokenizer
from .clip import ClipFastTokenizer
99 changes: 99 additions & 0 deletions fast_tokenizer/python/fast_tokenizer/tokenizers_impl/clip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .base_tokenizer import BaseFastTokenizer

from fast_tokenizer.normalizers import NFCNormalizer, ReplaceNormalizer, LowercaseNormalizer, SequenceNormalizer
from fast_tokenizer.pretokenizers import SplitPreTokenizer, ByteLevelPreTokenizer, SequencePreTokenizer
from fast_tokenizer.models import BPE
from fast_tokenizer.postprocessors import RobertaPostProcessor
from fast_tokenizer import Tokenizer, SplitMode

__all__ = ['ClipFastTokenizer']


class ClipFastTokenizer(BaseFastTokenizer):

def __init__(self,
vocab=None,
merges=None,
max_length=None,
unk_token="<|endoftext|>",
pad_token="<|endoftext|>",
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
continuing_subword_prefix="",
end_of_word_suffix="</w>",
trim_offsets=False):
# Init Tokenizer instance using tokenization model
tokenizer = Tokenizer(
BPE(vocab,
merges,
unk_token=unk_token,
continuing_subword_prefix=continuing_subword_prefix,
end_of_word_suffix=end_of_word_suffix,
fuse_unk=False))

# Add special tokens
bos_token_id = 0
eos_token_id = 1
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
if tokenizer.token_to_id(str(pad_token)) is not None:
tokenizer.add_special_tokens([str(pad_token)])
if tokenizer.token_to_id(str(bos_token)) is not None:
bos_token_id = tokenizer.token_to_id(str(bos_token))
tokenizer.add_special_tokens([str(bos_token)])
if tokenizer.token_to_id(str(eos_token)) is not None:
eos_token_id = tokenizer.token_to_id(str(eos_token))
tokenizer.add_special_tokens([str(eos_token)])

# Set the normalizer
tokenizer.normalizer = SequenceNormalizer([
NFCNormalizer(),
ReplaceNormalizer(r"\s+", " "),
LowercaseNormalizer()
])

# Set the pretokenizer
tokenizer.pretokenizer = SequencePreTokenizer([
SplitPreTokenizer(
r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
split_mode=SplitMode.REMOVED,
invert=True),
ByteLevelPreTokenizer(add_prefix_space=False)
])

# Set the postprocessor
tokenizer.postprocessor = RobertaPostProcessor(sep=(eos_token,
eos_token_id),
cls=(bos_token,
bos_token_id),
trim_offsets=False,
add_prefix_space=False)

parameters = {
"model": "BPE",
"unk_token": unk_token,
"pad_token": pad_token,
"bos_token": bos_token,
"eos_token": eos_token,
"add_prefix_space": add_prefix_space,
"max_length": max_length,
"continuing_subword_prefix": continuing_subword_prefix,
"end_of_word_suffix": end_of_word_suffix,
"trim_offsets": trim_offsets
}
super().__init__(tokenizer, parameters)
54 changes: 54 additions & 0 deletions fast_tokenizer/python/tests/test_clip_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import os
import unittest

import fast_tokenizer
from fast_tokenizer import ClipFastTokenizer, models
from paddlenlp.utils.downloader import get_path_from_url


class TestClipFastTokenizer(unittest.TestCase):

def setUp(self):
vocab_path = os.path.join(os.getcwd(), "vocab.json")
merges_path = os.path.join(os.getcwd(), "merges.txt")
if not os.path.exists(vocab_path):
get_path_from_url(
"http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/vocab.json",
os.getcwd())
if not os.path.exists(merges_path):
get_path_from_url(
"http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/merges.txt",
os.getcwd())
vocab, merges = models.BPE.read_file(vocab_path, merges_path)
self.tokenizer = ClipFastTokenizer(vocab, merges)
self.expected_ids = [
49406, 320, 1342, 272, 272, 335, 273, 273, 274, 16368, 13439, 2971,
748, 531, 13610, 323, 1896, 8445, 323, 539, 320, 2368, 49407
]
self.expected_tokens = [
"<|startoftext|>", "a</w>", "'ll</w>", "1</w>", "1</w>", "p</w>",
"2</w>", "2</w>", "3</w>", "rf</w>", "âĺĨ</w>", "ho</w>", "!!</w>",
"to</w>", "?'</w>", "d</w>", "'d</w>", "''</w>", "d</w>", "of</w>",
"a</w>", "cat</w>", "<|endoftext|>"
]
self.input_text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"

def test_encode(self):
result = self.tokenizer.encode(self.input_text)
self.assertEqual(result.tokens, self.expected_tokens)
self.assertEqual(result.ids, self.expected_ids)
2 changes: 1 addition & 1 deletion fast_tokenizer/run_build_py_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ do
echo "Compile with $core_num cores"
cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
make -j${core_num}
if [[$? == 0]];
if [[ $? == 0 ]];
then
echo "Successfully compile."
else
Expand Down

0 comments on commit fc0b22b

Please sign in to comment.