@@ -496,150 +496,4 @@ def tokenize(self, text):
496
496
output_tokens .append (self .unk_token )
497
497
else :
498
498
output_tokens .extend (sub_tokens )
499
- return output_tokens
500
-
501
- class DistilBertTokenizerFast (PreTrainedTokenizerFast ):
502
- r"""
503
- Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
504
-
505
- This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
506
- refer to this superclass for more information regarding those methods.
507
-
508
- Args:
509
- vocab_file (`str`):
510
- File containing the vocabulary.
511
- do_lower_case (`bool`, *optional*, defaults to `True`):
512
- Whether or not to lowercase the input when tokenizing.
513
- unk_token (`str`, *optional*, defaults to `"[UNK]"`):
514
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
515
- token instead.
516
- sep_token (`str`, *optional*, defaults to `"[SEP]"`):
517
- The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
518
- sequence classification or for a text and a question for question answering. It is also used as the last
519
- token of a sequence built with special tokens.
520
- pad_token (`str`, *optional*, defaults to `"[PAD]"`):
521
- The token used for padding, for example when batching sequences of different lengths.
522
- cls_token (`str`, *optional*, defaults to `"[CLS]"`):
523
- The classifier token which is used when doing sequence classification (classification of the whole sequence
524
- instead of per-token classification). It is the first token of the sequence when built with special tokens.
525
- mask_token (`str`, *optional*, defaults to `"[MASK]"`):
526
- The token used for masking values. This is the token used when training this model with masked language
527
- modeling. This is the token which the model will try to predict.
528
- clean_text (`bool`, *optional*, defaults to `True`):
529
- Whether or not to clean the text before tokenization by removing any control characters and replacing all
530
- whitespaces by the classic one.
531
- tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
532
- Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
533
- issue](https://github.com/huggingface/transformers/issues/328)).
534
- strip_accents (`bool`, *optional*):
535
- Whether or not to strip all accents. If this option is not specified, then it will be determined by the
536
- value for `lowercase` (as in the original BERT).
537
- wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
538
- The prefix for subwords.
539
- """
540
-
541
- vocab_files_names = VOCAB_FILES_NAMES
542
- model_input_names = ["input_ids" , "attention_mask" ]
543
- slow_tokenizer_class = DistilBertTokenizer
544
-
545
- def __init__ (
546
- self ,
547
- vocab_file = None ,
548
- tokenizer_file = None ,
549
- do_lower_case = True ,
550
- unk_token = "[UNK]" ,
551
- sep_token = "[SEP]" ,
552
- pad_token = "[PAD]" ,
553
- cls_token = "[CLS]" ,
554
- mask_token = "[MASK]" ,
555
- tokenize_chinese_chars = True ,
556
- strip_accents = None ,
557
- ** kwargs ,
558
- ):
559
- super ().__init__ (
560
- vocab_file ,
561
- tokenizer_file = tokenizer_file ,
562
- do_lower_case = do_lower_case ,
563
- unk_token = unk_token ,
564
- sep_token = sep_token ,
565
- pad_token = pad_token ,
566
- cls_token = cls_token ,
567
- mask_token = mask_token ,
568
- tokenize_chinese_chars = tokenize_chinese_chars ,
569
- strip_accents = strip_accents ,
570
- ** kwargs ,
571
- )
572
-
573
- normalizer_state = json .loads (self .backend_tokenizer .normalizer .__getstate__ ())
574
- if (
575
- normalizer_state .get ("lowercase" , do_lower_case ) != do_lower_case
576
- or normalizer_state .get ("strip_accents" , strip_accents ) != strip_accents
577
- or normalizer_state .get ("handle_chinese_chars" , tokenize_chinese_chars ) != tokenize_chinese_chars
578
- ):
579
- normalizer_class = getattr (normalizers , normalizer_state .pop ("type" ))
580
- normalizer_state ["lowercase" ] = do_lower_case
581
- normalizer_state ["strip_accents" ] = strip_accents
582
- normalizer_state ["handle_chinese_chars" ] = tokenize_chinese_chars
583
- self .backend_tokenizer .normalizer = normalizer_class (** normalizer_state )
584
-
585
- self .do_lower_case = do_lower_case
586
-
587
- # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
588
- def build_inputs_with_special_tokens (self , token_ids_0 , token_ids_1 = None ):
589
- """
590
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
591
- adding special tokens. A BERT sequence has the following format:
592
-
593
- - single sequence: `[CLS] X [SEP]`
594
- - pair of sequences: `[CLS] A [SEP] B [SEP]`
595
-
596
- Args:
597
- token_ids_0 (`List[int]`):
598
- List of IDs to which the special tokens will be added.
599
- token_ids_1 (`List[int]`, *optional*):
600
- Optional second list of IDs for sequence pairs.
601
-
602
- Returns:
603
- `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
604
- """
605
- output = [self .cls_token_id ] + token_ids_0 + [self .sep_token_id ]
606
-
607
- if token_ids_1 is not None :
608
- output += token_ids_1 + [self .sep_token_id ]
609
-
610
- return output
611
-
612
- # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
613
- def create_token_type_ids_from_sequences (
614
- self , token_ids_0 : List [int ], token_ids_1 : Optional [List [int ]] = None
615
- ) -> List [int ]:
616
- """
617
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
618
- pair mask has the following format:
619
-
620
- ```
621
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
622
- | first sequence | second sequence |
623
- ```
624
-
625
- If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
626
-
627
- Args:
628
- token_ids_0 (`List[int]`):
629
- List of IDs.
630
- token_ids_1 (`List[int]`, *optional*):
631
- Optional second list of IDs for sequence pairs.
632
-
633
- Returns:
634
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
635
- """
636
- sep = [self .sep_token_id ]
637
- cls = [self .cls_token_id ]
638
- if token_ids_1 is None :
639
- return len (cls + token_ids_0 + sep ) * [0 ]
640
- return len (cls + token_ids_0 + sep ) * [0 ] + len (token_ids_1 + sep ) * [1 ]
641
-
642
- # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
643
- def save_vocabulary (self , save_directory : str , filename_prefix : Optional [str ] = None ) -> Tuple [str ]:
644
- files = self ._tokenizer .model .save (save_directory , name = filename_prefix )
645
- return tuple (files )
499
+ return output_tokens
0 commit comments