PaddlePaddle · ZeyuChen · Sep 8, 2022 · Sep 7, 2022 · Sep 7, 2022 · Sep 7, 2022
diff --git a/faster_tokenizer/CMakeLists.txt b/faster_tokenizer/CMakeLists.txt
@@ -102,15 +102,15 @@ endforeach()
 
 ELSE(WIN32)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
-    IF (LINUX)
+    IF (NOT APPLE)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ldl -lpthread")
     ENDIF()
     set (PUBLIC_DEPEND_LIBS ${CMAKE_DL_LIBS})
 ENDIF(WIN32)
 
 # For OpenMP
 # openmp not support well for now on windows
-if (LINUX)
+if (NOT APPLE AND NOT WIN32) # Linux
     find_package(OpenMP)
     if (OPENMP_FOUND)
         add_definitions(-DWITH_OMP)
@@ -143,7 +143,7 @@ if(WITH_PYTHON)
 
 add_subdirectory(python)
 
-if(LINUX)
+if (NOT APPLE AND NOT WIN32) # Linux
 add_custom_target(build_tokenizers_bdist_wheel ALL
     COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel --plat-name=manylinux1_x86_64
     COMMENT "Packing whl packages------>>>"
@@ -168,6 +168,8 @@ file(COPY ${PROJECT_SOURCE_DIR}/FasterTokenizer.cmake DESTINATION ${CPP_PACKAGE_
 # copy headers
 file(COPY ${PROJECT_SOURCE_DIR}/faster_tokenizer/ DESTINATION ${CPP_PACKAGE_DIR}/include/faster_tokenizer/
     FILES_MATCHING PATTERN "*.h"
+    PATTERN "test" EXCLUDE
+    PATTERN "demo" EXCLUDE
     PATTERN "pybind" EXCLUDE)
 
 add_custom_target(copy_third_party_headers ALL
@@ -177,11 +179,6 @@ add_custom_target(copy_third_party_headers ALL
     ${CPP_PACKAGE_DIR}/third_party/include
     DEPENDS build_cpp_package_dir)
 
-add_custom_target(copy_boost_headers ALL
-    COMMAND ${CMAKE_COMMAND} -E copy_directory 
-    ${BOOST_INCLUDE_DIR}/boost ${CPP_PACKAGE_DIR}/third_party/include/boost
-    DEPENDS build_cpp_package_dir)
-
 # copy library
 set(TOKENIZER_CORE_NAME "core_tokenizers")
 set(TOKENIZER_CORE_PATH ${CMAKE_BINARY_DIR}/faster_tokenizer)

diff --git a/faster_tokenizer/cmake/external/boost.cmake b/faster_tokenizer/cmake/external/boost.cmake
diff --git a/faster_tokenizer/cmake/third_party.cmake b/faster_tokenizer/cmake/third_party.cmake
@@ -18,11 +18,12 @@ set(THIRD_PARTY_PATH  "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
     "A path setting third party libraries download & build directories.")
 
 include(external/icu)
-include(external/gtest)
+if(WITH_TESTING)
+    include(external/gtest)
+endif()
 include(external/gflags)
 include(external/glog)
 include(external/re2)
-include(external/boost)
 include(external/nlohmann_json)
 include(external/dart) # For trie
 if (WITH_PYTHON)

diff --git a/faster_tokenizer/faster_tokenizer/core/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/core/CMakeLists.txt
@@ -1,4 +1,3 @@
 cc_library(added_vocabulary SRCS added_vocabulary.cc DEPS normalizers pretokenizers json)
 cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors)
 cc_library(core SRCS encoding.cc DEPS json)
-add_dependencies(tokenizer extern_boost)
diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/core/tokenizer.cc
@@ -163,7 +163,7 @@ bool Tokenizer::DoPreTokenize(
   return true;
 }
 
-struct InputStringVisitor : public boost::static_visitor<> {
+struct InputStringVisitor {
   InputStringVisitor(const Tokenizer* tokenizer,
                      uint32_t type_id,
                      OffsetType offset_type,
@@ -190,8 +190,8 @@ void Tokenizer::EncodeSingleString(const InputString& input_string,
                                    uint32_t type_id,
                                    OffsetType offset_type,
                                    Encoding* encodings) const {
-  boost::apply_visitor(
-      InputStringVisitor(this, type_id, offset_type, encodings), input_string);
+  paddlenlp::visit(InputStringVisitor(this, type_id, offset_type, encodings),
+                   input_string);
 }
 
 void Tokenizer::PostProcess(Encoding* encoding,
@@ -234,13 +234,13 @@ void Tokenizer::EncodePairStrings(const EncodeInput& encode_input,
                                   bool add_special_tokens) const {
   Encoding encoding;
   if (encode_input.type() == typeid(InputString)) {
-    const auto& input_string = boost::get<InputString>(encode_input);
+    const auto& input_string = paddlenlp::get<InputString>(encode_input);
     EncodeSingleString(input_string, 0, OffsetType::CHAR, &encoding);
     PostProcess(&encoding, nullptr, add_special_tokens, encodings);
   } else {
     Encoding pair_encoding;
     const auto& input_string_pair =
-        boost::get<std::pair<InputString, InputString>>(encode_input);
+        paddlenlp::get<std::pair<InputString, InputString>>(encode_input);
     EncodeSingleString(input_string_pair.first, 0, OffsetType::CHAR, &encoding);
     EncodeSingleString(
         input_string_pair.second, 1, OffsetType::CHAR, &pair_encoding);
@@ -273,9 +273,9 @@ void Tokenizer::EncodeBatchStrings(
 void Tokenizer::EncodePairStringsCharOffsets(const EncodeInput& encode_input,
                                              Encoding* encodings,
                                              bool add_special_tokens) const {
-  const auto& input_string = boost::get<InputString>(&encode_input);
+  const auto& input_string = paddlenlp::get_if<InputString>(&encode_input);
   const auto& input_string_pair =
-      boost::get<std::pair<InputString, InputString>>(&encode_input);
+      paddlenlp::get_if<std::pair<InputString, InputString>>(&encode_input);
   Encoding encoding;
   Encoding pair_encoding;
   if (input_string != nullptr) {

diff --git a/faster_tokenizer/faster_tokenizer/core/tokenizer.h b/faster_tokenizer/faster_tokenizer/core/tokenizer.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "faster_tokenizer/core/added_vocabulary.h"
 #include "faster_tokenizer/core/base.h"
 #include "faster_tokenizer/utils/utils.h"
-#include "boost/variant.hpp"
+#include "faster_tokenizer/utils/variant.h"
 #include "nlohmann/json.hpp"
 
 namespace paddlenlp {
@@ -56,9 +56,9 @@ namespace core {
 class AddedVocabulary;
 class Encoding;
 
-using InputString = boost::variant<std::string, std::vector<std::string>>;
+using InputString = paddlenlp::variant<std::string, std::vector<std::string>>;
 using EncodeInput =
-    boost::variant<InputString, std::pair<InputString, InputString>>;
+    paddlenlp::variant<InputString, std::pair<InputString, InputString>>;
 
 class FASTERTOKENIZER_DECL Tokenizer {
 public:

diff --git a/faster_tokenizer/faster_tokenizer/decoders/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/decoders/CMakeLists.txt
@@ -1,2 +1 @@
 cc_library(decoders SRCS wordpiece.cc DEPS json utils)
-add_dependencies(decoders extern_boost)
diff --git a/faster_tokenizer/faster_tokenizer/models/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/models/CMakeLists.txt
@@ -1,3 +1,3 @@
 cc_library(models
         SRCS wordpiece.cc faster_wordpiece.cc bpe.cc unigram.cc
-        DEPS core json boost trie failure icuuc icudata lattice utils)
+        DEPS core json trie failure icuuc icudata lattice utils)
diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/postprocessors/CMakeLists.txt
@@ -1 +1 @@
-cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json boost)
+cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json)
diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/template.cc b/faster_tokenizer/faster_tokenizer/postprocessors/template.cc
@@ -16,8 +16,8 @@
 #include <string>
 
 #include "faster_tokenizer/core/encoding.h"
-#include "glog/logging.h"
 #include "faster_tokenizer/postprocessors/template.h"
+#include "glog/logging.h"
 
 namespace paddlenlp {
 namespace faster_tokenizer {
@@ -27,7 +27,7 @@ void ParseIdFromString(const std::string& template_id_string,
                        TemplatePiece* template_piece) {
   if (template_id_string.find_first_of("$") == 0) {
     *template_piece = TemplateSequence();
-    auto& seq = boost::get<TemplateSequence>(*template_piece);
+    auto& seq = paddlenlp::get<TemplateSequence>(*template_piece);
     std::string rest =
         template_id_string.substr(template_id_string.find_first_not_of("$"));
     if (rest == "" || rest == "A" || rest == "a") {
@@ -48,15 +48,16 @@ void ParseIdFromString(const std::string& template_id_string,
     }
   } else {
     *template_piece = TemplateSpecialToken();
-    boost::get<TemplateSpecialToken>(*template_piece) = {template_id_string, 0};
+    paddlenlp::get<TemplateSpecialToken>(*template_piece) = {template_id_string,
+                                                             0};
   }
 }
 
 void SetTypeId(uint32_t type_id, TemplatePiece* template_piece) {
-  if (boost::get<TemplateSequence>(template_piece) != nullptr) {
-    boost::get<TemplateSequence>(*template_piece).second = type_id;
+  if (paddlenlp::get_if<TemplateSequence>(template_piece) != nullptr) {
+    paddlenlp::get<TemplateSequence>(*template_piece).second = type_id;
   } else {
-    boost::get<TemplateSpecialToken>(*template_piece).second = type_id;
+    paddlenlp::get<TemplateSpecialToken>(*template_piece).second = type_id;
   }
 }
 
@@ -84,8 +85,8 @@ void GetTemplatePieceFromString(const std::string& template_string,
 }
 
 void to_json(nlohmann::json& j, const TemplatePiece& template_piece) {
-  if (boost::get<TemplateSequence>(&template_piece) != nullptr) {
-    auto& template_sequence = boost::get<TemplateSequence>(template_piece);
+  if (paddlenlp::get_if<TemplateSequence>(&template_piece) != nullptr) {
+    auto& template_sequence = paddlenlp::get<TemplateSequence>(template_piece);
     j = {
         {"Sequence",
          {
@@ -95,7 +96,7 @@ void to_json(nlohmann::json& j, const TemplatePiece& template_piece) {
     };
   } else {
     auto& template_special_token =
-        boost::get<TemplateSpecialToken>(template_piece);
+        paddlenlp::get<TemplateSpecialToken>(template_piece);
     j = {
         {"SpecialToken",
          {
@@ -135,7 +136,7 @@ size_t TemplatePostProcessor::CountAdded(
   size_t count = 0;
   for (auto& piece : template_->pieces_) {
     TemplateSpecialToken* special_token =
-        boost::get<TemplateSpecialToken>(&piece);
+        paddlenlp::get_if<TemplateSpecialToken>(&piece);
     if (special_token != nullptr) {
       auto token_iter =
           special_tokens_map.tokens_map_.find(special_token->first);
@@ -244,8 +245,8 @@ void TemplatePostProcessor::ApplyTemplate(
     core::Encoding* result_encoding) const {
   size_t new_size = 0;
   for (auto&& piece : pieces.pieces_) {
-    if (boost::get<TemplateSequence>(&piece) != nullptr) {
-      auto seq_type = boost::get<TemplateSequence>(piece).first;
+    if (paddlenlp::get_if<TemplateSequence>(&piece) != nullptr) {
+      auto seq_type = paddlenlp::get<TemplateSequence>(piece).first;
       if (seq_type == SequenceType::SEQ_A) {
         new_size += encoding->GetLen();
       } else {
@@ -257,7 +258,8 @@ void TemplatePostProcessor::ApplyTemplate(
       }
     } else {
       if (add_special_tokens) {
-        auto&& special_token = boost::get<TemplateSpecialToken>(piece).first;
+        auto&& special_token =
+            paddlenlp::get<TemplateSpecialToken>(piece).first;
         if (special_tokens_map_.tokens_map_.find(special_token) !=
             special_tokens_map_.tokens_map_.end()) {
           new_size +=
@@ -330,8 +332,8 @@ void TemplatePostProcessor::ApplyTemplate(
   }
   VLOG(6) << "Template pieces num: " << pieces.pieces_.size();
   for (auto& piece : pieces.pieces_) {
-    if (boost::get<TemplateSequence>(&piece) != nullptr) {
-      auto& template_sequence = boost::get<TemplateSequence>(piece);
+    if (paddlenlp::get_if<TemplateSequence>(&piece) != nullptr) {
+      auto& template_sequence = paddlenlp::get<TemplateSequence>(piece);
       if (template_sequence.first == SequenceType::SEQ_A) {
         auto seq_start = ids.size();
         auto seq_end = seq_start + encoding->GetLen();
@@ -385,7 +387,7 @@ void TemplatePostProcessor::ApplyTemplate(
                               pair_encoding->GetAttentionMask().end());
       }
     } else {
-      auto& special_token = boost::get<TemplateSpecialToken>(piece);
+      auto& special_token = paddlenlp::get<TemplateSpecialToken>(piece);
       if (add_special_tokens) {
         const std::string& id = special_token.first;
         uint32_t type_id = special_token.second;

diff --git a/faster_tokenizer/faster_tokenizer/postprocessors/template.h b/faster_tokenizer/faster_tokenizer/postprocessors/template.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "boost/variant.hpp"
-#include "glog/logging.h"
-#include "nlohmann/json.hpp"
 #include "faster_tokenizer/postprocessors/postprocessor.h"
 #include "faster_tokenizer/utils/utils.h"
+#include "faster_tokenizer/utils/variant.h"
+#include "glog/logging.h"
+#include "nlohmann/json.hpp"
 
 namespace paddlenlp {
 namespace faster_tokenizer {
@@ -37,7 +37,8 @@ NLOHMANN_JSON_SERIALIZE_ENUM(SequenceType,
 using TemplateSequence = std::pair<SequenceType, uint32_t>;
 using TemplateSpecialToken = std::pair<std::string, uint32_t>;
 
-using TemplatePiece = boost::variant<TemplateSequence, TemplateSpecialToken>;
+using TemplatePiece =
+    paddlenlp::variant<TemplateSequence, TemplateSpecialToken>;
 void to_json(nlohmann::json& j, const TemplatePiece& template_piece);
 void from_json(const nlohmann::json& j, TemplatePiece& template_piece);
 
@@ -119,10 +120,10 @@ struct FASTERTOKENIZER_DECL Template {
     for (auto&& piece : pieces) {
       TemplatePiece template_piece;
       GetTemplatePieceFromString(piece, &template_piece);
-      if (boost::get<TemplateSequence>(&template_piece)) {
-        pieces_.push_back(boost::get<TemplateSequence>(template_piece));
+      if (paddlenlp::get_if<TemplateSequence>(&template_piece)) {
+        pieces_.push_back(paddlenlp::get<TemplateSequence>(template_piece));
       } else {
-        pieces_.push_back(boost::get<TemplateSpecialToken>(template_piece));
+        pieces_.push_back(paddlenlp::get<TemplateSpecialToken>(template_piece));
       }
     }
   }

diff --git a/faster_tokenizer/faster_tokenizer/pybind/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/pybind/CMakeLists.txt
@@ -3,8 +3,8 @@ cc_library(pybind_utils SRCS utils.cc DEPS pybind python json)
 cc_library(pybind_normalizers SRCS normalizers.cc DEPS pybind python json)
 cc_library(pybind_pretokenizers SRCS pretokenizers.cc DEPS pybind python json)
 cc_library(pybind_models SRCS models.cc DEPS pybind python json)
-cc_library(pybind_postprocessors SRCS postprocessors.cc DEPS pybind python core json boost)
-cc_library(pybind_tokenizers SRCS tokenizers.cc DEPS pybind python pybind_utils json boost)
+cc_library(pybind_postprocessors SRCS postprocessors.cc DEPS pybind python core json)
+cc_library(pybind_tokenizers SRCS tokenizers.cc DEPS pybind python pybind_utils json)
 cc_library(pybind_exception SRCS exception.cc DEPS pybind python)
 cc_library(pybind_decoders SRCS decoders.cc DEPS pybind python json)
 cc_library(pybind_core SRCS core.cc DEPS pybind python json)
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1 @@
		cc_library(decoders SRCS wordpiece.cc DEPS json utils)
		add_dependencies(decoders extern_boost)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json boost)
		cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json)