Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove boost library. #3215

Merged
merged 3 commits into from
Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions faster_tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,15 @@ endforeach()

ELSE(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
IF (LINUX)
IF (NOT APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ldl -lpthread")
ENDIF()
set (PUBLIC_DEPEND_LIBS ${CMAKE_DL_LIBS})
ENDIF(WIN32)

# For OpenMP
# openmp not support well for now on windows
if (LINUX)
if (NOT APPLE AND NOT WIN32) # Linux
find_package(OpenMP)
if (OPENMP_FOUND)
add_definitions(-DWITH_OMP)
Expand Down Expand Up @@ -143,7 +143,7 @@ if(WITH_PYTHON)

add_subdirectory(python)

if(LINUX)
if (NOT APPLE AND NOT WIN32) # Linux
add_custom_target(build_tokenizers_bdist_wheel ALL
COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel --plat-name=manylinux1_x86_64
COMMENT "Packing whl packages------>>>"
Expand All @@ -168,6 +168,8 @@ file(COPY ${PROJECT_SOURCE_DIR}/FasterTokenizer.cmake DESTINATION ${CPP_PACKAGE_
# copy headers
file(COPY ${PROJECT_SOURCE_DIR}/faster_tokenizer/ DESTINATION ${CPP_PACKAGE_DIR}/include/faster_tokenizer/
FILES_MATCHING PATTERN "*.h"
PATTERN "test" EXCLUDE
PATTERN "demo" EXCLUDE
PATTERN "pybind" EXCLUDE)

add_custom_target(copy_third_party_headers ALL
Expand All @@ -177,11 +179,6 @@ add_custom_target(copy_third_party_headers ALL
${CPP_PACKAGE_DIR}/third_party/include
DEPENDS build_cpp_package_dir)

add_custom_target(copy_boost_headers ALL
COMMAND ${CMAKE_COMMAND} -E copy_directory
${BOOST_INCLUDE_DIR}/boost ${CPP_PACKAGE_DIR}/third_party/include/boost
DEPENDS build_cpp_package_dir)

# copy library
set(TOKENIZER_CORE_NAME "core_tokenizers")
set(TOKENIZER_CORE_PATH ${CMAKE_BINARY_DIR}/faster_tokenizer)
Expand Down
49 changes: 0 additions & 49 deletions faster_tokenizer/cmake/external/boost.cmake

This file was deleted.

5 changes: 3 additions & 2 deletions faster_tokenizer/cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")

include(external/icu)
include(external/gtest)
if(WITH_TESTING)
include(external/gtest)
endif()
include(external/gflags)
include(external/glog)
include(external/re2)
include(external/boost)
include(external/nlohmann_json)
include(external/dart) # For trie
if (WITH_PYTHON)
Expand Down
1 change: 0 additions & 1 deletion faster_tokenizer/faster_tokenizer/core/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
cc_library(added_vocabulary SRCS added_vocabulary.cc DEPS normalizers pretokenizers json)
cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors)
cc_library(core SRCS encoding.cc DEPS json)
add_dependencies(tokenizer extern_boost)
14 changes: 7 additions & 7 deletions faster_tokenizer/faster_tokenizer/core/tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ bool Tokenizer::DoPreTokenize(
return true;
}

struct InputStringVisitor : public boost::static_visitor<> {
struct InputStringVisitor {
InputStringVisitor(const Tokenizer* tokenizer,
uint32_t type_id,
OffsetType offset_type,
Expand All @@ -190,8 +190,8 @@ void Tokenizer::EncodeSingleString(const InputString& input_string,
uint32_t type_id,
OffsetType offset_type,
Encoding* encodings) const {
boost::apply_visitor(
InputStringVisitor(this, type_id, offset_type, encodings), input_string);
paddlenlp::visit(InputStringVisitor(this, type_id, offset_type, encodings),
input_string);
}

void Tokenizer::PostProcess(Encoding* encoding,
Expand Down Expand Up @@ -234,13 +234,13 @@ void Tokenizer::EncodePairStrings(const EncodeInput& encode_input,
bool add_special_tokens) const {
Encoding encoding;
if (encode_input.type() == typeid(InputString)) {
const auto& input_string = boost::get<InputString>(encode_input);
const auto& input_string = paddlenlp::get<InputString>(encode_input);
EncodeSingleString(input_string, 0, OffsetType::CHAR, &encoding);
PostProcess(&encoding, nullptr, add_special_tokens, encodings);
} else {
Encoding pair_encoding;
const auto& input_string_pair =
boost::get<std::pair<InputString, InputString>>(encode_input);
paddlenlp::get<std::pair<InputString, InputString>>(encode_input);
EncodeSingleString(input_string_pair.first, 0, OffsetType::CHAR, &encoding);
EncodeSingleString(
input_string_pair.second, 1, OffsetType::CHAR, &pair_encoding);
Expand Down Expand Up @@ -273,9 +273,9 @@ void Tokenizer::EncodeBatchStrings(
void Tokenizer::EncodePairStringsCharOffsets(const EncodeInput& encode_input,
Encoding* encodings,
bool add_special_tokens) const {
const auto& input_string = boost::get<InputString>(&encode_input);
const auto& input_string = paddlenlp::get_if<InputString>(&encode_input);
const auto& input_string_pair =
boost::get<std::pair<InputString, InputString>>(&encode_input);
paddlenlp::get_if<std::pair<InputString, InputString>>(&encode_input);
Encoding encoding;
Encoding pair_encoding;
if (input_string != nullptr) {
Expand Down
6 changes: 3 additions & 3 deletions faster_tokenizer/faster_tokenizer/core/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ limitations under the License. */
#include "faster_tokenizer/core/added_vocabulary.h"
#include "faster_tokenizer/core/base.h"
#include "faster_tokenizer/utils/utils.h"
#include "boost/variant.hpp"
#include "faster_tokenizer/utils/variant.h"
#include "nlohmann/json.hpp"

namespace paddlenlp {
Expand Down Expand Up @@ -56,9 +56,9 @@ namespace core {
class AddedVocabulary;
class Encoding;

using InputString = boost::variant<std::string, std::vector<std::string>>;
using InputString = paddlenlp::variant<std::string, std::vector<std::string>>;
using EncodeInput =
boost::variant<InputString, std::pair<InputString, InputString>>;
paddlenlp::variant<InputString, std::pair<InputString, InputString>>;

class FASTERTOKENIZER_DECL Tokenizer {
public:
Expand Down
1 change: 0 additions & 1 deletion faster_tokenizer/faster_tokenizer/decoders/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
cc_library(decoders SRCS wordpiece.cc DEPS json utils)
add_dependencies(decoders extern_boost)
2 changes: 1 addition & 1 deletion faster_tokenizer/faster_tokenizer/models/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
cc_library(models
SRCS wordpiece.cc faster_wordpiece.cc bpe.cc unigram.cc
DEPS core json boost trie failure icuuc icudata lattice utils)
DEPS core json trie failure icuuc icudata lattice utils)
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json boost)
cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json)
34 changes: 18 additions & 16 deletions faster_tokenizer/faster_tokenizer/postprocessors/template.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
#include <string>

#include "faster_tokenizer/core/encoding.h"
#include "glog/logging.h"
#include "faster_tokenizer/postprocessors/template.h"
#include "glog/logging.h"

namespace paddlenlp {
namespace faster_tokenizer {
Expand All @@ -27,7 +27,7 @@ void ParseIdFromString(const std::string& template_id_string,
TemplatePiece* template_piece) {
if (template_id_string.find_first_of("$") == 0) {
*template_piece = TemplateSequence();
auto& seq = boost::get<TemplateSequence>(*template_piece);
auto& seq = paddlenlp::get<TemplateSequence>(*template_piece);
std::string rest =
template_id_string.substr(template_id_string.find_first_not_of("$"));
if (rest == "" || rest == "A" || rest == "a") {
Expand All @@ -48,15 +48,16 @@ void ParseIdFromString(const std::string& template_id_string,
}
} else {
*template_piece = TemplateSpecialToken();
boost::get<TemplateSpecialToken>(*template_piece) = {template_id_string, 0};
paddlenlp::get<TemplateSpecialToken>(*template_piece) = {template_id_string,
0};
}
}

void SetTypeId(uint32_t type_id, TemplatePiece* template_piece) {
if (boost::get<TemplateSequence>(template_piece) != nullptr) {
boost::get<TemplateSequence>(*template_piece).second = type_id;
if (paddlenlp::get_if<TemplateSequence>(template_piece) != nullptr) {
paddlenlp::get<TemplateSequence>(*template_piece).second = type_id;
} else {
boost::get<TemplateSpecialToken>(*template_piece).second = type_id;
paddlenlp::get<TemplateSpecialToken>(*template_piece).second = type_id;
}
}

Expand Down Expand Up @@ -84,8 +85,8 @@ void GetTemplatePieceFromString(const std::string& template_string,
}

void to_json(nlohmann::json& j, const TemplatePiece& template_piece) {
if (boost::get<TemplateSequence>(&template_piece) != nullptr) {
auto& template_sequence = boost::get<TemplateSequence>(template_piece);
if (paddlenlp::get_if<TemplateSequence>(&template_piece) != nullptr) {
auto& template_sequence = paddlenlp::get<TemplateSequence>(template_piece);
j = {
{"Sequence",
{
Expand All @@ -95,7 +96,7 @@ void to_json(nlohmann::json& j, const TemplatePiece& template_piece) {
};
} else {
auto& template_special_token =
boost::get<TemplateSpecialToken>(template_piece);
paddlenlp::get<TemplateSpecialToken>(template_piece);
j = {
{"SpecialToken",
{
Expand Down Expand Up @@ -135,7 +136,7 @@ size_t TemplatePostProcessor::CountAdded(
size_t count = 0;
for (auto& piece : template_->pieces_) {
TemplateSpecialToken* special_token =
boost::get<TemplateSpecialToken>(&piece);
paddlenlp::get_if<TemplateSpecialToken>(&piece);
if (special_token != nullptr) {
auto token_iter =
special_tokens_map.tokens_map_.find(special_token->first);
Expand Down Expand Up @@ -244,8 +245,8 @@ void TemplatePostProcessor::ApplyTemplate(
core::Encoding* result_encoding) const {
size_t new_size = 0;
for (auto&& piece : pieces.pieces_) {
if (boost::get<TemplateSequence>(&piece) != nullptr) {
auto seq_type = boost::get<TemplateSequence>(piece).first;
if (paddlenlp::get_if<TemplateSequence>(&piece) != nullptr) {
auto seq_type = paddlenlp::get<TemplateSequence>(piece).first;
if (seq_type == SequenceType::SEQ_A) {
new_size += encoding->GetLen();
} else {
Expand All @@ -257,7 +258,8 @@ void TemplatePostProcessor::ApplyTemplate(
}
} else {
if (add_special_tokens) {
auto&& special_token = boost::get<TemplateSpecialToken>(piece).first;
auto&& special_token =
paddlenlp::get<TemplateSpecialToken>(piece).first;
if (special_tokens_map_.tokens_map_.find(special_token) !=
special_tokens_map_.tokens_map_.end()) {
new_size +=
Expand Down Expand Up @@ -330,8 +332,8 @@ void TemplatePostProcessor::ApplyTemplate(
}
VLOG(6) << "Template pieces num: " << pieces.pieces_.size();
for (auto& piece : pieces.pieces_) {
if (boost::get<TemplateSequence>(&piece) != nullptr) {
auto& template_sequence = boost::get<TemplateSequence>(piece);
if (paddlenlp::get_if<TemplateSequence>(&piece) != nullptr) {
auto& template_sequence = paddlenlp::get<TemplateSequence>(piece);
if (template_sequence.first == SequenceType::SEQ_A) {
auto seq_start = ids.size();
auto seq_end = seq_start + encoding->GetLen();
Expand Down Expand Up @@ -385,7 +387,7 @@ void TemplatePostProcessor::ApplyTemplate(
pair_encoding->GetAttentionMask().end());
}
} else {
auto& special_token = boost::get<TemplateSpecialToken>(piece);
auto& special_token = paddlenlp::get<TemplateSpecialToken>(piece);
if (add_special_tokens) {
const std::string& id = special_token.first;
uint32_t type_id = special_token.second;
Expand Down
15 changes: 8 additions & 7 deletions faster_tokenizer/faster_tokenizer/postprocessors/template.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ limitations under the License. */
#include <unordered_map>
#include <vector>

#include "boost/variant.hpp"
#include "glog/logging.h"
#include "nlohmann/json.hpp"
#include "faster_tokenizer/postprocessors/postprocessor.h"
#include "faster_tokenizer/utils/utils.h"
#include "faster_tokenizer/utils/variant.h"
#include "glog/logging.h"
#include "nlohmann/json.hpp"

namespace paddlenlp {
namespace faster_tokenizer {
Expand All @@ -37,7 +37,8 @@ NLOHMANN_JSON_SERIALIZE_ENUM(SequenceType,
using TemplateSequence = std::pair<SequenceType, uint32_t>;
using TemplateSpecialToken = std::pair<std::string, uint32_t>;

using TemplatePiece = boost::variant<TemplateSequence, TemplateSpecialToken>;
using TemplatePiece =
paddlenlp::variant<TemplateSequence, TemplateSpecialToken>;
void to_json(nlohmann::json& j, const TemplatePiece& template_piece);
void from_json(const nlohmann::json& j, TemplatePiece& template_piece);

Expand Down Expand Up @@ -119,10 +120,10 @@ struct FASTERTOKENIZER_DECL Template {
for (auto&& piece : pieces) {
TemplatePiece template_piece;
GetTemplatePieceFromString(piece, &template_piece);
if (boost::get<TemplateSequence>(&template_piece)) {
pieces_.push_back(boost::get<TemplateSequence>(template_piece));
if (paddlenlp::get_if<TemplateSequence>(&template_piece)) {
pieces_.push_back(paddlenlp::get<TemplateSequence>(template_piece));
} else {
pieces_.push_back(boost::get<TemplateSpecialToken>(template_piece));
pieces_.push_back(paddlenlp::get<TemplateSpecialToken>(template_piece));
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions faster_tokenizer/faster_tokenizer/pybind/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ cc_library(pybind_utils SRCS utils.cc DEPS pybind python json)
cc_library(pybind_normalizers SRCS normalizers.cc DEPS pybind python json)
cc_library(pybind_pretokenizers SRCS pretokenizers.cc DEPS pybind python json)
cc_library(pybind_models SRCS models.cc DEPS pybind python json)
cc_library(pybind_postprocessors SRCS postprocessors.cc DEPS pybind python core json boost)
cc_library(pybind_tokenizers SRCS tokenizers.cc DEPS pybind python pybind_utils json boost)
cc_library(pybind_postprocessors SRCS postprocessors.cc DEPS pybind python core json)
cc_library(pybind_tokenizers SRCS tokenizers.cc DEPS pybind python pybind_utils json)
cc_library(pybind_exception SRCS exception.cc DEPS pybind python)
cc_library(pybind_decoders SRCS decoders.cc DEPS pybind python json)
cc_library(pybind_core SRCS core.cc DEPS pybind python json)
Loading