2 #ifndef _NNDEPLOY_TOKENIZER_TOKENIZER_H_
3 #define _NNDEPLOY_TOKENIZER_TOKENIZER_H_
82 tokenizer_type_ = tp.tokenizer_type_;
83 json_blob_ = tp.json_blob_;
84 model_blob_ = tp.model_blob_;
85 vocab_blob_ = tp.vocab_blob_;
86 merges_blob_ = tp.merges_blob_;
87 added_tokens_ = tp.added_tokens_;
88 max_length_ = tp.max_length_;
130 int max_length_ = 77;
133 std::string key =
"";
139 key += added_tokens_;
145 rapidjson::Value &json,
146 rapidjson::Document::AllocatorType &allocator)
override {
147 this->addRequiredParam(
"tokenizer_type_");
153 json.AddMember(
"is_path_", is_path_, allocator);
155 json.AddMember(
"tokenizer_type_",
156 rapidjson::Value(tokenizer_type_str.c_str(), allocator),
158 json.AddMember(
"json_blob_",
159 rapidjson::Value(json_blob_.c_str(), allocator), allocator);
160 json.AddMember(
"model_blob_",
161 rapidjson::Value(model_blob_.c_str(), allocator), allocator);
162 json.AddMember(
"vocab_blob_",
163 rapidjson::Value(vocab_blob_.c_str(), allocator), allocator);
164 json.AddMember(
"merges_blob_",
165 rapidjson::Value(merges_blob_.c_str(), allocator),
167 json.AddMember(
"added_tokens_",
168 rapidjson::Value(added_tokens_.c_str(), allocator),
170 json.AddMember(
"max_length_", max_length_, allocator);
181 if (json.HasMember(
"is_path_") && json[
"is_path_"].IsBool()) {
182 is_path_ = json[
"is_path_"].GetBool();
184 if (json.HasMember(
"tokenizer_type_") &&
185 json[
"tokenizer_type_"].IsString()) {
189 if (json.HasMember(
"json_blob_") && json[
"json_blob_"].IsString()) {
190 json_blob_ = json[
"json_blob_"].GetString();
192 if (json.HasMember(
"model_blob_") && json[
"model_blob_"].IsString()) {
193 model_blob_ = json[
"model_blob_"].GetString();
195 if (json.HasMember(
"vocab_blob_") && json[
"vocab_blob_"].IsString()) {
196 vocab_blob_ = json[
"vocab_blob_"].GetString();
198 if (json.HasMember(
"merges_blob_") && json[
"merges_blob_"].IsString()) {
199 merges_blob_ = json[
"merges_blob_"].GetString();
201 if (json.HasMember(
"added_tokens_") && json[
"added_tokens_"].IsString()) {
202 added_tokens_ = json[
"added_tokens_"].GetString();
204 if (json.HasMember(
"max_length_") && json[
"max_length_"].IsInt()) {
205 max_length_ = json[
"max_length_"].GetInt();
217 rapidjson::Value &json,
218 rapidjson::Document::AllocatorType &allocator)
override {
219 rapidjson::Value texts_json(rapidjson::kArrayType);
220 for (
const auto &text : texts_) {
221 texts_json.PushBack(rapidjson::Value(text.c_str(), allocator), allocator);
223 json.AddMember(
"texts_", texts_json, allocator);
229 if (json.HasMember(
"texts_") && json[
"texts_"].IsArray()) {
231 for (
const auto &text : json[
"texts_"].GetArray()) {
232 texts_.push_back(text.GetString());
241 std::vector<std::vector<int32_t>>
ids_;
251 this->setInputTypeInfo<TokenizerText>();
252 this->setOutputTypeInfo<TokenizerIds>();
255 std::vector<dag::Edge *> outputs)
256 : dag::Node(name, inputs, outputs) {
257 this->setInputTypeInfo<TokenizerText>();
258 this->setOutputTypeInfo<TokenizerIds>();
273 this->setInputTypeInfo<TokenizerIds>();
274 this->setOutputTypeInfo<TokenizerText>();
277 std::vector<dag::Edge *> outputs)
278 : dag::Node(name, inputs, outputs) {
279 this->setInputTypeInfo<TokenizerIds>();
280 this->setOutputTypeInfo<TokenizerText>();
virtual base::Status deserialize(rapidjson::Value &json)
virtual std::string serialize()
virtual ~TokenizerDecode()
TokenizerDecode(const std::string &name, std::vector< dag::Edge * > inputs, std::vector< dag::Edge * > outputs)
TokenizerDecode(const std::string &name)
virtual base::Status run()=0
Run node (pure virtual function)
virtual ~TokenizerEncode()
TokenizerEncode(const std::string &name, std::vector< dag::Edge * > inputs, std::vector< dag::Edge * > outputs)
virtual base::Status run()=0
Run node (pure virtual function)
TokenizerEncode(const std::string &name)
std::vector< std::vector< int32_t > > ids_
std::string added_tokens_
virtual ~TokenizerPraram()
std::string getShareKey()
std::string model_blob_
Create SentencePiece.
virtual base::Status serialize(rapidjson::Value &json, rapidjson::Document::AllocatorType &allocator) override
std::string vocab_blob_
Create BPE tokenizer.
std::string json_blob_
Create HF tokenizer from a single in-memory json blob.
virtual base::Status deserialize(rapidjson::Value &json) override
virtual base::Status serialize(rapidjson::Value &json, rapidjson::Document::AllocatorType &allocator) override
std::vector< std::string > texts_
virtual base::Status deserialize(rapidjson::Value &json) override
#define NNDEPLOY_LOGE(fmt,...)
#define NNDEPLOY_CC_API
api
TokenizerType stringToTokenizerType(const std::string &src)
std::string tokenizerTypeToString(TokenizerType type)
@ kTokenizerTypeBPE
Create BPE tokenizer.
@ kTokenizerTypeSentencePiece
Create SentencePiece.
@ kTokenizerTypeHF
Create HF tokenizer from a single in-memory json blob.
@ kTokenizerTypeRWKVWorld
Create RWKVWorldTokenizer.
@ kTokenizerTypeNotSupport
#define PARAM_COPY_TO(param_type)
#define PARAM_COPY(param_type)