nndeploy C++ API  0.2.0
nndeploy C++ API
tokenizer.h
Go to the documentation of this file.
1 
2 #ifndef _NNDEPLOY_TOKENIZER_TOKENIZER_H_
3 #define _NNDEPLOY_TOKENIZER_TOKENIZER_H_
4 
5 #include "nndeploy/base/any.h"
6 #include "nndeploy/base/common.h"
8 #include "nndeploy/base/log.h"
9 #include "nndeploy/base/macro.h"
10 #include "nndeploy/base/object.h"
11 #include "nndeploy/base/param.h"
12 #include "nndeploy/base/status.h"
13 #include "nndeploy/base/string.h"
14 #include "nndeploy/base/type.h"
15 #include "nndeploy/dag/edge.h"
16 #include "nndeploy/dag/graph.h"
17 #include "nndeploy/dag/node.h"
18 #include "nndeploy/device/buffer.h"
19 #include "nndeploy/device/device.h"
21 #include "nndeploy/device/tensor.h"
22 
23 namespace nndeploy {
24 namespace tokenizer {
25 
26 //---------------------------------------------------
27 // Factory functions from byte-blobs
28 // These factory function takes in in-memory blobs
29 // so the library can be independent from filesystem
30 //---------------------------------------------------
31 enum TokenizerType : int {
38  kTokenizerTypeHF = 0x0000,
63 };
64 
67 stringToTokenizerType(const std::string &src);
68 
70  public:
71  TokenizerPraram() : base::Param() {}
72  virtual ~TokenizerPraram() {}
73 
76 
77  TokenizerPraram &operator=(const TokenizerPraram &tp) {
78  if (this == &tp) {
79  return *this;
80  }
81  is_path_ = tp.is_path_;
82  tokenizer_type_ = tp.tokenizer_type_;
83  json_blob_ = tp.json_blob_;
84  model_blob_ = tp.model_blob_;
85  vocab_blob_ = tp.vocab_blob_;
86  merges_blob_ = tp.merges_blob_;
87  added_tokens_ = tp.added_tokens_;
88  max_length_ = tp.max_length_;
89 
90  return *this;
91  }
92 
93  // is_path
94  bool is_path_ = true;
95  // The type of tokenizer
96  TokenizerType tokenizer_type_ = kTokenizerTypeHF;
97 
104  std::string json_blob_;
117  std::string model_blob_;
126  std::string vocab_blob_;
127  std::string merges_blob_;
128  std::string added_tokens_;
129 
130  int max_length_ = 77;
131 
132  std::string getShareKey() {
133  std::string key = "";
134  key += tokenizerTypeToString(tokenizer_type_);
135  key += json_blob_;
136  key += model_blob_;
137  key += vocab_blob_;
138  key += merges_blob_;
139  key += added_tokens_;
140  return key;
141  }
142 
145  rapidjson::Value &json,
146  rapidjson::Document::AllocatorType &allocator) override {
147  this->addRequiredParam("tokenizer_type_");
148  base::Status status = base::Param::serialize(json, allocator);
149  if (status != base::kStatusCodeOk) {
150  NNDEPLOY_LOGE("TokenizerPraram::serialize failed\n");
151  return status;
152  }
153  json.AddMember("is_path_", is_path_, allocator);
154  std::string tokenizer_type_str = tokenizerTypeToString(tokenizer_type_);
155  json.AddMember("tokenizer_type_",
156  rapidjson::Value(tokenizer_type_str.c_str(), allocator),
157  allocator);
158  json.AddMember("json_blob_",
159  rapidjson::Value(json_blob_.c_str(), allocator), allocator);
160  json.AddMember("model_blob_",
161  rapidjson::Value(model_blob_.c_str(), allocator), allocator);
162  json.AddMember("vocab_blob_",
163  rapidjson::Value(vocab_blob_.c_str(), allocator), allocator);
164  json.AddMember("merges_blob_",
165  rapidjson::Value(merges_blob_.c_str(), allocator),
166  allocator);
167  json.AddMember("added_tokens_",
168  rapidjson::Value(added_tokens_.c_str(), allocator),
169  allocator);
170  json.AddMember("max_length_", max_length_, allocator);
171  return base::kStatusCodeOk;
172  }
173 
175  virtual base::Status deserialize(rapidjson::Value &json) override {
176  base::Status status = base::Param::deserialize(json);
177  if (status != base::kStatusCodeOk) {
178  NNDEPLOY_LOGE("TokenizerPraram::deserialize failed\n");
179  return status;
180  }
181  if (json.HasMember("is_path_") && json["is_path_"].IsBool()) {
182  is_path_ = json["is_path_"].GetBool();
183  }
184  if (json.HasMember("tokenizer_type_") &&
185  json["tokenizer_type_"].IsString()) {
186  tokenizer_type_ =
187  stringToTokenizerType(json["tokenizer_type_"].GetString());
188  }
189  if (json.HasMember("json_blob_") && json["json_blob_"].IsString()) {
190  json_blob_ = json["json_blob_"].GetString();
191  }
192  if (json.HasMember("model_blob_") && json["model_blob_"].IsString()) {
193  model_blob_ = json["model_blob_"].GetString();
194  }
195  if (json.HasMember("vocab_blob_") && json["vocab_blob_"].IsString()) {
196  vocab_blob_ = json["vocab_blob_"].GetString();
197  }
198  if (json.HasMember("merges_blob_") && json["merges_blob_"].IsString()) {
199  merges_blob_ = json["merges_blob_"].GetString();
200  }
201  if (json.HasMember("added_tokens_") && json["added_tokens_"].IsString()) {
202  added_tokens_ = json["added_tokens_"].GetString();
203  }
204  if (json.HasMember("max_length_") && json["max_length_"].IsInt()) {
205  max_length_ = json["max_length_"].GetInt();
206  }
207  return base::kStatusCodeOk;
208  }
209 };
210 
212  public:
213  std::vector<std::string> texts_;
214 
217  rapidjson::Value &json,
218  rapidjson::Document::AllocatorType &allocator) override {
219  rapidjson::Value texts_json(rapidjson::kArrayType);
220  for (const auto &text : texts_) {
221  texts_json.PushBack(rapidjson::Value(text.c_str(), allocator), allocator);
222  }
223  json.AddMember("texts_", texts_json, allocator);
224  return base::kStatusCodeOk;
225  }
226 
228  virtual base::Status deserialize(rapidjson::Value &json) override {
229  if (json.HasMember("texts_") && json["texts_"].IsArray()) {
230  texts_.clear();
231  for (const auto &text : json["texts_"].GetArray()) {
232  texts_.push_back(text.GetString());
233  }
234  }
235  return base::kStatusCodeOk;
236  }
237 };
238 
240  public:
241  std::vector<std::vector<int32_t>> ids_;
242 };
243 
249  public:
250  TokenizerEncode(const std::string &name) : dag::Node(name) {
251  this->setInputTypeInfo<TokenizerText>();
252  this->setOutputTypeInfo<TokenizerIds>();
253  }
254  TokenizerEncode(const std::string &name, std::vector<dag::Edge *> inputs,
255  std::vector<dag::Edge *> outputs)
256  : dag::Node(name, inputs, outputs) {
257  this->setInputTypeInfo<TokenizerText>();
258  this->setOutputTypeInfo<TokenizerIds>();
259  }
260 
261  virtual ~TokenizerEncode();
262 
263  virtual base::Status run() = 0;
264 };
265 
271  public:
272  TokenizerDecode(const std::string &name) : dag::Node(name) {
273  this->setInputTypeInfo<TokenizerIds>();
274  this->setOutputTypeInfo<TokenizerText>();
275  }
276  TokenizerDecode(const std::string &name, std::vector<dag::Edge *> inputs,
277  std::vector<dag::Edge *> outputs)
278  : dag::Node(name, inputs, outputs) {
279  this->setInputTypeInfo<TokenizerIds>();
280  this->setOutputTypeInfo<TokenizerText>();
281  }
282 
283  virtual ~TokenizerDecode();
284 
285  virtual base::Status run() = 0;
286 };
287 
288 } // namespace tokenizer
289 } // namespace nndeploy
290 
291 #endif /* _NNDEPLOY_TOKENIZER_TOKENIZER_TOKENIZER_H_ */
virtual base::Status deserialize(rapidjson::Value &json)
virtual std::string serialize()
Node base class.
Definition: node.h:171
TokenizerDecode(const std::string &name, std::vector< dag::Edge * > inputs, std::vector< dag::Edge * > outputs)
Definition: tokenizer.h:276
TokenizerDecode(const std::string &name)
Definition: tokenizer.h:272
virtual base::Status run()=0
Run node (pure virtual function)
TokenizerEncode(const std::string &name, std::vector< dag::Edge * > inputs, std::vector< dag::Edge * > outputs)
Definition: tokenizer.h:254
virtual base::Status run()=0
Run node (pure virtual function)
TokenizerEncode(const std::string &name)
Definition: tokenizer.h:250
std::vector< std::vector< int32_t > > ids_
Definition: tokenizer.h:241
std::string model_blob_
Create SentencePiece.
Definition: tokenizer.h:117
virtual base::Status serialize(rapidjson::Value &json, rapidjson::Document::AllocatorType &allocator) override
Definition: tokenizer.h:144
std::string vocab_blob_
Create BPE tokenizer.
Definition: tokenizer.h:126
std::string json_blob_
Create HF tokenizer from a single in-memory json blob.
Definition: tokenizer.h:104
virtual base::Status deserialize(rapidjson::Value &json) override
Definition: tokenizer.h:175
virtual base::Status serialize(rapidjson::Value &json, rapidjson::Document::AllocatorType &allocator) override
Definition: tokenizer.h:216
std::vector< std::string > texts_
Definition: tokenizer.h:213
virtual base::Status deserialize(rapidjson::Value &json) override
Definition: tokenizer.h:228
#define NNDEPLOY_LOGE(fmt,...)
Definition: log.h:59
#define NNDEPLOY_CC_API
api
Definition: macro.h:29
@ kStatusCodeOk
Definition: status.h:13
TokenizerType stringToTokenizerType(const std::string &src)
std::string tokenizerTypeToString(TokenizerType type)
@ kTokenizerTypeBPE
Create BPE tokenizer.
Definition: tokenizer.h:47
@ kTokenizerTypeSentencePiece
Create SentencePiece.
Definition: tokenizer.h:54
@ kTokenizerTypeHF
Create HF tokenizer from a single in-memory json blob.
Definition: tokenizer.h:38
@ kTokenizerTypeRWKVWorld
Create RWKVWorldTokenizer.
Definition: tokenizer.h:61
#define PARAM_COPY_TO(param_type)
Definition: param.h:25
#define PARAM_COPY(param_type)
Definition: param.h:16