@@ -147,8 +147,8 @@ func NewMistralEncoder() GPTEncoder {
147
147
// Returns a GPTEncoder with the tokenizer data loaded for that vocabulary
148
148
// id.
149
149
func NewEncoder (vocabId string ) (* GPTEncoder , error ) {
150
- hfConfig , resourcesPtr , vocabErr := resources .ResolveVocabId (vocabId ,
151
- "" )
150
+ hfConfig , resourcesPtr , vocabErr := resources .ResolveVocabId (vocabId , "" )
151
+
152
152
if vocabErr != nil {
153
153
return nil , vocabErr
154
154
}
@@ -176,32 +176,6 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) {
176
176
}
177
177
}
178
178
179
- tokenizerSpecialConfig := resources.TokenizerSpecialsConfig {
180
- AddBosToken : false ,
181
- AddEosToken : false ,
182
- PadToken : "" ,
183
- }
184
- altMistralSpecialsConfig := resources.MistralSpecialsConfig {
185
- AddBosToken : false ,
186
- AddEosToken : false ,
187
- PadToken : "" ,
188
- }
189
- if special , ok := (rsrcs )["tokenizer_config.json" ]; ok {
190
- if special .Data != nil {
191
- err := json .Unmarshal (* special .Data , & tokenizerSpecialConfig )
192
- if err != nil {
193
- err = json .Unmarshal (* special .Data , & altMistralSpecialsConfig )
194
- if err != nil {
195
- log .Fatal ("Error unmarshalling tokenizer_config.json" )
196
- }
197
- //populate the tokenizerSpecialConfig from the altMistralSpecialsConfig
198
- tokenizerSpecialConfig .AddBosToken = altMistralSpecialsConfig .AddBosToken
199
- tokenizerSpecialConfig .AddEosToken = altMistralSpecialsConfig .AddEosToken
200
- tokenizerSpecialConfig .PadToken = altMistralSpecialsConfig .PadToken
201
- }
202
- }
203
- }
204
-
205
179
puncRunes := make ([]rune , 0 )
206
180
if specialConfig .PuncRunes != nil {
207
181
for _ , r := range specialConfig .PuncRunes {
@@ -364,23 +338,28 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) {
364
338
}
365
339
366
340
if specialConfig .EncloseEosBos {
367
- tokenizerSpecialConfig .AddBosToken = true
368
- tokenizerSpecialConfig .AddEosToken = true
341
+ bosBool := true
342
+ eosBool := true
343
+ hfConfig .AddBosToken = & bosBool
344
+ hfConfig .AddEosToken = & eosBool
369
345
}
370
346
371
347
// Add in default pad token if not already set
372
- padTokenNotFound := (tokenizerSpecialConfig . PadToken == "" && hfConfig .PadTokenStr == nil )
348
+ padTokenNotFound := (hfConfig .PadTokenStr == nil )
373
349
if padTokenNotFound {
374
350
// Inject the pad token into the encoder to uintmax16,
375
351
// throw an error if vocab is larger than uintmax16
376
- if len (encoderTokens ) >= math .MaxInt16 {
377
- log .Fatalf ("Vocab size is larger than uint16 max, default pad token cannot be added." +
378
- "Please specify a pad token in the vocab file." )
352
+ if len (encoderTokens ) >= math .MaxUint16 {
353
+ log .Fatalf ("Vocab size of %d is larger than uint16 max of %d. " +
354
+ "Please specify a pad token in the vocab file." ,
355
+ len (encoderTokens ), math .MaxUint16 )
379
356
}
380
- encoderTokens [ defaultPadTokenString ] = math . MaxUint16
381
- tokenizerSpecialConfig . PadToken = defaultPadTokenString
382
- hfConfig .PadTokenStr = & tokenizerSpecialConfig . PadToken
357
+ padToken := defaultPadTokenString
358
+ encoderTokens [ padToken ] = math . MaxUint16
359
+ hfConfig .PadTokenStr = & padToken
383
360
}
361
+
362
+ // Create the encoder
384
363
encoder := & GPTEncoder {
385
364
encoderTokens ,
386
365
tokensEncoder ,
@@ -403,8 +382,8 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) {
403
382
encoderTokens [* hfConfig .EosTokenStr ],
404
383
encoderTokens [* hfConfig .PadTokenStr ],
405
384
specialConfig .EncloseEosBos ,
406
- tokenizerSpecialConfig .AddBosToken ,
407
- tokenizerSpecialConfig .AddEosToken ,
385
+ * hfConfig .AddBosToken ,
386
+ * hfConfig .AddEosToken ,
408
387
specialConfig .PrefixSpace ,
409
388
specialConfig .LowerCase ,
410
389
specialConfig .EndOfWord ,
0 commit comments