@@ -273,10 +273,10 @@ func TestGPTEncoder_Split(t *testing.T) {
273
273
func BenchmarkGPTEncoder_WordSplitterChan (b * testing.B ) {
274
274
b .StopTimer ()
275
275
corpusHandle , err := os .Open (largeCorpusPath )
276
- defer corpusHandle .Close ()
277
276
if err != nil {
278
277
b .Error (err )
279
278
}
279
+ defer corpusHandle .Close ()
280
280
gpt2Encoder .SplitterThreads = 8
281
281
nextWord := gpt2Encoder .WordSplitter (bufio .NewReaderSize (corpusHandle ,
282
282
8 * 1024 * 1024 ))
@@ -413,8 +413,8 @@ func BenchmarkGPTEncoder_Decode(b *testing.B) {
413
413
start := time .Now ()
414
414
tokenNumBytes := len (gpt2Encoder .Decode (gpt2Encoded ))
415
415
duration := time .Since (start )
416
- b .Log ( fmt . Sprintf ("%v tokens into %v bytes over %v" ,
417
- len (* gpt2Encoded ), tokenNumBytes , duration ))
416
+ b .Logf ("%v tokens into %v bytes over %v" ,
417
+ len (* gpt2Encoded ), tokenNumBytes , duration )
418
418
}
419
419
420
420
type EncoderTest struct {
@@ -453,25 +453,25 @@ func BenchmarkGPTEncoder_Encode(b *testing.B) {
453
453
start := time .Now ()
454
454
tokenCt := len (* gpt2Encoder .Encode (& corpus ))
455
455
duration := time .Since (start )
456
- b .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
457
- len (corpus ), tokenCt , duration ))
456
+ b .Logf ("%v bytes into %v tokens over %v" ,
457
+ len (corpus ), tokenCt , duration )
458
458
}
459
459
460
460
func BenchmarkGPTEncoder_EncodeBuffer (b * testing.B ) {
461
461
corpusBytes := []byte (corpus )
462
462
start := time .Now ()
463
463
tokenCt := len (* gpt2Encoder .EncodeBuffer (& corpusBytes )) / 2
464
464
duration := time .Since (start )
465
- b .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
466
- len (corpus ), tokenCt , duration ))
465
+ b .Logf ("%v bytes into %v tokens over %v" ,
466
+ len (corpus ), tokenCt , duration )
467
467
}
468
468
469
469
func TestGPTEncoder_Encode (t * testing.T ) {
470
470
start := time .Now ()
471
471
tokenCt := len (* gpt2Encoder .Encode (& corpus ))
472
472
duration := time .Since (start )
473
- t .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
474
- len (corpus ), tokenCt , duration ))
473
+ t .Logf ("%v bytes into %v tokens over %v" ,
474
+ len (corpus ), tokenCt , duration )
475
475
for testIdx := range GPTEncoderTests {
476
476
tokensPtr := * gpt2Encoder .Encode (
477
477
& (GPTEncoderTests [testIdx ].Input ))
@@ -492,19 +492,18 @@ func TestGPTEncoder_StreamingEncode(t *testing.T) {
492
492
tokenCt += len (* tokens )
493
493
}
494
494
duration := time .Since (start )
495
- t .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
496
- len (corpus ), tokenCt , duration ))
495
+ t .Logf ("%v bytes into %v tokens over %v" ,
496
+ len (corpus ), tokenCt , duration )
497
497
}
498
498
499
499
func TestCLIPEncoder_Encode (t * testing.T ) {
500
500
start := time .Now ()
501
501
tokenCt := len (* clipEncoder .Encode (& corpus ))
502
502
duration := time .Since (start )
503
- t .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
504
- len (corpus ), tokenCt , duration ))
503
+ t .Logf ("%v bytes into %v tokens over %v" ,
504
+ len (corpus ), tokenCt , duration )
505
505
for testIdx := range GPTEncoderTests {
506
- testStr := fmt .Sprintf ("%s" ,
507
- GPTEncoderTests [testIdx ].Input )
506
+ testStr := GPTEncoderTests [testIdx ].Input
508
507
tokensPtr := * clipEncoder .Encode (& testStr )
509
508
assert .Equal (t , GPTEncoderTests [testIdx ].CLIPExpected , tokensPtr )
510
509
}
@@ -514,8 +513,8 @@ func TestPileEncoder_Encode(t *testing.T) {
514
513
start := time .Now ()
515
514
tokenCt := len (* pileEncoder .Encode (& corpus ))
516
515
duration := time .Since (start )
517
- t .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
518
- len (corpus ), tokenCt , duration ))
516
+ t .Logf ("%v bytes into %v tokens over %v" ,
517
+ len (corpus ), tokenCt , duration )
519
518
for testIdx := range GPTEncoderTests {
520
519
tokensPtr := * pileEncoder .Encode (
521
520
& (GPTEncoderTests [testIdx ].Input ))
@@ -527,8 +526,8 @@ func TestNerdstashEncoder_Encode(t *testing.T) {
527
526
start := time .Now ()
528
527
tokenCt := len (* nerdstashV2Encoder .Encode (& corpus ))
529
528
duration := time .Since (start )
530
- t .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
531
- len (corpus ), tokenCt , duration ))
529
+ t .Logf ("%v bytes into %v tokens over %v" ,
530
+ len (corpus ), tokenCt , duration )
532
531
for testIdx := range GPTEncoderTests {
533
532
tokensPtr := * nerdstashV2Encoder .Encode (
534
533
& (GPTEncoderTests [testIdx ].Input ))
@@ -576,7 +575,7 @@ func TestNerdstashEncoder_Encode2(t *testing.T) {
576
575
encoded := nerdstashV2Encoder .Encode (& inputStr )
577
576
// check that the encoded string is the same as the expected
578
577
if ! assert .Equal (t , expected , * encoded ) {
579
- t .Log ( fmt . Sprintf ( "failure on input: `%v`" , inputStr ) )
578
+ t .Logf ( "failure on input: `%v`" , inputStr )
580
579
expectedRepr := []string {}
581
580
for _ , token := range expected {
582
581
expectedRepr = append (expectedRepr ,
@@ -587,14 +586,14 @@ func TestNerdstashEncoder_Encode2(t *testing.T) {
587
586
actualRepr = append (actualRepr ,
588
587
string (nerdstashV2Encoder .Decoder [token ]))
589
588
}
590
- t .Log ( fmt . Sprintf ( "expected: |%s" , strings .Join (expectedRepr , "|" ) ))
591
- t .Log ( fmt . Sprintf ( "actual: |%s" , strings .Join (actualRepr , "|" ) ))
589
+ t .Logf ( "expected: |%s" , strings .Join (expectedRepr , "|" ))
590
+ t .Logf ( "actual: |%s" , strings .Join (actualRepr , "|" ))
592
591
failCt += 1
593
592
} else {
594
593
passCt += 1
595
594
}
596
595
}
597
- t .Log ( fmt . Sprintf ( "pass: %v, fail: %v" , passCt , failCt ) )
596
+ t .Logf ( "pass: %v, fail: %v" , passCt , failCt )
598
597
}
599
598
600
599
func TestNerdstashEncoder_Decode (t * testing.T ) {
@@ -613,6 +612,9 @@ func TestGPTEncoder_Decode2(t *testing.T) {
613
612
} else {
614
613
tokens := TokensFromBin (& binTokens )
615
614
tokens , err = gpt2Encoder .TrimIncompleteSentence (tokens )
615
+ if err != nil {
616
+ t .Error (err )
617
+ }
616
618
assert .Equal (t , gpt2Encoder .Decode (tokens ), decodedCorpus )
617
619
}
618
620
}
@@ -626,8 +628,8 @@ func TestGPTEncoder_Decode(t *testing.T) {
626
628
decoded := gpt2Encoder .Decode (gpt2Encoded )
627
629
duration := time .Since (start )
628
630
tokenNumBytes := len (decoded )
629
- t .Log ( fmt . Sprintf ("%v tokens into %v bytes over %v\n " ,
630
- len (* gpt2Encoded ), tokenNumBytes , duration ))
631
+ t .Logf ("%v tokens into %v bytes over %v\n " ,
632
+ len (* gpt2Encoded ), tokenNumBytes , duration )
631
633
assert .Equal (t , corpus , decoded )
632
634
}
633
635
@@ -647,8 +649,7 @@ func TestCLIPEncoder_Decode(t *testing.T) {
647
649
duration := time .Since (start )
648
650
tokenNumBytes := len (decoded )
649
651
idxToStop := 229550
650
- t .Log (fmt .Sprintf ("%v tokens into %v bytes over %v\n " ,
651
- len (* clipEncoded ), tokenNumBytes , duration ))
652
+ t .Logf ("%v tokens into %v bytes over %v\n " , len (* clipEncoded ), tokenNumBytes , duration )
652
653
for idx := range clipCorpus {
653
654
if idx > idxToStop {
654
655
break
@@ -672,8 +673,8 @@ func TestPileEncoder_Decode(t *testing.T) {
672
673
decoded := pileEncoder .Decode (pileEncoded )
673
674
duration := time .Since (start )
674
675
tokenNumBytes := len (decoded )
675
- t .Log ( fmt . Sprintf ("%v tokens into %v bytes over %v\n " ,
676
- len (* pileEncoded ), tokenNumBytes , duration ))
676
+ t .Logf ("%v tokens into %v bytes over %v\n " ,
677
+ len (* pileEncoded ), tokenNumBytes , duration )
677
678
range_data := corpus
678
679
if len (corpus ) > len (decoded ) {
679
680
range_data = decoded
@@ -779,8 +780,8 @@ func TestLlamaEncoder_Encode(t *testing.T) {
779
780
start := time .Now ()
780
781
tokenCt := len (* gpt2Encoder .Encode (& corpus ))
781
782
duration := time .Since (start )
782
- t .Log ( fmt . Sprintf ("%v bytes into %v tokens over %v" ,
783
- len (corpus ), tokenCt , duration ))
783
+ t .Logf ("%v bytes into %v tokens over %v" ,
784
+ len (corpus ), tokenCt , duration )
784
785
for testIdx := range GPTEncoderTests {
785
786
tokensPtr := * gpt2Encoder .Encode (
786
787
& (GPTEncoderTests [testIdx ].Input ))
@@ -852,9 +853,7 @@ func TestReadTokenizerConfig(t *testing.T) {
852
853
destPath := "./TestReadTokenizerConfig"
853
854
destPathPTR := & destPath
854
855
defer os .RemoveAll (destPath )
855
- var rsrcType resources.ResourceType
856
- rsrcType = resources .RESOURCETYPE_TRANSFORMERS
857
- hfApiToken := os .Getenv ("HF_API_TOKEN" )
856
+ rsrcType , hfApiToken := resources .RESOURCETYPE_TRANSFORMERS , os .Getenv ("HF_API_TOKEN" )
858
857
os .MkdirAll (destPath , 0755 )
859
858
_ , rsrcErr := resources .ResolveResources (modelId , destPathPTR ,
860
859
resources .RESOURCE_MODEL , rsrcType , hfApiToken )
@@ -898,9 +897,7 @@ func TestModelDownload(t *testing.T) {
898
897
destPath := "./TestModelDownload"
899
898
destPathPTR := & destPath
900
899
901
- var rsrcType resources.ResourceType
902
- rsrcType = resources .RESOURCETYPE_TRANSFORMERS
903
- hfApiToken := os .Getenv ("HF_API_TOKEN" )
900
+ rsrcType , hfApiToken := resources .RESOURCETYPE_TRANSFORMERS , os .Getenv ("HF_API_TOKEN" )
904
901
os .MkdirAll (destPath , 0755 )
905
902
defer os .RemoveAll (destPath )
906
903
_ , rsrcErr := resources .ResolveResources (modelId , destPathPTR ,
@@ -975,9 +972,7 @@ func TestModelDownloadPythia(t *testing.T) {
975
972
destPath := "./TestModelDownloadPythia"
976
973
destPathPTR := & destPath
977
974
978
- var rsrcType resources.ResourceType
979
- rsrcType = resources .RESOURCETYPE_TRANSFORMERS
980
- hfApiToken := os .Getenv ("HF_API_TOKEN" )
975
+ rsrcType , hfApiToken := resources .RESOURCETYPE_TRANSFORMERS , os .Getenv ("HF_API_TOKEN" )
981
976
os .MkdirAll (destPath , 0755 )
982
977
defer os .RemoveAll (destPath )
983
978
_ , rsrcErr := resources .ResolveResources (modelId , destPathPTR ,
@@ -1051,9 +1046,7 @@ func TestModelDownloadPythiaSharded(t *testing.T) {
1051
1046
destPath := "./TestModelDownloadPythiaSharded"
1052
1047
destPathPTR := & destPath
1053
1048
1054
- var rsrcType resources.ResourceType
1055
- rsrcType = resources .RESOURCETYPE_TRANSFORMERS
1056
- hfApiToken := os .Getenv ("HF_API_TOKEN" )
1049
+ rsrcType , hfApiToken := resources .RESOURCETYPE_TRANSFORMERS , os .Getenv ("HF_API_TOKEN" )
1057
1050
os .MkdirAll (destPath , 0755 )
1058
1051
defer os .RemoveAll (destPath )
1059
1052
_ , rsrcErr := resources .ResolveResources (modelId , destPathPTR ,
@@ -1118,9 +1111,7 @@ func TestModelDownloadLlama(t *testing.T) {
1118
1111
destPathPTR := & destPath
1119
1112
defer os .RemoveAll (destPath )
1120
1113
1121
- var rsrcType resources.ResourceType
1122
- rsrcType = resources .RESOURCETYPE_TRANSFORMERS
1123
- hfApiToken := os .Getenv ("HF_API_TOKEN" )
1114
+ rsrcType , hfApiToken := resources .RESOURCETYPE_TRANSFORMERS , os .Getenv ("HF_API_TOKEN" )
1124
1115
os .MkdirAll (destPath , 0755 )
1125
1116
_ , rsrcErr := resources .ResolveResources (modelId , destPathPTR ,
1126
1117
resources .RESOURCE_MODEL , rsrcType , hfApiToken )
@@ -1166,7 +1157,7 @@ func TestModelDownloadLlama(t *testing.T) {
1166
1157
}
1167
1158
1168
1159
matches := re .FindStringSubmatch (file .Name ())
1169
- if matches != nil && len (matches ) > 2 {
1160
+ if len (matches ) > 2 {
1170
1161
if strings .Compare (matches [1 ], matches [2 ]) == 0 {
1171
1162
found = true
1172
1163
break
@@ -1212,9 +1203,7 @@ func TestModelDownloadFairseq(t *testing.T) {
1212
1203
destPath := "./TestModelDownloadFairseq"
1213
1204
destPathPTR := & destPath
1214
1205
1215
- var rsrcType resources.ResourceType
1216
- rsrcType = resources .RESOURCETYPE_TRANSFORMERS
1217
- hfApiToken := os .Getenv ("HF_API_TOKEN" )
1206
+ rsrcType , hfApiToken := resources .RESOURCETYPE_TRANSFORMERS , os .Getenv ("HF_API_TOKEN" )
1218
1207
os .MkdirAll (destPath , 0755 )
1219
1208
defer os .RemoveAll (destPath )
1220
1209
_ , rsrcErr := resources .ResolveResources (modelId , destPathPTR ,
0 commit comments