1
1
from datasets import load_dataset
2
-
3
2
from textpredict import (
4
3
Benchmarking ,
5
4
Explainability ,
10
9
clean_text ,
11
10
initialize ,
12
11
load_data ,
12
+ set_device ,
13
13
)
14
14
15
15
16
16
# Function to test simple prediction using default model
17
17
def text_simple_prediction ():
18
+ set_device ("gpu" )
19
+
18
20
# sentiment
19
21
texts = ["I love this product!" , "I love this product!" ]
20
22
text = "I love this product!"
21
- model = initialize (task = "sentiment" )
23
+ model = initialize (task = "sentiment" , device = "gpu" )
22
24
result = model .analyze (texts , return_probs = False )
23
25
print (f"Simple Prediction Result: { result } " )
24
26
25
27
# # emotion
26
28
text = ["I am happy today" , "I am happy today" ]
27
- model = initialize (task = "emotion" )
29
+ model = initialize (task = "emotion" , device = "gpu" )
28
30
result = model .analyze (text , return_probs = False )
29
31
print (f"Emotion found : { result } " )
30
32
31
33
# zeroshot
32
34
texts = ["I am happy today" , "I am happy today" ]
33
35
text = "I am happy today"
34
- model = initialize (task = "zeroshot" )
36
+ model = initialize (task = "zeroshot" , device = "gpu" )
35
37
36
38
result = model .analyze (
37
39
text , candidate_labels = ["negative" , "positive" ], return_probs = True
@@ -49,6 +51,7 @@ def text_simple_prediction():
49
51
50
52
# Function to test prediction using a Hugging Face model
51
53
def text_hf_prediction ():
54
+ set_device ("cuda" )
52
55
text = "I love this product!"
53
56
54
57
model = initialize (
@@ -81,7 +84,10 @@ def text_hf_prediction():
81
84
print (f"Zeroshot Prediction Result: { result } " )
82
85
83
86
# ner
84
- texts = ["I am in London, united kingdom" , "I am in Manchester, united kingdom" ]
87
+ texts = [
88
+ "I am in London, united kingdom" ,
89
+ "I am in Manchester, united kingdom" ,
90
+ ] # noqa: F841
85
91
text = "I am in Manchester, united kingdom"
86
92
87
93
model = initialize (task = "ner" , source = "huggingface" )
@@ -91,9 +97,11 @@ def text_hf_prediction():
91
97
92
98
# Function to train a sequence classification model
93
99
def train_sequence_classification ():
100
+ set_device ("cuda" )
101
+
94
102
# Load and preprocess the dataset
95
- raw_train_dataset = load_dataset ("imdb" , split = "train[:10 ]" )
96
- raw_validation_dataset = load_dataset ("imdb" , split = "test[:10 ]" )
103
+ raw_train_dataset = load_dataset ("imdb" , split = "train[:100 ]" )
104
+ raw_validation_dataset = load_dataset ("imdb" , split = "test[:100 ]" )
97
105
98
106
tokenized_train_dataset = load_data (dataset = raw_train_dataset , splits = ["train" ])
99
107
tokenized_validation_dataset = load_data (
@@ -111,7 +119,7 @@ def train_sequence_classification():
111
119
trainer = SequenceClassificationTrainer (
112
120
model_name = "bert-base-uncased" ,
113
121
output_dir = "./results_new" ,
114
- device = "cpu " ,
122
+ device = "cuda " ,
115
123
training_config = training_config ,
116
124
)
117
125
@@ -137,8 +145,160 @@ def train_sequence_classification():
137
145
print ("result" , result )
138
146
139
147
148
+ def train_seq2seq ():
149
+ from datasets import load_dataset # type: ignore
150
+ from textpredict import Seq2seqTrainer , load_data
151
+
152
+ ds = load_dataset ("google-research-datasets/mbpp" , "sanitized" )
153
+
154
+ # Load dataset
155
+ dataset = load_data (
156
+ dataset = ds ,
157
+ splits = ["train" , "validation" , "test" ],
158
+ text_column = "prompt" ,
159
+ label_column = "code" ,
160
+ )
161
+
162
+ # Initialize the trainer
163
+ trainer = Seq2seqTrainer (
164
+ model_name = "google/flan-t5-small" ,
165
+ output_dir = "./seq2seq_model" ,
166
+ training_config = {
167
+ "num_train_epochs" : 3 ,
168
+ "per_device_train_batch_size" : 8 ,
169
+ "per_device_eval_batch_size" : 8 ,
170
+ "learning_rate" : 3e-5 ,
171
+ "logging_dir" : "./logs" ,
172
+ "evaluation_strategy" : "epoch" ,
173
+ "save_strategy" : "epoch" ,
174
+ "save_total_limit" : 2 ,
175
+ "load_best_model_at_end" : True ,
176
+ },
177
+ )
178
+
179
+ # Set datasets
180
+ trainer .train_dataset = dataset ["train" ]
181
+ trainer .val_dataset = dataset ["validation" ]
182
+
183
+ # Start training
184
+ trainer .train ()
185
+
186
+ # Save the model
187
+ trainer .save ()
188
+
189
+ metrics = trainer .get_metrics ()
190
+ print (f"Training Metrics: { metrics } " )
191
+
192
+ evaluate = trainer .evaluate (test_dataset = dataset ["test" ])
193
+ print (f"Evaluation Metrics: { evaluate } " )
194
+
195
+ model = initialize (model_name = "./results_seq2seq" , task = "seq2seq" )
196
+
197
+ text = "Summarize the following document: ..."
198
+
199
+ result = model .analyze (text , return_probs = True )
200
+
201
+ print ("result" , result )
202
+
203
+
204
+ # def train_token_classification():
205
+
206
+ # import torch # type: ignore
207
+ # from textpredict import TokenClassificationTrainer # noqa: E402
208
+ # from transformers import AutoTokenizer # type: ignore
209
+
210
+ # # Set device to cuda if available
211
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
212
+
213
+ # # Load tokenizer
214
+ # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
215
+
216
+ # # Load and preprocess the dataset
217
+ # raw_train_dataset = load_dataset("conll2003", split="train[:100]")
218
+ # raw_validation_dataset = load_dataset("conll2003", split="validation[:100]")
219
+
220
+ # # Tokenize the datasets
221
+ # def tokenize_and_align_labels(examples):
222
+ # tokenized_inputs = tokenizer(
223
+ # examples["tokens"],
224
+ # truncation=True,
225
+ # is_split_into_words=True,
226
+ # padding="max_length",
227
+ # max_length=128,
228
+ # )
229
+ # labels = []
230
+ # for i, label in enumerate(examples["ner_tags"]):
231
+ # word_ids = tokenized_inputs.word_ids(batch_index=i)
232
+ # label_ids = []
233
+ # previous_word_idx = None
234
+ # for word_idx in word_ids:
235
+ # if word_idx is None:
236
+ # label_ids.append(-100)
237
+ # elif word_idx != previous_word_idx:
238
+ # label_ids.append(label[word_idx])
239
+ # else:
240
+ # label_ids.append(-100)
241
+ # previous_word_idx = word_idx
242
+ # labels.append(label_ids)
243
+ # tokenized_inputs["labels"] = labels
244
+ # return tokenized_inputs
245
+
246
+ # tokenized_train_dataset = raw_train_dataset.map(
247
+ # tokenize_and_align_labels, batched=True
248
+ # )
249
+ # tokenized_validation_dataset = raw_validation_dataset.map(
250
+ # tokenize_and_align_labels, batched=True
251
+ # )
252
+
253
+ # # Set the format for PyTorch tensors
254
+ # tokenized_train_dataset.set_format(
255
+ # type="torch", columns=["input_ids", "attention_mask", "labels"]
256
+ # )
257
+ # tokenized_validation_dataset.set_format(
258
+ # type="torch", columns=["input_ids", "attention_mask", "labels"]
259
+ # )
260
+
261
+ # # Define training configuration
262
+ # training_config = {
263
+ # "num_train_epochs": 3,
264
+ # "per_device_train_batch_size": 8,
265
+ # }
266
+
267
+ # # Initialize the trainer
268
+ # trainer = TokenClassificationTrainer(
269
+ # model_name="bert-base-uncased",
270
+ # output_dir="./results_token_classification",
271
+ # device=device,
272
+ # training_config=training_config,
273
+ # )
274
+
275
+ # # Assign the preprocessed training data to the trainer
276
+ # trainer.train_dataset = tokenized_train_dataset
277
+ # trainer.val_dataset = tokenized_validation_dataset
278
+
279
+ # # Train the model
280
+ # trainer.train(from_checkpoint=False)
281
+ # trainer.save()
282
+ # metrics = trainer.get_metrics()
283
+ # print(f"Training Metrics: {metrics}")
284
+
285
+ # evaluate = trainer.evaluate(test_dataset=tokenized_validation_dataset)
286
+ # print(f"Evaluation Metrics: {evaluate}")
287
+
288
+ # model = initialize(
289
+ # model_name="./results_token_classification", task="token_classification"
290
+ # )
291
+
292
+ # text = "Hawking was a theoretical physicist."
293
+
294
+ # result = model.analyze(text, return_probs=True)
295
+
296
+ # print("result", result)
297
+
298
+
140
299
# Function to evaluate a sequence classification model
141
300
def evaluate_sequence_classification ():
301
+ set_device ("cuda" )
142
302
# Load and preprocess the dataset
143
303
raw_test_dataset = load_dataset ("imdb" , split = "test[:10]" )
144
304
@@ -151,7 +311,7 @@ def evaluate_sequence_classification():
151
311
152
312
evaluator = SequenceClassificationEvaluator (
153
313
model_name = "bert-base-uncased" ,
154
- device = "cpu " ,
314
+ device = "cuda " ,
155
315
evaluation_config = evaluation_config ,
156
316
)
157
317
@@ -223,17 +383,25 @@ def main():
223
383
# print("Running Simple Prediction...")
224
384
# text_simple_prediction()
225
385
226
- print ("\n Running Hugging Face Prediction..." )
227
- text_hf_prediction ()
386
+ # print("\nRunning Hugging Face Prediction...")
387
+ # text_hf_prediction()
388
+
389
+ # print("\nTraining Sequence Classification Model...")
390
+ # train_sequence_classification()
391
+
392
+ # Run the training function
393
+ print ("\Trainig Seq2seq Model..." )
394
+ train_seq2seq ()
228
395
229
- print ("\n Training Sequence Classification Model..." )
230
- train_sequence_classification ()
396
+ # Run the training function
397
+ # print("\Trainig Toekn c;assification Model...")
398
+ # train_token_classification()
231
399
232
- print ("\n Evaluating Sequence Classification Model..." )
233
- evaluate_sequence_classification ()
400
+ # print("\nEvaluating Sequence Classification Model...")
401
+ # evaluate_sequence_classification()
234
402
235
- print ("\n Benchmarking Model..." )
236
- benchmark_model ()
403
+ # print("\nBenchmarking Model...")
404
+ # benchmark_model()
237
405
238
406
# print("\nVisualizing Metrics...")
239
407
# visualize_metrics()
0 commit comments