ankit-aglawe
diff --git a/‎examples/example.py
Lines changed: 185 additions & 17 deletions b/‎examples/example.py
Lines changed: 185 additions & 17 deletions
diff --git a/‎examples/training_seq2seq.py
Lines changed: 69 additions & 0 deletions b/‎examples/training_seq2seq.py
Lines changed: 69 additions & 0 deletions
@@ -1,5 +1,4 @@
 from datasets import load_dataset
-
 from textpredict import (
     Benchmarking,
     Explainability,
@@ -10,28 +9,31 @@
     clean_text,
     initialize,
     load_data,
+    set_device,
 )
 
 
 # Function to test simple prediction using default model
 def text_simple_prediction():
+    set_device("gpu")
+
     # sentiment
     texts = ["I love this product!", "I love this product!"]
     text = "I love this product!"
-    model = initialize(task="sentiment")
+    model = initialize(task="sentiment", device="gpu")
     result = model.analyze(texts, return_probs=False)
     print(f"Simple Prediction Result: {result}")
 
     # # emotion
     text = ["I am happy today", "I am happy today"]
-    model = initialize(task="emotion")
+    model = initialize(task="emotion", device="gpu")
     result = model.analyze(text, return_probs=False)
     print(f"Emotion found : {result}")
 
     # zeroshot
     texts = ["I am happy today", "I am happy today"]
     text = "I am happy today"
-    model = initialize(task="zeroshot")
+    model = initialize(task="zeroshot", device="gpu")
 
     result = model.analyze(
         text, candidate_labels=["negative", "positive"], return_probs=True
@@ -49,6 +51,7 @@ def text_simple_prediction():
 
 # Function to test prediction using a Hugging Face model
 def text_hf_prediction():
+    set_device("cuda")
     text = "I love this product!"
 
     model = initialize(
@@ -81,7 +84,10 @@ def text_hf_prediction():
     print(f"Zeroshot Prediction Result: {result}")
 
     # ner
-    texts = ["I am in London, united kingdom", "I am in Manchester, united kingdom"]
+    texts = [
+        "I am in London, united kingdom",
+        "I am in Manchester, united kingdom",
+    ]  # noqa: F841
     text = "I am in Manchester, united kingdom"
 
     model = initialize(task="ner", source="huggingface")
@@ -91,9 +97,11 @@ def text_hf_prediction():
 
 # Function to train a sequence classification model
 def train_sequence_classification():
+    set_device("cuda")
+
     # Load and preprocess the dataset
-    raw_train_dataset = load_dataset("imdb", split="train[:10]")
-    raw_validation_dataset = load_dataset("imdb", split="test[:10]")
+    raw_train_dataset = load_dataset("imdb", split="train[:100]")
+    raw_validation_dataset = load_dataset("imdb", split="test[:100]")
 
     tokenized_train_dataset = load_data(dataset=raw_train_dataset, splits=["train"])
     tokenized_validation_dataset = load_data(
@@ -111,7 +119,7 @@ def train_sequence_classification():
     trainer = SequenceClassificationTrainer(
         model_name="bert-base-uncased",
         output_dir="./results_new",
-        device="cpu",
+        device="cuda",
         training_config=training_config,
     )
 
@@ -137,8 +145,160 @@ def train_sequence_classification():
     print("result", result)
 
 
+def train_seq2seq():
+    from datasets import load_dataset  # type: ignore
+    from textpredict import Seq2seqTrainer, load_data
+
+    ds = load_dataset("google-research-datasets/mbpp", "sanitized")
+
+    # Load dataset
+    dataset = load_data(
+        dataset=ds,
+        splits=["train", "validation", "test"],
+        text_column="prompt",
+        label_column="code",
+    )
+
+    # Initialize the trainer
+    trainer = Seq2seqTrainer(
+        model_name="google/flan-t5-small",
+        output_dir="./seq2seq_model",
+        training_config={
+            "num_train_epochs": 3,
+            "per_device_train_batch_size": 8,
+            "per_device_eval_batch_size": 8,
+            "learning_rate": 3e-5,
+            "logging_dir": "./logs",
+            "evaluation_strategy": "epoch",
+            "save_strategy": "epoch",
+            "save_total_limit": 2,
+            "load_best_model_at_end": True,
+        },
+    )
+
+    # Set datasets
+    trainer.train_dataset = dataset["train"]
+    trainer.val_dataset = dataset["validation"]
+
+    # Start training
+    trainer.train()
+
+    # Save the model
+    trainer.save()
+
+    metrics = trainer.get_metrics()
+    print(f"Training Metrics: {metrics}")
+
+    evaluate = trainer.evaluate(test_dataset=dataset["test"])
+    print(f"Evaluation Metrics: {evaluate}")
+
+    model = initialize(model_name="./results_seq2seq", task="seq2seq")
+
+    text = "Summarize the following document: ..."
+
+    result = model.analyze(text, return_probs=True)
+
+    print("result", result)
+
+
+# def train_token_classification():
+
+#     import torch  # type: ignore
+#     from textpredict import TokenClassificationTrainer  # noqa: E402
+#     from transformers import AutoTokenizer  # type: ignore
+
+#     # Set device to cuda if available
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+
+#     # Load tokenizer
+#     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+#     # Load and preprocess the dataset
+#     raw_train_dataset = load_dataset("conll2003", split="train[:100]")
+#     raw_validation_dataset = load_dataset("conll2003", split="validation[:100]")
+
+#     # Tokenize the datasets
+#     def tokenize_and_align_labels(examples):
+#         tokenized_inputs = tokenizer(
+#             examples["tokens"],
+#             truncation=True,
+#             is_split_into_words=True,
+#             padding="max_length",
+#             max_length=128,
+#         )
+#         labels = []
+#         for i, label in enumerate(examples["ner_tags"]):
+#             word_ids = tokenized_inputs.word_ids(batch_index=i)
+#             label_ids = []
+#             previous_word_idx = None
+#             for word_idx in word_ids:
+#                 if word_idx is None:
+#                     label_ids.append(-100)
+#                 elif word_idx != previous_word_idx:
+#                     label_ids.append(label[word_idx])
+#                 else:
+#                     label_ids.append(-100)
+#                 previous_word_idx = word_idx
+#             labels.append(label_ids)
+#         tokenized_inputs["labels"] = labels
+#         return tokenized_inputs
+
+#     tokenized_train_dataset = raw_train_dataset.map(
+#         tokenize_and_align_labels, batched=True
+#     )
+#     tokenized_validation_dataset = raw_validation_dataset.map(
+#         tokenize_and_align_labels, batched=True
+#     )
+
+#     # Set the format for PyTorch tensors
+#     tokenized_train_dataset.set_format(
+#         type="torch", columns=["input_ids", "attention_mask", "labels"]
+#     )
+#     tokenized_validation_dataset.set_format(
+#         type="torch", columns=["input_ids", "attention_mask", "labels"]
+#     )
+
+#     # Define training configuration
+#     training_config = {
+#         "num_train_epochs": 3,
+#         "per_device_train_batch_size": 8,
+#     }
+
+#     # Initialize the trainer
+#     trainer = TokenClassificationTrainer(
+#         model_name="bert-base-uncased",
+#         output_dir="./results_token_classification",
+#         device=device,
+#         training_config=training_config,
+#     )
+
+#     # Assign the preprocessed training data to the trainer
+#     trainer.train_dataset = tokenized_train_dataset
+#     trainer.val_dataset = tokenized_validation_dataset
+
+#     # Train the model
+#     trainer.train(from_checkpoint=False)
+#     trainer.save()
+#     metrics = trainer.get_metrics()
+#     print(f"Training Metrics: {metrics}")
+
+#     evaluate = trainer.evaluate(test_dataset=tokenized_validation_dataset)
+#     print(f"Evaluation Metrics: {evaluate}")
+
+#     model = initialize(
+#         model_name="./results_token_classification", task="token_classification"
+#     )
+
+#     text = "Hawking was a theoretical physicist."
+
+#     result = model.analyze(text, return_probs=True)
+
+#     print("result", result)
+
+
 # Function to evaluate a sequence classification model
 def evaluate_sequence_classification():
+    set_device("cuda")
     # Load and preprocess the dataset
     raw_test_dataset = load_dataset("imdb", split="test[:10]")
 
@@ -151,7 +311,7 @@ def evaluate_sequence_classification():
 
     evaluator = SequenceClassificationEvaluator(
         model_name="bert-base-uncased",
-        device="cpu",
+        device="cuda",
         evaluation_config=evaluation_config,
     )
 
@@ -223,17 +383,25 @@ def main():
     # print("Running Simple Prediction...")
     # text_simple_prediction()
 
-    print("\nRunning Hugging Face Prediction...")
-    text_hf_prediction()
+    # print("\nRunning Hugging Face Prediction...")
+    # text_hf_prediction()
+
+    # print("\nTraining Sequence Classification Model...")
+    # train_sequence_classification()
+
+    # Run the training function
+    print("\Trainig Seq2seq Model...")
+    train_seq2seq()
 
-    print("\nTraining Sequence Classification Model...")
-    train_sequence_classification()
+    # Run the training function
+    # print("\Trainig Toekn c;assification Model...")
+    # train_token_classification()
 
-    print("\nEvaluating Sequence Classification Model...")
-    evaluate_sequence_classification()
+    # print("\nEvaluating Sequence Classification Model...")
+    # evaluate_sequence_classification()
 
-    print("\nBenchmarking Model...")
-    benchmark_model()
+    # print("\nBenchmarking Model...")
+    # benchmark_model()
 
     # print("\nVisualizing Metrics...")
     # visualize_metrics()
 
@@ -0,0 +1,69 @@
+import logging
+
+from datasets import load_dataset  # type: ignore
+from textpredict import Seq2seqTrainer, initialize, load_data, set_device
+
+logging.basicConfig(level=logging.INFO)
+
+
+def train_seq2seq():
+    set_device("cpu")
+
+    # Load the dataset
+    ds = load_dataset("google-research-datasets/mbpp", "sanitized")
+
+    # Load and preprocess dataset with specified text column
+    dataset = load_data(
+        dataset=ds,
+        splits=["train", "test"],
+        text_column="prompt",
+        label_column="code",
+    )
+
+    # Initialize the trainer
+    trainer = Seq2seqTrainer(
+        model_name="google/flan-t5-small",
+        output_dir="./seq2seq_model",
+        training_config={
+            "num_train_epochs": 0.064,
+            "per_device_train_batch_size": 2,
+            "per_device_eval_batch_size": 2,
+            "learning_rate": 3e-5,
+            "logging_dir": "./logs",
+            "evaluation_strategy": "epoch",
+            "save_strategy": "epoch",
+            "save_total_limit": 2,
+            "load_best_model_at_end": True,
+        },
+        device="cpu",
+    )
+
+    # Set datasets
+    trainer.train_dataset = dataset["train"]
+    trainer.val_dataset = dataset["test"]
+
+    # Start training
+    trainer.train()
+
+    # Save the model
+    trainer.save()
+
+    # Get training metrics
+    metrics = trainer.get_metrics()
+    print(f"Training Metrics: {metrics}")
+
+    # Evaluate the model
+    evaluate = trainer.evaluate(test_dataset=dataset["test"])
+    print(f"Evaluation Metrics: {evaluate}")
+
+    # Load the trained model
+    model = initialize(model_name="./seq2seq_model", task="seq2seq")
+
+    # Analyze a sample text
+    text = "Summarize the following document: ..."
+    result = model.analyze(text, return_probs=True)
+    print("Result:", result)
+
+
+if __name__ == "__main__":
+    train_seq2seq()