Still working on dataloader

jloveric · jloveric · commit d5b48091d100 · 2024-05-19T14:27:25.000-07:00
diff --git a/config/generative_config.yaml b/config/generative_config.yaml
@@ -1,5 +1,6 @@
 files:
-  - "train-00000-of-00645-b66ac786bf6fb553.parquet"
+  - test_data/test.parquet
+  #- "train-00000-of-00645-b66ac786bf6fb553.parquet"
 mlp:
   periodicity: null
   rescale_output: False
diff --git a/examples/text_to_image.py b/examples/text_to_image.py
@@ -7,7 +7,7 @@
 from pytorch_lightning import Trainer
 import matplotlib.pyplot as plt
 from high_order_implicit_representation.networks import GenNet
-from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
 from high_order_implicit_representation.rendering import Text2ImageSampler
 from high_order_implicit_representation.single_image_dataset import (
     image_to_dataset,
@@ -41,11 +41,15 @@ def run_implicit_images(cfg: DictConfig):
             filename=full_path[0], batch_size=cfg.batch_size
         )
         lr_monitor = LearningRateMonitor(logging_interval="epoch")
+        checkpoint = ModelCheckpoint(
+            save_top_k=-1,  # Save all checkpoints
+            every_n_train_steps=50000  # Save checkpoint every 500 steps
+        )
         trainer = Trainer(
             max_epochs=cfg.max_epochs,
             devices=cfg.gpus,
             accelerator=cfg.accelerator,
-            callbacks=[lr_monitor],
+            callbacks=[lr_monitor, checkpoint],
         )
         model = GenNet(cfg)
         trainer.fit(model, datamodule=data_module)
diff --git a/high_order_implicit_representation/single_image_dataset.py b/high_order_implicit_representation/single_image_dataset.py
@@ -333,8 +333,9 @@ def test_dataloader(self) -> DataLoader:
 class PickAPic:
     def __init__(self, files: list[str]):
         self.files = files
+        self._generator = self.data_generator()
 
-    def __call__(self):
+    def data_generator(self):
         for file in self.files:
             data = pd.read_parquet(file)
 
@@ -344,30 +345,42 @@ def __call__(self):
                 img = Image.open(io.BytesIO(jpg_0))
                 arr = np.copy(np.asarray(img))
                 yield caption, torch.from_numpy(arr)
+
                 jpg_1 = row["jpg_1"]
                 img = Image.open(io.BytesIO(jpg_1))
                 arr = np.copy(np.asarray(img))
                 yield caption, torch.from_numpy(arr)
 
+    def __call__(self):
+        return self._generator
 
 class Text2ImageDataset(Dataset):
     def __init__(self, filenames: List[str]):
         super().__init__()
         self.dataset = PickAPic(files=filenames)
         self.sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
         self.generator = self.gen_data()
+        self._length = 0 #int(1e6)
+        self.count=0
 
     def __len__(self):
-        return int(1e6)
+        return self._length or int(1e12)
 
     def gen_data(self):
 
-        caption, image = next(self.dataset())
-        caption_embedding = self.sentence_model.encode(caption)
-        flattened_image, flattened_position, image = simple_image_to_dataset(image)
-
-        for index, rgb in enumerate(flattened_image):
-            yield caption_embedding, flattened_position[index], rgb
+        for batch in self.dataset():
+            print('batch', batch)
+            caption, image = batch
+            caption_embedding = self.sentence_model.encode(caption)
+            print('next image')
+            flattened_image, flattened_position, image = simple_image_to_dataset(image)
+            if self.count==0:
+                self._length += len(flattened_image)
+
+            for index, rgb in enumerate(flattened_image):
+                yield caption_embedding, flattened_position[index], rgb
+        
+        self.count+=1
 
     def __getitem__(self, idx):
         # I'm totally ignoring the index