Merge pull request #28 from VectorInstitute/improve_pmc2m

yasamanparhizkar · web-flow · commit fc9e950fe251 · 2024-12-05T20:43:18.000-05:00
Cleaned PMC-2M+inline Dataset
diff --git a/openpmcvl/experiment/configs/experiment/biomedclip_ppr.yaml b/openpmcvl/experiment/configs/experiment/biomedclip_ppr.yaml
@@ -108,7 +108,7 @@ trainer:
   callbacks:
     model_checkpoint:
       monitor: val/loss
-      save_top_k: 1
+      save_top_k: -1
       save_last: True
       every_n_epochs: 1
       dirpath: /checkpoint/${oc.env:USER}/${oc.env:SLURM_JOB_ID} # only works on Vector SLURM environment
diff --git a/openpmcvl/experiment/configs/experiment/pmcoa2_matched_512.yaml b/openpmcvl/experiment/configs/experiment/pmcoa2_matched_512.yaml
@@ -31,15 +31,15 @@ seed: 0
 datasets:
   train:
     pmc2m_sum:
-      split: train_clean
+      split: train_clean_sep
   val:
     pmc2m_sum:
-      split: valid_clean
+      split: valid_clean_sep
       transform:
         job_type: eval
   test:
     pmc2m_sum:
-      split: test_clean
+      split: test_clean_sep
       transform:
         job_type: eval
 
diff --git a/openpmcvl/experiment/datasets/pmc2m_sum.py b/openpmcvl/experiment/datasets/pmc2m_sum.py
@@ -41,7 +41,7 @@ def __init__(
         ] = None,
     ) -> None:
         """Initialize the dataset."""
-        data_path = os.path.join(root_dir, f"{split}.jsonl")
+        data_path = os.path.join(root_dir, "clean", f"{split}.jsonl")
         with open(data_path, encoding="utf-8") as file:
             entries = [json.loads(line) for line in file.readlines()]
         self.entries = entries
@@ -62,17 +62,16 @@ def __getitem__(self, idx: int) -> Example:
         try:
             with Image.open(entry["image_fullpath"]) as img:
                 image = img.convert("RGB")
+            with open(entry["caption_fullpath"], encoding="utf-8") as file:
+                caption = file.read()
         except Exception as e:
             print(
-                f"Error loading image for entry {idx}: image_path={entry['image_fullpath']}",
+                f"Error loading image or caption for entry {idx}: image_path={entry['image_fullpath']} caption_path={entry['caption_fullpath']}",
                 e,
             )
             idx = (idx + 1) % len(self.entries)
             return self.__getitem__(idx)
 
-        # load text
-        caption = " ".join([entry["caption"], entry["intext_refs_summary"]])
-
         # apply transform and tokenization
         if self.transform is not None:
             image = self.transform(image)
diff --git a/openpmcvl/experiment/datasets/pmc2m_sum_old.py b/openpmcvl/experiment/datasets/pmc2m_sum_old.py
@@ -0,0 +1,103 @@
+"""PMC-2M with summarized inline references Dataset."""
+
+import json
+import os
+from typing import Callable, Dict, Literal, Optional, Union
+
+import torch
+from mmlearn.conf import external_store
+from mmlearn.constants import EXAMPLE_INDEX_KEY
+from mmlearn.datasets.core import Modalities
+from mmlearn.datasets.core.example import Example
+from omegaconf import MISSING
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision.transforms import ToTensor
+
+
+@external_store(group="datasets", root_dir=os.getenv("PMC2M_SUMM_ROOT_DIR", MISSING))
+class PMC2MSum(Dataset[Example]):
+    """PMC-2M with summarized inline references dataset.
+
+    Parameters
+    ----------
+    root_dir : str
+        Path to the root folder containing jsonl file with data entries.
+    split : {"train", "valid", "test"}
+        Dataset split.
+    transform : Optional[Callable], default=None
+        Transform applied to images.
+    tokenizer : Optional[Callable], default=None
+        Function applied to textual captions.
+    """
+
+    def __init__(
+        self,
+        root_dir: str,
+        split: Literal["train", "valid", "test"] = "train",
+        transform: Optional[Callable[[Image.Image], torch.Tensor]] = None,
+        tokenizer: Optional[
+            Callable[[str], Union[torch.Tensor, Dict[str, torch.Tensor]]]
+        ] = None,
+    ) -> None:
+        """Initialize the dataset."""
+        data_path = os.path.join(root_dir, f"{split}.jsonl")
+        with open(data_path, encoding="utf-8") as file:
+            entries = [json.loads(line) for line in file.readlines()]
+        self.entries = entries
+
+        self.root_dir = root_dir
+
+        if transform is None:
+            self.transform = ToTensor()
+        else:
+            self.transform = transform
+
+        self.tokenizer = tokenizer
+
+    def __getitem__(self, idx: int) -> Example:
+        """Return the idx'th data sample."""
+        entry = self.entries[idx]
+        # load image
+        try:
+            with Image.open(entry["image_fullpath"]) as img:
+                image = img.convert("RGB")
+        except Exception as e:
+            print(
+                f"Error loading image for entry {idx}: image_path={entry['image_fullpath']}",
+                e,
+            )
+            idx = (idx + 1) % len(self.entries)
+            return self.__getitem__(idx)
+
+        # load text
+        caption = " ".join([entry["caption"], entry["intext_refs_summary"]])
+
+        # apply transform and tokenization
+        if self.transform is not None:
+            image = self.transform(image)
+
+        tokens = self.tokenizer(caption) if self.tokenizer is not None else None
+
+        example = Example(
+            {
+                Modalities.RGB.name: image,
+                Modalities.TEXT.name: caption,
+                EXAMPLE_INDEX_KEY: idx,
+            }
+        )
+
+        if tokens is not None:
+            if isinstance(tokens, dict):  # output of HFTokenizer
+                assert (
+                    Modalities.TEXT.name in tokens
+                ), f"Missing key `{Modalities.TEXT.name}` in tokens."
+                example.update(tokens)
+            else:
+                example[Modalities.TEXT.name] = tokens
+
+        return example
+
+    def __len__(self) -> int:
+        """Return the length of the dataset."""
+        return len(self.entries)
diff --git a/openpmcvl/experiment/scripts/eval/pmc_oa_2/ppr.sh b/openpmcvl/experiment/scripts/eval/pmc_oa_2/ppr.sh
@@ -7,7 +7,7 @@ mmlearn_run --multirun hydra.launcher.mem_gb=0 \
     hydra.launcher.tasks_per_node=4 \
     hydra.launcher.nodes=1 \
     hydra.launcher.stderr_to_stdout=true \
-    hydra.launcher.timeout_min=900 \
+    hydra.launcher.timeout_min=420 \
     '+hydra.launcher.additional_parameters={export: ALL}' \
     'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
     +experiment=biomedclip_ppr \
@@ -30,7 +30,7 @@ mmlearn_run --multirun hydra.launcher.mem_gb=0 \
     hydra.launcher.tasks_per_node=4 \
     hydra.launcher.nodes=1 \
     hydra.launcher.stderr_to_stdout=true \
-    hydra.launcher.timeout_min=900 \
+    hydra.launcher.timeout_min=20 \
     '+hydra.launcher.additional_parameters={export: ALL}' \
     'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
     +experiment=biomedclip_ppr \
diff --git a/openpmcvl/experiment/scripts/eval/roco/ppr.sh b/openpmcvl/experiment/scripts/eval/roco/ppr.sh
@@ -7,7 +7,7 @@ mmlearn_run --multirun hydra.launcher.mem_gb=0 \
     hydra.launcher.tasks_per_node=4 \
     hydra.launcher.nodes=1 \
     hydra.launcher.stderr_to_stdout=true \
-    hydra.launcher.timeout_min=900 \
+    hydra.launcher.timeout_min=420 \
     '+hydra.launcher.additional_parameters={export: ALL}' \
     'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
     +experiment=biomedclip_ppr \
@@ -30,7 +30,7 @@ mmlearn_run --multirun hydra.launcher.mem_gb=0 \
     hydra.launcher.tasks_per_node=4 \
     hydra.launcher.nodes=1 \
     hydra.launcher.stderr_to_stdout=true \
-    hydra.launcher.timeout_min=900 \
+    hydra.launcher.timeout_min=20 \
     '+hydra.launcher.additional_parameters={export: ALL}' \
     'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
     +experiment=biomedclip_ppr \
diff --git a/openpmcvl/experiment/scripts/eval/vitb16_bert256_pmcoa/ppr.sh b/openpmcvl/experiment/scripts/eval/vitb16_bert256_pmcoa/ppr.sh
@@ -7,7 +7,7 @@ mmlearn_run --multirun hydra.launcher.mem_gb=0 \
     hydra.launcher.tasks_per_node=4 \
     hydra.launcher.nodes=1 \
     hydra.launcher.stderr_to_stdout=true \
-    hydra.launcher.timeout_min=900 \
+    hydra.launcher.timeout_min=420 \
     '+hydra.launcher.additional_parameters={export: ALL}' \
     'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
     +experiment=biomedclip_ppr \
@@ -30,7 +30,7 @@ mmlearn_run --multirun hydra.launcher.mem_gb=0 \
     hydra.launcher.tasks_per_node=4 \
     hydra.launcher.nodes=1 \
     hydra.launcher.stderr_to_stdout=true \
-    hydra.launcher.timeout_min=900 \
+    hydra.launcher.timeout_min=20 \
     '+hydra.launcher.additional_parameters={export: ALL}' \
     'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
     +experiment=biomedclip_ppr \
diff --git a/openpmcvl/experiment/scripts/train/pmc_oa_2_512/pmc_oa_2_cl512_train_bs256_slurm.sh b/openpmcvl/experiment/scripts/train/pmc_oa_2_512/pmc_oa_2_cl512_train_bs256_slurm.sh
@@ -14,6 +14,31 @@ mmlearn_run --multirun hydra.launcher.mem_gb=0 \
     experiment_name=pmcoa2_matched_512_train \
     dataloader.train.batch_size=128 \
     dataloader.val.batch_size=32 \
+    dataloader.train.num_workers=4 \
+    dataloader.val.num_workers=4 \
+    task.encoders.text.pretrained=False \
+    task.encoders.rgb.pretrained=False \
+    trainer.max_epochs=64 \
+    task.lr_scheduler.scheduler.t_max=54476 \
+    task.lr_scheduler.scheduler.warmup_length=5448 \
+    ~trainer.callbacks.early_stopping
+
+# a100
+mmlearn_run --multirun hydra.launcher.mem_gb=0 \
+    hydra.launcher.qos=a100_arashaf \
+    hydra.launcher.partition=a100 \
+    hydra.launcher.gres=gpu:4 \
+    hydra.launcher.cpus_per_task=4 \
+    hydra.launcher.tasks_per_node=4 \
+    hydra.launcher.nodes=1 \
+    hydra.launcher.stderr_to_stdout=true \
+    hydra.launcher.timeout_min=600 \
+    '+hydra.launcher.additional_parameters={export: ALL}' \
+    'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
+    +experiment=pmcoa2_matched_512 \
+    experiment_name=pmcoa2_matched_512_train \
+    dataloader.train.batch_size=256 \
+    dataloader.val.batch_size=32 \
     dataloader.train.num_workers=3 \
     dataloader.val.num_workers=3 \
     task.encoders.text.pretrained=False \
diff --git a/openpmcvl/experiment/scripts/train/pmc_oa_2_512/pmc_oa_2_cl512_train_bs256_slurm_multinode.slrm b/openpmcvl/experiment/scripts/train/pmc_oa_2_512/pmc_oa_2_cl512_train_bs256_slurm_multinode.slrm
@@ -34,10 +34,10 @@ srun mmlearn_run \
     'hydra.searchpath=[pkg://openpmcvl.experiment.configs]' \
     +experiment=pmcoa2_matched_512 \
     experiment_name=pmcoa2_matched_512_train \
-    dataloader.train.batch_size=8 \
+    dataloader.train.batch_size=256 \
     dataloader.val.batch_size=32 \
-    dataloader.train.num_workers=4 \
-    dataloader.val.num_workers=4 \
+    dataloader.train.num_workers=2 \
+    dataloader.val.num_workers=2 \
     task.encoders.text.pretrained=False \
     task.encoders.rgb.pretrained=False \
     trainer.max_epochs=64 \
diff --git a/openpmcvl/experiment/tests/test_datasets.py b/openpmcvl/experiment/tests/test_datasets.py
@@ -168,11 +168,12 @@ def test_pmc2m_sum():
     ), "Please set PMC2M-Sum root directory in `PMC2M_SUMM_ROOT_DIR` environment variable."
 
     # test without transform and tokenizer
-    split = "train"
+    split = "test_clean_sep"
     transform = None
     tokenizer = None
     dataset = PMC2MSum(root_dir, split, transform, tokenizer)
     sample = dataset[0]
+    print(f"sample: {sample}")
     assert isinstance(
         sample[Modalities.TEXT.name], str
     ), f"Expected to find `str` in `Modalities.TEXT` but found {type(sample[Modalities.TEXT.name])}"
@@ -194,6 +195,7 @@ def test_pmc2m_sum():
     )
     dataset = PMC2MSum(root_dir, split, transform, tokenizer)
     sample = dataset[0]
+    print(f"sample: {sample}")
     assert isinstance(
         sample[Modalities.TEXT.name], torch.Tensor
     ), f"Expected to find `Tensor` in `Modalities.TEXT` but found {type(sample[Modalities.TEXT.name])}"
@@ -213,7 +215,7 @@ def test_pmc2m_sum_2():
     ), "Please set PMC2M-Sum root directory in `PMC2M_SUMM_ROOT_DIR` environment variable."
 
     # test with transform and tokenizer and dataloader
-    split = "train_clean"
+    split = "test_clean"
     batch_size = 64
     transform = biomedclip_vision_transform(image_crop_size=224, job_type="train")
     tokenizer = HFTokenizer(
@@ -250,5 +252,6 @@ def test_pmc2m_sum_2():
 
 
 if __name__ == "__main__":
+    test_pmc2m_sum()
     test_pmc2m_sum_2()
     print("Passed")
diff --git a/openpmcvl/experiment/working/clean_pmc2m_sum.py b/openpmcvl/experiment/working/clean_pmc2m_sum.py
@@ -21,7 +21,7 @@ def clean_pmc2m_sum(
         Dataset split.
     """
     # load entries
-    data_path = os.path.join(root_dir, f"{split}.jsonl")
+    data_path = os.path.join(root_dir, "clean", f"{split}.jsonl")
     with open(data_path, encoding="utf-8") as file:
         entries = [json.loads(line) for line in file.readlines()]
 
@@ -44,15 +44,54 @@ def clean_pmc2m_sum(
     print(f"{len(entries) - len(clean_entries)} entries removed due to non-existent caption and intext reference.")
 
     # write clean entries
-    filename = os.path.join(root_dir, f"{split}_clean.jsonl")
+    filename = os.path.join(root_dir, "clean", f"{split}_clean.jsonl")
     with open(filename, "w") as outfile:
         for entry in clean_entries:
             json.dump(entry, outfile)
             outfile.write("\n")
     print(f"Saved {len(clean_entries)} entries in {filename}")
 
 
+def separate_captions(
+        root_dir: str,
+        split: Literal["train", "valid", "test"] = "train"):
+    """Store captions in separate files.
+
+    Load captions in each call to __getitem__ to reduce GPU memory usage.
+
+    Parameters
+    ----------
+    root_dir : str
+        Path to the root folder containing jsonl file with data entries.
+    split : {"train", "valid", "test"}
+        Dataset split.
+    """
+    # load entries
+    data_path = os.path.join(root_dir, "clean", f"{split}.jsonl")
+    with open(data_path, encoding="utf-8") as file:
+        entries = [json.loads(line) for line in file.readlines()]
+
+    # separate caption
+    sep_entries = []
+    for entry in entries:
+        caption = " ".join([entry["caption"], entry["intext_refs_summary"]])
+        caption_filename = os.path.join(root_dir, "captions", entry["image"].replace("jpg", "txt"))
+        with open(caption_filename, "w") as outfile:
+            outfile.write(caption)
+        sep_entries.append({"image_fullpath": entry["image_fullpath"], "caption_fullpath": caption_filename})
+
+    # write sep entries
+    filename = os.path.join(root_dir, "clean", f"{split}_sep.jsonl")
+    with open(filename, "w") as outfile:
+        for entry in sep_entries:
+            json.dump(entry, outfile)
+            outfile.write("\n")
+    print(f"Saved {len(sep_entries)} entries in {filename}")
+
+
+
 if __name__ == "__main__":
     root_dir = os.getenv("PMC2M_SUMM_ROOT_DIR", "")
     split = "train"
-    clean_pmc2m_sum(root_dir, split)
+    # clean_pmc2m_sum(root_dir, split)
+    separate_captions(root_dir, f"{split}_clean")