vturrisi · DonkeyShot21 · Jul 14, 2022 · Jul 8, 2022 · Jul 8, 2022 · Jul 8, 2022
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ The library is self-contained, but it is possible to use the models outside of s
 
 ## News
 * **[Jun 26 2022]**: :fire: Added [MoCo V3](https://arxiv.org/abs/2104.02057).
-* **[Jun 10 2022]**: :bomb: Improved LARS and fixed some issues to support [Horovod](https://horovod.readthedocs.io/en/stable/pytorch.html).
+* **[Jun 10 2022]**: :bomb: Improved LARS.
 * **[Jun 09 2022]**: :lollipop: Added support for [WideResnet](https://arxiv.org/abs/1605.07146), multicrop for SwAV and equalization data augmentation.
 * **[May 02 2022]**: :diamond_shape_with_a_dot_inside: Wrapped Dali with a DataModule, added auto resume for linear eval and Wandb run resume.
 * **[Apr 12 2022]**: :rainbow: Improved design of models and added support to train with a fraction of data.

diff --git a/bash_files/pretrain/imagenet-100/mocov3_vit.sh b/bash_files/pretrain/imagenet-100/mocov3_vit.sh
@@ -0,0 +1,43 @@
+python3 main_pretrain.py \
+    --dataset imagenet100 \
+    --backbone vit_small \
+    --data_dir /datasets \
+    --train_dir imagenet-100/train \
+    --val_dir imagenet-100/val \
+    --max_epochs 400 \
+    --warmup_epochs 40 \
+    --devices 0,1,2,3,4,5,6,7 \
+    --accelerator gpu \
+    --strategy ddp \
+    --sync_batchnorm \
+    --precision 16 \
+    --optimizer adamw \
+    --eta_lars 0.02 \
+    --exclude_bias_n_norm \
+    --scheduler warmup_cosine \
+    --lr 2.0e-4  \
+    --classifier_lr 3.0e-4 \
+    --weight_decay 0.1 \
+    --batch_size 64 \
+    --num_workers 4 \
+    --dali \
+    --brightness 0.4 \
+    --contrast 0.4 \
+    --saturation 0.2 \
+    --hue 0.1 \
+    --gaussian_prob 1.0 0.1 \
+    --solarization_prob 0.0 0.2 \
+    --min_scale 0.08 \
+    --num_crops_per_aug 1 1 \
+    --name mocov3-vit-400ep-imagenet100 \
+    --project solo-learn \
+    --entity unitn-mhug \
+    --save_checkpoint \
+    --wandb \
+    --auto_resume \
+    --method mocov3 \
+    --proj_hidden_dim 4096 \
+    --pred_hidden_dim 4096 \
+    --temperature 0.2 \
+    --base_tau_momentum 0.99 \
+    --final_tau_momentum 1.0
diff --git a/bash_files/pretrain/imagenet-100/mocov3_vit_h5.sh b/bash_files/pretrain/imagenet-100/mocov3_vit_h5.sh
@@ -0,0 +1,42 @@
+python3 main_pretrain.py \
+    --dataset imagenet100 \
+    --backbone vit_small \
+    --data_dir $1 \
+    --train_h5_path train.h5 \
+    --val_h5_path val.h5 \
+    --max_epochs 400 \
+    --warmup_epochs 40 \
+    --devices 0,1,2,3,4,5,6,7 \
+    --accelerator gpu \
+    --strategy ddp \
+    --sync_batchnorm \
+    --precision 16 \
+    --optimizer adamw \
+    --eta_lars 0.02 \
+    --exclude_bias_n_norm \
+    --scheduler warmup_cosine \
+    --lr 3.0e-4  \
+    --classifier_lr 3.0e-4 \
+    --weight_decay 0.1 \
+    --batch_size 64 \
+    --num_workers 8 \
+    --brightness 0.4 \
+    --contrast 0.4 \
+    --saturation 0.2 \
+    --hue 0.1 \
+    --gaussian_prob 1.0 0.1 \
+    --solarization_prob 0.0 0.2 \
+    --min_scale 0.08 \
+    --num_crops_per_aug 1 1 \
+    --name mocov3-vit-400ep-imagenet100 \
+    --project solo-learn \
+    --entity unitn-mhug \
+    --save_checkpoint \
+    --wandb \
+    --auto_resume \
+    --method mocov3 \
+    --proj_hidden_dim 4096 \
+    --pred_hidden_dim 4096 \
+    --temperature 0.2 \
+    --base_tau_momentum 0.99 \
+    --final_tau_momentum 1.0
diff --git a/main_knn.py b/main_knn.py
@@ -1,4 +1,4 @@
-# Copyright 2021 solo-learn development team.
+# Copyright 2022 solo-learn development team.
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in

diff --git a/main_linear.py b/main_linear.py
@@ -1,4 +1,4 @@
-# Copyright 2021 solo-learn development team.
+# Copyright 2022 solo-learn development team.
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in
@@ -95,10 +95,13 @@ def main():
         data_dir=args.data_dir,
         train_dir=args.train_dir,
         val_dir=args.val_dir,
+        train_h5_path=args.train_h5_path,
+        val_h5_path=args.val_h5_path,
         batch_size=args.batch_size,
         num_workers=args.num_workers,
         data_fraction=args.data_fraction,
     )
+
     if args.dali:
         assert (
             _dali_avaliable

diff --git a/main_pretrain.py b/main_pretrain.py
@@ -1,4 +1,4 @@
-# Copyright 2021 solo-learn development team.
+# Copyright 2022 solo-learn development team.
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in
@@ -71,14 +71,18 @@ def main():
     # validation dataloader for when it is available
     if args.dataset == "custom" and (args.no_labels or args.val_dir is None):
         val_loader = None
-    elif args.dataset in ["imagenet100", "imagenet"] and args.val_dir is None:
+    elif args.dataset in ["imagenet100", "imagenet"] and (
+        args.val_dir is None and args.val_h5_path is None
+    ):
         val_loader = None
     else:
         _, val_loader = prepare_data_classification(
             args.dataset,
             data_dir=args.data_dir,
             train_dir=args.train_dir,
             val_dir=args.val_dir,
+            train_h5_path=args.train_h5_path,
+            val_h5_path=args.val_h5_path,
             batch_size=args.batch_size,
             num_workers=args.num_workers,
         )
@@ -123,6 +127,7 @@ def main():
             data_dir=args.data_dir,
             train_dir=args.train_dir,
             no_labels=args.no_labels,
+            train_h5_path=args.train_h5_path,
             data_fraction=args.data_fraction,
         )
         train_loader = prepare_dataloader(

diff --git a/main_umap.py b/main_umap.py
@@ -1,4 +1,4 @@
-# Copyright 2021 solo-learn development team.
+# Copyright 2022 solo-learn development team.
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in

diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ wandb
 scipy
 timm
 scikit-learn
+h5py
diff --git a/setup.py b/setup.py
@@ -1,4 +1,4 @@
-# Copyright 2021 solo-learn development team.
+# Copyright 2022 solo-learn development team.
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in
@@ -55,6 +55,7 @@ def parse_requirements(path):
         "scipy",
         "timm",
         "scikit-learn",
+        "h5py",
     ],
     extras_require=EXTRA_REQUIREMENTS,
     dependency_links=["https://developer.download.nvidia.com/compute/redist"],

diff --git a/solo/args/dataset.py b/solo/args/dataset.py
@@ -44,6 +44,10 @@ def dataset_args(parser: ArgumentParser):
     parser.add_argument("--train_dir", type=Path, default=None)
     parser.add_argument("--val_dir", type=Path, default=None)
 
+    # h5 files
+    parser.add_argument("--train_h5_path", type=str, default=None)
+    parser.add_argument("--val_h5_path", type=str, default=None)
+
     # percentage of data used from training, leave -1.0 to use all data available
     parser.add_argument("--data_fraction", default=-1.0, type=float)
 

diff --git a/solo/args/utils.py b/solo/args/utils.py
@@ -239,18 +239,8 @@ def additional_setup_pretrain(args: Namespace):
         args.devices = [int(device) for device in args.devices.split(",") if device]
 
     # adjust lr according to batch size
-    if args.strategy == "horovod":
-        warnings.warn(
-            "When using horovod, be aware of how the processes are divided. "
-            "The learning rate will only be scaled considering the number of "
-            "devices in each process. "
-            "If each gpu corresponds to each process, you should pass --num_nodes_horovod "
-            "N_GPUS to properly scale the lr. "
-            "You can also manually scale your lr if you are not sure, by checking your logs."
-        )
-
     try:
-        num_nodes = args.num_nodes_horovod or args.num_nodes or 1
+        num_nodes = args.num_nodes or 1
     except AttributeError:
         num_nodes = 1
 
@@ -315,18 +305,8 @@ def additional_setup_linear(args: Namespace):
         args.devices = [int(device) for device in args.devices.split(",") if device]
 
     # adjust lr according to batch size
-    if args.strategy == "horovod":
-        warnings.warn(
-            "When using horovod, be aware of how the processes are divided. "
-            "The learning rate will only be scaled considering the number of "
-            "devices in each process. "
-            "If each gpu corresponds to each process, you should pass --num_nodes_horovod "
-            "N_GPUS to properly scale the lr. "
-            "You can also manually scale your lr if you are not sure, by checking your logs."
-        )
-
     try:
-        num_nodes = args.num_nodes_horovod or args.num_nodes or 1
+        num_nodes = args.num_nodes or 1
     except AttributeError:
         num_nodes = 1
 

diff --git a/solo/backbones/vit/vit_mocov3.py b/solo/backbones/vit/vit_mocov3.py
@@ -73,7 +73,7 @@ def build_2d_sincos_position_embedding(self, temperature=10000.0):
             [torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1
         )[None, :, :]
 
-        assert self.num_tokens == 1, "Assuming one and only one token, [cls]"
+        assert self.num_prefix_tokens == 1, "Assuming one and only one token, [cls]"
         pe_token = torch.zeros([1, 1, self.embed_dim], dtype=torch.float32)
         self.pos_embed = nn.Parameter(torch.cat([pe_token, pos_emb], dim=1))
         self.pos_embed.requires_grad = False

diff --git a/solo/methods/base.py b/solo/methods/base.py
@@ -336,14 +336,6 @@ def add_model_specific_args(parent_parser: ArgumentParser) -> ArgumentParser:
         # disables channel last optimization
         parser.add_argument("--no_channel_last", action="store_true")
 
-        # When using horovod, be aware of how the processes are divided.
-        # The learning rate will only be scaled considering the number of
-        # devices in each process.
-        # If each gpu corresponds to each process, you should pass --num_nodes_horovod
-        # N_GPUS to properly scale the lr.
-        # You can also manually scale your lr if you are not sure, by checking your logs.
-        parser.add_argument("--num_nodes_horovod", default=None, type=int)
-
         return parent_parser
 
     @property
@@ -356,9 +348,13 @@ def num_training_steps(self) -> int:
                 if dataset not in ["cifar10", "cifar100", "stl10"]:
                     data_dir = self.extra_args.get("data_dir", ".")
                     train_dir = self.extra_args.get("train_dir", "train")
-                    folder = os.path.join(data_dir, train_dir)
+                    folder = os.path.join(data_dir, str(train_dir))
+                    h5py_file = self.extra_args.get("train_h5_path", None)
+                    h5py_file = os.path.join(data_dir, h5py_file)
                 else:
                     folder = None
+                    h5py_file = None
+
                 no_labels = self.extra_args.get("no_labels", False)
                 data_fraction = self.extra_args.get("data_fraction", -1.0)
 
@@ -367,6 +363,7 @@ def num_training_steps(self) -> int:
                     folder=folder,
                     train=True,
                     no_labels=no_labels,
+                    h5py_file=h5py_file,
                     data_fraction=data_fraction,
                 )
             except:
@@ -378,7 +375,7 @@ def num_training_steps(self) -> int:
             dataset_size = self.trainer.limit_train_batches * dataset_size
 
             num_devices = self.trainer.num_devices
-            num_nodes = self.extra_args.get("num_nodes_horovod", 0) or self.trainer.num_nodes or 1
+            num_nodes = self.trainer.num_nodes or 1
             effective_batch_size = (
                 self.batch_size * self.trainer.accumulate_grad_batches * num_devices * num_nodes
             )

diff --git a/solo/methods/linear.py b/solo/methods/linear.py
@@ -183,9 +183,13 @@ def num_training_steps(self) -> int:
                 if dataset not in ["cifar10", "cifar100", "stl10"]:
                     data_dir = self.extra_args.get("data_dir", ".")
                     train_dir = self.extra_args.get("train_dir", "train")
-                    folder = os.path.join(data_dir, train_dir)
+                    folder = os.path.join(data_dir, str(train_dir))
+                    h5py_file = self.extra_args.get("train_h5_path", None)
+                    h5py_file = os.path.join(data_dir, h5py_file)
                 else:
                     folder = None
+                    h5py_file = None
+
                 no_labels = self.extra_args.get("no_labels", False)
                 data_fraction = self.extra_args.get("data_fraction", -1.0)
 
@@ -194,6 +198,7 @@ def num_training_steps(self) -> int:
                     folder=folder,
                     train=True,
                     no_labels=no_labels,
+                    h5py_file=h5py_file,
                     data_fraction=data_fraction,
                 )
             except:
@@ -205,7 +210,7 @@ def num_training_steps(self) -> int:
             dataset_size = self.trainer.limit_train_batches * dataset_size
 
             num_devices = self.trainer.num_devices
-            num_nodes = self.extra_args.get("num_nodes_horovod", 0) or self.trainer.num_nodes or 1
+            num_nodes = self.trainer.num_nodes or 1
             effective_batch_size = (
                 self.batch_size * self.trainer.accumulate_grad_batches * num_devices * num_nodes
             )