WorldCereal · cbutsko · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/paper_eval.py b/paper_eval.py
@@ -1,28 +1,32 @@
 # presto_pretrain_finetune, but in a notebook
 import argparse
+import gc
 import json
 import logging
+from glob import glob
 from pathlib import Path
 from typing import Optional, cast
 
 import pandas as pd
 import torch
 import xarray as xr
-
 from presto.dataset import WorldCerealBase
 from presto.eval import WorldCerealEval
 from presto.presto import Presto
 from presto.utils import (
     DEFAULT_SEED,
+    NODATAVALUE,
     config_dir,
     data_dir,
     default_model_path,
     device,
     initialize_logging,
     plot_spatial,
+    process_parquet,
     seed_everything,
     timestamp_dirname,
 )
+from tqdm.auto import tqdm
 
 logger = logging.getLogger("__main__")
 
@@ -41,12 +45,20 @@
 argparser.add_argument("--num_workers", type=int, default=4)
 argparser.add_argument("--wandb", dest="wandb", action="store_true")
 argparser.add_argument("--wandb_org", type=str, default="nasa-harvest")
-argparser.add_argument("--parquet_file", type=str, default="rawts-monthly_calval.parquet")
+# argparser.add_argument("--parquet_file", type=str, default="rawts-monthly_calval.parquet")
+argparser.add_argument(
+    "--parquet_file",
+    type=str,
+    default="/vitodata/worldcereal/features/preprocessedinputs-monthly-nointerp/\
+worldcereal_training_data.parquet",
+)
 argparser.add_argument("--val_samples_file", type=str, default="cropland_test_split_samples.csv")
 argparser.add_argument("--train_only_samples_file", type=str, default="train_only_samples.csv")
 argparser.add_argument("--warm_start", dest="warm_start", action="store_true")
+argparser.add_argument("--augment", dest="augment", action="store_true")
 argparser.set_defaults(wandb=False)
 argparser.set_defaults(warm_start=True)
+argparser.set_defaults(augment=False)
 args = argparser.parse_args().__dict__
 
 model_name = args["model_name"]
@@ -79,6 +91,7 @@
 parquet_file: str = args["parquet_file"]
 val_samples_file: str = args["val_samples_file"]
 train_only_samples_file: str = args["train_only_samples_file"]
+augment: bool = args["augment"]
 
 dekadal = False
 if "10d" in parquet_file:
@@ -89,7 +102,21 @@
 
 logger.info("Setting up dataloaders")
 
-df = pd.read_parquet(data_dir / parquet_file)
+
+logger.info("Reading dataset")
+files = sorted(glob(f"{parquet_file}/**/*.parquet"))[:10]
+df_list = []
+for f in tqdm(files):
+    _data = pd.read_parquet(f, engine="fastparquet")
+    _data_pivot = process_parquet(_data)
+    _data_pivot.reset_index(inplace=True)
+    df_list.append(_data_pivot)
+    del _data, _data_pivot
+    gc.collect()
+df = pd.concat(df_list)
+df = df.fillna(NODATAVALUE)
+del df_list
+gc.collect()
 
 logger.info("Setting up model")
 if warm_start:
@@ -104,13 +131,14 @@
     best_model_path = None
 model.to(device)
 
-model_modes = ["Random Forest", "Regression", "CatBoostClassifier"]
+# model_modes = ["Random Forest", "Regression", "CatBoostClassifier"]
+model_modes = ["CatBoostClassifier"]
 
 # 1. Using the provided split
 val_samples_df = pd.read_csv(data_dir / val_samples_file)
 train_df, test_df = WorldCerealBase.split_df(df, val_sample_ids=val_samples_df.sample_id.tolist())
 full_eval = WorldCerealEval(
-    train_df, test_df, spatial_inference_savedir=model_logging_dir, dekadal=dekadal
+    train_df, test_df, spatial_inference_savedir=model_logging_dir, dekadal=dekadal, augment=False
 )
 results, finetuned_model = full_eval.finetuning_results(model, sklearn_model_modes=model_modes)
 logger.info(json.dumps(results, indent=2))

diff --git a/presto/dataset.py b/presto/dataset.py
@@ -23,7 +23,7 @@
     DynamicWorld2020_2021,
 )
 from .masking import BAND_EXPANSION, MaskedExample, MaskParamsNoDw
-from .utils import DEFAULT_SEED, data_dir, load_world_df
+from .utils import DEFAULT_SEED, MIN_EDGE_BUFFER, data_dir, load_world_df
 
 logger = logging.getLogger("__main__")
 
@@ -65,23 +65,73 @@ def target_crop(row_d: Dict) -> int:
         # by default, we predict crop vs non crop
         return int(row_d["LANDCOVER_LABEL"] == 11)
 
+    @classmethod
+    def get_timestep_positions(cls, row_d: Dict, augment: bool = False) -> List[int]:
+        available_timesteps = int(row_d["available_timesteps"])
+        valid_position = int(row_d["valid_position"])
+
+        # force moving the center point if it is too close to the edges
+        if (valid_position < cls.NUM_TIMESTEPS // 2) or (
+            valid_position > (available_timesteps - cls.NUM_TIMESTEPS // 2)
+        ):
+            augment = True
+
+        if not augment:
+            # Center the timesteps around the valid position
+            center_point = valid_position
+        else:
+            # Shift the center point but make sure the resulting range
+            # well includes the valid position
+
+            min_center_point = max(
+                cls.NUM_TIMESTEPS // 2, valid_position + MIN_EDGE_BUFFER - cls.NUM_TIMESTEPS // 2
+            )
+            max_center_point = min(
+                available_timesteps - cls.NUM_TIMESTEPS // 2,
+                valid_position - MIN_EDGE_BUFFER + cls.NUM_TIMESTEPS // 2,
+            )
+
+            center_point = np.random.randint(
+                min_center_point, max_center_point + 1
+            )  # max_center_point included
+
+        last_timestep = min(available_timesteps, center_point + cls.NUM_TIMESTEPS // 2)
+        first_timestep = max(0, last_timestep - cls.NUM_TIMESTEPS)
+        timestep_positions = list(range(first_timestep, last_timestep))
+
+        if len(timestep_positions) != cls.NUM_TIMESTEPS:
+            raise ValueError(
+                f"Acquired timestep positions do not have correct length: \
+required {cls.NUM_TIMESTEPS}, got {len(timestep_positions)}"
+            )
+        assert (
+            valid_position in timestep_positions
+        ), f"Valid position {valid_position} not in timestep positions {timestep_positions}"
+        return timestep_positions
+
     @classmethod
     def row_to_arrays(
-        cls, row: pd.Series, target_function: Callable[[Dict], int]
+        cls, row: pd.Series, target_function: Callable[[Dict], int], augment: bool = False
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, int]:
         # https://stackoverflow.com/questions/45783891/is-there-a-way-to-speed-up-the-pandas-getitem-getitem-axis-and-get-label
         # This is faster than indexing the series every time!
         row_d = pd.Series.to_dict(row)
 
         latlon = np.array([row_d["lat"], row_d["lon"]], dtype=np.float32)
-        month = datetime.strptime(row_d["start_date"], "%Y-%m-%d").month - 1
+
+        timestep_positions = cls.get_timestep_positions(row_d, augment=augment)
+        # make sure that month for encoding gets shifted according to
+        # the selected timestep positions
+        month = (
+            pd.to_datetime(row_d["start_date"]) + pd.DateOffset(months=timestep_positions[0])
+        ).month - 1
 
         eo_data = np.zeros((cls.NUM_TIMESTEPS, len(BANDS)))
         # an assumption we make here is that all timesteps for a token
         # have the same masking
         mask = np.zeros((cls.NUM_TIMESTEPS, len(BANDS_GROUPS_IDX)))
         for df_val, presto_val in cls.BAND_MAPPING.items():
-            values = np.array([float(row_d[df_val.format(t)]) for t in range(cls.NUM_TIMESTEPS)])
+            values = np.array([float(row_d[df_val.format(t)]) for t in timestep_positions])
             # this occurs for the DEM values in one point in Fiji
             values = np.nan_to_num(values, nan=cls._NODATAVALUE)
             idx_valid = values != cls._NODATAVALUE
@@ -260,6 +310,7 @@ def __init__(
         years_to_remove: Optional[List[int]] = None,
         target_function: Optional[Callable[[Dict], int]] = None,
         balance: bool = False,
+        augment: bool = False,
     ):
         dataframe = dataframe.loc[~dataframe.LANDCOVER_LABEL.isin(self.FILTER_LABELS)]
 
@@ -275,6 +326,7 @@ def __init__(
             dataframe = dataframe[(~dataframe.end_date.dt.year.isin(years_to_remove))]
         self.target_function = target_function if target_function is not None else self.target_crop
         self._class_weights: Optional[np.ndarray] = None
+        self.augment = augment
 
         super().__init__(dataframe)
         if balance:
@@ -307,7 +359,9 @@ def __getitem__(self, idx):
         # Get the sample
         df_index = self.indices[idx]
         row = self.df.iloc[df_index, :]
-        eo, mask_per_token, latlon, month, target = self.row_to_arrays(row, self.target_function)
+        eo, mask_per_token, latlon, month, target = self.row_to_arrays(
+            row, self.target_function, self.augment
+        )
         mask_per_variable = np.repeat(mask_per_token, BAND_EXPANSION, axis=1)
         return (
             self.normalize_and_mask(eo),
@@ -354,7 +408,9 @@ def __getitem__(self, idx):
         # Get the sample
         df_index = self.indices[idx]
         row = self.df.iloc[df_index, :]
-        eo, mask_per_token, latlon, _, target = self.row_to_arrays(row, self.target_function)
+        eo, mask_per_token, latlon, _, target = self.row_to_arrays(
+            row, self.target_function, self.augment
+        )
         mask_per_variable = np.repeat(mask_per_token, BAND_EXPANSION, axis=1)
         return (
             self.normalize_and_mask(eo),

diff --git a/presto/eval.py b/presto/eval.py
@@ -65,6 +65,7 @@ def __init__(
         name: Optional[str] = None,
         val_size: float = 0.2,
         dekadal: bool = False,
+        augment: bool = False,
     ):
         self.seed = seed
 
@@ -90,6 +91,8 @@ def __init__(
         self.dekadal = dekadal
         self.ds_class = WorldCerealLabelled10DDataset if dekadal else WorldCerealLabelledDataset
 
+        self.augment = augment
+
     def _construct_finetuning_model(self, pretrained_model: Presto) -> PrestoFineTuningModel:
         model: PrestoFineTuningModel = cast(Callable, pretrained_model.construct_finetuning_model)(
             num_outputs=self.num_outputs
@@ -278,7 +281,7 @@ def evaluate(
         pretrained_model: Optional[PrestoFineTuningModel] = None,
     ) -> Dict:
 
-        test_ds = self.ds_class(self.test_df, target_function=self.target_function)
+        test_ds = self.ds_class(self.test_df, target_function=self.target_function, augment=False)
         dl = DataLoader(
             test_ds,
             batch_size=512,
@@ -387,6 +390,7 @@ def finetune(self, pretrained_model) -> PrestoFineTuningModel:
             years_to_remove=self.years_to_remove,
             target_function=self.target_function,
             balance=True,
+            augment=self.augment,
         )
 
         # should the val set be balanced too?
@@ -395,6 +399,7 @@ def finetune(self, pretrained_model) -> PrestoFineTuningModel:
             countries_to_remove=self.countries_to_remove,
             years_to_remove=self.years_to_remove,
             target_function=self.target_function,
+            augment=False,  # don't augment the validation set
         )
 
         loss_fn = nn.BCEWithLogitsLoss()
@@ -511,6 +516,7 @@ def finetuning_results_sklearn(
                     countries_to_remove=self.countries_to_remove,
                     years_to_remove=self.years_to_remove,
                     target_function=self.target_function,
+                    augment=self.augment,
                 ),
                 batch_size=2048,
                 shuffle=False,
@@ -522,6 +528,7 @@ def finetuning_results_sklearn(
                     countries_to_remove=self.countries_to_remove,
                     years_to_remove=self.years_to_remove,
                     target_function=self.target_function,
+                    augment=False,  # don't augment the validation set
                 ),
                 batch_size=2048,
                 shuffle=False,