remove unnecessary print statements

liellnima · liellnima · commit 21df1baed9fe · 2025-06-02T16:21:34.000+02:00
diff --git a/emulator/src/data/climate_dataset.py b/emulator/src/data/climate_dataset.py
@@ -1,25 +1,18 @@
-import copy
-import logging
 import os
 import glob
-import pickle
-import shutil
 import zipfile
-from typing import Dict, Optional, List, Callable, Tuple, Union
+from typing import Dict, Optional, List, Tuple, Union
 
 import numpy as np
 import xarray as xr
 import torch
-from torch import Tensor
 
 from emulator.src.utils.utils import get_logger, map_variables_targetmip
 from emulator.src.data.constants import (
     LAT,
     LON,
     SEQ_LEN,
-    INPUT4MIPS_TEMP_RES,
     CMIP6_TEMP_RES,
-    INPUT4MIPS_NOM_RES,
     CMIP6_NOM_RES,
     DATA_DIR,
     OPENBURNING_MODEL_MAPPING,
@@ -130,9 +123,9 @@ def __init__(
             seq_len=seq_len,
         )
         # creates on cmip and on input4mip dataset
-        print("Creating input4mips...")
+        #print("Creating input4mips...")
         self.input4mips_ds = Input4MipsDataset(variables=in_variables_im, **ds_kwargs)
-        print("Creating cmip6...")
+        #print("Creating cmip6...")
         self.cmip6_ds = CMIP6Dataset(
             climate_model=climate_model,
             num_ensembles=num_ensembles,
@@ -151,7 +144,7 @@ def load_into_mem(
     ):  # -> np.ndarray():
         array_list = []
         for vlist in paths:
-            print("Number of files per var:", len(vlist))
+            #print("Number of files per var:", len(vlist))
             temp_data = xr.open_mfdataset(
                 vlist, concat_dim="time", combine="nested"
             ).compute()  # .compute is not necessary but eh, doesn't hurt
@@ -162,9 +155,9 @@ def load_into_mem(
         temp_data = np.concatenate(array_list, axis=0)
 
         if seq_len != SEQ_LEN:
-            print(
-                "Choosing a sequence length greater or lesser than the data sequence length."
-            )
+            # print(
+            #     "Choosing a sequence length greater or lesser than the data sequence length."
+            # )
             new_num_years = int(
                 np.floor(temp_data.shape[1] / seq_len / len(self.scenarios))
             )
@@ -272,10 +265,7 @@ def get_years_list(self, years: str, give_list: Optional[bool] = False):
 
         """
         if len(years) != 9:
-            log.warn(
-                "Years string must be in the format xxxx-yyyy eg. 2015-2100 with string length 9. Please check the year string."
-            )
-            raise ValueError
+            raise ValueError("Years string must be in the format xxxx-yyyy eg. 2015-2100 with string length 9. Please check the year string.")
         splits = years.split("-")
         min_year, max_year = int(splits[0]), int(splits[1])
 
@@ -292,9 +282,9 @@ def get_dataset_statistics(self, data, mode, type="z-norm", mips="cmip6"):
                 min_val, max_val = self.get_min_max(data)
                 return min_val, max_val
             else:
-                print("Normalizing of type {0} has not been implemented!".format(type))
+                raise NotImplementedError(f"Normalizing of type {type} has not been implemented!")
         else:
-            print("In testing mode, skipping statistics calculations.")
+            log.warning("In testing mode, skipping statistics calculations.")
 
     def get_mean_std(self, data):
         # data shape (years*scenarios, seq, vars, lat, lon)
@@ -330,7 +320,7 @@ def normalize_data(self, data, stats, type="z-norm"):
         # z-norm: (data-mean)/(std + eps); eps=1e-9
         # min-max = (v - v.min()) / (v.max() - v.min())
 
-        print("Normalizing data...")
+        #print("Normalizing data...")
         if self.channels_last:
             data = np.moveaxis(
                 data, -1, 0
@@ -340,7 +330,7 @@ def normalize_data(self, data, stats, type="z-norm"):
                 data, 2, 0
             )  # shape (years, seq_len, num_vars, lat, lon) -> (num_vars, years, seq_len, lat, lon)
 
-        print("mean", stats["mean"].shape, "std", stats["std"].shape)
+        #print("mean", stats["mean"].shape, "std", stats["std"].shape)
         norm_data = (data - stats["mean"]) / (stats["std"])
 
         if self.channels_last:
@@ -395,9 +385,9 @@ def __str__(self):
         return s
 
     def __len__(self):
-        print(
-            "Input4mips", self.input4mips_ds.length, "CMIP6 data", self.cmip6_ds.length
-        )
+        # print(
+        #     "Input4mips", self.input4mips_ds.length, "CMIP6 data", self.cmip6_ds.length
+        # )
         # cmip must be num_ensemble members times input4mips
         assert (
             self.input4mips_ds.length * self.num_ensembles == self.cmip6_ds.length
@@ -453,18 +443,15 @@ def __init__(  # inherits all the stuff from Base
         if isinstance(climate_model, str):
             self.root_dir = os.path.join(self.root_dir, climate_model)
         else:
-            log.warn(
-                "For loading multiple climate models, please make sure to use the Super Climate Dataset Class."
-            )
-            raise NotImplementedError
+            raise NotImplementedError("For loading multiple climate models, please make sure to use the Super Climate Dataset Class.")
 
         if num_ensembles == 1:
             ensembles = os.listdir(self.root_dir)
             self.ensemble_dir = [
                 os.path.join(self.root_dir, ensembles[0])
             ]  # Taking first ensemble member
         else:
-            print("Multiple ensembles", num_ensembles)
+            #print("Multiple ensembles", num_ensembles)
             self.ensemble_dir = []
             ensembles = os.listdir(self.root_dir)
             for i, folder in enumerate(ensembles):
@@ -484,7 +471,7 @@ def __init__(  # inherits all the stuff from Base
             os.path.join(output_save_dir, fname)
         ):  # we first need to get the name here to test that...
             self.data_path = os.path.join(output_save_dir, fname)
-            print("path exists, reloading")
+            #print("path exists, reloading")
             self.Data = self._reload_data(self.data_path)
 
             # Load stats and normalize
@@ -516,13 +503,7 @@ def __init__(  # inherits all the stuff from Base
                             )
                             files = glob.glob(var_dir + f"/*.nc", recursive=True)
                             if len(files) == 0:
-                                print(
-                                    "No files for this scenario, year, ensemble member pairing:",
-                                    exp,
-                                    y,
-                                    em,
-                                )
-                                exit(0)
+                                raise FileNotFoundError(f"No files could be found for scenario {exp}, year {y}, and ensemble member {em}. Check if climate model runs for that pairing actually exist.")
                             # loads all years!
                             output_nc_files += files
                 files_per_var.append(output_nc_files)
@@ -540,7 +521,6 @@ def __init__(  # inherits all the stuff from Base
                 )
 
                 if os.path.isfile(stats_fname):
-                    print("Stats file already exists! Loading from memory.")
                     stats = self.load_statistics_data(stats_fname)
                     self.norm_data = self.normalize_data(self.raw_data, stats)
 
@@ -552,7 +532,7 @@ def __init__(  # inherits all the stuff from Base
                     self.norm_data = self.normalize_data(self.raw_data, stats)
 
                     save_file_name = self.write_dataset_statistics(stats_fname, stats)
-                    print("WROTE STATISTICS", save_file_name)
+                    #print("WROTE STATISTICS", save_file_name)
 
                 self.norm_data = self.normalize_data(self.raw_data, stats)
 
@@ -630,14 +610,14 @@ def __init__(
             os.path.join(output_save_dir, fname)
         ):  # we first need to get the name here to test that...
             self.data_path = os.path.join(output_save_dir, fname)
-            print("path exists, reloading")
+            #print("path exists, reloading")
             self.Data = self._reload_data(self.data_path)
 
             # Load stats and normalize
             stats_fname = self.get_save_name_from_kwargs(
                 mode=mode, file="statistics", kwargs=fname_kwargs
             )
-            print(stats_fname)
+            #print(stats_fname)
             stats = self.load_dataset_statistics(
                 os.path.join(self.output_save_dir, stats_fname),
                 mode=self.mode,
@@ -705,7 +685,7 @@ def __init__(
                 )
 
                 if os.path.isfile(stats_fname):
-                    print("Stats file already exists! Loading from mempory.")
+                    #print("Stats file already exists! Loading from mempory.")
                     stats = self.load_statistics_data(stats_fname)
                     self.norm_data = self.normalize_data(self.raw_data, stats)
 
diff --git a/emulator/src/data/super_climate_dataset.py b/emulator/src/data/super_climate_dataset.py
@@ -1,32 +1,21 @@
-import copy
-import logging
 import os
 import glob
-import pickle
-import shutil
 import zipfile
-from typing import Dict, Optional, List, Callable, Tuple, Union
-import copy
+from typing import Dict, Optional, List, Tuple, Union
 import numpy as np
 import xarray as xr
 import torch
-from torch import Tensor
-import threading
 
 
 from emulator.src.utils.utils import get_logger, all_equal, map_variables_targetmip
 from emulator.src.data.constants import (
     LON,
     LAT,
     SEQ_LEN,
-    INPUT4MIPS_TEMP_RES,
     CMIP6_TEMP_RES,
-    INPUT4MIPS_NOM_RES,
     CMIP6_NOM_RES,
     DATA_DIR,
-    OPENBURNING_MODEL_MAPPING,
     NO_OPENBURNING_VARS,
-    AVAILABLE_MODELS_FIRETYPE,
 )
 log = get_logger()
 from abc import ABC, abstractmethod
@@ -267,9 +256,9 @@ def get_dataset_statistics(self, data: np.ndarray, mode: str, norm_type: str = "
             elif norm_type == "minmax":
                 return self.get_min_max(data)
             else:
-                print(f"Normalization of type {norm_type} has not been implemented!")
+                raise NotImplementedError(f"Normalization of type {norm_type} has not been implemented!")
         else:
-            print("In testing mode, skipping statistics calculations.")
+            log.warning("In testing mode, skipping statistics calculations.")
 
     def get_mean_std(self, data: np.ndarray):
         """
@@ -329,8 +318,7 @@ def __len__(self) -> int:
         elif self.mode == 'val':
             return len(self.index_manager.val_indexes)
         else:
-            print(f"Unknown mode: {self.mode}")
-            raise ValueError
+            raise ValueError(f"Unknown mode: {self.mode}")
     
 
 class SuperClimateDataset(ABC_Climate_Dataset):
@@ -473,10 +461,7 @@ def get_years_list(self, years: str, give_list: Optional[bool] = False):
 
         """
         if len(years) != 9:
-            log.warn(
-                "Years string must be in the format xxxx-yyyy eg. 2015-2100 with string length 9. Please check the year string."
-            )
-            raise ValueError
+            raise ValueError("Years string must be in the format xxxx-yyyy eg. 2015-2100 with string length 9. Please check the year string.")
         splits = years.split("-")
         min_year, max_year = int(splits[0]), int(splits[1])
 
@@ -519,8 +504,7 @@ def __getitem__(self, index):  # Dict[str, Tensor]):
         return X, Y, model_id
 
     def __str__(self):
-        s = f" Super Emulator dataset: {len(self.index_manager.climate_models)} climate models with {self.index_manager.num_ensembles} ensemble members and {self.n_years} years used, with a total size of {len(self)} examples (in, out)."
-        return s
+        return f" Super Emulator dataset: {len(self.index_manager.climate_models)} climate models with {self.index_manager.num_ensembles} ensemble members and {self.n_years} years used, with a total size of {len(self)} examples (in, out)."
 
     
     def __len__(self):
@@ -531,8 +515,7 @@ def __len__(self):
         # elif self.mode=='train+val':
         #     return self.get_initial_length()
         else:
-            print("Unknown mode.", self.mode)
-            raise ValueError
+            raise ValueError(f"Unknown mode: {self.mode}")
 
 
 
@@ -588,7 +571,7 @@ def __init__(  # inherits all the stuff from Base
             os.path.join(output_save_dir, fname)
         ):  # we first need to get the name here to test that...
             self.data_path = os.path.join(output_save_dir, fname)
-            print("path exists, reloading")
+            #print("path exists, reloading")
             self.Data = self._reload_data(self.data_path)
 
             # Load stats and normalize
@@ -621,16 +604,7 @@ def __init__(  # inherits all the stuff from Base
                         )
                         files = glob.glob(var_dir + f"/*.nc", recursive=True)
                         if len(files) == 0:
-                            print(
-                                "No files for this climate model, ensemble member, var, year ,scenario:",
-                                climate_model,
-                                data_dir.split("/")[-1],
-                                var,
-                                y,
-                                exp,
-                            )
-                            print("Exiting! Please fix the data issue.")
-                            exit(0)
+                            raise FileNotFoundError(f"No files for climate model {climate_model}, ensemble member {data_dir.split("/")[-1]}, var {var}, year {y}, scenario {exp}. Please check if climate model runs for this exact pairing actually exist.")
                         # loads all years! implement splitting
                         output_nc_files += files
                 files_per_var.append(output_nc_files)
@@ -648,7 +622,7 @@ def __init__(  # inherits all the stuff from Base
                 )
 
                 if os.path.isfile(fname):
-                    print("Stats file already exists! Loading from memory.")
+                    #print("Stats file already exists! Loading from memory.")
                     stats = self.load_statistics_data(stats_fname)
                     self.norm_data = self.normalize_data(self.raw_data, stats)
 
@@ -659,7 +633,7 @@ def __init__(  # inherits all the stuff from Base
                     stats = {"mean": stat1, "std": stat2}
                     self.norm_data = self.normalize_data(self.raw_data, stats)
                     save_file_name = self.write_dataset_statistics(stats_fname, stats)
-                    print("WROTE STATISTICS", save_file_name)
+                    #print("WROTE STATISTICS", save_file_name)
 
                 self.norm_data = self.normalize_data(self.raw_data, stats)
 
@@ -742,7 +716,7 @@ def __init__(  # inherits all the stuff from Base
             os.path.join(output_save_dir, fname)
         ):  # we first need to get the name here to test that...
             self.data_path = os.path.join(output_save_dir, fname)
-            print("path exists, reloading")
+            #print("path exists, reloading")
             self.Data = self._reload_data(self.data_path)
 
             # Load stats and normalize
@@ -813,7 +787,7 @@ def __init__(  # inherits all the stuff from Base
                 )
 
                 if os.path.isfile(stats_fname):
-                    print("Stats file already exists! Loading from mempory.")
+                    #print("Stats file already exists! Loading from mempory.")
                     stats = self.load_statistics_data(stats_fname)
                     self.norm_data = self.normalize_data(self.raw_data, stats)
 
diff --git a/emulator/src/datamodules/climate_datamodule.py b/emulator/src/datamodules/climate_datamodule.py
@@ -104,7 +104,7 @@ def __init__(
             for model in self.test_models
         ]
         self.emissions_tracker = self.hparams.emissions_tracker
-        print("Test Sets: ", self.test_set_names)
+        #print("Test Sets: ", self.test_set_names)
 
         self._data_train = None
         self._data_val = None
diff --git a/emulator/src/utils/interface.py b/emulator/src/utils/interface.py
@@ -112,6 +112,8 @@ def get_datamodule(config: DictConfig) -> DummyDataModule:
     #    _recursive_=False
     # )
 
+    # hydra automaticall instantiates the right class type (specified in the config)
+    # to test this you can run print(type(data_module).__name__)
     data_module: DummyDataModule = hydra.utils.instantiate(
         config.datamodule,
         # input_transform=config.model.get("input_transform"),

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ def __init__(`
`104`	`104`	`for model in self.test_models`
`105`	`105`	`]`
`106`	`106`	`self.emissions_tracker = self.hparams.emissions_tracker`
`107`		`- print("Test Sets: ", self.test_set_names)`
	`107`	`+ #print("Test Sets: ", self.test_set_names)`
`108`	`108`
`109`	`109`	`self._data_train = None`
`110`	`110`	`self._data_val = None`