Merge pull request #115 from mila-iqia/add_aim

Add aimlogger
mila-iqia · Mar 15, 2024 · 9614da9 · 9614da9
2 parents cb36f93 + d5a9a68
commit 9614da9
Show file tree

Hide file tree

Showing 12 changed files with 171 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .idea
 mlruns
+.aim
 
 examples/data/
 examples/*/output*/

diff --git a/amlrt_project/data/constants.py b/amlrt_project/data/constants.py
@@ -0,0 +1,4 @@
+TENSORBOARD = 'tensorboard'
+AIM = 'aim'
+LOG_FOLDER = 'log_folder'
+EXP_LOGGERS = 'experiment_loggers'
diff --git a/amlrt_project/train.py b/amlrt_project/train.py
@@ -16,7 +16,10 @@
 from amlrt_project.models.model_loader import load_model
 from amlrt_project.utils.file_utils import rsync_folder
 from amlrt_project.utils.hp_utils import check_and_log_hp
-from amlrt_project.utils.logging_utils import LoggerWriter, log_exp_details
+from amlrt_project.utils.logging_utils import (LoggerWriter,
+                                               load_experiment_loggers,
+                                               log_exp_details,
+                                               log_hyper_parameters)
 from amlrt_project.utils.reproducibility_utils import set_seed
 
 logger = logging.getLogger(__name__)
@@ -34,7 +37,6 @@ def main():
 
     """
     parser = argparse.ArgumentParser()
-    # __TODO__ check you need all the following CLI parameters
     parser.add_argument('--log', help='log to this file (in addition to stdout/err)')
     parser.add_argument('--config',
                         help='config file with generic hyper-parameters,  such as optimizer, '
@@ -48,7 +50,7 @@ def main():
     parser.add_argument('--disable-progressbar', action='store_true',
                         help='will disable the progressbar while going over the mini-batch')
     parser.add_argument('--start-from-scratch', action='store_true',
-                        help='will not load any existing saved model - even if present')
+                        help='will delete the output folder before starting the experiment')
     parser.add_argument('--gpus', default=None,
                         help='list of GPUs to use. If not specified, runs on CPU.'
                              'Example of GPU usage: 1 means run on GPU 1, 0 on GPU 0.')
@@ -58,7 +60,7 @@ def main():
     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
     if os.path.exists(args.output) and args.start_from_scratch:
-        logger.info('Starting from scratch, removing any previous experiments.')
+        logger.info('Starting from scratch, deleting the previous output folder.')
         shutil.rmtree(args.output)
 
     if os.path.exists(args.output):
@@ -193,25 +195,24 @@ def train_impl(model, datamodule, output, hyper_params, use_progress_bar, gpus):
         patience=early_stopping_params['patience'],
         verbose=use_progress_bar)
 
-    logger = pl.loggers.TensorBoardLogger(
-        save_dir=output,
-        default_hp_metric=False,
-        version=0,  # Necessary to resume tensorboard logging
-    )
+    name2loggers = load_experiment_loggers(hyper_params, output)
+    log_hyper_parameters(name2loggers, hyper_params)
 
     trainer = pl.Trainer(
         callbacks=[early_stopping, best_checkpoint_callback, last_checkpoint_callback],
         max_epochs=hyper_params['max_epoch'],
         resume_from_checkpoint=resume_from_checkpoint,
         gpus=gpus,
-        logger=logger,
+        logger=name2loggers.values()
     )
 
     trainer.fit(model, datamodule=datamodule)
 
     # Log the best result and associated hyper parameters
     best_dev_result = float(early_stopping.best_score.cpu().numpy())
-    logger.log_hyperparams(hyper_params, metrics={'best_dev_metric': best_dev_result})
+    # logging hyper-parameters again - this time also passing the final result
+    log_hyper_parameters(name2loggers, hyper_params, best_dev_result)
+    # logging to file
     with open(os.path.join(output, 'results.txt'), 'w') as stream_out:
         stream_out.write(f'final best_dev_metric: {best_dev_result}\n')
 

diff --git a/amlrt_project/utils/aim_logger_utils.py b/amlrt_project/utils/aim_logger_utils.py
@@ -0,0 +1,83 @@
+import os
+
+import yaml
+
+if os.name != 'nt':
+    # not using AIM on Windows
+    from aim.pytorch_lightning import AimLogger
+
+from amlrt_project.data.constants import LOG_FOLDER
+
+AIM_INFO_FILE_NAME = "aim_info.yaml"
+
+
+def prepare_aim_logger(hyper_params, options, output):
+    """Create the aim logger - make sure to track on the same experiment if resuming one."""
+    if LOG_FOLDER not in options:
+        raise ValueError('please set log_folder in config file to use aim')
+    aim_run_info_dict = retrieve_aim_run_info(
+        output, hyper_params["exp_name"], options[LOG_FOLDER],
+    )
+    aim_logger = AimLogger(
+        run_name=aim_run_info_dict["run_name"] if aim_run_info_dict else None,
+        run_hash=aim_run_info_dict["run_hash"] if aim_run_info_dict else None,
+        experiment=hyper_params["exp_name"],
+        repo=options[LOG_FOLDER],
+        train_metric_prefix="train__",
+        val_metric_prefix="val__",
+    )
+    # get orion trail id if using orion - if yes, this will be used as the run name
+    orion_trial_id = os.environ.get("ORION_TRIAL_ID")
+    if orion_trial_id:
+        aim_logger.experiment.name = orion_trial_id
+    save_aim_run_info(
+        aim_logger.experiment.name,
+        aim_logger.experiment.hash,
+        output,
+        hyper_params["exp_name"],
+        options[LOG_FOLDER],
+    )
+    return aim_logger
+
+
+def save_aim_run_info(
+    run_name: str,
+    run_hash: str,
+    output: str,
+    experiment: str,
+    repo: str,
+):
+    """Save aim_run_info_dict to output dir."""
+    aim_run_info_dict = {
+        "experiment": experiment,
+        "aim_dir": repo,
+        "run_name": run_name,
+        "run_hash": run_hash,
+    }
+    with open(os.path.join(output, AIM_INFO_FILE_NAME), "w") as file:
+        yaml.dump(aim_run_info_dict, file)
+
+
+def retrieve_aim_run_info(
+    output: str,
+    experiment: str,
+    repo: str,
+):
+    """Retrieve aim_run_info_dict from previous run's output dir."""
+    if os.path.exists(os.path.join(output, AIM_INFO_FILE_NAME)):
+        # output exist and aim_info.yaml exists under output
+        # this means current run is not starting from scratch
+        # so we will try to load aim_info.yaml to resume the previous run
+        with open(os.path.join(output, AIM_INFO_FILE_NAME), "r") as file:
+            aim_run_info_dict = yaml.load(file, Loader=yaml.FullLoader)
+        if (experiment != aim_run_info_dict["experiment"]) or (
+            repo != aim_run_info_dict["aim_dir"]
+        ):
+            # if the experiment changes or the aim logging directory changes
+            # either of these means the run is differently
+            # so will not resume the previous run for aim
+            aim_run_info_dict = None
+    else:
+        aim_run_info_dict = None
+
+    return aim_run_info_dict
diff --git a/amlrt_project/utils/logging_utils.py b/amlrt_project/utils/logging_utils.py
@@ -2,9 +2,13 @@
 import os
 import socket
 
+import pytorch_lightning as pl
 from git import InvalidGitRepositoryError, Repo
 from pip._internal.operations import freeze
 
+from amlrt_project.data.constants import AIM, EXP_LOGGERS, TENSORBOARD
+from amlrt_project.utils.aim_logger_utils import prepare_aim_logger
+
 logger = logging.getLogger(__name__)
 
 
@@ -69,3 +73,42 @@ def log_exp_details(script_location, args):  # pragma: no cover
                   hostname, git_hash, args.data, os.path.abspath(args.data),
                   '\n'.join(dependencies))
     logger.info('Experiment info:' + details + '\n')
+
+
+def load_experiment_loggers(
+        hyper_params: dict,
+        output: str):
+    """Prepares and loads the loggers for this experiment.
+
+    :param hyper_params: the experiment hyper-parameters
+    :param output: the output folder
+    :return: a dict containing the name and the associated logger
+    """
+    name2loggers = {}
+    for logger_name, options in hyper_params[EXP_LOGGERS].items():
+        if logger_name == TENSORBOARD:
+            tb_logger = pl.loggers.TensorBoardLogger(
+                save_dir=output,
+                default_hp_metric=False,
+                version=0,  # Necessary to resume tensorboard logging
+            )
+            name2loggers[TENSORBOARD] = tb_logger
+        elif logger_name == AIM:
+            if os.name == 'nt':
+                logger.warning("AIM logger is not supported on Windows, skipped")
+                continue
+            aim_logger = prepare_aim_logger(hyper_params, options, output)
+            name2loggers[AIM] = aim_logger
+        else:
+            raise NotImplementedError(f"logger {logger_name} is not supported")
+    return name2loggers
+
+
+def log_hyper_parameters(name2loggers, hyper_params, best_dev_result=None):
+    """Log the experiment hyper-parameters to all the loggers."""
+    for name, logger in name2loggers.items():
+        if name == AIM:
+            logger.log_hyperparams(hyper_params)
+        elif name == TENSORBOARD:
+            if best_dev_result is not None:
+                logger.log_hyperparams(hyper_params, metrics={'best_dev_metric': best_dev_result})
diff --git a/examples/local/config.yaml b/examples/local/config.yaml
@@ -9,6 +9,12 @@ num_workers: 0
 # results will not be reproducible)
 seed: 1234
 
+experiment_loggers:
+  tensorboard: null  # no parameters for tensorboard
+  aim:
+    # change this to an absolute path to always use the same aim db file
+    log_folder: ./
+
 # architecture
 hidden_dim: 256
 num_classes: 10
@@ -18,4 +24,4 @@ architecture: simple_mlp
 early_stopping:
   metric: val_loss
   mode: min
-  patience: 3
+  patience: 3
diff --git a/examples/local/run.sh b/examples/local/run.sh
@@ -1,2 +1,3 @@
+set -e
 amlrt-train --data ../data --output output --config config.yaml --start-from-scratch
 amlrt-eval --data ../data --config config.yaml --ckpt-path output/best_model/model.ckpt
diff --git a/examples/local_orion/config.yaml b/examples/local_orion/config.yaml
@@ -9,6 +9,12 @@ num_workers: 0
 # results will not be reproducible)
 seed: 1234
 
+experiment_loggers:
+  tensorboard: null  # no parameters for tensorboard
+  aim:
+    # change this to an absolute path to always use the same aim db file
+    log_folder: ./
+
 # architecture
 hidden_dim: 'orion~uniform(32,256,discrete=True)'
 num_classes: 10

diff --git a/examples/local_orion/run.sh b/examples/local_orion/run.sh
@@ -1,3 +1,4 @@
+set -e
 export ORION_DB_ADDRESS='orion_db.pkl'
 export ORION_DB_TYPE='pickleddb'
 

diff --git a/examples/slurm/config.yaml b/examples/slurm/config.yaml
@@ -8,6 +8,12 @@ exp_name: my_exp_1
 # results will not be reproducible)
 seed: 1234
 
+experiment_loggers:
+  tensorboard: null  # no parameters for tensorboard
+  aim:
+    # change this to an absolute path to always use the same aim db file
+    log_folder: ./
+
 # architecture
 hidden_dim: 256
 num_classes: 10

diff --git a/examples/slurm_orion/config.yaml b/examples/slurm_orion/config.yaml
@@ -8,6 +8,12 @@ exp_name: my_exp_1
 # results will not be reproducible)
 seed: 1234
 
+experiment_loggers:
+  tensorboard: null  # no parameters for tensorboard
+  aim:
+    # change this to an absolute path to always use the same aim db file
+    log_folder: ./
+
 # architecture
 num_classes: 10
 architecture: simple_mlp

diff --git a/setup.py b/setup.py
@@ -6,6 +6,7 @@
     packages=find_packages(include=['amlrt_project', 'amlrt_project.*']),
     python_requires='>=3.9',
     install_requires=[
+        'aim==3.18.1; os_name!="nt"',
         'flake8==4.0.1',
         'flake8-docstrings==1.6.0',
         'gitpython==3.1.27',