diff --git a/.gitignore b/.gitignore index ce45a93..8b8bfc8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .idea mlruns +.aim examples/data/ examples/*/output*/ diff --git a/amlrt_project/data/constants.py b/amlrt_project/data/constants.py new file mode 100644 index 0000000..f6845bd --- /dev/null +++ b/amlrt_project/data/constants.py @@ -0,0 +1,4 @@ +TENSORBOARD = 'tensorboard' +AIM = 'aim' +LOG_FOLDER = 'log_folder' +EXP_LOGGERS = 'experiment_loggers' diff --git a/amlrt_project/train.py b/amlrt_project/train.py index 501c63e..e7d6275 100644 --- a/amlrt_project/train.py +++ b/amlrt_project/train.py @@ -16,7 +16,10 @@ from amlrt_project.models.model_loader import load_model from amlrt_project.utils.file_utils import rsync_folder from amlrt_project.utils.hp_utils import check_and_log_hp -from amlrt_project.utils.logging_utils import LoggerWriter, log_exp_details +from amlrt_project.utils.logging_utils import (LoggerWriter, + load_experiment_loggers, + log_exp_details, + log_hyper_parameters) from amlrt_project.utils.reproducibility_utils import set_seed logger = logging.getLogger(__name__) @@ -34,7 +37,6 @@ def main(): """ parser = argparse.ArgumentParser() - # __TODO__ check you need all the following CLI parameters parser.add_argument('--log', help='log to this file (in addition to stdout/err)') parser.add_argument('--config', help='config file with generic hyper-parameters, such as optimizer, ' @@ -48,7 +50,7 @@ def main(): parser.add_argument('--disable-progressbar', action='store_true', help='will disable the progressbar while going over the mini-batch') parser.add_argument('--start-from-scratch', action='store_true', - help='will not load any existing saved model - even if present') + help='will delete the output folder before starting the experiment') parser.add_argument('--gpus', default=None, help='list of GPUs to use. If not specified, runs on CPU.' 'Example of GPU usage: 1 means run on GPU 1, 0 on GPU 0.') @@ -58,7 +60,7 @@ def main(): logging.basicConfig(stream=sys.stdout, level=logging.INFO) if os.path.exists(args.output) and args.start_from_scratch: - logger.info('Starting from scratch, removing any previous experiments.') + logger.info('Starting from scratch, deleting the previous output folder.') shutil.rmtree(args.output) if os.path.exists(args.output): @@ -193,25 +195,24 @@ def train_impl(model, datamodule, output, hyper_params, use_progress_bar, gpus): patience=early_stopping_params['patience'], verbose=use_progress_bar) - logger = pl.loggers.TensorBoardLogger( - save_dir=output, - default_hp_metric=False, - version=0, # Necessary to resume tensorboard logging - ) + name2loggers = load_experiment_loggers(hyper_params, output) + log_hyper_parameters(name2loggers, hyper_params) trainer = pl.Trainer( callbacks=[early_stopping, best_checkpoint_callback, last_checkpoint_callback], max_epochs=hyper_params['max_epoch'], resume_from_checkpoint=resume_from_checkpoint, gpus=gpus, - logger=logger, + logger=name2loggers.values() ) trainer.fit(model, datamodule=datamodule) # Log the best result and associated hyper parameters best_dev_result = float(early_stopping.best_score.cpu().numpy()) - logger.log_hyperparams(hyper_params, metrics={'best_dev_metric': best_dev_result}) + # logging hyper-parameters again - this time also passing the final result + log_hyper_parameters(name2loggers, hyper_params, best_dev_result) + # logging to file with open(os.path.join(output, 'results.txt'), 'w') as stream_out: stream_out.write(f'final best_dev_metric: {best_dev_result}\n') diff --git a/amlrt_project/utils/aim_logger_utils.py b/amlrt_project/utils/aim_logger_utils.py new file mode 100644 index 0000000..6a3ac54 --- /dev/null +++ b/amlrt_project/utils/aim_logger_utils.py @@ -0,0 +1,83 @@ +import os + +import yaml + +if os.name != 'nt': + # not using AIM on Windows + from aim.pytorch_lightning import AimLogger + +from amlrt_project.data.constants import LOG_FOLDER + +AIM_INFO_FILE_NAME = "aim_info.yaml" + + +def prepare_aim_logger(hyper_params, options, output): + """Create the aim logger - make sure to track on the same experiment if resuming one.""" + if LOG_FOLDER not in options: + raise ValueError('please set log_folder in config file to use aim') + aim_run_info_dict = retrieve_aim_run_info( + output, hyper_params["exp_name"], options[LOG_FOLDER], + ) + aim_logger = AimLogger( + run_name=aim_run_info_dict["run_name"] if aim_run_info_dict else None, + run_hash=aim_run_info_dict["run_hash"] if aim_run_info_dict else None, + experiment=hyper_params["exp_name"], + repo=options[LOG_FOLDER], + train_metric_prefix="train__", + val_metric_prefix="val__", + ) + # get orion trail id if using orion - if yes, this will be used as the run name + orion_trial_id = os.environ.get("ORION_TRIAL_ID") + if orion_trial_id: + aim_logger.experiment.name = orion_trial_id + save_aim_run_info( + aim_logger.experiment.name, + aim_logger.experiment.hash, + output, + hyper_params["exp_name"], + options[LOG_FOLDER], + ) + return aim_logger + + +def save_aim_run_info( + run_name: str, + run_hash: str, + output: str, + experiment: str, + repo: str, +): + """Save aim_run_info_dict to output dir.""" + aim_run_info_dict = { + "experiment": experiment, + "aim_dir": repo, + "run_name": run_name, + "run_hash": run_hash, + } + with open(os.path.join(output, AIM_INFO_FILE_NAME), "w") as file: + yaml.dump(aim_run_info_dict, file) + + +def retrieve_aim_run_info( + output: str, + experiment: str, + repo: str, +): + """Retrieve aim_run_info_dict from previous run's output dir.""" + if os.path.exists(os.path.join(output, AIM_INFO_FILE_NAME)): + # output exist and aim_info.yaml exists under output + # this means current run is not starting from scratch + # so we will try to load aim_info.yaml to resume the previous run + with open(os.path.join(output, AIM_INFO_FILE_NAME), "r") as file: + aim_run_info_dict = yaml.load(file, Loader=yaml.FullLoader) + if (experiment != aim_run_info_dict["experiment"]) or ( + repo != aim_run_info_dict["aim_dir"] + ): + # if the experiment changes or the aim logging directory changes + # either of these means the run is differently + # so will not resume the previous run for aim + aim_run_info_dict = None + else: + aim_run_info_dict = None + + return aim_run_info_dict diff --git a/amlrt_project/utils/logging_utils.py b/amlrt_project/utils/logging_utils.py index 8b4367b..b7d6958 100644 --- a/amlrt_project/utils/logging_utils.py +++ b/amlrt_project/utils/logging_utils.py @@ -2,9 +2,13 @@ import os import socket +import pytorch_lightning as pl from git import InvalidGitRepositoryError, Repo from pip._internal.operations import freeze +from amlrt_project.data.constants import AIM, EXP_LOGGERS, TENSORBOARD +from amlrt_project.utils.aim_logger_utils import prepare_aim_logger + logger = logging.getLogger(__name__) @@ -69,3 +73,42 @@ def log_exp_details(script_location, args): # pragma: no cover hostname, git_hash, args.data, os.path.abspath(args.data), '\n'.join(dependencies)) logger.info('Experiment info:' + details + '\n') + + +def load_experiment_loggers( + hyper_params: dict, + output: str): + """Prepares and loads the loggers for this experiment. + + :param hyper_params: the experiment hyper-parameters + :param output: the output folder + :return: a dict containing the name and the associated logger + """ + name2loggers = {} + for logger_name, options in hyper_params[EXP_LOGGERS].items(): + if logger_name == TENSORBOARD: + tb_logger = pl.loggers.TensorBoardLogger( + save_dir=output, + default_hp_metric=False, + version=0, # Necessary to resume tensorboard logging + ) + name2loggers[TENSORBOARD] = tb_logger + elif logger_name == AIM: + if os.name == 'nt': + logger.warning("AIM logger is not supported on Windows, skipped") + continue + aim_logger = prepare_aim_logger(hyper_params, options, output) + name2loggers[AIM] = aim_logger + else: + raise NotImplementedError(f"logger {logger_name} is not supported") + return name2loggers + + +def log_hyper_parameters(name2loggers, hyper_params, best_dev_result=None): + """Log the experiment hyper-parameters to all the loggers.""" + for name, logger in name2loggers.items(): + if name == AIM: + logger.log_hyperparams(hyper_params) + elif name == TENSORBOARD: + if best_dev_result is not None: + logger.log_hyperparams(hyper_params, metrics={'best_dev_metric': best_dev_result}) diff --git a/examples/local/config.yaml b/examples/local/config.yaml index 6fd9834..96c72e7 100644 --- a/examples/local/config.yaml +++ b/examples/local/config.yaml @@ -9,6 +9,12 @@ num_workers: 0 # results will not be reproducible) seed: 1234 +experiment_loggers: + tensorboard: null # no parameters for tensorboard + aim: + # change this to an absolute path to always use the same aim db file + log_folder: ./ + # architecture hidden_dim: 256 num_classes: 10 @@ -18,4 +24,4 @@ architecture: simple_mlp early_stopping: metric: val_loss mode: min - patience: 3 \ No newline at end of file + patience: 3 diff --git a/examples/local/run.sh b/examples/local/run.sh index feb7704..ae13f36 100644 --- a/examples/local/run.sh +++ b/examples/local/run.sh @@ -1,2 +1,3 @@ +set -e amlrt-train --data ../data --output output --config config.yaml --start-from-scratch amlrt-eval --data ../data --config config.yaml --ckpt-path output/best_model/model.ckpt diff --git a/examples/local_orion/config.yaml b/examples/local_orion/config.yaml index 86a9181..7f54b1d 100644 --- a/examples/local_orion/config.yaml +++ b/examples/local_orion/config.yaml @@ -9,6 +9,12 @@ num_workers: 0 # results will not be reproducible) seed: 1234 +experiment_loggers: + tensorboard: null # no parameters for tensorboard + aim: + # change this to an absolute path to always use the same aim db file + log_folder: ./ + # architecture hidden_dim: 'orion~uniform(32,256,discrete=True)' num_classes: 10 diff --git a/examples/local_orion/run.sh b/examples/local_orion/run.sh index 4d5e0a1..a6ef96d 100644 --- a/examples/local_orion/run.sh +++ b/examples/local_orion/run.sh @@ -1,3 +1,4 @@ +set -e export ORION_DB_ADDRESS='orion_db.pkl' export ORION_DB_TYPE='pickleddb' diff --git a/examples/slurm/config.yaml b/examples/slurm/config.yaml index 71a5b57..96c05d7 100644 --- a/examples/slurm/config.yaml +++ b/examples/slurm/config.yaml @@ -8,6 +8,12 @@ exp_name: my_exp_1 # results will not be reproducible) seed: 1234 +experiment_loggers: + tensorboard: null # no parameters for tensorboard + aim: + # change this to an absolute path to always use the same aim db file + log_folder: ./ + # architecture hidden_dim: 256 num_classes: 10 diff --git a/examples/slurm_orion/config.yaml b/examples/slurm_orion/config.yaml index 6c9346c..6250dea 100644 --- a/examples/slurm_orion/config.yaml +++ b/examples/slurm_orion/config.yaml @@ -8,6 +8,12 @@ exp_name: my_exp_1 # results will not be reproducible) seed: 1234 +experiment_loggers: + tensorboard: null # no parameters for tensorboard + aim: + # change this to an absolute path to always use the same aim db file + log_folder: ./ + # architecture num_classes: 10 architecture: simple_mlp diff --git a/setup.py b/setup.py index 72bfe90..4936102 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ packages=find_packages(include=['amlrt_project', 'amlrt_project.*']), python_requires='>=3.9', install_requires=[ + 'aim==3.18.1; os_name!="nt"', 'flake8==4.0.1', 'flake8-docstrings==1.6.0', 'gitpython==3.1.27',