Skip to content

Commit

Permalink
Merge pull request #115 from mila-iqia/add_aim
Browse files Browse the repository at this point in the history
Add aimlogger
  • Loading branch information
mirkobronzi authored Mar 15, 2024
2 parents cb36f93 + d5a9a68 commit 9614da9
Show file tree
Hide file tree
Showing 12 changed files with 171 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.idea
mlruns
.aim

examples/data/
examples/*/output*/
Expand Down
4 changes: 4 additions & 0 deletions amlrt_project/data/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
TENSORBOARD = 'tensorboard'
AIM = 'aim'
LOG_FOLDER = 'log_folder'
EXP_LOGGERS = 'experiment_loggers'
23 changes: 12 additions & 11 deletions amlrt_project/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
from amlrt_project.models.model_loader import load_model
from amlrt_project.utils.file_utils import rsync_folder
from amlrt_project.utils.hp_utils import check_and_log_hp
from amlrt_project.utils.logging_utils import LoggerWriter, log_exp_details
from amlrt_project.utils.logging_utils import (LoggerWriter,
load_experiment_loggers,
log_exp_details,
log_hyper_parameters)
from amlrt_project.utils.reproducibility_utils import set_seed

logger = logging.getLogger(__name__)
Expand All @@ -34,7 +37,6 @@ def main():
"""
parser = argparse.ArgumentParser()
# __TODO__ check you need all the following CLI parameters
parser.add_argument('--log', help='log to this file (in addition to stdout/err)')
parser.add_argument('--config',
help='config file with generic hyper-parameters, such as optimizer, '
Expand All @@ -48,7 +50,7 @@ def main():
parser.add_argument('--disable-progressbar', action='store_true',
help='will disable the progressbar while going over the mini-batch')
parser.add_argument('--start-from-scratch', action='store_true',
help='will not load any existing saved model - even if present')
help='will delete the output folder before starting the experiment')
parser.add_argument('--gpus', default=None,
help='list of GPUs to use. If not specified, runs on CPU.'
'Example of GPU usage: 1 means run on GPU 1, 0 on GPU 0.')
Expand All @@ -58,7 +60,7 @@ def main():
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

if os.path.exists(args.output) and args.start_from_scratch:
logger.info('Starting from scratch, removing any previous experiments.')
logger.info('Starting from scratch, deleting the previous output folder.')
shutil.rmtree(args.output)

if os.path.exists(args.output):
Expand Down Expand Up @@ -193,25 +195,24 @@ def train_impl(model, datamodule, output, hyper_params, use_progress_bar, gpus):
patience=early_stopping_params['patience'],
verbose=use_progress_bar)

logger = pl.loggers.TensorBoardLogger(
save_dir=output,
default_hp_metric=False,
version=0, # Necessary to resume tensorboard logging
)
name2loggers = load_experiment_loggers(hyper_params, output)
log_hyper_parameters(name2loggers, hyper_params)

trainer = pl.Trainer(
callbacks=[early_stopping, best_checkpoint_callback, last_checkpoint_callback],
max_epochs=hyper_params['max_epoch'],
resume_from_checkpoint=resume_from_checkpoint,
gpus=gpus,
logger=logger,
logger=name2loggers.values()
)

trainer.fit(model, datamodule=datamodule)

# Log the best result and associated hyper parameters
best_dev_result = float(early_stopping.best_score.cpu().numpy())
logger.log_hyperparams(hyper_params, metrics={'best_dev_metric': best_dev_result})
# logging hyper-parameters again - this time also passing the final result
log_hyper_parameters(name2loggers, hyper_params, best_dev_result)
# logging to file
with open(os.path.join(output, 'results.txt'), 'w') as stream_out:
stream_out.write(f'final best_dev_metric: {best_dev_result}\n')

Expand Down
83 changes: 83 additions & 0 deletions amlrt_project/utils/aim_logger_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os

import yaml

if os.name != 'nt':
# not using AIM on Windows
from aim.pytorch_lightning import AimLogger

from amlrt_project.data.constants import LOG_FOLDER

AIM_INFO_FILE_NAME = "aim_info.yaml"


def prepare_aim_logger(hyper_params, options, output):
"""Create the aim logger - make sure to track on the same experiment if resuming one."""
if LOG_FOLDER not in options:
raise ValueError('please set log_folder in config file to use aim')
aim_run_info_dict = retrieve_aim_run_info(
output, hyper_params["exp_name"], options[LOG_FOLDER],
)
aim_logger = AimLogger(
run_name=aim_run_info_dict["run_name"] if aim_run_info_dict else None,
run_hash=aim_run_info_dict["run_hash"] if aim_run_info_dict else None,
experiment=hyper_params["exp_name"],
repo=options[LOG_FOLDER],
train_metric_prefix="train__",
val_metric_prefix="val__",
)
# get orion trail id if using orion - if yes, this will be used as the run name
orion_trial_id = os.environ.get("ORION_TRIAL_ID")
if orion_trial_id:
aim_logger.experiment.name = orion_trial_id
save_aim_run_info(
aim_logger.experiment.name,
aim_logger.experiment.hash,
output,
hyper_params["exp_name"],
options[LOG_FOLDER],
)
return aim_logger


def save_aim_run_info(
run_name: str,
run_hash: str,
output: str,
experiment: str,
repo: str,
):
"""Save aim_run_info_dict to output dir."""
aim_run_info_dict = {
"experiment": experiment,
"aim_dir": repo,
"run_name": run_name,
"run_hash": run_hash,
}
with open(os.path.join(output, AIM_INFO_FILE_NAME), "w") as file:
yaml.dump(aim_run_info_dict, file)


def retrieve_aim_run_info(
output: str,
experiment: str,
repo: str,
):
"""Retrieve aim_run_info_dict from previous run's output dir."""
if os.path.exists(os.path.join(output, AIM_INFO_FILE_NAME)):
# output exist and aim_info.yaml exists under output
# this means current run is not starting from scratch
# so we will try to load aim_info.yaml to resume the previous run
with open(os.path.join(output, AIM_INFO_FILE_NAME), "r") as file:
aim_run_info_dict = yaml.load(file, Loader=yaml.FullLoader)
if (experiment != aim_run_info_dict["experiment"]) or (
repo != aim_run_info_dict["aim_dir"]
):
# if the experiment changes or the aim logging directory changes
# either of these means the run is differently
# so will not resume the previous run for aim
aim_run_info_dict = None
else:
aim_run_info_dict = None

return aim_run_info_dict
43 changes: 43 additions & 0 deletions amlrt_project/utils/logging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
import os
import socket

import pytorch_lightning as pl
from git import InvalidGitRepositoryError, Repo
from pip._internal.operations import freeze

from amlrt_project.data.constants import AIM, EXP_LOGGERS, TENSORBOARD
from amlrt_project.utils.aim_logger_utils import prepare_aim_logger

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -69,3 +73,42 @@ def log_exp_details(script_location, args): # pragma: no cover
hostname, git_hash, args.data, os.path.abspath(args.data),
'\n'.join(dependencies))
logger.info('Experiment info:' + details + '\n')


def load_experiment_loggers(
hyper_params: dict,
output: str):
"""Prepares and loads the loggers for this experiment.
:param hyper_params: the experiment hyper-parameters
:param output: the output folder
:return: a dict containing the name and the associated logger
"""
name2loggers = {}
for logger_name, options in hyper_params[EXP_LOGGERS].items():
if logger_name == TENSORBOARD:
tb_logger = pl.loggers.TensorBoardLogger(
save_dir=output,
default_hp_metric=False,
version=0, # Necessary to resume tensorboard logging
)
name2loggers[TENSORBOARD] = tb_logger
elif logger_name == AIM:
if os.name == 'nt':
logger.warning("AIM logger is not supported on Windows, skipped")
continue
aim_logger = prepare_aim_logger(hyper_params, options, output)
name2loggers[AIM] = aim_logger
else:
raise NotImplementedError(f"logger {logger_name} is not supported")
return name2loggers


def log_hyper_parameters(name2loggers, hyper_params, best_dev_result=None):
"""Log the experiment hyper-parameters to all the loggers."""
for name, logger in name2loggers.items():
if name == AIM:
logger.log_hyperparams(hyper_params)
elif name == TENSORBOARD:
if best_dev_result is not None:
logger.log_hyperparams(hyper_params, metrics={'best_dev_metric': best_dev_result})
8 changes: 7 additions & 1 deletion examples/local/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ num_workers: 0
# results will not be reproducible)
seed: 1234

experiment_loggers:
tensorboard: null # no parameters for tensorboard
aim:
# change this to an absolute path to always use the same aim db file
log_folder: ./

# architecture
hidden_dim: 256
num_classes: 10
Expand All @@ -18,4 +24,4 @@ architecture: simple_mlp
early_stopping:
metric: val_loss
mode: min
patience: 3
patience: 3
1 change: 1 addition & 0 deletions examples/local/run.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
set -e
amlrt-train --data ../data --output output --config config.yaml --start-from-scratch
amlrt-eval --data ../data --config config.yaml --ckpt-path output/best_model/model.ckpt
6 changes: 6 additions & 0 deletions examples/local_orion/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ num_workers: 0
# results will not be reproducible)
seed: 1234

experiment_loggers:
tensorboard: null # no parameters for tensorboard
aim:
# change this to an absolute path to always use the same aim db file
log_folder: ./

# architecture
hidden_dim: 'orion~uniform(32,256,discrete=True)'
num_classes: 10
Expand Down
1 change: 1 addition & 0 deletions examples/local_orion/run.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
set -e
export ORION_DB_ADDRESS='orion_db.pkl'
export ORION_DB_TYPE='pickleddb'

Expand Down
6 changes: 6 additions & 0 deletions examples/slurm/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ exp_name: my_exp_1
# results will not be reproducible)
seed: 1234

experiment_loggers:
tensorboard: null # no parameters for tensorboard
aim:
# change this to an absolute path to always use the same aim db file
log_folder: ./

# architecture
hidden_dim: 256
num_classes: 10
Expand Down
6 changes: 6 additions & 0 deletions examples/slurm_orion/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ exp_name: my_exp_1
# results will not be reproducible)
seed: 1234

experiment_loggers:
tensorboard: null # no parameters for tensorboard
aim:
# change this to an absolute path to always use the same aim db file
log_folder: ./

# architecture
num_classes: 10
architecture: simple_mlp
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
packages=find_packages(include=['amlrt_project', 'amlrt_project.*']),
python_requires='>=3.9',
install_requires=[
'aim==3.18.1; os_name!="nt"',
'flake8==4.0.1',
'flake8-docstrings==1.6.0',
'gitpython==3.1.27',
Expand Down

0 comments on commit 9614da9

Please sign in to comment.