Skip to content

Commit

Permalink
Merge pull request #1283 from bghira/debug/multigpu-init-stuck-logging
Browse files Browse the repository at this point in the history
add debug logging for factory initialisation in multigpu systems where it seems to get stuck, and format some files
  • Loading branch information
bghira authored Jan 18, 2025
2 parents 5c1fbe5 + dfaefab commit 2248fd0
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 13 deletions.
5 changes: 4 additions & 1 deletion helpers/data_backend/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ def exists(self, s3_key):
except (NoCredentialsError, PartialCredentialsError) as e:
raise e # Raise credential errors to the caller
except Exception as e:
if "An error occurred (404) when calling the HeadObject operation: Not Found" in str(e):
if (
"An error occurred (404) when calling the HeadObject operation: Not Found"
in str(e)
):
return False
logger.error(f'Error checking existence of S3 key "{s3_key}": {e}')
if i == self.read_retry_limit - 1:
Expand Down
19 changes: 18 additions & 1 deletion helpers/data_backend/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
)

# Generate a TextEmbeddingCache object
logger.debug(f"rank {get_rank()} is creating TextEmbeddingCache")
init_backend["text_embed_cache"] = TextEmbeddingCache(
id=init_backend["id"],
data_backend=init_backend["data_backend"],
Expand All @@ -497,11 +498,15 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
model_type=StateTracker.get_model_family(),
write_batch_size=backend.get("write_batch_size", args.write_batch_size),
)
logger.debug(f"rank {get_rank()} completed creation of TextEmbeddingCache")
init_backend["text_embed_cache"].set_webhook_handler(
StateTracker.get_webhook_handler()
)
logger.debug(f"rank {get_rank()} might skip discovery..")
with accelerator.main_process_first():
logger.debug(f"rank {get_rank()} is discovering all files")
init_backend["text_embed_cache"].discover_all_files()
logger.debug(f"rank {get_rank()} is waiting for other processes")
accelerator.wait_for_everyone()

if backend.get("default", False):
Expand All @@ -510,12 +515,19 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
logger.debug(f"Set the default text embed cache to {init_backend['id']}.")
# We will compute the null embedding for caption dropout here.
info_log("Pre-computing null embedding")
logger.debug(f"rank {get_rank()} may skip computing the embedding..")
with accelerator.main_process_first():
logger.debug(f"rank {get_rank()} is computing the null embed")
init_backend["text_embed_cache"].compute_embeddings_for_prompts(
[""], return_concat=False, load_from_cache=False
)
time.sleep(5)
logger.debug(
f"rank {get_rank()} has completed computing the null embed"
)

logger.debug(f"rank {get_rank()} is waiting for other processes")
accelerator.wait_for_everyone()
logger.debug(f"rank {get_rank()} is continuing")
if args.caption_dropout_probability == 0.0:
logger.warning(
"Not using caption dropout will potentially lead to overfitting on captions, eg. CFG will not work very well. Set --caption_dropout_probability=0.1 as a recommended value."
Expand Down Expand Up @@ -1013,6 +1025,11 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
f"Pre-computing text embeds / updating cache. We have {len(captions)} captions to process, though these will be filtered next."
)
logger.debug(f"Data missing captions: {images_missing_captions}")
if len(images_missing_captions) > 0 and hasattr(
init_backend["metadata_backend"], "remove_images"
):
# we'll tell the aspect bucket manager to remove these images.
init_backend["metadata_backend"].remove_images(images_missing_captions)
caption_strategy = backend.get("caption_strategy", args.caption_strategy)
info_log(
f"(id={init_backend['id']}) Initialise text embed pre-computation using the {caption_strategy} caption strategy. We have {len(captions)} captions to process."
Expand Down
27 changes: 19 additions & 8 deletions helpers/training/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
"clip": "CLIPModelEvaluator",
}


class ModelEvaluator:
def __init__(self, pretrained_model_name_or_path):
raise NotImplementedError("Subclasses is incomplete, no __init__ method was found.")
raise NotImplementedError(
"Subclasses is incomplete, no __init__ method was found."
)

def evaluate(self, images, prompts):
raise NotImplementedError("Subclasses should implement the evaluate() method.")
Expand All @@ -25,19 +28,27 @@ def from_config(args):
"""Instantiate a ModelEvaluator from the training config, if set to do so."""
if not StateTracker.get_accelerator().is_main_process:
return None
if args.evaluation_type is not None and args.evaluation_type.lower() != "" and args.evaluation_type.lower() != "none":
if (
args.evaluation_type is not None
and args.evaluation_type.lower() != ""
and args.evaluation_type.lower() != "none"
):
model_evaluator = model_evaluator_map[args.evaluation_type]
return globals()[model_evaluator](args.pretrained_evaluation_model_name_or_path)
return globals()[model_evaluator](
args.pretrained_evaluation_model_name_or_path
)

return None


class CLIPModelEvaluator(ModelEvaluator):
def __init__(self, pretrained_model_name_or_path='openai/clip-vit-large-patch14-336'):
self.clip_score_fn = partial(clip_score, model_name_or_path=pretrained_model_name_or_path)
self.preprocess = transforms.Compose([
transforms.ToTensor()
])
def __init__(
self, pretrained_model_name_or_path="openai/clip-vit-large-patch14-336"
):
self.clip_score_fn = partial(
clip_score, model_name_or_path=pretrained_model_name_or_path
)
self.preprocess = transforms.Compose([transforms.ToTensor()])

def evaluate(self, images, prompts):
# Preprocess images
Expand Down
7 changes: 4 additions & 3 deletions helpers/training/optimizer_param.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
# Some optimizers are not available in multibackend bitsandbytes as of January 2025.
is_ademamix_available = False
if is_bitsandbytes_available:
if 'AdEMAMix' in dir(bitsandbytes.optim):
if "AdEMAMix" in dir(bitsandbytes.optim):
is_ademamix_available = True

optimizer_choices = {
Expand Down Expand Up @@ -395,7 +395,8 @@
},
"class": bitsandbytes.optim.PagedLion8bit,
},
})
}
)

if is_ademamix_available:
optimizer_choices.update(
Expand Down Expand Up @@ -451,7 +452,7 @@
"min_8bit_size": 4096,
},
"class": bitsandbytes.optim.PagedAdEMAMix8bit,
}
},
}
)

Expand Down

0 comments on commit 2248fd0

Please sign in to comment.