Merge pull request #1283 from bghira/debug/multigpu-init-stuck-logging

add debug logging for factory initialisation in multigpu systems where it seems to get stuck, and format some files
bghira · Jan 18, 2025 · 2248fd0 · 2248fd0
2 parents 5c1fbe5 + dfaefab
commit 2248fd0
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 13 deletions.
diff --git a/helpers/data_backend/aws.py b/helpers/data_backend/aws.py
@@ -106,7 +106,10 @@ def exists(self, s3_key):
             except (NoCredentialsError, PartialCredentialsError) as e:
                 raise e  # Raise credential errors to the caller
             except Exception as e:
-                if "An error occurred (404) when calling the HeadObject operation: Not Found" in str(e):
+                if (
+                    "An error occurred (404) when calling the HeadObject operation: Not Found"
+                    in str(e)
+                ):
                     return False
                 logger.error(f'Error checking existence of S3 key "{s3_key}": {e}')
                 if i == self.read_retry_limit - 1:

diff --git a/helpers/data_backend/factory.py b/helpers/data_backend/factory.py
@@ -487,6 +487,7 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
             )
 
         # Generate a TextEmbeddingCache object
+        logger.debug(f"rank {get_rank()} is creating TextEmbeddingCache")
         init_backend["text_embed_cache"] = TextEmbeddingCache(
             id=init_backend["id"],
             data_backend=init_backend["data_backend"],
@@ -497,11 +498,15 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
             model_type=StateTracker.get_model_family(),
             write_batch_size=backend.get("write_batch_size", args.write_batch_size),
         )
+        logger.debug(f"rank {get_rank()} completed creation of TextEmbeddingCache")
         init_backend["text_embed_cache"].set_webhook_handler(
             StateTracker.get_webhook_handler()
         )
+        logger.debug(f"rank {get_rank()} might skip discovery..")
         with accelerator.main_process_first():
+            logger.debug(f"rank {get_rank()} is discovering all files")
             init_backend["text_embed_cache"].discover_all_files()
+        logger.debug(f"rank {get_rank()} is waiting for other processes")
         accelerator.wait_for_everyone()
 
         if backend.get("default", False):
@@ -510,12 +515,19 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
             logger.debug(f"Set the default text embed cache to {init_backend['id']}.")
             # We will compute the null embedding for caption dropout here.
             info_log("Pre-computing null embedding")
+            logger.debug(f"rank {get_rank()} may skip computing the embedding..")
             with accelerator.main_process_first():
+                logger.debug(f"rank {get_rank()} is computing the null embed")
                 init_backend["text_embed_cache"].compute_embeddings_for_prompts(
                     [""], return_concat=False, load_from_cache=False
                 )
-            time.sleep(5)
+                logger.debug(
+                    f"rank {get_rank()} has completed computing the null embed"
+                )
+
+            logger.debug(f"rank {get_rank()} is waiting for other processes")
             accelerator.wait_for_everyone()
+            logger.debug(f"rank {get_rank()} is continuing")
         if args.caption_dropout_probability == 0.0:
             logger.warning(
                 "Not using caption dropout will potentially lead to overfitting on captions, eg. CFG will not work very well. Set --caption_dropout_probability=0.1 as a recommended value."
@@ -1013,6 +1025,11 @@ def configure_multi_databackend(args: dict, accelerator, text_encoders, tokenize
                 f"Pre-computing text embeds / updating cache. We have {len(captions)} captions to process, though these will be filtered next."
             )
             logger.debug(f"Data missing captions: {images_missing_captions}")
+            if len(images_missing_captions) > 0 and hasattr(
+                init_backend["metadata_backend"], "remove_images"
+            ):
+                # we'll tell the aspect bucket manager to remove these images.
+                init_backend["metadata_backend"].remove_images(images_missing_captions)
             caption_strategy = backend.get("caption_strategy", args.caption_strategy)
             info_log(
                 f"(id={init_backend['id']}) Initialise text embed pre-computation using the {caption_strategy} caption strategy. We have {len(captions)} captions to process."

diff --git a/helpers/training/evaluation.py b/helpers/training/evaluation.py
@@ -13,9 +13,12 @@
     "clip": "CLIPModelEvaluator",
 }
 
+
 class ModelEvaluator:
     def __init__(self, pretrained_model_name_or_path):
-        raise NotImplementedError("Subclasses is incomplete, no __init__ method was found.")
+        raise NotImplementedError(
+            "Subclasses is incomplete, no __init__ method was found."
+        )
 
     def evaluate(self, images, prompts):
         raise NotImplementedError("Subclasses should implement the evaluate() method.")
@@ -25,19 +28,27 @@ def from_config(args):
         """Instantiate a ModelEvaluator from the training config, if set to do so."""
         if not StateTracker.get_accelerator().is_main_process:
             return None
-        if args.evaluation_type is not None and args.evaluation_type.lower() != "" and args.evaluation_type.lower() != "none":
+        if (
+            args.evaluation_type is not None
+            and args.evaluation_type.lower() != ""
+            and args.evaluation_type.lower() != "none"
+        ):
             model_evaluator = model_evaluator_map[args.evaluation_type]
-            return globals()[model_evaluator](args.pretrained_evaluation_model_name_or_path)
+            return globals()[model_evaluator](
+                args.pretrained_evaluation_model_name_or_path
+            )
 
         return None
 
 
 class CLIPModelEvaluator(ModelEvaluator):
-    def __init__(self, pretrained_model_name_or_path='openai/clip-vit-large-patch14-336'):
-        self.clip_score_fn = partial(clip_score, model_name_or_path=pretrained_model_name_or_path)
-        self.preprocess = transforms.Compose([
-            transforms.ToTensor()
-        ])
+    def __init__(
+        self, pretrained_model_name_or_path="openai/clip-vit-large-patch14-336"
+    ):
+        self.clip_score_fn = partial(
+            clip_score, model_name_or_path=pretrained_model_name_or_path
+        )
+        self.preprocess = transforms.Compose([transforms.ToTensor()])
 
     def evaluate(self, images, prompts):
         # Preprocess images

diff --git a/helpers/training/optimizer_param.py b/helpers/training/optimizer_param.py
@@ -58,7 +58,7 @@
 # Some optimizers are not available in multibackend bitsandbytes as of January 2025.
 is_ademamix_available = False
 if is_bitsandbytes_available:
-    if 'AdEMAMix' in dir(bitsandbytes.optim):
+    if "AdEMAMix" in dir(bitsandbytes.optim):
         is_ademamix_available = True
 
 optimizer_choices = {
@@ -395,7 +395,8 @@
                 },
                 "class": bitsandbytes.optim.PagedLion8bit,
             },
-        })
+        }
+    )
 
 if is_ademamix_available:
     optimizer_choices.update(
@@ -451,7 +452,7 @@
                     "min_8bit_size": 4096,
                 },
                 "class": bitsandbytes.optim.PagedAdEMAMix8bit,
-            }
+            },
         }
     )