allenai · epwalsh · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -37,17 +37,12 @@ jobs:
           - name: Test
             run: |
               pytest -v --color=yes --durations=3 src/test/ \
-                --ignore-glob='src/test/distributed/fsdp*' \
                 --ignore-glob='src/test/distributed/checkpoint*'
 
           - name: Test checkpoint
             run: |
               pytest -v --color=yes --durations=3 src/test/distributed/checkpoint*
 
-          - name: Test FSDP
-            run: |
-              pytest -v --color=yes --durations=3 src/test/distributed/fsdp/
-
           - name: Type check
             run: make type-check
 
@@ -113,13 +108,10 @@ jobs:
       matrix:
         task:
           - name: Test (GPU)
-            run: pytest -v --color=yes --durations=3 -m gpu src/test/ --ignore-glob='src/test/distributed/fsdp*' --ignore-glob='src/test/distributed/checkpoint*'
+            run: pytest -v --color=yes --durations=3 -m gpu src/test/ --ignore-glob='src/test/distributed/checkpoint*'
 
           - name: Test checkpoint (GPU)
             run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/checkpoint*
-
-          - name: Test FSDP (GPU)
-            run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/fsdp/
     steps:
       - name: Determine current commit SHA (pull request)
         if: github.event_name == 'pull_request'

diff --git a/docs/source/distributed/fsdp.rst b/docs/source/distributed/fsdp.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -23,7 +23,6 @@ To get started, install OLMo-core from PyPI with:
    io.rst
    utils.rst
    distributed/checkpoint.rst
-   distributed/fsdp.rst
    distributed/tensors.rst
 
 .. toctree::

diff --git a/src/benchmarks/fsdp/common.py b/src/benchmarks/fsdp/common.py
@@ -151,10 +151,9 @@ def build_components(
     config: TransformerConfig,
     batch_size: int,
     num_batches: int = 100,
-    fsdp_wrapper: Literal["torch", "olmo_core", "ddp"] = "olmo_core",
+    model_wrapper: Literal["fsdp", "ddp"] = "fsdp",
     wrap_blocks: bool = True,
     mixed_precision: bool = True,
-    max_prefetch_count: int = 1,
     learning_rate: float = 1e-4,
     seed: int = 4634534,
 ) -> Tuple[nn.Module, torch.optim.Optimizer, Dataloader]:
@@ -163,20 +162,7 @@ def build_components(
     model = Transformer(config)
 
     print_rank0("Wrapping model...")
-    if fsdp_wrapper == "olmo_core":
-        from olmo_core.distributed.fsdp import FSDP, FSDPPrecision
-
-        model = FSDP.auto_wrap(
-            model,
-            [nn.TransformerEncoderLayer] if wrap_blocks else [],
-            precision=FSDPPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-            if mixed_precision
-            else None,
-            max_prefetch_count=max_prefetch_count,
-        )
-
-        model.apply(init_function)
-    elif fsdp_wrapper == "torch":
+    if model_wrapper == "fsdp":
         from torch.distributed.fsdp import FullyShardedDataParallel, MixedPrecision
 
         def auto_wrap_policy(module: nn.Module, recurse: bool, *args, **kwargs) -> bool:
@@ -204,13 +190,13 @@ def auto_wrap_policy(module: nn.Module, recurse: bool, *args, **kwargs) -> bool:
         )
 
         model.apply(init_function)  # just in case
-    elif fsdp_wrapper == "ddp":
+    elif model_wrapper == "ddp":
         from torch.nn.parallel import DistributedDataParallel as DDP
 
         model = DDP(model.cuda(), device_ids=[dist.get_rank()])
         model.apply(init_function)
     else:
-        raise NotImplementedError(fsdp_wrapper)
+        raise NotImplementedError(model_wrapper)
 
     model.train()
     print_rank0(model)

diff --git a/src/benchmarks/fsdp/test.py b/src/benchmarks/fsdp/test.py
diff --git a/src/benchmarks/fsdp/train.py b/src/benchmarks/fsdp/train.py
@@ -29,7 +29,7 @@ def main(
     config: TransformerConfig,
     batch_size: int,
     num_batches: int = 100,
-    fsdp_wrapper: Literal["torch", "olmo_core", "ddp"] = "olmo_core",
+    model_wrapper: Literal["fsdp", "ddp"] = "fsdp",
     dry_run: bool = False,
     save_path: Optional[str] = None,
     load_path: Optional[str] = None,
@@ -43,7 +43,7 @@ def main(
         config,
         batch_size,
         num_batches=num_batches,
-        fsdp_wrapper=fsdp_wrapper,
+        model_wrapper=model_wrapper,
         mixed_precision=mixed_precision,
         **kwargs,
     )
@@ -140,10 +140,10 @@ def on_trace_ready(p):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(prog="train.py", description="Train an FSDP model")
+    parser = argparse.ArgumentParser(prog="train.py", description="Train a distributed model")
     parser.add_argument(
-        "--fsdp",
-        choices=["torch", "olmo_core", "ddp"],
+        "--wrapper",
+        choices=["fsdp", "ddp"],
         default="olmo_core",
         help="""The FSDP implementation.""",
     )
@@ -241,7 +241,7 @@ def on_trace_ready(p):
         config,
         args.batch_size,
         num_batches=args.num_batches,
-        fsdp_wrapper=args.fsdp,
+        model_wrapper=args.fsdp,
         dry_run=args.dry_run,
         save_path=args.save_path,
         load_path=args.load_path,

diff --git a/src/olmo_core/distributed/checkpoint.py b/src/olmo_core/distributed/checkpoint.py
@@ -7,7 +7,7 @@
 Features
 --------
 
-- Sharded distributed models, such OLMo-core's :class:`~olmo_core.distributed.fsdp.FSDP` or PyTorch's
+- Sharded distributed models, such as PyTorch's
   :class:`~torch.distributed.fsdp.FullyShardedDataParallel` (with ``use_orig_params=True``)
   are supported out-of-the-box.
 - Utilizes `safetensors <https://huggingface.co/docs/safetensors/>`_ under the hood for fast, efficient, and