Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove in-house FSDP and ShardedFlatParameter #25

Merged
merged 3 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,12 @@ jobs:
- name: Test
run: |
pytest -v --color=yes --durations=3 src/test/ \
--ignore-glob='src/test/distributed/fsdp*' \
--ignore-glob='src/test/distributed/checkpoint*'

- name: Test checkpoint
run: |
pytest -v --color=yes --durations=3 src/test/distributed/checkpoint*

- name: Test FSDP
run: |
pytest -v --color=yes --durations=3 src/test/distributed/fsdp/

- name: Type check
run: make type-check

Expand Down Expand Up @@ -113,13 +108,10 @@ jobs:
matrix:
task:
- name: Test (GPU)
run: pytest -v --color=yes --durations=3 -m gpu src/test/ --ignore-glob='src/test/distributed/fsdp*' --ignore-glob='src/test/distributed/checkpoint*'
run: pytest -v --color=yes --durations=3 -m gpu src/test/ --ignore-glob='src/test/distributed/checkpoint*'

- name: Test checkpoint (GPU)
run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/checkpoint*

- name: Test FSDP (GPU)
run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/fsdp/
steps:
- name: Determine current commit SHA (pull request)
if: github.event_name == 'pull_request'
Expand Down
6 changes: 0 additions & 6 deletions docs/source/distributed/fsdp.rst

This file was deleted.

1 change: 0 additions & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ To get started, install OLMo-core from PyPI with:
io.rst
utils.rst
distributed/checkpoint.rst
distributed/fsdp.rst
distributed/tensors.rst

.. toctree::
Expand Down
22 changes: 4 additions & 18 deletions src/benchmarks/fsdp/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,9 @@ def build_components(
config: TransformerConfig,
batch_size: int,
num_batches: int = 100,
fsdp_wrapper: Literal["torch", "olmo_core", "ddp"] = "olmo_core",
model_wrapper: Literal["fsdp", "ddp"] = "fsdp",
wrap_blocks: bool = True,
mixed_precision: bool = True,
max_prefetch_count: int = 1,
learning_rate: float = 1e-4,
seed: int = 4634534,
) -> Tuple[nn.Module, torch.optim.Optimizer, Dataloader]:
Expand All @@ -163,20 +162,7 @@ def build_components(
model = Transformer(config)

print_rank0("Wrapping model...")
if fsdp_wrapper == "olmo_core":
from olmo_core.distributed.fsdp import FSDP, FSDPPrecision

model = FSDP.auto_wrap(
model,
[nn.TransformerEncoderLayer] if wrap_blocks else [],
precision=FSDPPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
if mixed_precision
else None,
max_prefetch_count=max_prefetch_count,
)

model.apply(init_function)
elif fsdp_wrapper == "torch":
if model_wrapper == "fsdp":
from torch.distributed.fsdp import FullyShardedDataParallel, MixedPrecision

def auto_wrap_policy(module: nn.Module, recurse: bool, *args, **kwargs) -> bool:
Expand Down Expand Up @@ -204,13 +190,13 @@ def auto_wrap_policy(module: nn.Module, recurse: bool, *args, **kwargs) -> bool:
)

model.apply(init_function) # just in case
elif fsdp_wrapper == "ddp":
elif model_wrapper == "ddp":
from torch.nn.parallel import DistributedDataParallel as DDP

model = DDP(model.cuda(), device_ids=[dist.get_rank()])
model.apply(init_function)
else:
raise NotImplementedError(fsdp_wrapper)
raise NotImplementedError(model_wrapper)

model.train()
print_rank0(model)
Expand Down
197 changes: 0 additions & 197 deletions src/benchmarks/fsdp/test.py

This file was deleted.

12 changes: 6 additions & 6 deletions src/benchmarks/fsdp/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def main(
config: TransformerConfig,
batch_size: int,
num_batches: int = 100,
fsdp_wrapper: Literal["torch", "olmo_core", "ddp"] = "olmo_core",
model_wrapper: Literal["fsdp", "ddp"] = "fsdp",
dry_run: bool = False,
save_path: Optional[str] = None,
load_path: Optional[str] = None,
Expand All @@ -43,7 +43,7 @@ def main(
config,
batch_size,
num_batches=num_batches,
fsdp_wrapper=fsdp_wrapper,
model_wrapper=model_wrapper,
mixed_precision=mixed_precision,
**kwargs,
)
Expand Down Expand Up @@ -140,10 +140,10 @@ def on_trace_ready(p):


if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="train.py", description="Train an FSDP model")
parser = argparse.ArgumentParser(prog="train.py", description="Train a distributed model")
parser.add_argument(
"--fsdp",
choices=["torch", "olmo_core", "ddp"],
"--wrapper",
choices=["fsdp", "ddp"],
default="olmo_core",
help="""The FSDP implementation.""",
)
Expand Down Expand Up @@ -241,7 +241,7 @@ def on_trace_ready(p):
config,
args.batch_size,
num_batches=args.num_batches,
fsdp_wrapper=args.fsdp,
model_wrapper=args.fsdp,
dry_run=args.dry_run,
save_path=args.save_path,
load_path=args.load_path,
Expand Down
2 changes: 1 addition & 1 deletion src/olmo_core/distributed/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
Features
--------

- Sharded distributed models, such OLMo-core's :class:`~olmo_core.distributed.fsdp.FSDP` or PyTorch's
- Sharded distributed models, such as PyTorch's
:class:`~torch.distributed.fsdp.FullyShardedDataParallel` (with ``use_orig_params=True``)
are supported out-of-the-box.
- Utilizes `safetensors <https://huggingface.co/docs/safetensors/>`_ under the hood for fast, efficient, and
Expand Down
Loading
Loading