From b94b8d13113b51bc2b275ac143ca192cb07ddacd Mon Sep 17 00:00:00 2001 From: Etienne Pot Date: Wed, 27 Nov 2024 07:23:51 -0800 Subject: [PATCH] Update external documentation PiperOrigin-RevId: 700694146 --- README.md | 23 +++++++++++ docs/conf.py | 33 ++-------------- docs/data.md | 71 +--------------------------------- docs/index.md | 52 ++++++++++++++++--------- kauldron/data/__init__.py | 3 -- kauldron/metrics/auto_state.py | 44 +++++++++++---------- 6 files changed, 84 insertions(+), 142 deletions(-) diff --git a/README.md b/README.md index 63db35c0..5c12e807 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,27 @@ [![PyPI version](https://badge.fury.io/py/kauldron.svg)](https://badge.fury.io/py/kauldron) [![Documentation Status](https://readthedocs.org/projects/kauldron/badge/?version=latest)](https://kauldron.readthedocs.io/en/latest/?badge=latest) +Kauldron is a library for training machine learning models, optimized for +**research velocity** and **modularity**. + +**Modularity**: + +* All parts of Kauldron are self-contained, so can be used independently + outside Kauldron. +* Use any dataset (TFDS, Grain, SeqIO, your custom pipeline), + any (flax) model, any optimizer,... Kauldron provides the + glue that link everything together. +* Everything can be customized and overwritten (e.g. sweep over models + architecture, overwrite any inner layer parameter,...) + +**Research velocity**: + +* Everything should work out-of the box. The + [example configs](http://https://github.com/google-research/kauldron/tree/HEAD/kauldron/examples/mnist_autoencoder.py) + can be used and customized as a starting point. +* Colab-first workflow for easy prototyping and fast iteration +* Polished user experience (integrated XM plots, profiler, + post-mortem debugging on borg, runtime shape checking, and many others...). +[Open an issue](https://github.com/google-research/kauldron/issues).. + *This is not an officially supported Google product.* diff --git a/docs/conf.py b/docs/conf.py index 834f2c8a..5be88853 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,39 +23,9 @@ ``` """ -import sys -from unittest import mock - import apitree -# TODO(epot): Delete once `grain` can be imported -sys.modules['grain'] = mock.MagicMock() -sys.modules['grain._src'] = mock.MagicMock() -sys.modules['grain._src.core'] = mock.MagicMock() -sys.modules['grain._src.core.constants'] = mock.MagicMock() -sys.modules['grain._src.tensorflow'] = mock.MagicMock() -sys.modules['grain._src.tensorflow.transforms'] = mock.MagicMock() -sys.modules['grain.tensorflow'] = mock.MagicMock() - -import grain.tensorflow as _mocked_grain # pylint: disable=g-import-not-at-top - - -class _MockedTransform: - pass - - -# Required for inheritance `class MyTransform(grain.MapTransform)` -_mocked_grain.MapTransform = _MockedTransform -_mocked_grain.RandomMapTransform = _MockedTransform - - -# Early failure if kauldron cannot be imported -# Read-the-doc install kauldron not in `-e` edit mode, so should only import -# kauldron after `apitree` import kauldron from the right path. -# from kauldron import kd # pylint: disable=g-import-not-at-top - - apitree.make_project( modules=apitree.ModuleInfo( api='kauldron.kd', @@ -64,7 +34,10 @@ class _MockedTransform: ), includes_paths={ 'kauldron/konfig/docs/demo.ipynb': 'konfig.ipynb', + 'kauldron/kontext/README.md': 'kontext.md', + 'kauldron/data/py/README.md': 'data_py.md', 'kauldron/klinen/README.md': 'klinen.md', + 'kauldron/random/README.md': 'random.md', }, globals=globals(), ) diff --git a/docs/data.md b/docs/data.md index 0af665b0..476a1515 100644 --- a/docs/data.md +++ b/docs/data.md @@ -1,4 +1,4 @@ -# Train, eval, randomness +# Data pipelines ## Pipelines options @@ -20,78 +20,9 @@ By default, Kauldron provides two main pipelines implementations: ) ``` -* `tf.data` based: `kd.data.TFDataPipeline` base class which itself implements - multiple sub-classes (see next section). For example: - - ```python - cfg.train_ds = kd.data.Tfds( - # TFDS specific args - name='mnist', - split='train', - shuffle=True, - - # `kd.data.TFDataPipeline` args (common to all TFDataPipeline) - batch_size=256, - transforms=[ - kd.data.Elements(keep=["image"]), - kd.data.ValueRange(key="image", vrange=(0, 1)), - ], - ) - ``` - While it's easy to implement your custom pipeline, please contact us if the existing pipelines do not fit your use-case. -## TFDataPipeline - -The following `tf.data` sources are available: - -* `kd.data.Tfds`: TFDS dataset (note that this requires the dataset to be in - ArrayRecord format) -* `kd.data.TfdsLegacy`: TFDS dataset for datasets not supporting random access - ( e.g. in `tfrecord` format) -* `kd.data.SeqIOTask`: SeqIO task -* `kd.data.SeqIOMixture`: SeqIO mixture -* Your custom `tf.data` pipeline. See: https://kauldron.rtfd.io/en/latest-kmix#implement-your-own - -Additionally, any of those sources dataset can be combined using: - -* `kd.data.SampleFromDatasets`: Sample from a combination of datasets. - -Other sources will be added in the future. If your dataset is not yet supported, -please [contact us](https://kauldron.rtfd.io/en/latest-help#bugs-feedback). - -See https://kauldron.rtfd.io/en/latest-kmix for details on how to implement a custom `tf.data` source. - -Example of dataset mixture with nested transforms: - -```python -cfg.train_ds = kd.data.SampleFromDatasets( - datasets=[ - kd.data.Tfds( - name='cifar100', - split='train', - transforms=[ - kd.data.Elements(keep=["image", "label"]), - ], - ), - kd.data.Tfds( - name='imagenet2012', - split='train', - transforms=[ - kd.data.Elements(keep=["image", "label"]), - kd.data.Resize(key='image', height=32, width=32), - ], - ), - ], - seed=0, - batch_size=256, - transforms=[ - kd.data.RandomCrop(shape=(15, 15, None)), - ], -) -``` - ## Transformations Both `kd.data.py.PyGrainPipeline` and `kd.data.TFDataPipeline` can be customized diff --git a/docs/index.md b/docs/index.md index 94b7e72e..a9ac7b0d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,29 +1,43 @@ ```{include} ../README.md ``` -```{toctree} -:hidden: -:caption: Guides - -train -eval -checkpoint -konfig -klinen -``` +```{eval-rst} +.. toctree:: + :hidden: + :caption: Guides -```{toctree} -:hidden: -:caption: Links + intro + eval + sharding + checkpoint + data + + +.. toctree:: + :hidden: + :caption: Modules + + konfig + kontext + data_py + metrics + klinen + random + + +.. toctree:: + :hidden: + :caption: Links + + GitHub + Issues -GitHub -``` -```{toctree} -:hidden: -:caption: API +.. toctree:: + :hidden: + :caption: API -api/kd/index + api/kd/index ```