Refactor(Optimizers): Use ask() instead of two-stage `load_optimiza…

…tion_state()` and `get_config_and_ids()` (#146)
automl · Oct 9, 2024 · 38b91d5 · 38b91d5
1 parent 5ed2bf3
commit 38b91d5
Show file tree

Hide file tree

Showing 61 changed files with 918 additions and 1,607 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ files: |
   )/.*\.py$
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-added-large-files
         files: ".*"
@@ -36,13 +36,14 @@ repos:
           )/.*\.py$
         additional_dependencies:
           - "types-pyyaml"
+          - "types-requests"
         args:
           - "--no-warn-return-any" # Disable this because it doesn't know about 3rd party imports
           - "--ignore-missing-imports"
           - "--show-traceback"
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.29.2
+    rev: 0.29.3
     hooks:
       - id: check-github-workflows
         files: '^github/workflows/.*\.ya?ml$'
@@ -51,7 +52,7 @@ repos:
         files: '^\.github/dependabot\.ya?ml$'
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.6.5
+    rev: v0.6.9
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --no-cache]

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -174,7 +174,7 @@ traceback and the environment in which you are running, i.e. python version, OS,
 
 Regression tests are run on each push to the repository to assure the performance of the optimizers don't degrade.
 
-Currently, regression runs are recorded on JAHS-Bench-201 data for 2 tasks: `cifar10` and `fashion_mnist` and only for optimizers: `random_search`, `bayesian_optimization`, `mf_bayesian_optimization`, `regularized_evolution`.
+Currently, regression runs are recorded on JAHS-Bench-201 data for 2 tasks: `cifar10` and `fashion_mnist` and only for optimizers: `random_search`, `bayesian_optimization`, `mf_bayesian_optimization`.
 This information is stored in the `tests/regression_runner.py` as two lists: `TASKS`, `OPTIMIZERS`.
 The recorded results are stored as a json dictionary in the `tests/losses.json` file.
 

diff --git a/neps/api.py b/neps/api.py
@@ -53,7 +53,6 @@ def run(
             "priorband",
             "mobster",
             "asha",
-            "regularized_evolution",
         ]
         | BaseOptimizer
         | Path
@@ -278,7 +277,6 @@ def _run_args(
             "priorband",
             "mobster",
             "asha",
-            "regularized_evolution",
         ]
         | BaseOptimizer
     ) = "default",

diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py
@@ -1,35 +1,31 @@
 from collections.abc import Callable, Mapping
 from functools import partial
-from typing import TYPE_CHECKING
 
-from .base_optimizer import BaseOptimizer
-from .bayesian_optimization.optimizer import BayesianOptimization
-from .grid_search.optimizer import GridSearch
-from .multi_fidelity.hyperband import (
+from neps.optimizers.base_optimizer import BaseOptimizer
+from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization
+from neps.optimizers.grid_search.optimizer import GridSearch
+from neps.optimizers.multi_fidelity import (
+    IFBO,
     MOBSTER,
-    AsynchronousHyperband,
-    Hyperband,
-    HyperbandCustomDefault,
-)
-from .multi_fidelity.ifbo import IFBO
-from .multi_fidelity.successive_halving import (
     AsynchronousSuccessiveHalving,
     AsynchronousSuccessiveHalvingWithPriors,
+    Hyperband,
+    HyperbandCustomDefault,
     SuccessiveHalving,
     SuccessiveHalvingWithPriors,
 )
-from .multi_fidelity_prior.async_priorband import PriorBandAsha, PriorBandAshaHB
-from .multi_fidelity_prior.priorband import PriorBand
-from .random_search.optimizer import RandomSearch
-from .regularized_evolution.optimizer import RegularizedEvolution
+from neps.optimizers.multi_fidelity_prior import (
+    PriorBand,
+    PriorBandAsha,
+    PriorBandAshaHB,
+)
+from neps.optimizers.random_search.optimizer import RandomSearch
 
 # TODO: Rename Searcher to Optimizer...
 SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = {
     "bayesian_optimization": partial(BayesianOptimization, use_priors=False),
     "pibo": partial(BayesianOptimization, use_priors=True),
     "random_search": RandomSearch,
-    "regularized_evolution": RegularizedEvolution,
-    "assisted_regularized_evolution": partial(RegularizedEvolution, assisted=True),
     "grid_search": GridSearch,
     "successive_halving": SuccessiveHalving,
     "successive_halving_prior": SuccessiveHalvingWithPriors,

diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py
@@ -3,38 +3,42 @@
 import logging
 from abc import abstractmethod
 from collections.abc import Mapping
-from dataclasses import asdict, dataclass
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
 from neps.state.trial import Report, Trial
 from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss
-from neps.utils.types import ERROR, ConfigResult, RawConfig, ResultDict
 
 if TYPE_CHECKING:
     from neps.search_spaces.search_space import SearchSpace
     from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ERROR, ResultDict
 
 
 @dataclass
 class SampledConfig:
-    id: Trial.ID
+    id: str
     config: Mapping[str, Any]
-    previous_config_id: Trial.ID | None = None
+    previous_config_id: str | None = None
 
 
 class BaseOptimizer:
     """Base sampler class. Implements all the low-level work."""
 
+    # TODO: Remove a lot of these init params
+    # Ideally we just make this a `Protocol`, i.e. an interface
+    # and it has no functionality
     def __init__(
         self,
+        *,
         pipeline_space: SearchSpace,
         patience: int = 50,
         logger: logging.Logger | None = None,
         budget: int | float | None = None,
         loss_value_on_error: float | None = None,
         cost_value_on_error: float | None = None,
         learning_curve_on_error: float | list[float] | None = None,
-        ignore_errors=False,
+        ignore_errors: bool = False,
     ) -> None:
         if patience < 1:
             raise ValueError("Patience should be at least 1")
@@ -50,107 +54,31 @@ def __init__(
         self.ignore_errors = ignore_errors
 
     @abstractmethod
-    def load_optimization_state(
-        self,
-        previous_results: dict[str, ConfigResult],
-        pending_evaluations: dict[str, SearchSpace],
-        budget_info: BudgetInfo | None,
-        optimizer_state: dict[str, Any],
-    ) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
-        """Sample a new configuration.
-
-        Returns:
-            config: serializable object representing the configuration
-            config_id: unique identifier for the configuration
-            previous_config_id: if provided, id of a previous on which this
-                configuration is based
-        """
-        raise NotImplementedError
-
     def ask(
         self,
         trials: Mapping[str, Trial],
         budget_info: BudgetInfo | None,
-        optimizer_state: dict[str, Any],
-    ) -> SampledConfig | tuple[SampledConfig, dict[str, Any]]:
+    ) -> SampledConfig:
         """Sample a new configuration.
 
-        !!! note
-
-            The plan is this method replaces the two-step procedure of `load_optimization_state`
-            and `get_config_and_ids` in the future, replacing both with a single method `ask`
-            which would be easier for developer of NePS optimizers to implement.
-
-        !!! note
-
-            The `optimizer_state` right now is just a `dict` that optimizers are free to mutate
-            as desired. A `dict` is not ideal as its _stringly_ typed but this was the least
-            invasive way to add this at the moment. It's actually an existing feature no
-            optimizer uses except _cost-cooling_ which basically just took a value from
-            `budget_info`.
-
-            Ideally an optimizer overwriting this can decide what to return instead of having
-            to rely on them mutating it, however this is the best work-around I could come up with
-            for now.
-
         Args:
             trials: All of the trials that are known about.
             budget_info: information about the budget
-            optimizer_state: extra state the optimizer would like to keep between calls
 
         Returns:
             SampledConfig: a sampled configuration
             dict: state the optimizer would like to keep between calls
         """
-        completed: dict[Trial.ID, ConfigResult] = {}
-        pending: dict[Trial.ID, SearchSpace] = {}
-        for trial_id, trial in trials.items():
-            if trial.report is not None:
-                completed[trial_id] = ConfigResult(
-                    id=trial_id,
-                    config=self.pipeline_space.from_dict(trial.config),
-                    result=trial.report,
-                    # TODO: Better if we could just pass around this metadata
-                    # object instead of converting to a dict each time.
-                    metadata=asdict(trial.metadata),
-                )
-            elif trial.state in (
-                Trial.State.PENDING,
-                Trial.State.SUBMITTED,
-                Trial.State.EVALUATING,
-            ):
-                pending[trial_id] = self.pipeline_space.from_dict(trial.config)
-
-        self.load_optimization_state(
-            previous_results=completed,
-            pending_evaluations=pending,
-            budget_info=budget_info,
-            optimizer_state=optimizer_state,
-        )
-        config, config_id, previous_config_id = self.get_config_and_ids()
-        return SampledConfig(
-            id=config_id, config=config, previous_config_id=previous_config_id
-        )
-
-    def update_state_post_evaluation(
-        self, state: dict[str, Any], report: Trial.Report
-    ) -> dict[str, Any]:
-        # TODO: There's a slot in `OptimizerState` to store extra things
-        # required for the optimizer but is currently not used
-        # state["key"] = "value"
-        return state
+        ...
 
     def get_loss(self, result: ERROR | ResultDict | float | Report) -> float | ERROR:
         """Calls result.utils.get_loss() and passes the error handling through.
         Please use self.get_loss() instead of get_loss() in all optimizer classes.
         """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
-        # use `Trial` and `Report`, they already take care of this and save having to do this
-        # `_get_loss` at every call. We can also then just use `None` instead of the string `"error"`
+        # use `Trial` and `Report`, they already take care of this and save having to do
+        # this `_get_loss` at every call. We can also then just use `None` instead of
+        # the string `"error"`
         if isinstance(result, Report):
             return result.loss if result.loss is not None else "error"
 
@@ -165,8 +93,8 @@ def get_cost(self, result: ERROR | ResultDict | float | Report) -> float | ERROR
         Please use self.get_cost() instead of get_cost() in all optimizer classes.
         """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
-        # use `Trial` and `Report`, they already take care of this and save having to do this
-        # `_get_loss` at every call
+        # use `Trial` and `Report`, they already take care of this and save having to do
+        # this `_get_loss` at every call
         if isinstance(result, Report):
             return result.loss if result.loss is not None else "error"
 
@@ -183,8 +111,8 @@ def get_learning_curve(
         Please use self.get_loss() instead of get_loss() in all optimizer classes.
         """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
-        # use `Trial` and `Report`, they already take care of this and save having to do this
-        # `_get_loss` at every call
+        # use `Trial` and `Report`, they already take care of this and save having to do
+        # this `_get_loss` at every call
         if isinstance(result, Report):
             return result.learning_curve
 

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
@@ -1,6 +1,9 @@
 from collections.abc import Callable
 from functools import partial
 
+from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+    BaseAcquisition,
+)
 from neps.optimizers.bayesian_optimization.acquisition_functions.ei import (
     ComprehensiveExpectedImprovement,
 )
@@ -23,8 +26,9 @@
         augmented_ei=False,
         log_ei=True,
     ),
-    ## Uses the augmented EI heuristic and changed the in-fill criterion to the best test location with
-    ## the highest *posterior mean*, which are preferred when the optimisation is noisy.
+    ## Uses the augmented EI heuristic and changed the in-fill criterion to the best test
+    ## location with the highest *posterior mean*, which are preferred when the
+    ## optimisation is noisy.
     "AEI": partial(
         ComprehensiveExpectedImprovement,
         in_fill="posterior",
@@ -41,4 +45,5 @@
     "ComprehensiveExpectedImprovement",
     "UpperConfidenceBound",
     "DecayingPriorWeightedAcquisition",
+    "BaseAcquisition",
 ]