Acquisition values always ZERO #1846

Arcadianlee · 2023-05-30T05:32:13Z

Arcadianlee
May 30, 2023

Hi,
I've been using botorch to optimize an 8-input and 5-output objective function, essentially following the code in this tutorial: https://botorch.org/tutorials/constrained_multi_objective_bo. The parallel Noisy Expected Hypervolume Improvement (𝑞NEHVI) is chosen as the acquisition function as in the tutorial. The only difference is that my optimization problem doesn't have a constraint.

Now my issue is that the acquisition function values are constantly zero, not matter how I adjust my model and hyper-parameters. Here is my code:

def generate_initial_data():
      # generate training data
      #input variable: state
    train_X = torch.zeros(5, 8, device=device, dtype=torch.float64)  #fixed zero initialization
      
    train_Y = torch.zeros(5,5, device=device, dtype=torch.float64)
    for i in range(0, 5):
        obj, _ = env.step(train_X[i, :].tolist())  #objective function
        # output variable: score
        train_Y[i, :] = torch.tensor(obj, device=device)

    return train_X, train_Y

def initialize_model(train_X, train_Y):
    # define models for objective and constraint

    d = 8  #No. of input parameters
    m = 5  #No. of objectives
    theta = 3   #lengthscale
    #define the matern kernel with custom lengthscale. Default lengthscale = 1.
    covariance = ScaleKernel(
        base_kernel=MaternKernel(
        nu=2.5,
        ard_num_dims=d,
        lengthscale_prior=GammaPrior(6.0, 9.0),
        )
    )

    #use model list for improved performance
    models = []
    for i in range(m):
        models.append(
            SingleTaskGP(train_X, train_Y[..., i : i + 1], covar_module= covariance,
                      outcome_transform=Standardize(m=1), input_transform=Normalize(d=d)
            )
        )
    model = ModelListGP(*models)
    mll = SumMarginalLogLikelihood(model.likelihood, model)
    
    return mll, model

def optimize_acqf_and_get_observation(model, train_x):
    """Optimizes the acquisition function, and returns a new candidate and a noisy observation."""

    #bounds for the x input
    high = [1000.0, 200., 200., 200., 0.15, 0.15, 0.3, 100.]
    low = [-1000.0, -200., -200., -200., -0.15, -0.15, -0.3, -100.]
    bounds = torch.tensor([low, high], dtype=torch.float64)

    train_x = normalize(train_x, bounds)

    #acquisition function (qNEHVI for multi-objective)
    acq_func = qNEHVI(
            model=model,
            ref_point= torch.zeros(5),   #lower bound for objectives
            X_baseline=train_x,
            prune_baseline=True,
            alpha=1.0,
    )

    print('\nacq func')
    print(acq_func(train_x))

    #optimize
    candidate, acq_value = optimize_acqf(
        acq_function = acq_func, bounds=bounds, q=1, num_restarts=5, raw_samples=20,
    )

    # observe new values
    new_x = candidate.detach()
    #new_x = torch.round(new_x)
    obj, score = env.step(new_x.tolist()[0])  #objective function 
    new_y = torch.tensor(obj, device=device).unsqueeze(0)
    return new_x, new_y, acq_value, score


N_TRIALS = 60
N_BATCH = 300

verbose = True

#save overall best solutions
best_observed_all_ei = []

#save data
input = torch.tensor([])
objective = torch.tensor([])

steps_done = 0

#average over multiple trials
for trial in range(1, N_TRIALS + 1):
    utc_dt = datetime.now(timezone.utc)

    print(f"\nTrial {trial:>2} of {N_TRIALS} \n", end="")

    print("\nLocal time {}".format(utc_dt.astimezone().isoformat()))
    
    best_observed_ei = []

    #define the hypervolume
    hv = Hypervolume(ref_point=torch.zeros(5))
    hvs_qnehvi = []

    # call helper functions to generate initial training data and initialize model
    (
        train_x_ei,
        train_obj_ei,
    ) = generate_initial_data()

    #print('\nx_initial: {}, f_initial: {:.5f}\n'.format(train_x_ei.tolist()[0], train_obj_ei.tolist()[0]))
    print('f_initial:')
    print(train_obj_ei.tolist())
    print('\nx_initial:')
    print(train_x_ei.tolist())

    mll_ei, model_ei = initialize_model(train_x_ei, train_obj_ei)

    best_observed_ei.append(train_obj_ei)

    # compute pareto front
    feas_train_obj = train_obj_ei
    if feas_train_obj.shape[0] > 0:
        pareto_mask = is_non_dominated(feas_train_obj)
        pareto_y = feas_train_obj[pareto_mask]
        # compute hypervolume
        volume = hv.compute(pareto_y)
    else:
        volume = 0.0

    hvs_qnehvi.append(volume)

    # run N_BATCH rounds of BayesOpt after the initial random batch
    for iteration in range(1, N_BATCH + 1):
        print('\nStarting step No.{}'.format(iteration))

        steps_done += 1

        # fit the models
        fit_gpytorch_mll(mll_ei)

        # print('\n lengthscale')
        # print(model_ei.covar_module.base_kernel.lengthscale)

        # optimize and get new observation
        new_x_ei, new_obj_ei, acq, score = optimize_acqf_and_get_observation(model_ei, train_x_ei)

        print('f_obj, f_score:')
        print(new_obj_ei.tolist(), score)
        print('\nx_candidate:')
        print(new_x_ei.tolist())
        print('\nacquisition:')
        print(acq.tolist())

        posterior = model_ei.posterior(new_x_ei)
        mean = posterior.mean  #posterior mean
        # Get upper and lower confidence bounds (2 std dev from the mean)
        lower, upper = posterior.mvn.confidence_region()

        print('\nPosterior mean and uncertainty:')
        print(mean.tolist(), upper.tolist())

        #writer.add_scalar('BO/scores', new_obj_ei.item(), steps_done)
        writer.add_scalar('BO/acq value', acq.item(), steps_done)
        writer.add_scalar('BO/f score', score, steps_done)

        # update training points
        train_x_ei = torch.cat([train_x_ei, new_x_ei])
        train_obj_ei = torch.cat([train_obj_ei, new_obj_ei])

        # update progress
        for hvs_list, train_obj in zip(
            (hvs_qnehvi,),
            (train_obj_ei,),
        ):
            # compute pareto front
            feas_train_obj = train_obj.unsqueeze(0)
            if feas_train_obj.shape[0] > 0:
                pareto_mask = is_non_dominated(feas_train_obj)
                pareto_y = feas_train_obj[pareto_mask]
                # compute feasible hypervolume
                volume = hv.compute(pareto_y)
            else:
                volume = 0.0
            hvs_list.append(volume)

        print('\nhypervolume:')
        print(volume)
        writer.add_scalar('BO/hypervolume', volume, steps_done)

        # reinitialize the models so they are ready for fitting on next iteration
        # use the current state dict to speed up fitting
        mll_ei, model_ei = initialize_model(
            train_x_ei,
            train_obj_ei,
        )


        if iteration % 50 == 0:
            #save data
            input = torch.cat([input, train_x_ei])
            objective = torch.cat([objective, train_obj_ei])
            torch.save(input, 'input.pt')
            torch.save(objective, 'objective.pt')

        if verbose:
            print(
                f"Batch {iteration:>2}: Hypervolume (qNEHVI) = "
                f"({hvs_qnehvi[-1]:>4.5f}), ",
                end="\n",
            )
        else:
            print(".", end="\n")

Here is a sample output from running the above code:
BadInitialCandidatesWarning: Unable to find non-zero acquisition function values - initial conditions are being selected randomly.

Starting step No.1

acq func
tensor([0.], dtype=torch.float64, grad_fn=)

Q factor: 14740.106, resonance lambda: 1406.38, area: 8.5639e-14, power: 0.0000, divergence: 1.1614

f_obj, f_score:
[[0.0073700529412966675, 0.9264244075447232, 0.23788713235941283, 5.672858516270196e-09, 0.8385756450397841]] 41.49402116450884

x_candidate:
[[-79.38002981245518, 32.086195796728134, -148.4838206321001, -35.77237389981747, 0.10972788669168951, 0.05081366663798689, -0.05658666919916869, -52.36530061811209]]

acquisition:
0.0

Posterior mean and uncertainty:
[[0.05059579170492109, 0.9492523232296569, 0.2618673535516844, 0.3117851135315177, 0.838471422626162]] [[0.050653100885836064, 0.9492723232296569, 0.2618873535516844, 0.3119949052921522, 0.838491422626162]]

hypervolume:
0.017218729481101036
Batch 1: Hypervolume (qNEHVI) = (0.01722),

Starting step No.2
BadInitialCandidatesWarning: Unable to find non-zero acquisition function values - initial conditions are being selected randomly.

acq func
tensor([0.], dtype=torch.float64, grad_fn=)

Q factor: 133590.653, resonance lambda: 1362.14, area: 8.4514e-14, power: 0.0003, divergence: 1.1615

f_obj, f_score:
[[0.06679532659907061, 0.9601957026595896, 0.2347601551320646, 0.00095828431855427, 0.8384761202505044]] 41.36835737933064

x_candidate:
[[-4.69064898788929, -147.21730574965477, -81.92704394459724, -79.19720783829689, -0.07687873654067516, 0.044364464934915304, 0.10664636492729185, -97.85456620156765]]

acquisition:
0.0

Posterior mean and uncertainty:
[[0.04339396979224573, 0.94544767050204, 0.2578706497836072, 0.25982092815776137, 0.8384887930284166]] [[0.04341396979224573, 0.9454676705020401, 0.2578906497836072, 0.2598667985236072, 0.8385087930284166]]

hypervolume:
0.01721883937716484
Batch 2: Hypervolume (qNEHVI) = (0.01722),

As you can see above, the acquisition values are zero, and I get the BadInitialCandidatesWarning warning each time.
On a side note, the hypervolumes and posterior means seem correct.
Any idea what could have gone wrong? Help is really appreciated !!

Answered by Balandat

May 30, 2023

Hmm so my first guess here is that this is b/c of the choice of alpha=1.0 - this is a parameter controlling the coarseness of the hypervolume computation, and typically chosen rather small since it expresses a fraction. From https://github.com/pytorch/botorch/blob/main/botorch/utils/multi_objective/box_decompositions/non_dominated.py#L48-L51 setting it to 1.0 suggests the HV will always be trivially zero. Try setting it to a small value, such as 0.05 or 0.1.

But note that with 5 outcomes the complexity of the HV computation can be quite high, so this could take a while. At that point if you really do care about the tradeoffs between all outcomes you could consider some other scalarization…

View full answer

Balandat · 2023-05-30T14:21:14Z

Balandat
May 30, 2023
Collaborator

Hmm so my first guess here is that this is b/c of the choice of alpha=1.0 - this is a parameter controlling the coarseness of the hypervolume computation, and typically chosen rather small since it expresses a fraction. From https://github.com/pytorch/botorch/blob/main/botorch/utils/multi_objective/box_decompositions/non_dominated.py#L48-L51 setting it to 1.0 suggests the HV will always be trivially zero. Try setting it to a small value, such as 0.05 or 0.1.

But note that with 5 outcomes the complexity of the HV computation can be quite high, so this could take a while. At that point if you really do care about the tradeoffs between all outcomes you could consider some other scalarization-based methods. But I'm curious how you're expecting to process / use the Pareto frontier in 5 dimensions, and whether this is indeed a full multi-objective problem or whether a MOO formulation with fewer objectives but some constraints would make sense.

Finally, we have some work that improves the numerical behavior of the improvement-based acquisition functions, we hope to open source that soon. This seems to be a problem that could also benefit quite a bit from this.

cc @sdaulton, @SebastianAment

I don't see anything obviously wrong with this, assuming that the reference point at zero here is reasonably conservative / that all outcomes are non-negative as the comment suggests.

1 reply

Arcadianlee May 31, 2023
Author

Hi there,
Appreciate your timely reply. Here is the output after I changed alpha from 1 to 0:

Starting step No.10

acq func
C:\Users\haihuang\Miniconda3\envs\fdtd\lib\site-packages\linear_operator\utils\cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-08 to the diagonal
warnings.warn(
tensor([7.4032685780e-06], dtype=torch.float64, grad_fn=)
starting new FDTD session...

Q factor: -21585.623, resonance lambda: 1327.53, area: 8.7055e-14, power: 0.0014, divergence: 1.1615

f_obj, f_score:
[[-0.010792811724270823, 0.986620588725405, 0.24182011735822706, 0.004585623923119342, 0.8385247021414537]] 42.386895947964845

x_candidate:
[[931.7723251879215, 117.38879606127739, 24.297846481204033, 14.8428525775671, -0.07457528114318847, -0.1238873055204749, 0.29943717867136005, -48.539046943187714]]

acquisition:
2.1387586746517376e-07
C:\Users\haihuang\Miniconda3\envs\fdtd\lib\site-packages\gpytorch\distributions\multivariate_normal.py:319: NumericalWarning: Negative variance values detected. This is likely
due to numerical instabilities. Rounding negative variances up to 1e-10.
warnings.warn(

Posterior mean and uncertainty:
[[0.1135903302831784, 0.9532514332899666, 0.2368104948095983, 0.16570317373789678, 0.838484256372174]] [[0.11383293991903519, 0.9532714332899667, 0.23684833479370568, 0.16585578187913383, 0.8385042563721741]]

hypervolume:
0.019333185628056526
Batch 10: Hypervolume (qNEHVI) = (0.01933),

Starting step No.11

acq func
C:\Users\haihuang\Miniconda3\envs\fdtd\lib\site-packages\linear_operator\utils\cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-08 to the diagonal
warnings.warn(
tensor([6.3484794429e-06], dtype=torch.float64, grad_fn=)
starting new FDTD session...

Q factor: 1655471.972, resonance lambda: 1348.44, area: 8.9185e-14, power: 0.0098, divergence: 1.1615

f_obj, f_score:
[[0.8277359858800497, 0.9706574848168468, 0.24773579372723298, 0.032501778725411934, 0.8385261951941789]] 46.59267461984497

x_candidate:
[[-753.9003789424896, 137.26902455091476, 24.798031523823738, 155.31849153339863, -0.12768304729834198, -0.05982640692964196, 0.26400913037359713, 31.833583675324917]]

acquisition:
2.1065067677286072e-07
C:\Users\haihuang\Miniconda3\envs\fdtd\lib\site-packages\gpytorch\distributions\multivariate_normal.py:319: NumericalWarning: Negative variance values detected. This is likely
due to numerical instabilities. Rounding negative variances up to 1e-10.
warnings.warn(

Posterior mean and uncertainty:
[[0.1053008990645664, 0.9554760433997954, 0.23714447045978354, 0.15496200862309312, 0.8384869527564519]] [[0.10556030090144049, 0.9554960433997954, 0.23718471349038067, 0.1551281692321272, 0.8385069527564519]]

hypervolume:
0.022425130009651184
Batch 11: Hypervolume (qNEHVI) = (0.02243),

As you can see above, acquisition values are indeed no longer zero, albeit rather small (e.g., on the order of ~ 2.0e-07). Are acquisition values that small something we would normally expect?

Besides, I noticed that my posterior mean and 2 standard deviation confidence are almost identical, as seen in the output. I don't think this is a coincidence, and if not, what could have been wrong here?

In addition, in the output you can see two types of numerical warnings: a "not p.d., added jitter" warning and a "Negative variance values" warning. Any idea how to prevent those from occurring?

Also,

But I'm curious how you're expecting to process / use the Pareto frontier in 5 dimensions, and whether this is indeed a full multi-objective problem or whether a MOO formulation with fewer objectives but some constraints would make sense.

In the quote above, are you suggesting that 5 objectives is too much for Botorch to handle? I am asking cuz I've read in the literature that MOBO can generally handle upto 15-ish objectives.

Also, I don't believe my problem has a constraint, other than a simple box bound imposed on the input parameters that has been explicitly written out in the optimize_acqf_and_get_observation(model, train_x) method in my code :

high = [1000.0, 200., 200., 200., 0.15, 0.15, 0.3, 100.]
low = [-1000.0, -200., -200., -200., -0.15, -0.15, -0.3, -100.]
bounds = torch.tensor([low, high], dtype=torch.float64)

Thanks again.

Balandat · 2023-05-31T06:19:35Z

Balandat
May 31, 2023
Collaborator

As you can see above, acquisition values are indeed no longer zero, albeit rather small (e.g., on the order of ~ 2.0e-07). Are acquisition values that small something we would normally expect?

Yes it's possible - you're integrating an area in 5 dimensions w.r.t. to a MVN with light tails, this can produce rather small numbers. The versions that @SebastianAment has implemented should be helpful here.

In addition, in the output you can see two types of numerical warnings: a "not p.d., added jitter" warning and a "Negative variance values" warning. Any idea how to prevent those from occurring?

Hard to say how to avoid these in general. Some of the covariance matrices that appear during the course of BO, especially in later iterations, can be quite ill conditioned (and potentially numerically not p.d.). These ill conditioned covariances are likely also the cause of the negative variances.

In the quote above, are you suggesting that 5 objectives is too much for Botorch to handle? I am asking cuz I've read in the literature that MOBO can generally handle upto 15-ish objectives.

What literature suggests that? With that many outputs hypervolume approaches are typically not easily feasible, and pretty significant approximations need to be made or scalarization-based methods need to be used. Do you maybe mean that MOBO can handle 15 input (rather than outcome) dimensions? That has been a ballpark number for Bayesian Optimization for a while (though it's not true anymore with more modern methods that scale better).

Also, I don't believe my problem has a constraint, other than a simple box bound imposed on the input parameters that has been explicitly written out in the optimize_acqf_and_get_observation(model, train_x) method in my code :

What I was suggesting is that considering some of the outcomes as things you may constrain on, rather than adding them to the list of objectives to explore the Pareto Frontier for. This will increase scalability of the hypervolume computations.

17 replies

Balandat Jun 14, 2023
Collaborator

So you're optimizing (as one of 3 objectives) the fourth element of obj, but you're also imposing a constraint on it (to be less than 1.98). Instead of doing that, you should encode the fact that you want it to be less than 1.98 in the respective dimension of the reference point.

I thought the ref_point argument only includes objectives and not constraints

That is right, but it looks like you're also constraining one of your objectives. And that should be done via the reference point. Otherwise you'd be building two GP models for the same outcome, and the algorithm probably also wouldn't explore the pareto frontier near 1.98 as well since with a constraint it'd be discouraged to explore there.

Arcadianlee Jun 15, 2023
Author

According to the tutorial at https://botorch.org/tutorials/constrained_multi_objective_bo, the reference point is defined as:

"qNEHVI requires specifying a reference point, which is the lower bound on the objectives used for computing hypervolume. In this tutorial, we assume the reference point is known. In practice the reference point can be set 1) using domain knowledge to be slightly worse than the lower bound of objective values, where the lower bound is the minimum acceptable value of interest for each objective, or 2) using a dynamic reference point selection strategy."

Since my constraint is to be less than 1.98, which is sort of an upper bound, how should I transform the problem so it becomes a lower bound to be used in the reference point?

Balandat Jun 16, 2023
Collaborator

Interesting - what's the reason you're providing this as an upper bound? This seems odd if you're maximizing this objective. Is this encoding some domain knowledge that values >= 1.98 are bad in some way (e.g. b/c they negatively affect some other outcome that you're not tracking explicitly)?

Arcadianlee Jun 16, 2023
Author

Yeah this a domain knowledge where values >= 1.98 are physically impossible and thus should always be avoided. I am basically maximizing this objective such that it's as large as possible while being less than 1.98.

Balandat Jun 19, 2023
Collaborator

Hmm I see. This is a somewhat atypical situation so it's not supported super well out of the box, but you should be able to apply the outcome constraint as you initially suggested. Let me know how that goes.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Acquisition values always ZERO #1846

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 2 comments 18 replies

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

Acquisition values always ZERO #1846

Arcadianlee May 30, 2023

Replies: 2 comments · 18 replies

Balandat May 30, 2023 Collaborator

Arcadianlee May 31, 2023 Author

Balandat May 31, 2023 Collaborator

Balandat Jun 14, 2023 Collaborator

Arcadianlee Jun 15, 2023 Author

Balandat Jun 16, 2023 Collaborator

Arcadianlee Jun 16, 2023 Author

Balandat Jun 19, 2023 Collaborator

Arcadianlee
May 30, 2023

Replies: 2 comments 18 replies

Balandat
May 30, 2023
Collaborator

Arcadianlee May 31, 2023
Author

Balandat
May 31, 2023
Collaborator

Balandat Jun 14, 2023
Collaborator

Arcadianlee Jun 15, 2023
Author

Balandat Jun 16, 2023
Collaborator

Arcadianlee Jun 16, 2023
Author

Balandat Jun 19, 2023
Collaborator