diff --git a/flaml/autogen/math/README.md b/flaml/autogen/math/README.md
new file mode 100644
index 0000000000..76ee5a5383
--- /dev/null
+++ b/flaml/autogen/math/README.md
@@ -0,0 +1,81 @@
+# MathChat: A conversational framework for math problem solving with GPT-4
+
+## Introduction:
+
+Employing Large Language Models (LLMs) to address mathematical problems is an intriguing research endeavor, with LLMs demonstrating remarkable proficiency in various tasks spanning diverse domains. We propose *MathChat*, a framework that simulates a mock conversation between an LLM assistant (GPT-4 in our case) and a user proxy agent. Here a user proxy agent is an agent playing the user's role in conversations with the LLM assistant. In *MathChat*, the assistant and the user proxy agent work together to solve the math problem.Here a user proxy agent is an agent playing the user's role in conversations with the LLM assistant. In *MathChat*, the assistant and the user proxy agent work together to solve the math problem (See Figure below). The user proxy agent takes a math problem to be solved as input and would initiate a conversation with the LLM assistant using an intial prompt. With proper modifications, effective prompting methods from existing research, such as CoT and tool-using, can be integrated into the *MathChat* framework.
+
+More details are provided in our paper [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337).
+
+![Example Image](./mathchat.png)
+
+## Environment Setup
+
+1. You can set up the environment using the following commands:
+
+```
+cd flaml/autogen/math
+conda env create -f environment.yml
+conda activate mathchat
+```
+
+2. Create a `key.txt` file in `flaml/autogen/math`, and put your openai key in it. The key should allow GPT-4 usage.
+
+```
+echo "your_openai_key" > key.txt
+```
+
+3. If you want to try out the wolfram prompt, you need to register a wolfram id and put it in `wolfram.txt`, which will be read in `main.py`.
+
+```
+echo "your_wolfram_key" > wolfram.txt
+```
+
+## Run MathChat
+
+- Use `--categories` to select category to run, and `--samples_per_category` for number of samples. The problems are randomly selected from level-5 difficulty. Here are the category names and IDs:
+
+| ID | Category Name            |
+|----|--------------------------|
+| 0  | Algebra                  |
+| 1  | Counting & Probability   |
+| 2  | Geometry                 |
+| 3  | Intermediate Algebra     |
+| 4  | Number Theory            |
+| 5  | Prealgebra               |
+| 6  | Precalculus              |
+
+
+- Test on 1 level-5 problem from Alegbra (`--categories 0`):
+
+```python
+python main.py -ptype default --folder ./default --categories 0 --samples_per_category 1
+```
+You can find the output in folder `./default/`.
+
+- Test on 1 level-5 problem from each category (except geometry):
+```python
+python main.py -ptype default --folder ./default --categories 0 1 3 4 5 6 --samples_per_category 1
+```
+
+Note: `default` is the default prompt for *MathChat*, other choices are `python` and `two_tools`.
+
+- Test on all problems from each category (except geometry):
+
+```python
+python main.py -ptype default --folder ./default --categories 0 1 3 4 5 6 --samples_per_category 400
+```
+
+Note that no category has more that 400 problems, by setting `--samples_per_category 400` will take all problems.
+
+## Citation
+
+If you find this work helpful, please cite:
+
+```bibtex
+@inproceedings{wu2023empirical,
+    title={An Empirical Study on Challenging Math Problem Solving with GPT-4},
+    author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},
+    year={2023},
+    booktitle={ArXiv preprint arXiv:2306.01337},
+}
+```
diff --git a/flaml/autogen/extensions/__init__.py b/flaml/autogen/math/__init__.py
similarity index 100%
rename from flaml/autogen/extensions/__init__.py
rename to flaml/autogen/math/__init__.py
diff --git a/flaml/autogen/math/baseline_PS.py b/flaml/autogen/math/baseline_PS.py
new file mode 100644
index 0000000000..11a27b1be0
--- /dev/null
+++ b/flaml/autogen/math/baseline_PS.py
@@ -0,0 +1,228 @@
+# adapted from https://github.com/wenhuchen/Program-of-Thoughts/blob/main/run_gsm8k_zs.py
+import openai
+from time import sleep
+from tool import synthesize_program
+from collections import Counter
+from datetime import datetime
+from tqdm import tqdm
+import os
+import json
+import argparse
+from flaml import oai
+import datasets
+
+# Caution: distinguish between the two types imports
+from flaml.autogen.math_utils import eval_math_responses, get_answer
+from utils import (
+    load_level5_math_test_each_category,
+    math_type_mapping,
+    write_json,
+    remove_asy_sections,
+    mylogger,
+    random_sample_MATH,
+    load_fixed,
+)
+from flaml.autogen.code_utils import execute_code
+from flaml.autogen.math.user_proxy_agent import UserProxyAgent
+
+
+parser = argparse.ArgumentParser()
+# parser.add_argument("--key", default='OPENAI_KEY', type=str)
+parser.add_argument("--dry_run", default=False, action="store_true")
+parser.add_argument("--folder", "-f", dest="folder", help="saving folder", default="./pnas", type=str)
+parser.add_argument("--cache_folder", "-c", dest="cache_folder", default=".cache/pnas", help="cache folder")
+parser.add_argument("--samples_per_category", "-s", help="samples per category", default=20, type=int)
+parser.add_argument("--temperature", "-t", dest="temperature", help="temperature", default=1, type=float)
+parser.add_argument("--seed", dest="seed", help="seed", default=41, type=int)
+parser.add_argument("--categories", dest="categories", help="categories", default=[0, 1], nargs="+")
+parser.add_argument("--sample_all", help="samples per category", default=0, type=int)
+parser.add_argument("--select", action="store_true")
+args = parser.parse_args()
+args.folder = args.folder + "_baseline_pnas" "_t" + str(args.temperature) + "_seed" + str(args.seed)
+if args.sample_all != 0:
+    args.folder += "_random_sample"
+# key = os.getenv(args.key)
+# print(key)
+
+
+def pnas_solve(model, problem, max_tokens=None):
+    problem = remove_asy_sections(problem["problem"])
+    docstring_front = '''"""\n'''
+    docstring_back = '''\n"""\n'''
+    context_array = ["write a python program", "using sympy", "using simulations"]
+    prompt_prefix = "that answers the following question:"
+    codex_input = docstring_front + context_array[0] + " " + prompt_prefix + " " + problem + docstring_back
+    # print(codex_input)
+    config = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": codex_input},
+        ],
+        "n": 1,
+    }
+    if max_tokens is not None:
+        config["max_tokens"] = max_tokens
+
+    if config_list is None:
+        raw_responses = oai.ChatCompletion.create(None, **config, use_cache=True)
+    else:
+        raw_responses = oai.ChatCompletion.create(config_list=config_list, **config)
+    responses = oai.ChatCompletion.extract_text(raw_responses)
+    # print(responses[0])
+
+    proxyagent = UserProxyAgent()
+    query_response, is_query_sucess = proxyagent.handle_query(responses[0])
+
+    config["messages"].append({"role": "assistant", "content": responses[0]})
+    if is_query_sucess:
+        config["messages"].append(
+            {"role": "user", "content": "Return: " + query_response + "\nPlease put the final answer in \\boxed{}."}
+        )
+        if config_list is None:
+            raw_responses = oai.ChatCompletion.create(None, **config, use_cache=True)
+        else:
+            raw_responses = oai.ChatCompletion.create(config_list=config_list, **config)
+        answer_response = oai.ChatCompletion.extract_text(raw_responses)
+        response_with_ans = answer_response[0]
+        if get_answer(answer_response[0]) is None:
+            response_with_ans = "\\boxed{N/A}"
+    else:
+        response_with_ans = "\\boxed{Error}"
+
+    try:
+        cost = oai.ChatCompletion.cost(raw_responses)
+    except TypeError:
+        cost = oai.ChatCompletion.cost(model, raw_responses)
+    return {
+        "cost": cost,
+        "response_with_ans": response_with_ans,
+        "program": responses[0],
+    }
+
+
+if __name__ == "__main__":
+    config_list = None
+    try:
+        openai.api_key = open("key.txt").read().strip()
+        print(openai.api_key)
+    except Exception:
+        from azure.identity import DefaultAzureCredential
+
+        SCOPE = "https://ml.azure.com"
+        credential = DefaultAzureCredential()
+        token = credential.get_token(SCOPE).token
+        headers = {
+            "azureml-model-deployment": "gpt4",
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+            **json.load(open("headers.json")),
+        }
+        config_list = [
+            {
+                "api_key": open("key.txt").read().strip(),
+                "api_type": "open_ai",
+                "api_base": "https://api.openai.com/v1",
+            },
+            {
+                "api_key": open("key_flaml.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_flaml.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            {
+                "api_key": open("key_aoai.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_aoai.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            # {
+            #     "api_key": open("key_gcr.txt").read().strip(),
+            #     "api_type": "azure",
+            #     "api_base": open("base_gcr.txt").read().strip(),
+            #     "api_version": "2023-03-15-preview",
+            # },
+            # {
+            #     "api_key": "nokey",
+            #     "headers": headers,
+            #     "api_base": open("base_azure.txt").read().strip(),
+            # },
+        ]
+    oai.ChatCompletion.request_timeout = 60 * 10  # 10 minutes
+    oai.ChatCompletion.set_cache(seed=args.seed, cache_path_root=args.cache_folder)
+    os.makedirs(args.folder, exist_ok=True)
+    logger = mylogger(os.path.join(args.folder, "log.txt"))
+
+    engine = "gpt-4"
+    aggre_correct = 0
+    problem_sets = load_level5_math_test_each_category(
+        samples_per_category=args.samples_per_category, category_to_load=args.categories
+    )
+    if args.sample_all != 0:
+        problem_sets = random_sample_MATH(args.sample_all)
+
+    if args.select:
+        problem_sets = load_fixed()
+
+    logger.log("problem id: is_correct $ ans $ correct_ans $ accum_acc", verbose=True)
+
+    for problem_set in problem_sets:  # one problem_set is one category
+        if len(problem_set) == 0:
+            continue
+        for i in range(len(problem_set)):
+            problem_set[i]["problem_id"] = str(i)  # assign problem id
+
+        logger.log("Solving " + problem_set[0]["type"], verbose=True)
+        saving_folder = os.path.join(args.folder, math_type_mapping[problem_set[0]["type"]])
+        os.makedirs(saving_folder, exist_ok=True)
+        done_problems = set([int(f.split(".")[0]) for f in os.listdir(saving_folder) if "json" in f])
+
+        correct_counts = 0
+        for count, problem in enumerate(problem_set):
+            problem_path = os.path.join(saving_folder, problem["problem_id"] + ".json")
+
+            # 1. if problem already solved, continue
+            if int(problem["problem_id"]) in done_problems:
+                problem = json.load(open(problem_path, "r"))
+                aggre_correct += problem["is_correct"]
+                correct_counts += problem["is_correct"]
+                logger.log(
+                    f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']} $ {round(correct_counts / (count + 1), 4)} (loaded from previous run)",
+                    verbose=True,
+                )
+                continue
+
+            results = pnas_solve(engine, problem)
+            metrics = eval_math_responses([results["response_with_ans"]], problem["solution"])
+            aggre_correct += metrics["success_vote"]
+            correct_counts += metrics["success_vote"]
+
+            problem.update(
+                {
+                    "cost": results["cost"],
+                    "is_correct": bool(metrics["success_vote"]),
+                    "correct_ans": get_answer(problem["solution"]),
+                    "voted_answer": get_answer(metrics["voted_answer"]),
+                    "response_with_ans": results["response_with_ans"],
+                    "program": results["program"],
+                }
+            )
+            write_json(problem, problem_path)
+            logger.log(
+                f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']}",
+                verbose=True,
+            )
+            if args.dry_run:
+                break
+        logger.log(
+            f"{problem_set[0]['type']} acc: {correct_counts}/{len(problem_set)}= {round(correct_counts / len(problem_set), 4)}",
+        )
+        logger.log("-----------------------------------")
+        os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
+
+    total_num_problem = sum([len(problem_set) for problem_set in problem_sets])
+    logger.log(
+        f"Total accuracy: {aggre_correct}/{total_num_problem}={round(aggre_correct / total_num_problem, 4)}",
+    )
+    logger.log("****************************\n\n\n\n")
+    os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
diff --git a/flaml/autogen/math/baseline_PoT.py b/flaml/autogen/math/baseline_PoT.py
new file mode 100644
index 0000000000..a4ea37bcb0
--- /dev/null
+++ b/flaml/autogen/math/baseline_PoT.py
@@ -0,0 +1,227 @@
+# adapted from https://github.com/wenhuchen/Program-of-Thoughts/blob/main/run_gsm8k_zs.py
+import openai
+from time import sleep
+from tool import synthesize_program
+from collections import Counter
+from datetime import datetime
+from tqdm import tqdm
+import os
+import json
+import argparse
+from flaml import oai
+import datasets
+
+# Caution: distinguish between the two types imports
+from flaml.autogen.math_utils import eval_math_responses, get_answer
+from utils import (
+    load_level5_math_test_each_category,
+    math_type_mapping,
+    write_json,
+    remove_asy_sections,
+    mylogger,
+    random_sample_MATH,
+)
+from flaml.autogen.code_utils import execute_code
+
+
+parser = argparse.ArgumentParser()
+# parser.add_argument("--key", default='OPENAI_KEY', type=str)
+parser.add_argument("--dry_run", default=False, action="store_true")
+parser.add_argument("--folder", "-f", dest="folder", help="saving folder", default="./PoT", type=str)
+parser.add_argument("--cache_folder", "-c", dest="cache_folder", default=".cache/PoT", help="cache folder")
+parser.add_argument("--samples_per_category", "-s", help="samples per category", default=20, type=int)
+parser.add_argument("--temperature", "-t", dest="temperature", help="temperature", default=1, type=float)
+parser.add_argument("--seed", dest="seed", help="seed", default=41, type=int)
+parser.add_argument("--categories", dest="categories", help="categories", default=[0, 1], nargs="+")
+parser.add_argument("--sample_all", help="samples per category", default=0, type=int)
+args = parser.parse_args()
+args.folder = args.folder + "_baseline_PoT" "_t" + str(args.temperature) + "_seed" + str(args.seed)
+if args.sample_all != 0:
+    args.folder += "_random_sample"
+# key = os.getenv(args.key)
+# print(key)
+
+
+def PoT_solve(model, problem, max_tokens=None):
+    commented_problem = problem["problem"].replace("\n", "\n# ")  # in case the problem is multiline
+    commented_problem = remove_asy_sections(commented_problem)
+    full_prompt = f"""
+import math
+import numpy as np
+import sympy as sp # added
+
+# Question: {commented_problem}
+# Answer this question by implementing a solver() function.
+def solver():
+    # Let's write a Python program step by step, and then return the answer
+    # Firstly, we need define the following variable:
+"""
+    with open(os.path.join(args.folder, "prompt.txt"), "w") as f:
+        f.write(full_prompt)
+    if args.dry_run:
+        print(full_prompt)
+        print("=======================")
+        return
+
+    config = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": full_prompt},
+        ],
+        "n": 1,
+    }
+    if max_tokens is not None:
+        config["max_tokens"] = max_tokens
+
+    raw_responses = oai.ChatCompletion.create(config_list=config_list, **config)
+    responses = oai.ChatCompletion.extract_text(raw_responses)
+
+    # TODO: adapt for voting
+    program = synthesize_program(responses[0], full_prompt)
+    return_code, ans = execute_code(program, timeout=5, use_docker=False)
+    if isinstance(ans, bytes):
+        try:
+            ans = ans.decode("ascii")
+        except Exception:
+            try:
+                ans = ans.decode("utf-8")
+            except Exception:
+                ans = "The return cannot be decoded."
+
+    ans = "Error" if return_code != 0 or ans is None else ans
+    response_with_ans = "\\boxed{" + str(ans) + "}"
+
+    (
+        oai.ChatCompletion.price1K[model][0]
+        if type(oai.ChatCompletion.price1K[model]) == tuple
+        else oai.ChatCompletion.price1K[model]
+    )
+    return {
+        "usage": raw_responses["usage"],
+        "response_with_ans": response_with_ans,
+        "program": program,
+    }
+
+
+if __name__ == "__main__":
+    config_list = None
+    try:
+        openai.api_key = open("key.txt").read().strip()
+        print(openai.api_key)
+    except Exception:
+        from azure.identity import DefaultAzureCredential
+
+        SCOPE = "https://ml.azure.com"
+        credential = DefaultAzureCredential()
+        token = credential.get_token(SCOPE).token
+        headers = {
+            "azureml-model-deployment": "gpt4",
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+            **json.load(open("headers.json")),
+        }
+        config_list = [
+            {
+                "api_key": open("key.txt").read().strip(),
+                "api_type": "open_ai",
+                "api_base": "https://api.openai.com/v1",
+            },
+            {
+                "api_key": open("key_flaml.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_flaml.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            {
+                "api_key": open("key_aoai.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_aoai.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            # {
+            #     "api_key": open("key_gcr.txt").read().strip(),
+            #     "api_type": "azure",
+            #     "api_base": open("base_gcr.txt").read().strip(),
+            #     "api_version": "2023-03-15-preview",
+            # },
+            # {
+            #     "api_key": "nokey",
+            #     "headers": headers,
+            #     "api_base": open("base_azure.txt").read().strip(),
+            # },
+        ]
+    oai.ChatCompletion.request_timeout = 60 * 10  # 10 minutes
+    oai.ChatCompletion.set_cache(seed=args.seed, cache_path_root=args.cache_folder)
+
+    os.makedirs(args.folder, exist_ok=True)
+    logger = mylogger(os.path.join(args.folder, "log.txt"))
+
+    engine = "gpt-4"
+    aggre_correct = 0
+    problem_sets = load_level5_math_test_each_category(
+        samples_per_category=args.samples_per_category, category_to_load=args.categories
+    )
+    if args.sample_all != 0:
+        problem_sets = random_sample_MATH(args.sample_all)
+    logger.log("problem id: is_correct $ ans $ correct_ans $ accum_acc", verbose=True)
+
+    for problem_set in problem_sets:  # one problem_set is one category
+        if len(problem_set) == 0:
+            continue
+        for i in range(len(problem_set)):
+            problem_set[i]["problem_id"] = str(i)  # assign problem id
+
+        logger.log("Solving " + problem_set[0]["type"], verbose=True)
+        saving_folder = os.path.join(args.folder, math_type_mapping[problem_set[0]["type"]])
+        os.makedirs(saving_folder, exist_ok=True)
+        done_problems = set([int(f.split(".")[0]) for f in os.listdir(saving_folder) if "json" in f])
+
+        correct_counts = 0
+        for count, problem in enumerate(problem_set):
+            problem_path = os.path.join(saving_folder, problem["problem_id"] + ".json")
+
+            # 1. if problem already solved, continue
+            if int(problem["problem_id"]) in done_problems:
+                problem = json.load(open(problem_path, "r"))
+                aggre_correct += problem["is_correct"]
+                correct_counts += problem["is_correct"]
+                logger.log(
+                    f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']} $ {round(correct_counts / (count + 1), 4)} (loaded from previous run)",
+                    verbose=True,
+                )
+                continue
+
+            results = PoT_solve(engine, problem)
+            metrics = eval_math_responses([results["response_with_ans"]], problem["solution"])
+            aggre_correct += metrics["success_vote"]
+            correct_counts += metrics["success_vote"]
+
+            problem.update(
+                {
+                    "usage": results["usage"],
+                    "is_correct": bool(metrics["success_vote"]),
+                    "correct_ans": get_answer(problem["solution"]),
+                    "voted_answer": get_answer(metrics["voted_answer"]),
+                    "program": results["program"],
+                }
+            )
+            write_json(problem, problem_path)
+            logger.log(
+                f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']}",
+                verbose=True,
+            )
+            if args.dry_run:
+                break
+        logger.log(
+            f"{problem_set[0]['type']} acc: {correct_counts}/{len(problem_set)}= {round(correct_counts / len(problem_set), 4)}",
+        )
+        logger.log("-----------------------------------")
+        os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
+
+    total_num_problem = sum([len(problem_set) for problem_set in problem_sets])
+    logger.log(
+        f"Total accuracy: {aggre_correct}/{total_num_problem}={round(aggre_correct / total_num_problem, 4)}",
+    )
+    logger.log("****************************\n\n\n\n")
+    os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
diff --git a/flaml/autogen/math/baseline_fewshot.py b/flaml/autogen/math/baseline_fewshot.py
new file mode 100644
index 0000000000..7550bd3eb9
--- /dev/null
+++ b/flaml/autogen/math/baseline_fewshot.py
@@ -0,0 +1,266 @@
+# adapted from https://github.com/wenhuchen/Program-of-Thoughts/blob/main/run_gsm8k_zs.py
+import openai
+from time import sleep
+from tool import synthesize_program
+from collections import Counter
+from datetime import datetime
+from tqdm import tqdm
+import os
+import json
+import argparse
+from flaml import oai
+import datasets
+import random
+
+# Caution: distinguish between the two types imports
+from flaml.autogen.math_utils import eval_math_responses, get_answer
+from utils import (
+    load_level5_math_test_each_category,
+    math_type_mapping,
+    write_json,
+    remove_asy_sections,
+    mylogger,
+)
+
+
+parser = argparse.ArgumentParser()
+# parser.add_argument("--key", default='OPENAI_KEY', type=str)
+parser.add_argument("--dry_run", default=False, action="store_true")
+parser.add_argument("--folder", "-f", dest="folder", help="saving folder", default="./fewshot", type=str)
+parser.add_argument("--cache_folder", "-c", dest="cache_folder", default=".cache/fewshot", help="cache folder")
+parser.add_argument("--samples_per_category", "-s", help="samples per category", default=20, type=int)
+parser.add_argument("--categories", dest="categories", help="categories", default=[0, 1], nargs="+")
+parser.add_argument("--temperature", "-t", dest="temperature", help="temperature", default=1, type=float)
+parser.add_argument("--seed", dest="seed", help="seed", default=41, type=int)
+parser.add_argument("--k", dest="k", help="k", default=3, type=int)
+args = parser.parse_args()
+args.folder = args.folder + "_baseline_fewshot_t" + str(args.temperature) + "_seed" + str(args.seed)
+
+# key = os.getenv(args.key)
+# print(key)
+
+
+def random_sample_level5_train_each_category(k=3, category_to_load=None):
+    """
+    Load level 5 math problems from the train set of  competition dataset.
+    Returns:
+        A list of list of problems. Each list of problems is of the same category.
+    """
+    category_to_load = [i for i in range(7)] if not category_to_load or "all" in category_to_load else category_to_load
+    category_to_load = [int(x) for x in category_to_load]
+    seed = 41
+    data = datasets.load_dataset("competition_math")
+    train_data = data["train"].shuffle(seed=seed)
+    sep_cate = []
+    print("******Loading train data******")
+    for i, category in enumerate(math_type_mapping.keys()):
+        if i not in category_to_load:
+            print(i, category, "(skipped)", flush=True)
+            continue
+        tmp = []
+        for x in range(len(train_data)):
+            if (
+                train_data[x]["level"] == "Level 5"
+                and train_data[x]["type"] == category
+                and "asy" not in train_data[x]["problem"]
+                and "ASY" not in train_data[x]["problem"]
+                and "asy" not in train_data[x]["solution"]
+                and "ASY" not in train_data[x]["solution"]
+            ):
+                tmp.append(train_data[x])
+
+        sep_cate.append(tmp[:k])
+        print(i, category, f"{len(sep_cate[-1])} problems loaded", flush=True)
+    print("******Loading train data done******")
+
+    if len(sep_cate) == 0:
+        raise ValueError("No category is loaded.")
+    return sep_cate
+
+
+def few_shot_template(examplars):
+    few_shot_prompt = ""
+    for examplar in examplars:
+        few_shot_prompt += "\n".join(
+            [
+                "Problem: " + examplar["problem"],
+                "Solution: " + examplar["solution"],
+                "\n",
+            ]
+        )
+    return few_shot_prompt
+
+
+def fewshot_solve(model, problem, prompt, max_tokens=None):
+    # few shot examplars
+    # examplars = random_select(problem_set, problem, k = k)
+    # few_shot_prompt = few_shot_template(examplars)
+
+    prompt += remove_asy_sections(problem["problem"])
+    if args.dry_run:
+        print(prompt)
+        print("=======================")
+        return
+
+    config = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ],
+        "n": 1,
+        # 'temperature': args.temperature,
+    }
+    if max_tokens is not None:
+        config["max_tokens"] = max_tokens
+
+    if config_list is not None:
+        raw_responses = oai.ChatCompletion.create(
+            config_list=config_list,
+            **config,
+        )
+    else:
+        raw_responses = oai.ChatCompletion.create(None, **config)
+    responses = oai.ChatCompletion.extract_text(raw_responses)
+
+    return {
+        "usage": raw_responses["usage"],
+        "response_with_ans": responses[0],
+    }
+
+
+if __name__ == "__main__":
+    config_list = None
+
+    # openai.api_key = open("key_e.txt").read().strip()
+    # print(openai.api_key)
+    try:
+        openai.api_key = open("key.txt").read().strip()
+        print(openai.api_key)
+    except Exception:
+        from azure.identity import DefaultAzureCredential
+
+        SCOPE = "https://ml.azure.com"
+        credential = DefaultAzureCredential()
+        token = credential.get_token(SCOPE).token
+        headers = {
+            "azureml-model-deployment": "gpt4",
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+            **json.load(open("headers.json")),
+        }
+
+        config_list = [
+            {
+                "api_key": open("key.txt").read().strip(),
+                "api_type": "open_ai",
+                "api_base": "https://api.openai.com/v1",
+            },
+            {
+                "api_key": open("key_flaml.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_flaml.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            {
+                "api_key": open("key_aoai.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_aoai.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            # {
+            #     "api_key": open("key_gcr.txt").read().strip(),
+            #     "api_type": "azure",
+            #     "api_base": open("base_gcr.txt").read().strip(),
+            #     "api_version": "2023-03-15-preview",
+            # },
+            # {
+            #     "api_key": "nokey",
+            #     "headers": headers,
+            #     "api_base": open("base_azure.txt").read().strip(),
+            # },
+        ]
+    oai.ChatCompletion.request_timeout = 60 * 10  # 10 minutes
+    oai.ChatCompletion.set_cache(seed=args.seed, cache_path_root=args.cache_folder)
+    random.seed(args.seed)
+    os.makedirs(args.folder, exist_ok=True)
+    logger = mylogger(os.path.join(args.folder, "log.txt"))
+
+    engine = "gpt-4"
+    aggre_correct = 0
+    problem_sets = load_level5_math_test_each_category(
+        samples_per_category=args.samples_per_category, category_to_load=args.categories
+    )
+
+    examplar_data = random_sample_level5_train_each_category(k=args.k, category_to_load=args.categories)
+    logger.log("problem id: is_correct $ ans $ correct_ans $ accum_acc", verbose=True)
+
+    for cate_id, problem_set in enumerate(problem_sets):  # one problem_set is one category
+        for i in range(len(problem_set)):
+            problem_set[i]["problem_id"] = str(i)  # assign problem id
+
+        logger.log("Solving " + problem_set[0]["type"], verbose=True)
+        saving_folder = os.path.join(args.folder, math_type_mapping[problem_set[0]["type"]])
+        os.makedirs(saving_folder, exist_ok=True)
+        done_problems = set([int(f.split(".")[0]) for f in os.listdir(saving_folder) if "json" in f])
+
+        assert examplar_data[cate_id][0]["type"] == problem_set[0]["type"], " examplar and test category mismatch"
+        category_prompt = (
+            "Solve a math problem carefully. Put the final answer in \\boxed{}.\n\n"
+            + few_shot_template(examplar_data[cate_id])
+            + """\n\nProblem: """
+        )
+
+        with open(os.path.join(args.folder, f"prompt_{math_type_mapping[problem_set[0]['type']]}.txt"), "w") as f:
+            f.write(category_prompt)
+
+        correct_counts = 0
+        for count, problem in enumerate(problem_set):
+            problem_path = os.path.join(saving_folder, problem["problem_id"] + ".json")
+
+            # 1. if problem already solved, continue
+            if int(problem["problem_id"]) in done_problems:
+                problem = json.load(open(problem_path, "r"))
+                aggre_correct += problem["is_correct"]
+                correct_counts += problem["is_correct"]
+                logger.log(
+                    f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']} $ {round(correct_counts / (count + 1), 4)} (loaded from previous run)",
+                    verbose=True,
+                )
+                continue
+            results = fewshot_solve(engine, problem, category_prompt, max_tokens=None)
+            if results is None:
+                break
+            metrics = eval_math_responses([results["response_with_ans"]], problem["solution"])
+            aggre_correct += metrics["success_vote"]
+            correct_counts += metrics["success_vote"]
+
+            problem.update(
+                {
+                    "is_correct": bool(metrics["success_vote"]),
+                    "correct_ans": get_answer(problem["solution"]),
+                    "voted_answer": get_answer(metrics["voted_answer"]),
+                    "response": results["response_with_ans"],
+                    "usage": results["usage"],
+                }
+            )
+            write_json(problem, problem_path)
+            logger.log(
+                f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']}",
+                verbose=True,
+            )
+            if args.dry_run:
+                break
+        logger.log(
+            f"{problem_set[0]['type']} acc: {correct_counts}/{len(problem_set)}= {round(correct_counts / len(problem_set), 4)}",
+        )
+        logger.log("-----------------------------------")
+        if args.dry_run:
+            print("------------------------------------")
+        # os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
+
+    logger.log(
+        f"Total accuracy: {aggre_correct}/{(len(problem_sets) * len(problem_sets[0]))}={round(aggre_correct / (len(problem_sets) * len(problem_sets[0])), 4)}",
+    )
+    logger.log("****************************\n\n\n\n")
+    os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
diff --git a/flaml/autogen/math/baseline_zeroshot.py b/flaml/autogen/math/baseline_zeroshot.py
new file mode 100644
index 0000000000..dfae8a1699
--- /dev/null
+++ b/flaml/autogen/math/baseline_zeroshot.py
@@ -0,0 +1,219 @@
+# adapted from https://github.com/wenhuchen/Program-of-Thoughts/blob/main/run_gsm8k_zs.py
+import openai
+from time import sleep
+from tool import synthesize_program
+from collections import Counter
+from datetime import datetime
+from tqdm import tqdm
+import os
+import json
+import argparse
+from flaml import oai
+import datasets
+
+# Caution: distinguish between the two types imports
+from flaml.autogen.math_utils import eval_math_responses, get_answer
+from utils import (
+    load_level5_math_test_each_category,
+    math_type_mapping,
+    write_json,
+    remove_asy_sections,
+    mylogger,
+    load_fixed,
+)
+
+
+parser = argparse.ArgumentParser()
+# parser.add_argument("--key", default='OPENAI_KEY', type=str)
+parser.add_argument("--dry_run", default=False, action="store_true")
+parser.add_argument("--folder", "-f", dest="folder", help="saving folder", default="./zeroshot", type=str)
+parser.add_argument("--cache_folder", "-c", dest="cache_folder", default=".cache/zeroshot", help="cache folder")
+parser.add_argument("--samples_per_category", "-s", help="samples per category", default=20, type=int)
+parser.add_argument("--categories", dest="categories", help="categories", default=[0, 1], nargs="+")
+parser.add_argument("--temperature", "-t", dest="temperature", help="temperature", default=1, type=float)
+parser.add_argument("--seed", dest="seed", help="seed", default=41, type=int)
+parser.add_argument("--select", action="store_true")
+
+args = parser.parse_args()
+args.folder = args.folder + "_baseline_zeroshot_t" + str(args.temperature) + "_seed" + str(args.seed)
+
+# key = os.getenv(args.key)
+# print(key)
+
+
+def zeroshot_solve(model, problem, max_tokens=None):
+    full_prompt = """Solve a math problem carefully. Put the final answer in \\boxed{}.\n\nProblem: """
+    full_prompt += remove_asy_sections(problem["problem"])
+
+    with open(os.path.join(args.folder, "prompt.txt"), "w") as f:
+        f.write(full_prompt)
+    if args.dry_run:
+        print(full_prompt)
+        print("=======================")
+        return
+
+    config = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": full_prompt},
+        ],
+        "n": 1,
+        # 'temperature': args.temperature,
+    }
+    if max_tokens is not None:
+        config["max_tokens"] = max_tokens
+
+    if config_list is not None:
+        raw_responses = oai.ChatCompletion.create(
+            config_list=config_list,
+            **config,
+        )
+    else:
+        raw_responses = oai.ChatCompletion.create(None, **config)
+    # raw_responses = oai.ChatCompletion.create(config_list=config_list, **config)
+    responses = oai.ChatCompletion.extract_text(raw_responses)
+
+    try:
+        oai.ChatCompletion.cost(raw_responses)
+    except TypeError:
+        oai.ChatCompletion.cost("gpt-4", raw_responses)
+    return {
+        "usage": raw_responses["usage"],
+        "response_with_ans": responses[0],
+    }
+
+
+if __name__ == "__main__":
+    config_list = None
+
+    try:
+        openai.api_key = open("key.txt").read().strip()
+        print(openai.api_key)
+    except Exception:
+        from azure.identity import DefaultAzureCredential
+
+        SCOPE = "https://ml.azure.com"
+        credential = DefaultAzureCredential()
+        token = credential.get_token(SCOPE).token
+        headers = {
+            "azureml-model-deployment": "gpt4",
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+            **json.load(open("headers.json")),
+        }
+        config_list = [
+            {
+                "api_key": open("key.txt").read().strip(),
+                "api_type": "open_ai",
+                "api_base": "https://api.openai.com/v1",
+            },
+            {
+                "api_key": open("key_flaml.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_flaml.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            {
+                "api_key": open("key_aoai.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_aoai.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            # {
+            #     "api_key": open("key_gcr.txt").read().strip(),
+            #     "api_type": "azure",
+            #     "api_base": open("base_gcr.txt").read().strip(),
+            #     "api_version": "2023-03-15-preview",
+            # },
+            # {
+            #     "api_key": "nokey",
+            #     "headers": headers,
+            #     "api_base": open("base_azure.txt").read().strip(),
+            # },
+        ]
+    problem_sets = load_level5_math_test_each_category(
+        samples_per_category=args.samples_per_category, category_to_load=args.categories
+    )
+    if args.select:
+        problem_sets = load_fixed()
+        # print("hhh")
+
+    selected_samples = {
+        "Algebra": [108],  # [8] wrong,  # 8 correct
+    }
+
+    oai.ChatCompletion.request_timeout = 60 * 10  # 10 minutes
+    oai.ChatCompletion.set_cache(seed=args.seed, cache_path_root=args.cache_folder)
+
+    os.makedirs(args.folder, exist_ok=True)
+    logger = mylogger(os.path.join(args.folder, "log.txt"))
+
+    engine = "gpt-4"
+    aggre_correct = 0
+
+    logger.log("problem id: is_correct $ ans $ correct_ans $ accum_acc", verbose=True)
+
+    for problem_set in problem_sets:  # one problem_set is one category
+        for i in range(len(problem_set)):
+            problem_set[i]["problem_id"] = str(i)  # assign problem id
+        if args.select:
+            if problem_set[0]["type"] in selected_samples and len(selected_samples[problem_set[0]["type"]]) > 0:
+                problem_set = [problem_set[i] for i in selected_samples[problem_set[0]["type"]]]
+                print(problem_set[0]["type"], selected_samples[problem_set[0]["type"]])
+            else:
+                continue
+        logger.log("Solving " + problem_set[0]["type"], verbose=True)
+        saving_folder = os.path.join(args.folder, math_type_mapping[problem_set[0]["type"]])
+        os.makedirs(saving_folder, exist_ok=True)
+        done_problems = set([int(f.split(".")[0]) for f in os.listdir(saving_folder) if "json" in f])
+
+        correct_counts = 0
+        for count, problem in enumerate(problem_set):
+            problem_path = os.path.join(saving_folder, problem["problem_id"] + ".json")
+
+            # 1. if problem already solved, continue
+            if int(problem["problem_id"]) in done_problems:
+                problem = json.load(open(problem_path, "r"))
+                aggre_correct += problem["is_correct"]
+                correct_counts += problem["is_correct"]
+                logger.log(
+                    f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']} $ {round(correct_counts / (count + 1), 4)} (loaded from previous run)",
+                    verbose=True,
+                )
+                continue
+
+            results = zeroshot_solve(engine, problem)
+            if results is None:
+                break
+            metrics = eval_math_responses([results["response_with_ans"]], problem["solution"])
+            aggre_correct += metrics["success_vote"]
+            correct_counts += metrics["success_vote"]
+
+            problem.update(
+                {
+                    "usage": results["usage"],
+                    "is_correct": bool(metrics["success_vote"]),
+                    "correct_ans": get_answer(problem["solution"]),
+                    "voted_answer": get_answer(metrics["voted_answer"]),
+                    "response": results["response_with_ans"],
+                }
+            )
+            write_json(problem, problem_path)
+            logger.log(
+                f"{count}: {problem['is_correct']} $ {problem['voted_answer']} $ {problem['correct_ans']}",
+                verbose=True,
+            )
+        logger.log(
+            f"{problem_set[0]['type']} acc: {correct_counts}/{len(problem_set)}= {round(correct_counts / len(problem_set), 4)}",
+        )
+        logger.log("-----------------------------------")
+        if args.dry_run:
+            break
+        # os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
+
+    logger.log(
+        f"Total accuracy: {aggre_correct}/{(len(problem_sets) * len(problem_sets[0]))}={round(aggre_correct / (len(problem_sets) * len(problem_sets[0])), 4)}",
+    )
+    logger.log("****************************\n\n\n\n")
+    os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
diff --git a/flaml/autogen/math/main.py b/flaml/autogen/math/main.py
new file mode 100644
index 0000000000..a5314e0230
--- /dev/null
+++ b/flaml/autogen/math/main.py
@@ -0,0 +1,61 @@
+import os
+from flaml.autogen.math.pseudo_main import pseudo_main
+from flaml import oai
+import json
+import openai
+
+
+def main():
+    pseudo_main(config_list)
+
+
+if __name__ == "__main__":
+    config_list = None
+    try:
+        openai.api_key = open("key.txt").read().strip()
+        print(openai.api_key)
+    except Exception:
+        from azure.identity import DefaultAzureCredential
+
+        SCOPE = "https://ml.azure.com"
+        credential = DefaultAzureCredential()
+        token = credential.get_token(SCOPE).token
+        headers = {
+            "azureml-model-deployment": "gpt4",
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+            **json.load(open("headers.json")),
+        }
+        config_list = [
+            {
+                "api_key": open("key.txt").read().strip(),
+                "api_type": "open_ai",
+                "api_base": "https://api.openai.com/v1",
+            },
+            {
+                "api_key": open("key_flaml.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_flaml.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            {
+                "api_key": open("key_aoai.txt").read().strip(),
+                "api_type": "azure",
+                "api_base": open("base_aoai.txt").read().strip(),
+                "api_version": "2023-03-15-preview",
+            },
+            # {
+            #     "api_key": open("key_gcr.txt").read().strip(),
+            #     "api_type": "azure",
+            #     "api_base": open("base_gcr.txt").read().strip(),
+            #     "api_version": "2023-03-15-preview",
+            # },
+            # {
+            #     "headers": headers,
+            #     "api_base": open("base_azure.txt").read().strip(),
+            # },
+        ]
+
+    # os.environ["WOLFRAM_ALPHA_APPID"] = open("wolfram.txt").read().strip()
+    oai.retry_timeout = 3600
+    main()
diff --git a/flaml/autogen/math/math_chat.py b/flaml/autogen/math/math_chat.py
new file mode 100644
index 0000000000..cd98a7307c
--- /dev/null
+++ b/flaml/autogen/math/math_chat.py
@@ -0,0 +1,313 @@
+from flaml.autogen.math.user_proxy_agent import UserProxyAgent
+from flaml.autogen.math_utils import eval_math_responses, get_answer
+from flaml import oai
+import os
+import json
+import re
+import copy
+from openai.error import InvalidRequestError, RateLimitError, Timeout
+from utils import write_json, remove_asy_sections, math_type_mapping, mylogger
+from prompts import PROMPTS
+
+
+class MathChat:
+    def __init__(
+        self,
+        model,
+        prompt_type="select",
+        prompt_location="user",
+        sys_type="s0",
+        max_round=10,
+        max_invalid_q_per_step=3,
+        n=1,
+        temperature=1,
+        logger=None,
+        use_cache=True,
+        refine=False,
+        config_list=None,
+    ):
+        self.max_round = max_round
+        if prompt_type not in PROMPTS:
+            raise ValueError(f"Tool {prompt_type} not supported, choose from {PROMPTS.keys()}")
+
+        self.prompt_type = prompt_type
+        self.prompt_loaction = prompt_location
+        self.prompt = PROMPTS[prompt_type]
+        self.refine = refine
+
+        # if the prompt_location is set to system, then the prompt is put in the system message
+        self.sys_type = sys_type
+        sys_choices = {
+            "s0": "You are a helpful assistant.",
+        }
+        messages = (
+            [{"role": "system", "content": self.prompt}]
+            if prompt_location == "system"
+            else [
+                # {"role": "system", "content": "You are a helpful assistant."} # vanilla system message
+                {
+                    "role": "system",
+                    "content": sys_choices[sys_type],
+                }
+            ]
+        )
+        self.deafult_config = {
+            "model": model,
+            "messages": messages,
+            "n": n,  # n should be 1 for now
+            "temperature": temperature,
+        }
+
+        self.max_invalid_q_per_step = max_invalid_q_per_step
+        self.use_cache = use_cache
+        self.logger = logger
+        self.config_list = config_list
+
+    def make_conversation(self, problem, n=1, file_to_be_saved=None):
+        # initialize the query handler
+        proxyagent = UserProxyAgent()
+
+        # initialize the conversation
+        config = copy.deepcopy(self.deafult_config)
+        problem_prompt = {
+            "role": "user",
+            "content": self.prompt + "\nProblem: " + remove_asy_sections(problem["problem"]),
+        }  # put prompt in user message
+
+        # if the prompt_location is set to system, then the prompt is already put in the system message in __init__,
+        # then we only need to put the problem in the user message
+        if self.prompt_loaction == "system":
+            problem_prompt = {"role": "user", "content": remove_asy_sections(problem["problem"])}
+        config["messages"].append(problem_prompt)
+
+        # save a readable conversation in txt file
+        def save_message_to_file(message):
+            if file_to_be_saved is not None:
+                with open(file_to_be_saved, "a") as f:
+                    f.write(message)
+                    f.flush()
+
+        seperate_line = "\n" + "-" * 40 + "\n"
+        save_message_to_file(f'Problem: {self.str_splitter(problem["problem"])}\n {seperate_line}')
+
+        # for additional refine process
+        is_refine_process = False
+        response_with_new_ans = ""  # save the corrected answer
+
+        # init parameters
+        is_valid_reply = False  # only valid when detect \box
+        invalid_q = 0  # for query
+        total_cost = 0
+        response_with_ans = ""  # save the response with \box to get the answer
+        rr = 0  # round
+        total_completion_tokens = 0
+        while rr < self.max_round:
+            # 1. get the response from the assistant, handle exceptions
+            try:
+                if self.config_list is not None:
+                    raw_responses = oai.ChatCompletion.create(
+                        config_list=self.config_list, **config, use_cache=self.use_cache
+                    )
+                else:
+                    raw_responses = oai.ChatCompletion.create(None, **config, use_cache=self.use_cache)
+            except InvalidRequestError as e:
+                print(problem["type"], problem["problem_id"], str(e), flush=True)
+                save_message_to_file(str(e))
+                break
+            except (RateLimitError, Timeout):
+                print("Ratelimit or timeout, retrying...", flush=True)
+                continue
+            try:
+                total_completion_tokens += raw_responses["usage"]["completion_tokens"]
+            except Exception:
+                pass
+            if raw_responses["usage"]["total_tokens"] >= 8000:
+                error_str = "Use more than 8000 many tokens, breaking."
+                print(error_str)
+                save_message_to_file(error_str)
+                break
+
+            assert raw_responses != -1, "Error in getting response"
+            responses = oai.ChatCompletion.extract_text(raw_responses)
+            assert len(responses) == 1, "More than one response"  # right now we only use one response
+
+            # 2. process response
+            save_message_to_file(f"assistant: {self.str_splitter(responses[0])}{seperate_line}")
+            # token_used = raw_responses['usage']['total_tokens']
+            try:
+                total_cost += oai.ChatCompletion.cost(raw_responses)
+            except TypeError:
+                total_cost += oai.ChatCompletion.cost(self.deafult_config["model"], raw_responses)
+            config["messages"].append({"role": "assistant", "content": responses[0]})
+            tmp_msg = ""
+
+            if get_answer(responses[0]) is not None and get_answer(responses[0]) != "":
+                tmp_msg, is_query_exist = proxyagent.check_queries(responses[0])
+                if not is_query_exist:
+                    # if the assistant gives a valid reply and no more queries, stop the conversation
+                    is_valid_reply = True
+                    if not self.refine:  # if not refine, stop the conversation
+                        response_with_ans = responses[0]
+                        response_with_new_ans = responses[0]
+                        break
+                    elif not is_refine_process:  # if refine, start the refine process
+                        response_with_ans = responses[0]
+                        is_refine_process = True
+                        refine_message = "Please check your answer to make sure it meets conditions in the problem and you doesn't make any mistakes. If you find any mistake, please correct it and put the corrected answer in box. If you find no mistake, put previous answer in the box."
+                        config["messages"].append({"role": "user", "content": refine_message})
+                        save_message_to_file(
+                            "user: {a}{s}".format(a=config["messages"][-1]["content"], s=seperate_line)
+                        )
+                        continue
+                    else:  # if already in the refine process, then stop the conversation
+                        response_with_new_ans = responses[0]
+                        break
+
+            # 3. handle the response and get the query
+            query_response, is_query_sucess = proxyagent.handle_query(responses[0])
+            if len(query_response) > 2000:
+                # prevent long response by string length, 2000 chars -> around 500-1000 tokens
+                save_message_to_file(f"****: Replacing {query_response} ****\n")
+                query_response = "Your requested query response is too long. You might have made a mistake. Please revise your reasoning and query."
+                is_query_sucess = False
+
+                query_response = 'Continue. (If you think the problem is finished, please reply "[EOF]")'
+            if is_query_sucess:
+                query_response += tmp_msg  # add the query response from the previous step
+            config["messages"].append({"role": "user", "content": query_response})
+
+            invalid_q = 0 if is_query_sucess else invalid_q + 1
+            if invalid_q >= self.max_invalid_q_per_step:
+                assert config["messages"][-1]["role"] == "user", "The last message should be from user"
+                skip_query_str = "Please revisit the problem statement and your reasoning. If you think this step is correct, solve it yourself and continue the next step. Otherwise, correct this step."
+                config["messages"][-1]["content"] = skip_query_str
+                save_message_to_file(f"****: Replacing {query_response}****\n")
+                invalid_q = 0
+
+            save_message_to_file("user: {a}{s}".format(a=config["messages"][-1]["content"], s=seperate_line))
+            if "Continue" in query_response:
+                rr -= 0.5
+            rr += 1
+        save_message_to_file("Solution: " + problem["solution"])
+
+        return {
+            "total_completion_tokens": total_completion_tokens,
+            "valid_q_count": proxyagent.valid_q_count,  # number of valid queries
+            "total_q_count": proxyagent.total_q_count,
+            "is_valid_reply": is_valid_reply,  # whether the assistant can give a valid reply
+            "response_with_ans": response_with_ans,  # string instead of list
+            "response_with_new_ans": response_with_new_ans,  # string instead of list
+            "messages": config["messages"],
+            "round": min(rr + 1, self.max_round),
+            "cost": total_cost,
+        }
+
+    def str_splitter(self, string, length=500):
+        """
+        Add '\n' every 'length' characters to make the output more readable.
+        If at 'length' there is a word, add '\n' before the word.
+
+        Args:
+            string (str): The input string to be processed.
+            length (int): The maximum number of characters in a line before adding a newline.
+
+        Returns:
+            str: The processed string with newlines added.
+        """
+
+        words = string.split(" ")
+        current_line = []
+        current_length = 0
+        result = []
+
+        for word in words:
+            if current_length + len(word) + len(current_line) > length:
+                result.append(" ".join(current_line))
+                current_line = []
+                current_length = 0
+
+            current_line.append(word)
+            current_length += len(word)
+
+        if current_line:
+            result.append(" ".join(current_line))
+
+        return "\n".join(result)
+
+    def solve_one_category(self, problem_set, saving_folder):
+        """
+        Solve all problems in a category.
+        Assumption 1: all problems are of the same type
+        Assumption 2: if resume from a previous run, the sequence of problems are the same as the previous run, using same shuffling seed
+
+        Args:
+            problem_set (list): a list of problems
+            saving_folder (str): the result folder to save the solved problems, the category folder will be created inside
+
+        Returns:
+            None
+        """
+        if not self.logger:
+            self.logger = mylogger(os.path.join(saving_folder, "log.txt"))
+
+        # assume all problems are of the same type: TODO: ensure this assumption
+        saving_folder = os.path.join(saving_folder, math_type_mapping[problem_set[0]["type"]])
+        # mkdir if not exist
+        os.makedirs(saving_folder, exist_ok=True)
+
+        # from the saving folder load solved problems
+        done_problems = set([int(f.split(".")[0]) for f in os.listdir(saving_folder) if "json" in f])
+
+        correct_counts = 0
+        self.logger.log("id : is_correct $ ans $ correct_ans | corrected_ans $ round")
+        for count, problem in enumerate(problem_set):
+            problem_path = os.path.join(saving_folder, problem["problem_id"] + ".json")
+
+            # 1. if problem already solved, continue
+            if int(problem["problem_id"]) in done_problems:
+                problem = json.load(open(problem_path, "r"))
+                correct_counts += problem["is_correct"]
+                new_ans = problem["new_ans"] if "new_ans" in problem else ""
+                if problem["new_ans"] == problem["voted_answer"]:
+                    problem["new_ans"] = "same"
+                self.logger.log(
+                    f'{problem["problem_id"]} : {bool(problem["is_correct"])} $ {problem["voted_answer"]} $ {problem["correct_ans"]} | {new_ans} $ {problem["round"]} $ (from previous run)'
+                )
+                continue
+
+            # 2. solve the problem
+            result = self.make_conversation(
+                problem, file_to_be_saved=os.path.join(saving_folder, problem["problem_id"] + ".txt")
+            )
+            metrics = eval_math_responses([result["response_with_ans"]], problem["solution"])
+
+            # 3. save the result
+            correct_ans = get_answer(problem["solution"])
+            problem.update(
+                {
+                    "is_valid_reply": result["is_valid_reply"],
+                    "is_correct": bool(metrics["success_vote"]),
+                    "correct_ans": correct_ans,
+                    "voted_answer": get_answer(metrics["voted_answer"]),
+                    "new_ans": get_answer(result["response_with_new_ans"]),
+                    "round": result["round"],
+                    "valid_q_count": result["valid_q_count"],  # total number of valid queries
+                    "total_q_count": result["total_q_count"],  # total number of queries
+                    "cost": result["cost"],  # total cost of the conversation
+                    "messages": result["messages"],  # the conversation
+                    "total_completion_tokens": result["total_completion_tokens"],
+                }
+            )
+            write_json(problem, problem_path)
+            if problem["new_ans"] == problem["voted_answer"]:
+                problem["new_ans"] = "same"
+
+            # 4. continue to next problem
+            correct_counts += problem["is_correct"]
+            self.logger.log(
+                f'{problem["problem_id"]} : {bool(problem["is_correct"])} $ {problem["voted_answer"]} $ {problem["correct_ans"]} | {problem["new_ans"]} $ {problem["round"]} $'
+            )
+
+        tp = problem_set[0]["type"]
+        self.logger.log(f"{tp} Accuracy: {correct_counts}/{len(problem_set)} = {correct_counts/len(problem_set)}")
+        self.logger.log("------------------------------------------------------------\n", verbose=True)
diff --git a/flaml/autogen/math/math_voting.py b/flaml/autogen/math/math_voting.py
new file mode 100644
index 0000000000..ea45fa8d92
--- /dev/null
+++ b/flaml/autogen/math/math_voting.py
@@ -0,0 +1,149 @@
+from flaml.autogen.math.user_proxy_agent import UserProxyAgent
+from flaml.autogen.math_utils import eval_math_responses, get_answer
+from flaml.autogen.math.math_chat import write_json, remove_asy_sections, math_type_mapping
+from flaml import oai
+import os
+import json
+import re
+import copy
+from flaml.autogen.math.math_chat import MathChat
+from functools import partial
+
+
+def vanilla_solving(model, problem, n, max_tokens=None):
+    """Solving a problem directly."""
+    config = {
+        "model": model,
+        "n": n,
+        "prompt": "{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\boxed{{}}.",
+    }
+    if max_tokens is not None:
+        config["max_tokens"] = max_tokens
+    context = {
+        "problem": problem["problem"],
+    }
+    raw_responses = oai.ChatCompletion.create(context, **config, use_cache=True)
+
+    prompt_price = (
+        oai.ChatCompletion.price1K[model][0]
+        if type(oai.ChatCompletion.price1K[model]) == tuple
+        else oai.ChatCompletion.price1K[model]
+    )
+    return {
+        "responses": oai.ChatCompletion.extract_text(raw_responses),
+        "cost": oai.ChatCompletion.cost(model, raw_responses),
+        "prompt_cost": prompt_price * raw_responses["usage"]["prompt_tokens"] / 1000,
+    }
+
+
+def vanilla_voting_one_category(model, problem_set, saving_folder, n=10, n_per_time=3):
+    """Solve one category of problems directly."""
+    selfconsistency = SelfConsistency(n=n, n_per_time=n_per_time)
+    saving_folder = os.path.join(saving_folder, math_type_mapping[problem_set[0]["type"]])
+    os.makedirs(saving_folder, exist_ok=True)
+    for problem in problem_set:
+        responses = selfconsistency.sequential_reasoning_path_sampling(
+            problem=problem,
+            saving_folder=saving_folder,
+            solving=partial(vanilla_solving, model=model, max_tokens=None),
+        )
+        results = selfconsistency.vanilla_voting(responses["responses"], problem["solution"])
+        print(results["success_vote"], results["votes"])
+
+
+def tool_voting_one_category(model, problem_set, saving_folder, n=2, n_per_time=1):
+    selfconsistency = SelfConsistency(n=n, n_per_time=n_per_time)
+    toolsolver = MathChat(model="gpt-4", tool="both", max_round=10)
+
+    saving_folder = os.path.join(saving_folder, math_type_mapping[problem_set[0]["type"]])
+    os.makedirs(saving_folder, exist_ok=True)
+    for problem in problem_set:
+        responses = selfconsistency.sequential_reasoning_path_sampling(
+            problem=problem,
+            saving_folder=saving_folder,
+            solving=toolsolver.make_conversation,
+        )
+        results = selfconsistency.vanilla_voting(responses["responses"], problem["solution"])
+        print(results["success_vote"], results["votes"])
+
+
+class SelfConsistency:
+    def __init__(self, n=10, n_per_time=5, cache_folder=".cache"):
+        self.n = n
+        self.n_per_time = n_per_time
+        self.start_seed = 41
+        self.cache_folder = cache_folder
+
+    def vanilla_voting(self, accum_responses, solution):
+        if type(accum_responses[0]) == dict:
+            accum_responses = [r["response_with_ans"] for r in accum_responses]
+        return eval_math_responses(accum_responses, solution)
+
+    def early_stop_voting(self, accum_responses):
+        if type(accum_responses[0]) == dict:
+            accum_responses = [r["response_with_ans"] for r in accum_responses]
+        pass
+
+    def sequential_reasoning_path_sampling(self, problem, saving_folder, solving):
+        """
+
+        Args:
+            problem (dict): problem dict
+            saving_folder (str): saving folder
+            solver (function): solver function, either MathSolver.make_conversation or vanilla prompt
+
+        return from vanilla prompt: {
+            'responses': responses,
+            'cost': oai.ChatCompletion.cost(model, raw_responses),
+            'prompt_cost': oai.ChatCompletion.price1K(model, 0) * raw_responses["usage"]["prompt_tokens"] / 1000
+        }
+
+        return from math solver: {
+            'valid_q_count' : query_handler.valid_q_count, # number of valid queries
+            'total_q_count' : query_handler.total_q_count,
+            'is_valid_reply': is_valid_reply, # whether the assistant can give a valid reply
+            'response_with_ans': response_with_ans,
+            'ans': ans,
+            'messages': config['messages'],
+            'round' : len(config['messages'])//2 + 1,
+            'cost' : total_cost,
+        }
+
+        """
+        accum_responses = []  # can be a list of dicts (for mathsolver) or list of strings
+        accum_cost = 0
+        file = os.path.join(saving_folder, "responses_" + problem["problem_id"] + ".json")
+        if os.path.exists(file):
+            accum_responses = json.load(open(file, "r"))["responses"]
+            accum_cost = json.load(open(file, "r"))["cost"]
+
+        query_count = len(accum_responses)
+        tmp_n = self.n_per_time
+        while query_count < self.n:
+            oai.ChatCompletion.set_cache(seed=self.start_seed + query_count, cache_path_root=self.cache_folder)
+            tmp_n = min(tmp_n, self.n - self.n_per_time)
+
+            responses = solving(problem=problem, n=tmp_n)
+
+            if "responses" in responses.keys():
+                accum_responses.extend(responses["responses"])
+                if query_count != 0:
+                    accum_cost -= responses["prompt_cost"]  # if not the first round, deduct the prompt cost
+            else:  # the response comes from math solver, single response
+                accum_responses.extend([responses])
+
+            accum_cost += responses["cost"]
+            write_json(
+                {
+                    "cost": accum_cost,
+                    "true_ans": get_answer(problem["solution"]),
+                    "answers": [get_answer(r) for r in accum_responses],
+                    "responses": accum_responses,
+                },
+                file,
+            )  # save the responses each time
+
+            query_count += tmp_n
+
+        # TODO: cost calculation: should prompt for each round being counted?
+        return {"responses": accum_responses, "cost": accum_cost}
diff --git a/flaml/autogen/math/mathchat.png b/flaml/autogen/math/mathchat.png
new file mode 100644
index 0000000000..19b41ead78
Binary files /dev/null and b/flaml/autogen/math/mathchat.png differ
diff --git a/flaml/autogen/math/prompts.py b/flaml/autogen/math/prompts.py
new file mode 100644
index 0000000000..58b8a6e067
--- /dev/null
+++ b/flaml/autogen/math/prompts.py
@@ -0,0 +1,71 @@
+PROMPTS = {
+    # default
+    "default": """Let's use Python to solve a math problem.
+
+Query requirements:
+You should always use the 'print' function for the output and use fractions/radical forms instead of decimals.
+You can use packages like sympy to help you.
+You must follow the formats below to write your code:
+```python
+# your code
+```
+
+First state the key idea to solve the problem. You may choose from three ways to solve the problem:
+Case 1: If the problem can be solved with Python code directly, please write a program to solve it. You can enumerate all possible arrangements if needed.
+Case 2: If the problem is mostly reasoning, you can solve it by yourself directly.
+Case 3: If the problem cannot be handled in the above two ways, please follow this process:
+1. Solve the problem step by step (do not over-divide the steps).
+2. Take out any queries that can be asked through Python (for example, any calculations or equations that can be calculated).
+3. Wait for me to give the results.
+4. Continue if you think the result is correct. If the result is invalid or unexpected, please correct your query or reasoning.
+
+After all the queries are run and you get the answer, put the answer in \\boxed{}.
+
+""",
+    # select python or wolfram
+    "two_tools": """Let's use two tools (Python and Wolfram alpha) to solve a math problem.
+
+Query requirements:
+You must follow the formats below to write your code:
+For Wolfram Alpha:
+```wolfram
+# your wolfram query
+```
+For Python:
+```python
+# your code
+```
+When using Python, you should always use the 'print' function for the output and use fractions/radical forms instead of decimals.
+You can use packages like sympy to help you.
+
+
+Please follow this process:
+1. Solve the problem step by step (do not over-divide the steps).
+2. Take out any queries that can be asked through Python or Wolfram Alpha and select the most suitable tool to be used (for example, any calculations or equations that can be calculated).
+3. Wait for me to give the results.
+4. Continue if you think the result is correct. If the result is invalid or unexpected, please correct your query or reasoning.
+
+After all the queries are run and you get the answer, put the final answer in \\boxed{}.
+
+""",
+    # use python step by step
+    "python": """Let's use Python to solve a math problem.
+
+Query requirements:
+You should always use the 'print' function for the output and use fractions/radical forms instead of decimals.
+You can use packages like sympy to help you.
+You must follow the formats below to write your code:
+```python
+# your code
+```
+
+Please follow this process:
+1. Solve the problem step by step (do not over-divide the steps).
+2. Take out any queries that can be asked through Python (for example, any calculations or equations that can be calculated).
+3. Wait for me to give the results.
+4. Continue if you think the result is correct. If the result is invalid or unexpected, please correct your query or reasoning.
+
+After all the queries are run and you get the answer, put the answer in \\boxed{}.
+
+""",
+}
diff --git a/flaml/autogen/math/pseudo_main.py b/flaml/autogen/math/pseudo_main.py
new file mode 100644
index 0000000000..289d06d54e
--- /dev/null
+++ b/flaml/autogen/math/pseudo_main.py
@@ -0,0 +1,118 @@
+import os
+from flaml import oai
+from math_voting import SelfConsistency
+from flaml.autogen.math.math_chat import MathChat
+import argparse
+from utils import mylogger, load_level5_math_test_each_category, load_fixed, random_sample_MATH
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="MathChat")
+    parser.add_argument("--prompt_type", "-ptype", dest="prompt_type", help="prompt type", default="default", type=str)
+    parser.add_argument("--prompt_location", dest="prompt_location", help="prompt location", default="user", type=str)
+    parser.add_argument("--max_round", dest="max_round", help="max round", default=15, type=int)
+    parser.add_argument("--folder", "-f", dest="folder", help="saving folder", default="./mathchat", type=str)
+    parser.add_argument("--cache_folder", "-c", dest="cache_folder", default=".cache", help="cache folder")
+    parser.add_argument("--samples_per_category", help="samples per category", default=20, type=int)
+    parser.add_argument("--temperature", "-t", dest="temperature", help="temperature", default=1, type=float)
+    parser.add_argument("--test_run", help="test run", action="store_true")
+    parser.add_argument("--categories", dest="categories", help="categories", default=[0, 1], nargs="+")
+    parser.add_argument("--seed", dest="seed", help="seed", default=41, type=int)
+    parser.add_argument("--select", action="store_true")
+    parser.add_argument("--refine", action="store_true")
+    parser.add_argument("--sample_all", help="samples per category", default=0, type=int)
+    parser.add_argument("-systype", dest="systype", help="system type", default="s0", type=str)
+
+    # not used
+    parser.add_argument("--n", dest="n", help="number of samples", default=1, type=int)
+    parser.add_argument("--voting", action="store_true")
+    args = parser.parse_args()
+    args.folder = (
+        args.folder + args.systype + "_" + args.prompt_location + "_" + args.prompt_type + "_t" + str(args.temperature)
+    )
+    if args.seed != 41:
+        args.folder = args.folder + "_seed" + str(args.seed)
+    if args.refine:
+        args.folder = args.folder.replace("_t" + str(args.temperature), "_refine_t" + str(args.temperature))
+    if args.sample_all != 0:
+        args.folder += "_random_sample"
+    os.makedirs(args.folder, exist_ok=True)
+    return args
+
+
+def pseudo_main(config_list):
+    # 1. args, settings and logger
+    args = parse_args()
+    args.model = "gpt-4"
+    oai.ChatCompletion.request_timeout = 60 * 10  # 10 minutes
+    oai.ChatCompletion.set_cache(seed=args.seed, cache_path_root=args.cache_folder)
+    logger = mylogger(os.path.join(args.folder, "log.txt"))
+
+    # 2. load math dataset
+    problem_sets = load_level5_math_test_each_category(
+        samples_per_category=args.samples_per_category, category_to_load=args.categories
+    )
+    if args.test_run:
+        problem_sets = load_level5_math_test_each_category(samples_per_category=1, category_to_load=args.categories)
+        logger.log("Take out 1 problem from each category for test run.")
+
+    if args.select:
+        problem_sets = load_fixed()
+
+    if args.sample_all != 0:
+        problem_sets = random_sample_MATH(args.sample_all)
+
+    print(f"Running {args.folder}")
+
+    selected_samples = {
+        # "Algebra": [9,13,14],  # [8] wrong,  # 8 correct
+        # "Algebra": [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],  # [8] wrong,  # 8 correct
+        # "Algebra": [1,2,4,13],
+        # "Algebra": [18], # [1, 8] wrong, 9-10 out of 10 correct
+        # "Algebra": [2, 5, 13],
+        # "Counting & Probability": [0,1,8,9], #  0,10,  | 5 correct [2,3,16,18,19], 6 [4,5,13,14,15,17] wrong
+        # "Geometry": [],
+        # "Algebra": [i for i in range(20)],
+        # "Counting & Probability": [i for i in range(20)],
+        "Intermediate Algebra": [0, 3, 6, 8, 9, 10, 11, 13, 15, 16, 17],
+        # "Number Theory": [i for i in range(20)],
+        # "Prealgebra": [i for i in range(20)],
+        "Precalculus": [1, 14, 15, 18],
+        # "Number Theory": [0, 2, 4,6,7,8,10,11,12,13,14,15,17,18],  # [3] always wrong      [1, 5, 9, 16, 19] always right
+        # "Prealgebra": [3, 4, 8, 9, 10, 11, 12, 13, 14, 15, 17], # [0,7,16] always wrong, [1,2,5,6,10,18,19] always right
+    }
+
+    # 3. solve
+    if not args.voting:
+        solver = MathChat(
+            config_list=config_list,
+            model=args.model,
+            prompt_type=args.prompt_type,
+            sys_type=args.systype,
+            max_round=args.max_round,
+            temperature=args.temperature,
+            prompt_location=args.prompt_location,
+            logger=logger,
+            refine=args.refine,
+        )
+        with open(os.path.join(args.folder, "prompt.txt"), "w") as f:
+            f.write(solver.prompt)
+
+        for problem_set in problem_sets:
+            for i in range(len(problem_set)):
+                problem_set[i]["problem_id"] = str(i)  # assign problem id
+            if args.select:
+                if problem_set[0]["type"] in selected_samples and len(selected_samples[problem_set[0]["type"]]) > 0:
+                    problem_set = [problem_set[i] for i in selected_samples[problem_set[0]["type"]]]
+                    print(problem_set[0]["type"], selected_samples[problem_set[0]["type"]])
+                else:
+                    continue
+            solver.solve_one_category(problem_set, saving_folder=args.folder)
+            # os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
+
+        logger.log("*******************************************************************************\n\n\n", verbose=False)
+        # os.system("tar -czf " + args.folder + ".tar.gz " + args.folder)
+
+    else:
+        logger.log("Voting is not supported yet.")
+        pass
diff --git a/flaml/autogen/math/tool.py b/flaml/autogen/math/tool.py
new file mode 100644
index 0000000000..87343a7aa7
--- /dev/null
+++ b/flaml/autogen/math/tool.py
@@ -0,0 +1,196 @@
+from typing import Union, Any
+from math import isclose
+import func_timeout
+from sympy.solvers import solve
+from sympy import Symbol, Eq
+import math
+from sympy import simplify
+import numpy as np
+import cvxpy as cp
+import statistics
+
+
+def get_precision(gt_ans: float) -> int:
+    precision = 5
+    if "." in str(gt_ans):
+        precision = len(str(gt_ans).split(".")[-1])
+    return precision
+
+
+def finqa_equal(
+    prediction: Union[bool, float, str],
+    reference: Union[float, str],
+    include_percentage: bool = False,
+    is_close: float = False,
+) -> bool:
+    if prediction is None:
+        return False
+    elif type(prediction) == bool:
+        # bool questions
+        if prediction:
+            return reference == "yes"
+        else:
+            return reference == "no"
+    elif type(reference) == str or type(prediction) == str:
+        # string questions
+        return prediction == reference
+    else:
+        # number questions
+        if include_percentage:
+            gt_result = [reference / 100, reference, reference * 100]
+        else:
+            gt_result = [reference]
+        for item in gt_result:
+            try:
+                if is_close:
+                    if isclose(item, prediction, rel_tol=0.001):
+                        return True
+                precision = min(get_precision(prediction), get_precision(item))
+                if round(prediction, precision) == round(item, precision):
+                    return True
+            except Exception:
+                continue
+        return False
+
+
+def simplify_ans(ans, convert_to_str: bool = True):
+    if "relational" in str(type(ans)):
+        return str(ans)
+    elif "numpy" in str(type(ans)):
+        if ans.shape == ():
+            # scalar value
+            ans = round(float(ans), 2)
+        else:
+            # array value
+            ans = round(float(ans[0]), 2)
+        if convert_to_str:
+            return str(ans)
+        else:
+            return ans
+    elif not ans:
+        return None
+    else:
+        if type(ans) in [list, tuple]:
+            if "sympy" in str(type(ans[0])):
+                try:
+                    ans = [round(float(x), 2) for x in ans]
+                except Exception:
+                    ans = [str(x) for x in ans]
+            if len(ans) == 1:
+                ans = ans[0]
+        else:
+            if "sympy" in str(type(ans)):
+                try:
+                    ans = round(float(ans), 2)
+                except Exception:
+                    ans = str(ans)
+        if convert_to_str:
+            return str(ans)
+        else:
+            return ans
+
+
+def floatify_ans(ans):
+    if ans is None:
+        return None
+    elif type(ans) == dict:
+        ans = list(ans.values())[0]
+    elif type(ans) == bool:
+        ans = ans
+    elif type(ans) in [list, tuple]:
+        if not ans:
+            return None
+        else:
+            try:
+                ans = float(ans[0])
+            except Exception:
+                ans = str(ans[0])
+    else:
+        try:
+            ans = float(ans)
+        except Exception:
+            ans = str(ans)
+    return ans
+
+
+def parse_api_result(result):
+    to_return = []
+    for idx, g in enumerate(result["choices"]):
+        text = g["text"]
+        logprob = sum(g["logprobs"]["token_logprobs"])
+        to_return.append((text, logprob))
+    to_return = sorted(to_return, key=lambda tup: tup[1], reverse=True)
+    to_return = [r[0] for r in to_return]
+    return to_return
+
+
+def solve_it(equation, variable):
+    solution = solve(equation, variable, dict=True)
+    if not solution:
+        if isinstance(variable, list):
+            solution = {v: None for v in variable}
+        else:
+            solution = {variable: None}
+        return solution
+    else:
+        solution = solution[0]
+        return solution
+
+
+def safe_execute(code_string: str, keys=None):
+    def execute(x):
+        try:
+            exec(x)
+            locals_ = locals()
+            if keys is None:
+                return locals_.get("ans", None)
+            else:
+                return [locals_.get(k, None) for k in keys]
+        except Exception:
+            return None
+
+    try:
+        ans = func_timeout.func_timeout(5, execute, args=(code_string,))
+    except func_timeout.FunctionTimedOut:
+        ans = None
+
+    return ans
+
+
+def synthesize_program(result: str, prefix: str) -> str:
+    # program = prefix
+    program = """
+import math
+import numpy as np
+import sympy as sp # added
+
+def solver():
+"""
+
+    for i, line in enumerate(result.split("\n")):
+        if line == "":
+            continue
+        if i == 0:
+            program += line + "\n"
+        else:
+            if line.startswith("    "):
+                program += line + "\n"
+            else:
+                break
+    program += "print(solver())"
+    # program += 'ans = solver()'
+    return program
+
+
+# def synthesize_program(result: str, prefix: str) -> str:
+#     program = prefix
+#     for i, line in enumerate(result.split('\n')):
+#         if line == '':
+#             continue
+#         if '\t' or '    ' not in line:
+#             program += '    ' + line + '\n'
+#         else:
+#             program += line + '\n'
+
+#     program += 'ans = solver()'
+#     return program
diff --git a/flaml/autogen/math/user_proxy_agent.py b/flaml/autogen/math/user_proxy_agent.py
new file mode 100644
index 0000000000..35f18c06fe
--- /dev/null
+++ b/flaml/autogen/math/user_proxy_agent.py
@@ -0,0 +1,393 @@
+import json
+import sys
+from io import StringIO
+import regex
+import os
+import re
+from pydantic import BaseModel, Field, Extra, root_validator
+from typing import Any, Dict, Optional
+from flaml.autogen.code_utils import execute_code
+from time import sleep
+
+
+class UserProxyAgent:
+    def __init__(self):
+        self.previous_code = "import sympy\nfrom sympy import symbols, Eq, solve\nfrom fractions import Fraction\n"
+
+        self.valid_q_count = 0
+        self.total_q_count = 0
+
+        self.last_query = None
+        self.last_return = None
+        self.consecutive_continue = 0
+
+    def check_queries(self, response: str):
+        """check if there is a query in the response"""
+        queries = self.extractJSON(response)  # extract json queries
+        if len(queries) == 0:
+            queries = self.extractCode(response)  # extract code queries
+            if len(queries) == 0:
+                if (
+                    ("tool" in response and "query" in response)
+                    or ("python" in response and "wolfram" in response)
+                    or "```" in response
+                ):
+                    return (
+                        "\nYour query is invalid and cannot be parsed. (If you already get the answer, put it in \\boxed{}.)",
+                        True,
+                    )
+                else:
+                    return "", False
+
+        return (
+            "\nAbove is the result to the queries. If you get to the final answer, put it in \\boxed{}",
+            True,
+        )
+
+    def handle_query(self, response: str):
+        """Handle a list of queries and return the output.
+        Args:
+            response: string with a list of queries
+        returns:
+            output: string with the output of the queries
+            is_success: boolean indicating whether the queries were successful
+        """
+        queries = self.extractJSON(response)  # extract json queries
+        if len(queries) == 0:
+            queries = self.extractCode(response)  # extract code queries
+            if len(queries) == 0:
+                if (
+                    ("tool" in response and "query" in response)
+                    or ("python" in response and "wolfram" in response)
+                    or "```" in response
+                ):
+                    return "Your query is invalid and cannot be parsed. Please revise your query format.", False
+                else:
+                    # self.consecutive_continue += 1
+                    # if self.consecutive_continue >= 3:
+                    #     self.consecutive_continue = 0
+                    #     return "Continue. Please keep solving the problem until you need to query. (If you get to the answer already, put it in \\boxed{}.)", True
+                    return (
+                        "Continue. Please keep solving the problem until you need to query. (If you get to the answer, put it in \\boxed{}.)",
+                        True,
+                    )
+
+        self.consecutive_continue = 0
+        self.total_q_count += len(queries)
+        self.valid_q_count += len(queries)
+
+        buffer_out = ""
+        all_success = True  # all queries are successful
+        for i, query in enumerate(queries):
+            if "tool" in query:
+                # old format of query in json format, ignore
+                if query["tool"] == "python":
+                    output, is_success = self.execute_python_code(query["query"])
+                elif query["tool"] == "wolfram":
+                    output, is_success = self.execute_wolfram_query(query["query"])
+                else:
+                    output = "Error: Unknown tool"
+                    is_success = False
+            else:
+                output = ""
+                is_success = False
+                if "python" in query and query["python"] != "":
+                    pyout, pysucess = self.execute_python_code(query["python"])
+                    output += "python: " + pyout + "\n"
+                    is_success = is_success or pysucess
+                if "wolfram" in query and query["wolfram"] != "":
+                    wolframout, wolframsuccess = self.execute_wolfram_query(query["wolfram"])
+                    output += "wolfram: " + wolframout + "\n"
+                    is_success = is_success or wolframsuccess
+                # add new query handling here
+
+            buffer_out += output + "\n"
+            if not is_success:
+                # TODO: handle situation with several queries and one fails
+                all_success = False
+                self.valid_q_count -= 1  # invalid query
+        buffer_out = buffer_out.strip()
+        if self.last_query == tuple(queries) or self.last_return == buffer_out:
+            return (
+                buffer_out + "\nYour query or result is same from the last, please try a new approach.",
+                False,
+            )
+        self.last_query = tuple(queries)
+        self.last_return = buffer_out
+        return buffer_out, all_success
+
+    def extractCode(self, input_string: str):
+        """Extract code blocks from message."""
+        pattern = r"```(.*?)```"
+        match = re.findall(pattern, input_string, flags=re.DOTALL)
+
+        queries = []
+        for m in match:
+            if "python" in m:
+                queries.append({"tool": "python", "query": m.replace("python", "").strip()})
+            elif "wolfram" in m:
+                queries.append({"tool": "wolfram", "query": m.replace("wolfram", "").strip()})
+            # add new query handling here
+        return queries
+
+    def execute_wolfram_query(self, query: str):
+        """
+        Run one wolfram query and return the output.
+        return:
+            output: string with the output of the query
+            is_success: boolean indicating whether the query was successful
+        """
+        # wolfram query handler
+        wolfram = WolframAlphaAPIWrapper()
+        output, is_success = wolfram.run(query)
+        if output == "":
+            output = "Error: The wolfram query is invalid."
+        return output, is_success
+
+    # code query handler
+    def execute_python_code(self, query: str):
+        """Run one code query and return the output.
+        params:
+            query: string with the code query
+        """
+        query = query.replace("; ", "\n").replace(";", "\n")
+        code = self.previous_code + self.add_print_to_last_line(query)
+
+        # python_repl = PythonREPL()
+        # output, is_success = python_repl.run(code)
+        return_code, output = execute_code(code, use_docker=False, timeout=5)
+        is_success = return_code == 0
+        if isinstance(output, bytes):
+            try:
+                output = output.decode("ascii")
+            except Exception:
+                try:
+                    output = output.decode("utf-8")
+                except Exception:
+                    is_success = False
+                    output = "The return cannot be decoded."
+
+        if not is_success:
+            # Remove the file information from the error string
+            pattern = r'File "/[^"]+\.py", line \d+, in .+\n'
+            if type(output) == str:
+                output = re.sub(pattern, "", output)
+
+        if not is_success:
+            output = "Error: " + output
+        elif output == "":
+            if "print" not in query:
+                output = "No output found. Make sure you print the results."
+                is_success = False
+            else:
+                output = "No output found."
+                is_success = True
+
+        if len(output) > 2000:
+            output = "You required too much output. Please print only the necessary output."
+            is_success = False
+
+        if is_success:
+            # remove print and check if it still works
+            tmp = self.previous_code + "\n" + self.remove_print(query) + "\n"
+            rcode, _ = execute_code(tmp, use_docker=False)
+        else:
+            tmp = self.previous_code + "\n"
+            for line in query.split("\n"):
+                if "import" in line:
+                    tmp += line + "\n"
+            rcode, _ = execute_code(tmp, use_docker=False)
+        if rcode == 0:
+            self.previous_code = tmp
+        return output, is_success
+
+    def add_print_to_last_line(self, s):
+        # first check if there is already a print statement
+        if "print(" in s:
+            return s
+
+        # Input a string, extract the last line, enclose it in print() and return the new string
+        lines = s.splitlines()
+        last_line = lines[-1]
+        if " = " in last_line:
+            last_line = "print(" + last_line.split(" = ")[0] + ")"
+            lines.append(last_line)
+        else:
+            lines[-1] = "print(" + last_line + ")"
+
+        # Join the lines back together
+        return "\n".join(lines)
+
+    def remove_print(self, s):
+        # remove all print statements from a string
+        lines = s.splitlines()
+        lines = [line for line in lines if "print(" not in line]
+        return "\n".join(lines)
+
+    def _remove_newlines_outside_quotes(self, s):
+        """Remove newlines outside of quotes.
+
+        Return from openai:
+            s = "{\n"tool": "python",\n"query": "print('hello')\nprint('world')"\n}"
+
+        if calling json.loads(s), it will throw an error because of the newline in the query.
+        So this function removes the newline in the query outside of quotes.
+
+        _remove_newlines_outside_quotes(s) -> "{"tool": "python","query": "print('hello')\nprint('world')"}"
+
+
+        params:
+            s: string to remove newlines from
+        returns:
+            string with newlines removed
+
+        Example:
+
+        """
+        result = []
+        inside_quotes = False
+        for c in s:
+            if c == '"':
+                inside_quotes = not inside_quotes
+            if not inside_quotes and c == "\n":
+                continue
+            if inside_quotes and c == "\n":
+                c = "\\n"
+            if inside_quotes and c == "\t":
+                c = "\\t"
+            result.append(c)
+        return "".join(result)
+
+    def extractJSON(self, input_string: str):
+        """
+        Extract JSON queries from a string.
+        params:
+            input_string: string to extract JSON queries from
+        returns:
+            list of JSON queries
+        """
+        input_string = input_string.replace(",\n}", "}")
+        # bracketed_strings = re.findall(r'\{[\s\S]*?\}', input_string)
+        bracketed_strings = regex.findall(r"\{(?:[^{}]|(?R))*\}", input_string)
+        # print(bracketed_strings)
+        # Extract valid JSON queries
+        json_queries = []
+        for bracketed_string in bracketed_strings:
+            bracketed_string = self._remove_newlines_outside_quotes(bracketed_string)
+            try:
+                data = json.loads(bracketed_string)
+                if ("tool" in data and "query" in data) or "python" in data or "wolfram" in data:
+                    json_queries.append(data)
+            except json.JSONDecodeError:
+                pass
+
+        return json_queries
+
+
+# Imported from langchain
+def get_from_dict_or_env(data: Dict[str, Any], key: str, env_key: str, default: Optional[str] = None) -> str:
+    """Get a value from a dictionary or an environment variable."""
+    if key in data and data[key]:
+        return data[key]
+    elif env_key in os.environ and os.environ[env_key]:
+        return os.environ[env_key]
+    elif default is not None:
+        return default
+    else:
+        raise ValueError(
+            f"Did not find {key}, please add an environment variable"
+            f" `{env_key}` which contains it, or pass"
+            f"  `{key}` as a named parameter."
+        )
+
+
+# Imported from langchain
+class WolframAlphaAPIWrapper(BaseModel):
+    """Wrapper for Wolfram Alpha.
+
+    Docs for using:
+
+    1. Go to wolfram alpha and sign up for a developer account
+    2. Create an app and get your APP ID
+    3. Save your APP ID into WOLFRAM_ALPHA_APPID env variable
+    4. pip install wolframalpha
+
+    """
+
+    wolfram_client: Any  #: :meta private:
+    wolfram_alpha_appid: Optional[str] = None
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        wolfram_alpha_appid = get_from_dict_or_env(values, "wolfram_alpha_appid", "WOLFRAM_ALPHA_APPID")
+        values["wolfram_alpha_appid"] = wolfram_alpha_appid
+
+        try:
+            import wolframalpha
+
+        except ImportError:
+            raise ImportError("wolframalpha is not installed. " "Please install it with `pip install wolframalpha`")
+        client = wolframalpha.Client(wolfram_alpha_appid)
+        values["wolfram_client"] = client
+
+        return values
+
+    def run(self, query: str) -> str:
+        """Run query through WolframAlpha and parse result."""
+        from urllib.error import HTTPError
+
+        is_success = False  # added
+        res = None
+        for _ in range(20):
+            try:
+                res = self.wolfram_client.query(query)
+                break
+            except HTTPError:
+                sleep(1)
+            except Exception:
+                return (
+                    "Wolfram Alpha wasn't able to answer it. Please try a new query for wolfram or use python.",
+                    is_success,
+                )
+        if res is None:
+            return (
+                "Wolfram Alpha wasn't able to answer it (may due to web error), you can try again or use python.",
+                is_success,
+            )
+
+        try:
+            if not res["@success"]:
+                return (
+                    "Your Wolfram query is invalid. Please try a new query for wolfram or use python.",
+                    is_success,
+                )
+            assumption = next(res.pods).text
+            answer = ""
+            for r in res["pod"]:
+                if r["@title"] == "Solution":
+                    answer = r["subpod"]["plaintext"]
+                if r["@title"] == "Results" or r["@title"] == "Solutions":
+                    for i, sub in enumerate(r["subpod"]):
+                        answer += f"ans {i}: " + sub["plaintext"] + "\n"
+                    break
+            if answer == "":
+                answer = next(res.results).text
+
+        except Exception:
+            return (
+                "Wolfram Alpha wasn't able to answer it. Please try a new query for wolfram or use python.",
+                is_success,
+            )
+
+        if answer is None or answer == "":
+            # We don't want to return the assumption alone if answer is empty
+            return "No good Wolfram Alpha Result was found", is_success
+        else:
+            is_success = True
+            return f"Assumption: {assumption} \nAnswer: {answer}", is_success
diff --git a/flaml/autogen/math/utils.py b/flaml/autogen/math/utils.py
new file mode 100644
index 0000000000..d0ec70cd52
--- /dev/null
+++ b/flaml/autogen/math/utils.py
@@ -0,0 +1,148 @@
+import datasets
+import re
+import os
+import json
+import argparse
+
+math_type_mapping = {
+    "Algebra": "algebra",
+    "Counting & Probability": "counting_and_probability",
+    "Geometry": "geometry",
+    "Intermediate Algebra": "intermediate_algebra",
+    "Number Theory": "number_theory",
+    "Prealgebra": "prealgebra",
+    "Precalculus": "precalculus",
+}
+
+
+class mylogger:
+    def __init__(self, file) -> None:
+        self.file = file
+
+    def log(self, message, verbose=True):
+        """Print the message.
+        Args:
+            message (str): The message to print.
+        """
+        with open(self.file, "a") as f:
+            f.write(message + "\n")
+        if verbose:
+            print(message, flush=True)
+
+
+def load_fixed(category_to_load=None):
+    category_to_load = [i for i in range(7)] if not category_to_load or "all" in category_to_load else category_to_load
+    category_to_load = [int(x) for x in category_to_load]
+    folder = "22_user_v3select_t1"
+    # folder = "37system_user_v3.1python_t1_random_sample"
+    sep_cat = []
+
+    for i, category in enumerate(math_type_mapping.keys()):
+        if i not in category_to_load:
+            continue
+
+        c = math_type_mapping[category]
+        sep_cat.append([])
+        for i in range(20):
+            try:
+                with open(os.path.join(folder, c, f"{i}.json"), "r") as fp:
+                    problem = json.load(fp)
+            except Exception:
+                continue
+            del problem["is_valid_reply"]
+            del problem["is_correct"]
+            del problem["correct_ans"]
+            del problem["voted_answer"]
+            del problem["round"]
+            del problem["valid_q_count"]
+            del problem["total_q_count"]
+            del problem["cost"]
+            del problem["messages"]
+
+            sep_cat[-1].append(problem)
+    return sep_cat
+
+
+def load_level5_math_test_each_category(samples_per_category=20, category_to_load=None):
+    """
+    Load level 5 math problems from the testset of competition dataset.
+    Returns:
+        A list of list of problems. Each list of problems is of the same category.
+    """
+    category_to_load = [i for i in range(7)] if not category_to_load or "all" in category_to_load else category_to_load
+    category_to_load = [int(x) for x in category_to_load]
+    seed = 41
+    data = datasets.load_dataset("competition_math")
+    test_data = data["test"].shuffle(seed=seed)
+    sep_cate = []
+    for i, category in enumerate(math_type_mapping.keys()):
+        if i not in category_to_load:
+            print(i, category, "(skipped)", flush=True)
+            continue
+        tmp = [
+            test_data[x]
+            for x in range(len(test_data))
+            if test_data[x]["level"] == "Level 5" and test_data[x]["type"] == category
+        ]
+        # if len(tmp) < samples_per_category:
+        #     print(f"Warning: {category} has {len(tmp)} problems.", flush=True)
+
+        sep_cate.append(tmp[:samples_per_category])
+        print(i, category, f"{len(sep_cate[-1])} problems loaded", flush=True)
+
+    if len(sep_cate) == 0:
+        raise ValueError("No category is loaded.")
+    return sep_cate
+
+
+def random_sample_MATH(num_samples=100):
+    """
+    Load level 5 math problems from the competition dataset.
+    Returns:
+        A list of list of problems. Each list of problems is of the same category.
+    """
+    seed = 41
+    data = datasets.load_dataset("competition_math")
+    test_data = data["test"].shuffle(seed=seed)
+
+    test_data = [test_data[x] for x in range(min(num_samples, len(test_data)))]
+
+    sep_cate = []
+    for i, category in enumerate(math_type_mapping.keys()):
+        sep_cate.append([test_data[x] for x in range(len(test_data)) if test_data[x]["type"] == category])
+        print(i, category, f"{len(sep_cate[-1])} problems sampled ", flush=True)
+
+    if len(sep_cate) == 0:
+        raise ValueError("No category is loaded.")
+    return sep_cate
+
+
+def remove_asy_sections(input_string):
+    """Remove asy sections from the input string.
+
+    Args:
+        input_string (str): The input string.
+    Returns:
+        str: The string without asy sections.
+    """
+    pattern = r"\[asy\](.*?)\[\\asy\]"
+    output_string = re.sub(pattern, "", input_string, flags=re.DOTALL)
+    pattern = r"\[asy\](.*?)\[/asy\]"
+    output_string = re.sub(pattern, "", output_string, flags=re.DOTALL)
+    pattern = r"\[ASY\](.*?)\[\\ASY\]"
+    output_string = re.sub(pattern, "", output_string, flags=re.DOTALL)
+    pattern = r"\[ASY\](.*?)\[/ASY\]"
+    output_string = re.sub(pattern, "", output_string, flags=re.DOTALL)
+    return output_string
+
+
+def write_json(dict_to_save, file):
+    """Write a dictionary to a json file.
+    Args:
+
+        dict_to_save (dict): The dictionary to save.
+        file (str): The file to save to.
+    """
+    jstring = json.dumps(dict_to_save, indent=2)
+    with open(file, "w") as j:
+        j.write(jstring)