Skip to content

Commit

Permalink
Allow to pass raw number of jobs or dicts to benchmark scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
notoraptor committed Feb 27, 2024
1 parent 8d4f8c9 commit 0f422ef
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 116 deletions.
26 changes: 18 additions & 8 deletions scripts/gen_job_request_benchmark_script.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sys

# max: sum(2**i for i in range(n)) jobs
# max: sum(2**i for i in range(n)) dicts
N = 17
Ks = (1, 10, 100)
# Ns = [i * 10_000 for i in range(16)]
Ns = [i * 10_000 for i in range(11)]
Ks = (1, 500)
N = Ns[-1]

NB_REQUESTS = 10

Expand All @@ -17,19 +17,19 @@ def main():
print("set -eu")

for nb_props_per_dict in Ks:
for nb_dicts in range(N + 1):
for nb_dicts in Ns:
gen_commands(N, nb_dicts, nb_props_per_dict, wd)

for nb_jobs in range(N):
for nb_jobs in Ns[:-1]:
gen_commands(nb_jobs, 0, 1, wd)

for nb_props_per_dict in Ks:
for nb_jobs in range(N):
for nb_jobs in Ns[:-1]:
gen_commands(nb_jobs, N, nb_props_per_dict, wd)


def gen_commands(nb_jobs, nb_dicts, nb_props_per_dict, working_directory):
task_name = f"jobs-{nb_jobs:02}_dicts-{nb_dicts:02}_props-{nb_props_per_dict:02}"
task_name = f"jobs-{nb_jobs:06}_dicts-{nb_dicts:06}_props-{nb_props_per_dict:03}"

cmd_fake_data = (
f"python3 scripts/store_huge_fake_data_in_db.py "
Expand All @@ -43,8 +43,18 @@ def gen_commands(nb_jobs, nb_dicts, nb_props_per_dict, working_directory):
f"--nb-requests {NB_REQUESTS} "
f"--output {task_name}"
)

print(cmd_fake_data)
print('python3 -m flask run --host="0.0.0.0" &')
print("export SERVER_PID=$!")
print("sleep 1")
print(
'''python3 -c "import urllib.request; print(urllib.request.urlopen('http://127.0.0.1:5000/').getcode())"'''
)
print(cmd_benchmark)
print("kill $SERVER_PID")
print("export SERVER_PID=")
print()


if __name__ == "__main__":
Expand Down
48 changes: 33 additions & 15 deletions scripts/plot_job_request_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

try:
import matplotlib.pyplot as plt

# plt.figure(figure=(10.8, 7.2), dpi=100)
except Exception:
print(
Expand Down Expand Up @@ -49,21 +50,23 @@ def main():
nbs_dicts.append(nb_dicts)
nbs_props.append(nb_props_per_dict)

assert max(nbs_jobs) == max(nbs_dicts)
N = max(nbs_jobs)
assert sorted(set(nbs_jobs)) == sorted(set(nbs_dicts))
Ns = sorted(set(nbs_jobs))
Ks = sorted(set(nbs_props))

_plot_request_time_per_nb_dicts(stats, N, Ks, folder)
_plots_request_time_per_nb_jobs(stats, N, Ks, folder)
_plot_request_time_per_nb_dicts(stats, Ns, Ks, folder)
_plots_request_time_per_nb_jobs(stats, Ns, Ks, folder)


def _plot_request_time_per_nb_dicts(stats: dict, Ns: list, Ks: list, folder: str):
N = max(Ns)

def _plot_request_time_per_nb_dicts(stats: dict, N: int, Ks: list, folder: str):
x_nb_dicts = [_compute_nb_jobs(n) for n in range(N + 1)]
x_nb_dicts = list(Ns)
y_time = {nb_props: [] for nb_props in Ks}

for nb_props in Ks:
print()
for nb_dicts in range(N + 1):
for nb_dicts in Ns:
key = (N, nb_dicts, nb_props)
average_duration = _debug_average_seconds(key, stats[key])
y_time[nb_props].append(average_duration)
Expand All @@ -73,9 +76,11 @@ def _plot_request_time_per_nb_dicts(stats: dict, N: int, Ks: list, folder: str):
ax.plot(
x_nb_dicts,
y_time[nb_props],
marker='o',
marker="o",
label=f"{_compute_nb_jobs(N)} jobs in DB, {nb_props} prop(s) per dict",
)
_show_points(x_nb_dicts, y_time[nb_props])

ax.set_title("Request duration per number of job-user dicts")
ax.set_xlabel("Number of job-user dicts in DB")
ax.set_ylabel("Request duration in seconds")
Expand All @@ -89,32 +94,39 @@ def _plot_request_time_per_nb_dicts(stats: dict, N: int, Ks: list, folder: str):
plt.close(fig)


def _plots_request_time_per_nb_jobs(stats: dict, N: int, Ks: list, folder: str):
x_nb_jobs = [_compute_nb_jobs(n) for n in range(N + 1)]
def _plots_request_time_per_nb_jobs(stats: dict, Ns: list, Ks: list, folder: str):
x_nb_jobs = list(Ns)
y_time_0_dicts_1_props = []
y_time_N_dicts = {nb_props: [] for nb_props in Ks}
N = max(Ns)

print()
for nb_jobs in range(N + 1):
for nb_jobs in Ns:
key = (nb_jobs, 0, 1)
average_duration = _debug_average_seconds(key, stats[key])
y_time_0_dicts_1_props.append(average_duration)
print()
for nb_props in Ks:
for nb_jobs in range(N + 1):
for nb_jobs in Ns:
key = (nb_jobs, N, nb_props)
average_duration = _debug_average_seconds(key, stats[key])
y_time_N_dicts[nb_props].append(average_duration)

fig, ax = plt.subplots()
ax.plot(x_nb_jobs, y_time_0_dicts_1_props, marker='o', label=f"0 job-user dicts in DB")
ax.plot(
x_nb_jobs, y_time_0_dicts_1_props, marker="o", label=f"0 job-user dicts in DB"
)
_show_points(x_nb_jobs, y_time_0_dicts_1_props)

for nb_props in Ks:
ax.plot(
x_nb_jobs,
y_time_N_dicts[nb_props],
marker='o',
marker="o",
label=f"{_compute_nb_jobs(N)} job-user dicts in DB, {nb_props} props per dict",
)
_show_points(x_nb_jobs, y_time_N_dicts[nb_props])

ax.set_title("Request duration per number of jobs")
ax.set_xlabel("Number of jobs in DB")
ax.set_ylabel("Request duration in seconds")
Expand All @@ -126,7 +138,13 @@ def _plots_request_time_per_nb_jobs(stats: dict, N: int, Ks: list, folder: str):


def _compute_nb_jobs(n: int):
return sum(2**i for i in range(n))
return n


def _show_points(xs, ys):
# return
for x, y in zip(xs, ys):
plt.text(x, y, f"({x}, {round(y, 2)})")


def _debug_average_seconds(key, durations):
Expand Down
127 changes: 34 additions & 93 deletions scripts/store_huge_fake_data_in_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@
}


DEFAULT_NB_JOBS = len(USERS)
DEFAULT_NB_JOBS = 1_000_000
DEFAULT_NB_DICTS = DEFAULT_NB_JOBS
DEFAULT_NB_PROPS_PER_DICT = 4

Expand All @@ -349,45 +349,37 @@ def _generate_huge_fake_data(
job_user_dicts = []

# populate jobs
if nb_jobs:
assert 1 <= nb_jobs <= len(USERS)
nb_jobs_per_user = [2**i for i in range(nb_jobs)]
assert len(nb_jobs_per_user) == nb_jobs
job_id = 0
for user, nb_user_jobs in zip(USERS[:nb_jobs], nb_jobs_per_user):
for i in range(nb_user_jobs):
job_id += 1
job_slurm = BASE_JOB_SLURM.copy()
job_cw = BASE_JOB_CW.copy()
# edit slurm.job_id
job_slurm["job_id"] = str(job_id)
# edit slurm.name
job_slurm["name"] = f"job_name_{job_id}"
# edit slurm.username
job_slurm["username"] = user["cc_account_username"]
# edit cw.mila_email_username
job_cw["mila_email_username"] = user["mila_email_username"]
jobs.append({"slurm": job_slurm, "cw": job_cw, "user": {}})
print("Nb. jobs:", job_id)
assert job_id == sum(nb_jobs_per_user)
for i in range(nb_jobs):
user = USERS[i % len(USERS)]
job_id = i + 1
job_slurm = BASE_JOB_SLURM.copy()
job_cw = BASE_JOB_CW.copy()
# edit slurm.job_id
job_slurm["job_id"] = str(job_id)
# edit slurm.name
job_slurm["name"] = f"job_name_{job_id}"
# edit slurm.username
job_slurm["username"] = user["cc_account_username"]
# edit cw.mila_email_username
job_cw["mila_email_username"] = user["mila_email_username"]
jobs.append({"slurm": job_slurm, "cw": job_cw, "user": {}})

# populate job-user-dicts
if nb_dicts:
real_nb_dicts = sum(2**i for i in range(nb_dicts))
for i in range(real_nb_dicts):
user_job_dict = {
"user_id": "[email protected]",
"job_id": i + 1,
"cluster_name": "beluga",
"labels": {
f"prop_{j + 1}_for_job_{i + 1}": f"I am user dict prop {j + 1} for job ID {i + 1}"
for j in range(nb_props_per_dict)
},
}
job_user_dicts.append(user_job_dict)
print("Nb. dicts:", real_nb_dicts)
print("NB. props per dict:", nb_props_per_dict)

for i in range(nb_dicts):
user_job_dict = {
"user_id": "[email protected]",
"job_id": i + 1,
"cluster_name": "beluga",
"labels": {
f"prop_{j + 1}_for_job_{i + 1}": f"I am user dict prop {j + 1} for job ID {i + 1}"
for j in range(nb_props_per_dict)
},
}
job_user_dicts.append(user_job_dict)

print(
f"Jobs: {len(jobs)}, dicts: {len(job_user_dicts)}, props per dict: {nb_props_per_dict}"
)
return {"users": USERS, "jobs": jobs, "labels": job_user_dicts}


Expand Down Expand Up @@ -420,54 +412,12 @@ def populate_fake_data(db_insertion_point, **kwargs):
# Anyway clean before inserting
db_insertion_point[k].delete_many({})
if k in E and E[k]:
print("Inserting", k)
# Then insert
print(f"Inserting {k}, {len(E[k])} value(s)")
db_insertion_point[k].insert_many(E[k])
# And check count
# Check count
assert db_insertion_point[k].count_documents({}) == len(E[k])
print("Inserted", k)

def cleanup_function():
"""
Each of those kinds of data is identified in a unique way,
and we can use that identifier to clean up.
For example, when clearing out jobs, we can look at the "job_id"
of the entries that we inserted.
The point is that we can run a test against the production mongodb on Atlas
and not affect the real data. If we cleared the tables completely,
then we'd be affecting the real data in a bad way.
"""
for e in E["users"]:
db_insertion_point["users"].delete_many(
{"mila_email_username": e["mila_email_username"]}
)

for e in E["gpu"]:
db_insertion_point["gpu"].delete_many({"name": e["name"]})

for e in E["labels"]:
copy_e = e
copy_e.pop("labels")
db_insertion_point["labels"].delete_many(copy_e)

for (k, sub, id_field) in [
("jobs", "slurm", "job_id"),
("nodes", "slurm", "name"),
]:
if k in E:
for e in E[k]:
# This is complicated, but it's just about a way to say something like
# that we want to remove {"slurm.job_id", e["slurm"]["job_id"]},
# and the weird notation comes from the fact that mongodb filters use dots,
# but not the original python.
db_insertion_point[k].delete_many(
{f"{sub}.{id_field}": e[sub][id_field]}
)

return cleanup_function


def store_data_in_db(**kwargs):
# Open the database and insert the contents.
Expand All @@ -484,22 +434,13 @@ def main(argv):
"--nb-jobs",
type=int,
default=DEFAULT_NB_JOBS,
help="Number of users for which to add jobs. "
"Control the number of jobs in database by generating "
"2**i jobs for each user i from user <0> to user <nb-jobs>. "
"If 0, no jobs are added. "
f"Default is {DEFAULT_NB_JOBS}, for all users available, ie. "
f"{sum(2**i for i in range(DEFAULT_NB_JOBS))} total jobs.",
help="Number of jobs to add. May be 0 (no job added).",
)
parser.add_argument(
"--nb-dicts",
type=int,
default=DEFAULT_NB_DICTS,
help="Control the number of job-user dicts in database by generating "
"sum(2**i for i in range(nb-dicts)) dictionaries. "
"If 0, no dicts are added. "
f"Default is {DEFAULT_NB_DICTS} to match the maximum number of potential jobs, ie. "
f"{sum(2**i for i in range(DEFAULT_NB_DICTS))} total dicts.",
help="Number of job-user dicts to add. May be 0 (no job added).",
)
parser.add_argument(
"--nb-props-per-dict",
Expand Down

0 comments on commit 0f422ef

Please sign in to comment.