Skip to content

Commit

Permalink
resolve many more instances
Browse files Browse the repository at this point in the history
  • Loading branch information
azliu0 committed Jan 10, 2025
1 parent 91f82df commit 75c7faf
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 59 deletions.
134 changes: 75 additions & 59 deletions swebench/harness/run_evaluation_modal.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# This file contains logic for running evaluations on Modal: <https://modal.com/>.

from __future__ import annotations

import json
Expand All @@ -20,6 +22,10 @@
from swebench.harness.constants import KEY_INSTANCE_ID
from swebench.harness.utils import EvaluationError

SANDBOX_ENTRYPOINT = "run_evaluation_modal_entrypoint"
LOCAL_SANDBOX_ENTRYPOINT_PATH = (Path(__file__).parent / f"{SANDBOX_ENTRYPOINT}.py").resolve()
REMOTE_SANDBOX_ENTRYPOINT_PATH = f"/root/{SANDBOX_ENTRYPOINT}.py"

import asyncio
import tenacity

Expand Down Expand Up @@ -60,15 +66,28 @@ def __init__(self, test_spec: TestSpec, timeout: int | None = None, verbose: boo
# Hack for pylint
self.write_file("/sys/fs/cgroup/cpu/cpu.shares", "2048")

@tenacity.retry(stop=tenacity.stop_after_attempt(5))
@tenacity.retry(
stop=tenacity.stop_after_attempt(7),
wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
)
def _get_sandbox(self, timeout: int | None = None):
# Sometimes network flakiness causes the image build to fail,
# so we retry a few times.
if timeout is None:
# Default 30 minutes
timeout = 60 * 30

return modal.Sandbox.create(image=self.image, timeout=timeout, cpu=4)
return modal.Sandbox.create(
image=self.image,
timeout=timeout,
cpu=4,
mounts=[
modal.Mount.from_local_file(
REMOTE_SANDBOX_ENTRYPOINT_PATH,
REMOTE_SANDBOX_ENTRYPOINT_PATH,
)
],
)

async def _read_stream(self, stream: modal.io_streams.StreamReader, output_list: list[str]):
try:
Expand All @@ -91,22 +110,17 @@ async def _read_output(self, p: modal.container_process.ContainerProcess, stdout
except asyncio.CancelledError:
pass

def write_file(self, file_path: str, content: str) -> modal.container_process.ContainerProcess:
bash_command = f"""cat <<'EOF' > {file_path}
{content}
EOF"""
p = self.sandbox.exec("bash", "-c", bash_command)
p.wait()
return p
def write_file(self, file_path: str, content: str):
self.sandbox.open(file_path, "w").write(content)

def exec(self, *args, **kwargs) -> tuple[str, int]:
def exec(self, command: str) -> tuple[str, int]:
"""
Execute a command in the sandbox.
Returns:
tuple[str, int]: Sandbox output and return code.
"""
p = self.sandbox.exec(*args, **kwargs)
p = self.sandbox.exec("python", "-m", SANDBOX_ENTRYPOINT, command)
stdout = []
stderr = []
try:
Expand Down Expand Up @@ -142,6 +156,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
@staticmethod
def get_instance_image(test_spec: TestSpec) -> modal.Image:
env_script = test_spec.setup_env_script
# add trusted host flag for Modal's PyPI mirror
env_script = env_script.replace(
"conda activate testbed && python -m pip install -r $HOME/requirements.txt",
"conda activate testbed && python -m pip install --trusted-host pypi-mirror.modal.local -r $HOME/requirements.txt"
)
repo_script = test_spec.install_repo_script

remote_env_script_path = "/root/setup_env.sh"
Expand Down Expand Up @@ -292,6 +311,12 @@ def get_log_dir(pred: dict, run_id: str, instance_id: str) -> Path:

@app.function(
image=swebench_image,
mounts=[
modal.Mount.from_local_file(
LOCAL_SANDBOX_ENTRYPOINT_PATH,
REMOTE_SANDBOX_ENTRYPOINT_PATH,
)
],
timeout=120*60, # Much larger than default timeout to account for image build time
)
def run_instance_modal(
Expand Down Expand Up @@ -334,17 +359,13 @@ def run_instance_modal(
runner.write_file(patch_file, patch_diff)

apply_patch_output, returncode = runner.exec(
"bash",
"-c",
"cd /testbed && git apply -v /tmp/patch.diff",
)

if returncode != 0:
logger.info(f"Failed to apply patch to container, trying again...")

apply_patch_output, returncode = runner.exec(
"bash",
"-c",
"cd /testbed && patch --batch --fuzz=5 -p1 -i /tmp/patch.diff",
)

Expand All @@ -363,29 +384,27 @@ def run_instance_modal(

# Get git diff before running eval script
git_diff_output_before, returncode = runner.exec(
"bash",
"-c",
"cd /testbed && git diff",
)
logger.info(f"Git diff before:\n{git_diff_output_before}")

eval_file = "/root/eval.sh"
eval_script = test_spec.eval_script
# Hack for django
# django hack
eval_script = eval_script.replace("locale-gen", "locale-gen en_US.UTF-8")
runner.write_file(eval_file, eval_script)

start_time = time.time()

run_command = "cd /testbed"
# pylint hack
if "pylint" in test_spec.instance_id:
run_command += " && PYTHONPATH="
# increase recursion limit for testing
run_command += " && python3 -c 'import sys; sys.setrecursionlimit(10000)'"
# run eval script
run_command += " && /bin/bash /root/eval.sh"
test_output, returncode = runner.exec(
"bash",
"-c",
run_command,
)
test_output, returncode = runner.exec(run_command)

total_runtime = time.time() - start_time

Expand All @@ -397,11 +416,7 @@ def run_instance_modal(
print(f"Test output for {instance_id} written to {test_output_path}")

# Get git diff after running eval script
git_diff_output_after, returncode = runner.exec(
"bash",
"-c",
"cd /testbed && git diff",
)
git_diff_output_after, returncode = runner.exec("cd /testbed && git diff")

# Check if git diff changed after running eval script
logger.info(f"Git diff after:\n{git_diff_output_after}")
Expand Down Expand Up @@ -492,37 +507,38 @@ def run_instances_modal(
continue
run_test_specs.append(test_spec)

# Run instances that haven't been run yet
results = run_instance_modal.starmap(
[
(
test_spec,
predictions[test_spec.instance_id],
run_id,
timeout,
)
for test_spec in run_test_specs
],
)
if run_test_specs:
# Run instances that haven't been run yet
results = run_instance_modal.starmap(
[
(
test_spec,
predictions[test_spec.instance_id],
run_id,
timeout,
)
for test_spec in run_test_specs
],
)

for result in results:
result = cast(TestOutput, result)

# Save logs locally
log_dir = result.log_dir
log_dir.mkdir(parents=True, exist_ok=True)
with open(log_dir / "run_instance.log", "w") as f:
f.write(result.run_instance_log)
with open(log_dir / "test_output.txt", "w") as f:
f.write(result.test_output)
with open(log_dir / "patch.diff", "w") as f:
f.write(result.patch_diff)
with open(log_dir / "report.json", "w") as f:
try:
report_json = json.loads(result.report_json_str)
json.dump(report_json, f, indent=4)
except Exception:
# This happens if the test fails with any exception
print(f"{result.instance_id}: no report.json")
for result in results:
result = cast(TestOutput, result)

# Save logs locally
log_dir = result.log_dir
log_dir.mkdir(parents=True, exist_ok=True)
with open(log_dir / "run_instance.log", "w") as f:
f.write(result.run_instance_log)
with open(log_dir / "test_output.txt", "w") as f:
f.write(result.test_output)
with open(log_dir / "patch.diff", "w") as f:
f.write(result.patch_diff)
with open(log_dir / "report.json", "w") as f:
try:
report_json = json.loads(result.report_json_str)
json.dump(report_json, f, indent=4)
except Exception:
# This happens if the test fails with any exception
print(f"{result.instance_id}: no report.json")

make_run_report(predictions, full_dataset, run_id)
113 changes: 113 additions & 0 deletions swebench/harness/run_evaluation_modal_entrypoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Sandbox entrypoint script for running evals on Modal.
#
# In a perfect world, we would execute commands using the Sandbox directly, but Modal imposes
# a container stdio rate limit of 64 KiB/s. Some test harnesses exceed this limit which leads
# to "dropped container output" logs that interfere with parsing the test output. Instead,
# we mount and run this script in the Sandbox to control the rate at which stdio is streamed to
# the container.
import asyncio
import sys
import argparse

# 64 KiB // 2 to be safe
STDIO_RATE_LIMIT_BYTES_PER_SEC = 64 * 1024 // 2

async def exec(command: str) -> int:
p = await asyncio.create_subprocess_shell(
command,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
limit=1024 * 1024,
)

stdout_lines = []
stderr_lines = []

async def read_stream(stream, lines, fd):
tokens = STDIO_RATE_LIMIT_BYTES_PER_SEC
last_refill = asyncio.get_event_loop().time()

while True:
try:
line = await stream.readline()
if not line:
break
except (asyncio.LimitOverrunError, ValueError):
# buffer exceeded asyncio stream limit
fallback_chunk_size = 8192
line = await stream.read(fallback_chunk_size)
if not line:
break

remaining_data = line
buffer = bytearray()

while remaining_data:
current_time = asyncio.get_event_loop().time()
time_passed = current_time - last_refill

tokens = min(
STDIO_RATE_LIMIT_BYTES_PER_SEC,
tokens + (time_passed * STDIO_RATE_LIMIT_BYTES_PER_SEC)
)
last_refill = current_time

chunk_size = min(len(remaining_data), STDIO_RATE_LIMIT_BYTES_PER_SEC, int(tokens))

if chunk_size == 0:
sleep_time = max(
0.01,
(0.01 * STDIO_RATE_LIMIT_BYTES_PER_SEC - tokens) / STDIO_RATE_LIMIT_BYTES_PER_SEC
)
await asyncio.sleep(sleep_time)
continue

buffer.extend(remaining_data[:chunk_size])

# Find last valid UTF-8 character boundary.
# This is to avoid partial characters being written to
# container stdout/stderr, which results in a very small
# chance of errors of the form: "Error reading stream: 'utf-8' codec can't decode bytes in position ..."
valid_bytes = len(buffer.decode('utf-8', errors='ignore').encode('utf-8'))

if valid_bytes > 0:
chunk = buffer[:valid_bytes]
if fd == "stdout":
sys.stdout.buffer.write(chunk)
sys.stdout.buffer.flush()
else:
sys.stderr.buffer.write(chunk)
sys.stderr.buffer.flush()

buffer = buffer[valid_bytes:]
tokens -= valid_bytes

remaining_data = remaining_data[chunk_size:]

if buffer:
if fd == "stdout":
sys.stdout.buffer.write(buffer)
sys.stdout.buffer.flush()
else:
sys.stderr.buffer.write(buffer)
sys.stderr.buffer.flush()

lines.append(line)

await asyncio.gather(
read_stream(p.stdout, stdout_lines, "stdout"),
read_stream(p.stderr, stderr_lines, "stderr")
)

return await p.wait()

async def main(command: str):
returncode = await exec(command)
exit(returncode)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Execute a shell command and stream output')
parser.add_argument('command', type=str, help='The shell command to execute')
args = parser.parse_args()

asyncio.run(main(args.command))

0 comments on commit 75c7faf

Please sign in to comment.