Skip to content

Commit

Permalink
Merge branch 'master' into patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
DilumAluthge authored Jan 2, 2025
2 parents f3c5214 + 6f36748 commit ee7237d
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 67 deletions.
7 changes: 7 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
2 changes: 1 addition & 1 deletion .github/workflows/UnitTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v2.2.0
uses: actions/checkout@v4.2.2

- uses: julia-actions/setup-julia@latest
with:
Expand Down
18 changes: 0 additions & 18 deletions .travis.yml

This file was deleted.

4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "ClusterManagers"
uuid = "34f1f09b-3a8b-5176-ab39-66d58a4d544e"
version = "0.4.4"
version = "0.4.7"

[deps]
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Expand All @@ -9,7 +9,7 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"

[compat]
julia = "1"
julia = "1.2"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ ElasticManager(addr, port) = ElasticManager(;addr=addr, port=port)
ElasticManager(addr, port, cookie) = ElasticManager(;addr=addr, port=port, cookie=cookie)
```

On Linux and Mac, you can set `addr=:auto` to automatically use the host's private IP address on the local network, which will allow other workers on this network to connect. You can also use `port=0` to let the OS choose a random free port for you (some systems may not support this). Once created, printing the `ElasticManager` object prints the command which you can run on workers to connect them to the master, e.g.:
You can set `addr=:auto` to automatically use the host's private IP address on the local network, which will allow other workers on this network to connect. You can also use `port=0` to let the OS choose a random free port for you (some systems may not support this). Once created, printing the `ElasticManager` object prints the command which you can run on workers to connect them to the master, e.g.:

```julia
julia> em = ElasticManager(addr=:auto, port=0)
Expand Down
26 changes: 6 additions & 20 deletions src/elastic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,13 @@ struct ElasticManager <: ClusterManager
Distributed.init_multi()
cookie !== nothing && cluster_cookie(cookie)

# Automatically check for the IP address of the local machine
if addr == :auto
addr = get_private_ip()
try
addr = Sockets.getipaddr(IPv4)
catch
error("Failed to automatically get host's IP address. Please specify `addr=` explicitly.")
end
end

l_sock = listen(addr, port)
Expand Down Expand Up @@ -134,25 +139,6 @@ function elastic_worker(cookie, addr="127.0.0.1", port=9009; stdout_to_master=tr
start_worker(c, cookie)
end


function get_private_ip()
if Sys.islinux()
cmd = `hostname --ip-address`
elseif Sys.isapple()
cmd = `ipconfig getifaddr en0`
else
error("`addr=:auto` is only supported on Linux and Mac")
end
try
return IPv4(first(split(strip(read(cmd, String)))))
catch err
error("""Failed to automatically get host's IP address (output below). Please specify `addr=` explicitly.
\$ $(repr(cmd))
$err
""")
end
end

function get_connect_cmd(em::ElasticManager; absolute_exename=true, same_project=true)

ip = string(em.sockname[1])
Expand Down
89 changes: 65 additions & 24 deletions src/slurm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,56 +51,97 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array,
mkdir(job_file_loc)
end

np = manager.np
# Check for given output file name
jobname = "julia-$(getpid())"
job_output_name = "$(jobname)-$(trunc(Int, Base.time() * 10))"
make_job_output_path(task_num) = joinpath(job_file_loc, "$(job_output_name)-$(task_num).out")
job_output_template = make_job_output_path("%4t")
srun_cmd = `srun -J $jobname -n $np -o "$(job_output_template)" -D $exehome $(srunargs) $exename $exeflags $(worker_arg())`
has_output_name = ("-o" in srunargs) | ("--output" in srunargs)
if has_output_name
loc = findfirst(x-> x == "-o" || x == "--output", srunargs)
job_output_name = srunargs[loc+1]
job_output_template = joinpath(job_file_loc, job_output_name)
srunargs[loc+1] = job_output_template
else
job_output_name = "$(jobname)-$(trunc(Int, Base.time() * 10))"
make_job_output_path(task_num) = joinpath(job_file_loc, "$(job_output_name)-$(task_num).out")
job_output_template = make_job_output_path("%4t")
push!(srunargs, "-o", job_output_template)
end

np = manager.np
srun_cmd = `srun -J $jobname -n $np -D $exehome $(srunargs) $exename $exeflags $(worker_arg())`

@info "Starting SLURM job $jobname: $srun_cmd"
srun_proc = open(srun_cmd)

slurm_spec_regex = r"([\w]+):([\d]+)#(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})"
could_not_connect_regex = r"could not connect"
exiting_regex = r"exiting."
retry_delays = manager.retry_delays

t_start = time()
t_waited = round(Int, time() - t_start)
for i = 0:np - 1
println("connecting to worker $(i + 1) out of $np")
slurm_spec_match = nothing
fn = make_job_output_path(lpad(i, 4, "0"))
t0 = time()
for retry_delay in retry_delays
slurm_spec_match::Union{RegexMatch,Nothing} = nothing
worker_errors = String[]
if has_output_name
fn = job_output_template
else
fn = make_job_output_path(lpad(i, 4, "0"))
end
for retry_delay in push!(collect(retry_delays), 0)
t_waited = round(Int, time() - t_start)

# Wait for output log to be created and populated, then parse
if isfile(fn) && filesize(fn) > 0
slurm_spec_match = open(fn) do f
# Due to error and warning messages, the specification
# may not appear on the file's first line
for line in eachline(f)
re_match = match(slurm_spec_regex, line)
if re_match !== nothing
return re_match # only returns from do-block

if isfile(fn)
if filesize(fn) > 0
open(fn) do f
# Due to error and warning messages, the specification
# may not appear on the file's first line
for line in eachline(f)
re_match = match(slurm_spec_regex, line)
if !isnothing(re_match)
slurm_spec_match = re_match
end
for expr in [could_not_connect_regex, exiting_regex]
if !isnothing(match(expr, line))
slurm_spec_match = nothing
push!(worker_errors, line)
end
end
end
end
end
if slurm_spec_match !== nothing
break # break if specification found
if !isempty(worker_errors) || !isnothing(slurm_spec_match)
break # break if error or specification found
else
@info "Worker $i (after $t_waited s): Output file found, but no connection details yet"
end
else
@info "Worker $i (after $t_waited s): No output file \"$fn\" yet"
end
# Sleep for some time to limit ressource usage while waiting for the job to start

# Sleep for some time to limit resource usage while waiting for the job to start
sleep(retry_delay)
end

if slurm_spec_match === nothing
throw(SlurmException("Timeout while trying to connect to worker"))
if !isempty(worker_errors)
throw(SlurmException("Worker $i failed after $t_waited s: $(join(worker_errors, " "))"))
elseif isnothing(slurm_spec_match)
throw(SlurmException("Timeout after $t_waited s while waiting for worker $i to get ready."))
end

config = WorkerConfig()
config.port = parse(Int, slurm_spec_match[2])
config.host = strip(slurm_spec_match[3])
@info "Worker $i ready after $t_waited s on host $(config.host), port $(config.port)"
# Keep a reference to the proc, so it's properly closed once
# the last worker exits.
config.userdata = srun_proc
push!(instances_arr, config)
notify(c)
end
catch e
println("Error launching Slurm job:")
@error "Error launching Slurm job"
rethrow(e)
end
end
Expand Down
7 changes: 6 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,19 @@ end

if "slurm" in ARGS
@testset "Slurm" begin
p = addprocs_slurm(1)
out_file = "my_slurm_job.out"
p = addprocs_slurm(1; o=out_file)
@test nprocs() == 2
@test workers() == p
@test fetch(@spawnat :any myid()) == p[1]
@test remotecall_fetch(+,p[1],1,1) == 2
rmprocs(p)
@test nprocs() == 1
@test workers() == [1]

# Check output file creation
@test isfile(out_file)
rm(out_file)
end
end

Expand Down

0 comments on commit ee7237d

Please sign in to comment.