Skip to content

Add total memory to job info 878 #879

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion lib/ood_core/job/adapters/slurm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -871,8 +871,33 @@ def get_state(st)
STATE_MAP.fetch(st, :undetermined)
end

# Parse the memory string returned by Slurm and return bytes
def parse_memory(mem_str)
return nil if mem_str.nil? || mem_str.strip.empty? || !mem_str.match(/[KMGTP]/)

unit = mem_str.match(/[KMGTP]/).to_s
value = mem_str.match(/\d+/).to_s

return nil unless unit && value

factor = {
"K" => 1024,
"M" => 1024**2,
"G" => 1024**3,
"T" => 1024**4,
"P" => 1024**5
}

return nil unless factor[unit]

value.to_i * factor[unit]
end

# Parse hash describing Slurm job status
def parse_job_info(v)
# per cpu or per node
memory_per = nil

allocated_nodes = parse_nodes(v[:node_list])
if allocated_nodes.empty?
if v[:scheduled_nodes] && v[:scheduled_nodes] != "(null)"
Expand All @@ -882,6 +907,20 @@ def parse_job_info(v)
end
end

if v[:min_memory] && !v[:min_memory].empty?
# Slurm uses per CPU memory if --mem-per-cpu with 'Mc' output
# or uses per node if --mem with 'M' output
if v[:min_memory].end_with?('c')
# memory per CPU
memory_per = :cpu
Comment on lines +911 to +915
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did you get this from? I'm unable to replicate this or see it in any job at OSC.

else
# memory per node
memory_per = :node
end

v[:memory_per] = memory_per
end

Info.new(
id: v[:job_id],
status: get_state(v[:state_compact]),
Expand All @@ -898,10 +937,33 @@ def parse_job_info(v)
submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil,
dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]),
native: v,
gpus: self.class.gpus_from_gres(v[:gres])
gpus: self.class.gpus_from_gres(v[:gres]),
total_memory: compute_total_memory(v, allocated_nodes)
)
end

# Compute the total memory being used by a job
# @return [Integer] total memory in bytes
def compute_total_memory(v, allocated_nodes)
return nil unless v[:min_memory].to_s.match?(/\d+/) && v[:memory_per]

# Retrieve the memory_per created in parse_job
memory_per = v[:memory_per]&.to_sym
min_memory = parse_memory(v[:min_memory])

return nil if min_memory.nil?

# Compute per-cpu or per-node
case memory_per
when :cpu
min_memory * v[:cpus]
when :node
min_memory * allocated_nodes.count
else
nil
end
end

# Replace '(null)' with nil
def handle_null_account(account)
(account != '(null)') ? account : nil
Expand Down
8 changes: 7 additions & 1 deletion lib/ood_core/job/info.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ class Info
# @return [Integer, nil] allocated total number of gpus
attr_reader :gpus

# Total memory used by job in bytes
# @note computed from the adapter, if supported
# @return [Integer, nil] total bytes used for job
attr_reader :total_memory

# List of job array child task statuses
# @note only relevant for job arrays
# @return [Array<Task>] tasks
Expand Down Expand Up @@ -96,7 +101,7 @@ def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
procs: nil, queue_name: nil, wallclock_time: nil,
wallclock_limit: nil, cpu_time: nil, submission_time: nil,
dispatch_time: nil, native: nil, gpus: 0, tasks: [],
**_)
total_memory: nil, **_)
@id = id.to_s
@status = Status.new(state: status.to_sym)
@allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(**n.to_h) }
Expand All @@ -116,6 +121,7 @@ def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
@status = job_array_aggregate_status unless @tasks.empty?

@native = native
@total_memory = total_memory
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should likely keep the other behavior where we check for nil and cast to_i when ti's not nil, like gpus below it.

@gpus = gpus && gpus.to_i
end

Expand Down
70 changes: 70 additions & 0 deletions spec/job/adapters/slurm_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1487,4 +1487,74 @@ def job_info(opts = {})
end
end
end

describe "#info" do
context "when computing total memory" do
let(:job_id) { "123" }
let(:job_hash) {
{
job_id: job_id,
state_compact: "R",
job_name: "test",
user: "user1",
cpus: 4,
partition: "normal",
time_used: "00:10:00",
time_limit: "01:00:00",
submit_time: "2025-04-18T10:00:00",
start_time: "2025-04-18T10:05:00",
node_list: "node[01-02]",
min_memory: min_memory
}
}

let(:slurm) { double(get_jobs: [job_hash]) }
subject(:job) { described_class.new(slurm: slurm).info(job_id) }

context "and memory is per node" do
let(:min_memory) { "1024M" }

it "computes total memory in bytes" do
expect(job.total_memory).to eq(2 * 1024 * 1024 * 1024)
end
end

context "and memory is per cpu" do
let(:min_memory) { "1024Mc" }

it "computes total memory in bytes" do
expect(job.total_memory).to eq(4 * 1024 * 1024 * 1024)
end
end
end

context "when computing total memory" do
let(:job_id) { "123" }
let(:job_hash) {
{
job_id: job_id,
state_compact: "R",
job_name: "test",
user: "user1",
cpus: 4,
partition: "normal",
time_used: "00:10:00",
time_limit: "01:00:00",
submit_time: "2025-04-18T10:00:00",
start_time: "2025-04-18T10:05:00",
node_list: "node[01-02]",
min_memory: nil
}
}

let(:slurm) { double(get_jobs: [job_hash]) }
subject(:job) { described_class.new(slurm: slurm).info(job_id) }

context "and min_memory is nil" do
it "returns a nil value" do
expect(job.total_memory).to be_nil
end
end
end
end
end