forked from jack-morrison/OLCF-Support
-
Notifications
You must be signed in to change notification settings - Fork 0
/
commonNodes-slurm
executable file
·67 lines (50 loc) · 2.22 KB
/
commonNodes-slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
# Find overlapping nodes from an arbitrary number of Slurm jobs
# This is likely full of inefficent ways of doing things and/or nonPythonic ways.
# I don't claim to be a python expert though, and this isn't performance-critical.
import sys
import subprocess
import functools
def expand(ranges):
flat = []
for item in ranges:
pair = tuple(int(i) for i in item.split('-')) + (None,)
flat.extend(list(range(pair[0], (pair[1] or pair[0])+1)))
flattened_ranges = []
for item in flat:
flattened_ranges.append('rhea'+str(item))
return flattened_ranges
def find_max(jobs_dict):
maxnodes = -1
for val in jobs_dict.values():
if len(val) > maxnodes:
maxnodes = len(val)
return maxnodes
def compareJobs(jobIDs):
if all(job.isdigit() for job in jobIDs):
print("\njobIDs: " + str(jobIDs))
jobs_and_nodes = {}
for job in jobIDs:
runline = ["/usr/bin/sacct", "-n", "-o", "NodeList%1000", "-j", job]
# print("Running ", runline)
csm_query_output = subprocess.run(runline, stdout=subprocess.PIPE)
csm_query_output = str(csm_query_output.stdout.decode()).splitlines()
main=csm_query_output[0]
main = main.strip().replace('rhea[','').replace(']','')
node_list = expand(main.split(','))
# print("Node list:", node_list)
# print(len(node_list), "nodes used")
jobs_and_nodes[job] = node_list
common_compute_nodes = functools.reduce(set.intersection, (set(val) for val in jobs_and_nodes.values()))
maxnodes = find_max(jobs_and_nodes)
print("Nodes used max: ", maxnodes)
if (common_compute_nodes):
print("\nCompute nodes common between jobs:", jobIDs)
print(", ".join(common_compute_nodes) + "\n")
print(len(common_compute_nodes), "nodes in common")
else:
print("\nThese jobs did not share any common compute nodes.\n")
else:
print("Error: Invalid input. 1 or more primary job IDs required\n usage: common_nodes primaryjobID1 [primaryjobID2 primaryjobID3 ...]")
if __name__ == "__main__":
compareJobs(sys.argv[1:])