Skip to content

Commit

Permalink
controller: Rewrite timout mechanism
Browse files Browse the repository at this point in the history
Kill the qemu process as it stalls the ARCHIE execution. After the qemu
process is killed, the python worker can be terminated. The last step in
the timeout mechanism is a write to hdf5collector, s.t.the triggered
timeout is recorded within the hdf5.
  • Loading branch information
aewag committed Nov 2, 2023
1 parent 92effdd commit 53732ed
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 6 deletions.
31 changes: 25 additions & 6 deletions controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import logging
from multiprocessing import Manager, Process, Value
from pathlib import Path
import psutil
import signal
from statistics import mean
import subprocess
Expand Down Expand Up @@ -791,22 +792,40 @@ def controller(
if len(times) > 0:
time_max = max(times)

for i in range(len(p_list)):
p = p_list[i]
for i, p in enumerate(p_list):
# Find finished processes
p["process"].join(timeout=0)

# Kill process if timeout exceeded and gdb is not used
# Halt experiment if timeout duration exceeded
# If gdb is used the timeout is not applicable
if (
p["process"].is_alive()
and (time.time() - p["start_time"]) > config_qemu["timeout"]
and not config_qemu.get("gdb", False)
):
clogger.error(
f"Process {p['process'].name} ran into timeout and was killed!"
)
clogger.warning(f"Experiment {p['experiment_index']} ran into timeout")
# Search for qemu process and kill if found
qemu_process_name = f"qemu{p['experiment_index']}"
for process in psutil.process_iter():
if process.name() != qemu_process_name:
continue
clogger.debug(f"{process.name()} killed")
process.kill()
break
else:
clogger.debug(f"{qemu_process_name} not found to kill")
# Terminate worker process
p["process"].terminate()
p["process"].join()
# Tell hdf5collector about timeout
queue_output.put(
{
"index": p["experiment_index"],
"faultlist": faultlist[p["experiment_index"]]["faultlist"],
"endpoint": -1,
"end_reason": "timeout",
}
)

if p["process"].is_alive() is False:
# Recalculate moving average
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ tables==3.7.0
json5==0.9.10
protobuf==4.21.12
tqdm==4.65.0
psutil==5.9.6

0 comments on commit 53732ed

Please sign in to comment.