From a2f11d9eb07b2a236b262cbbc610c5ed1016adc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Fri, 12 Jan 2024 14:47:00 +0000 Subject: [PATCH] build: set a timeout for the tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have some non-deterministic timeouts on github actions. Introduce some shorter timeouts that is under our control, and print more information about what is stuck. GH actions take <2m to run the tests, so add 5m as a timeout for the children, and twice that for the parent. Signed-off-by: Edwin Török --- Makefile | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4d155619a57..8f2a2f72288 100644 --- a/Makefile +++ b/Makefile @@ -28,8 +28,41 @@ lint: pylint --disable=line-too-long,too-few-public-methods,unused-argument,no-self-use,invalid-name,broad-except,protected-access,redefined-builtin,too-many-lines,wildcard-import,too-many-branches,too-many-arguments,unused-wildcard-import,raising-format-tuple,too-many-statements,duplicate-code _build/default/xapi-storage/python/xapi/storage/api/v5/*.py pycodestyle --ignore=E501 _build/default/xapi-storage/python/xapi/storage/api/v5/*.py + +# ulimit -S -t +# Set a soft CPU time quota which will kill processes with SIGXCPU +# This will in preference kill children of dune that consume CPU time (e.g. a stuck test), +# and not dune itself (which should consume little CPU time when it just waits for subprocesses) +# However it won't kill idle processes (e.g. a test sleeping and waiting for an event that never arrives) + +# sleep && ps +# Prints a process tree once the timeout is reached to identify any sleeping processes that are stuck +# (if we send 'dune' a SIGINT or SIGTERM it'd kill all subprocesses but won't say which ones were running and since when!) +# -e: prints all processes +# -ww: disables line length restrictions +# -ly: prints additional columns, e.g. WCHAN which shows currently active syscall +# -F: prints process start time and CPU time, useful in identifying which test got started recently, +# and which one was running for a while +# --forest prints a process tree + +# timeout --foreground dune +# Sends a SIGTERM to dune after N seconds. This should print any pending buffered output, but won't tell us which processes were still running +# The timeout used here should be > timeout in ulimit && ps to allow time for subprocesses to terminate first + +# ulimit -n +# By default the ulimit on some systems is very large (e.g. Fedora39 distrobox 1048576) +# which causes some tests to take very long to run (e.g. forkexec tests which loop through and close all fds up to limit) + +TEST_TIMEOUT=600 +TEST_TIMEOUT2=1200 test: - dune runtest --profile=$(PROFILE) --error-reporting=twice -j $(JOBS) + ulimit -S -t $(TEST_TIMEOUT); \ + ulimit -n 1024; \ + (sleep $(TEST_TIMEOUT) && ps -ewwlyF --forest)& \ + PSTREE_SLEEP_PID=$$!; \ + trap "kill $${PSTREE_SLEEP_PID}" SIGINT SIGTERM EXIT; \ + timeout --foreground $(TEST_TIMEOUT2) \ + dune runtest --profile=$(PROFILE) --error-reporting=twice -j $(JOBS) ifneq ($(PY_TEST), NO) dune build @runtest-python --profile=$(PROFILE) endif