From aeebe26de672b55ba36d35087953074173d06d61 Mon Sep 17 00:00:00 2001 From: "Jager,Petar" Date: Tue, 24 May 2022 10:49:14 +0200 Subject: [PATCH 1/4] Add more e2e test cases --- Makefile | 8 +- templates/adaptive_card_template.json | 1 + test_e2e/README | 4 - test_e2e/README.md | 41 +++ .../cases/test_00/{README.txt => README.md} | 6 +- test_e2e/cases/test_00/test.yaml | 6 - .../cases/test_01/{README.txt => README.md} | 6 +- test_e2e/cases/test_01/test.yaml | 8 +- test_e2e/cases/test_02/README.md | 6 + .../test_02/conf/adaptive_card_template.json | 237 +++++++++++++++++ test_e2e/cases/test_02/conf/gobler.conf | 19 ++ test_e2e/cases/test_02/conf/goslmailer.conf | 14 + test_e2e/cases/test_02/sacct/sacct | 5 + test_e2e/cases/test_02/sacct/sacct.txt | 1 + test_e2e/cases/test_02/slurm_env/slurmenv.sh | 28 ++ test_e2e/cases/test_02/test.yaml | 71 +++++ test_e2e/cases/test_03/README.md | 7 + .../test_03/conf/adaptive_card_template.json | 237 +++++++++++++++++ test_e2e/cases/test_03/conf/gobler.conf | 19 ++ test_e2e/cases/test_03/conf/goslmailer.conf | 22 ++ ...r@imba.oeaw.ac.at-1653378962712164702.json | 168 ++++++++++++ test_e2e/cases/test_03/sacct/sacct | 4 + test_e2e/cases/test_03/sacct/sacct.txt | 3 + test_e2e/cases/test_03/sacct/sstat | 4 + test_e2e/cases/test_03/sacct/sstat.txt | 2 + test_e2e/cases/test_03/slurm_env/slurmenv.sh | 30 +++ test_e2e/cases/test_03/test.yaml | 65 +++++ test_e2e/cases/test_04/README.md | 7 + .../test_04/conf/adaptive_card_template.json | 237 +++++++++++++++++ test_e2e/cases/test_04/conf/gobler.conf | 19 ++ test_e2e/cases/test_04/conf/goslmailer.conf | 22 ++ ...r@imba.oeaw.ac.at-1653372112324147944.json | 247 ++++++++++++++++++ test_e2e/cases/test_04/sacct/sacct | 4 + test_e2e/cases/test_04/sacct/sacct.txt | 3 + test_e2e/cases/test_04/slurm_env/slurmenv.sh | 30 +++ test_e2e/cases/test_04/test.yaml | 63 +++++ test_e2e/run.yaml | 8 +- 37 files changed, 1636 insertions(+), 26 deletions(-) delete mode 100644 test_e2e/README create mode 100644 test_e2e/README.md rename test_e2e/cases/test_00/{README.txt => README.md} (69%) rename test_e2e/cases/test_01/{README.txt => README.md} (76%) create mode 100644 test_e2e/cases/test_02/README.md create mode 100644 test_e2e/cases/test_02/conf/adaptive_card_template.json create mode 100755 test_e2e/cases/test_02/conf/gobler.conf create mode 100755 test_e2e/cases/test_02/conf/goslmailer.conf create mode 100755 test_e2e/cases/test_02/sacct/sacct create mode 100644 test_e2e/cases/test_02/sacct/sacct.txt create mode 100755 test_e2e/cases/test_02/slurm_env/slurmenv.sh create mode 100644 test_e2e/cases/test_02/test.yaml create mode 100644 test_e2e/cases/test_03/README.md create mode 100644 test_e2e/cases/test_03/conf/adaptive_card_template.json create mode 100755 test_e2e/cases/test_03/conf/gobler.conf create mode 100755 test_e2e/cases/test_03/conf/goslmailer.conf create mode 100644 test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json create mode 100755 test_e2e/cases/test_03/sacct/sacct create mode 100644 test_e2e/cases/test_03/sacct/sacct.txt create mode 100755 test_e2e/cases/test_03/sacct/sstat create mode 100644 test_e2e/cases/test_03/sacct/sstat.txt create mode 100755 test_e2e/cases/test_03/slurm_env/slurmenv.sh create mode 100644 test_e2e/cases/test_03/test.yaml create mode 100644 test_e2e/cases/test_04/README.md create mode 100644 test_e2e/cases/test_04/conf/adaptive_card_template.json create mode 100755 test_e2e/cases/test_04/conf/gobler.conf create mode 100755 test_e2e/cases/test_04/conf/goslmailer.conf create mode 100644 test_e2e/cases/test_04/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653372112324147944.json create mode 100755 test_e2e/cases/test_04/sacct/sacct create mode 100644 test_e2e/cases/test_04/sacct/sacct.txt create mode 100755 test_e2e/cases/test_04/slurm_env/slurmenv.sh create mode 100644 test_e2e/cases/test_04/test.yaml diff --git a/Makefile b/Makefile index 74933b9..e78dd47 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ config=cmd/goslmailer/goslmailer.conf.annotated_example cmd/gobler/gobler.conf # can be replaced with go test ./... construct testdirs=$(sort $(dir $(shell find ./ -name *_test.go))) -all: list test build test_endly install +all: list test build get_endly test_endly install list: @echo "================================================================================" @@ -67,10 +67,14 @@ test: @echo "********************************************************************************" go test -v -count=1 ./... -get_endly: +endly_linux_$(endly_version).tar.gz: curl -L -O https://github.com/viant/endly/releases/download/v$(endly_version)/endly_linux_$(endly_version).tar.gz + +test_e2e/endly: tar -C test_e2e/ -xzf endly_linux_$(endly_version).tar.gz +get_endly: endly_linux_$(endly_version).tar.gz test_e2e/endly + test_endly: cd test_e2e ./endly diff --git a/templates/adaptive_card_template.json b/templates/adaptive_card_template.json index e4628e5..d8d3334 100644 --- a/templates/adaptive_card_template.json +++ b/templates/adaptive_card_template.json @@ -235,3 +235,4 @@ } ] } + diff --git a/test_e2e/README b/test_e2e/README deleted file mode 100644 index 0893ab9..0000000 --- a/test_e2e/README +++ /dev/null @@ -1,4 +0,0 @@ -* sacct comes from config -* template config to point to /tmp/sacct script -* copy sacct/* folder there -* goslmailer: get conf path from env var GOSLMAILER_CONFIG, else default /etc/slurm/goslmailer.conf diff --git a/test_e2e/README.md b/test_e2e/README.md new file mode 100644 index 0000000..0c1c3e4 --- /dev/null +++ b/test_e2e/README.md @@ -0,0 +1,41 @@ +# Test cases + +1. [test_00](./cases/test_00/README.md) +2. [test_01](./cases/test_01/README.md) +3. [test_02](./cases/test_02/README.md) + +--- + +## test_00 +--- + +Run all binaries without the config file. + +--- +## test_01 +--- + +1. run gosler, save to gob +2. run gobler, render gob to file + +--- +## test_02 +--- + +goslmailer runs with broken sacct line (-j jobid missing) + +--- +## test_03 +--- + +goslmailer render msteams json to file (actual data) +Job start + +--- +## test_04 +--- + +goslmailer render msteams json to file (actual data) +Job end - fail + +--- diff --git a/test_e2e/cases/test_00/README.txt b/test_e2e/cases/test_00/README.md similarity index 69% rename from test_e2e/cases/test_00/README.txt rename to test_e2e/cases/test_00/README.md index dce7364..28d6483 100644 --- a/test_e2e/cases/test_00/README.txt +++ b/test_e2e/cases/test_00/README.md @@ -1,6 +1,6 @@ -test_00 -------- +## test_00 +--- Run all binaries without the config file. - +--- diff --git a/test_e2e/cases/test_00/test.yaml b/test_e2e/cases/test_00/test.yaml index e6d94c1..b4be165 100644 --- a/test_e2e/cases/test_00/test.yaml +++ b/test_e2e/cases/test_00/test.yaml @@ -1,5 +1,4 @@ init: - test_readme: '${twd}/README.txt' defaults: message: "Running test $i from $twd" @@ -13,11 +12,6 @@ pipeline: action: workflow:print style: 1 - print_test_case: - description: "Test README" - action: workflow:print - message: $Cat('${test_readme}') - run_goslmailer: action: exec:run checkError: false diff --git a/test_e2e/cases/test_01/README.txt b/test_e2e/cases/test_01/README.md similarity index 76% rename from test_e2e/cases/test_01/README.txt rename to test_e2e/cases/test_01/README.md index 90811a5..d981d59 100644 --- a/test_e2e/cases/test_01/README.txt +++ b/test_e2e/cases/test_01/README.md @@ -1,7 +1,7 @@ -test_01 -------- +## test_01 +--- 1. run gosler, save to gob 2. run gobler, render gob to file - +--- diff --git a/test_e2e/cases/test_01/test.yaml b/test_e2e/cases/test_01/test.yaml index 13fff90..3bddedb 100644 --- a/test_e2e/cases/test_01/test.yaml +++ b/test_e2e/cases/test_01/test.yaml @@ -1,5 +1,4 @@ init: - test_readme: '${twd}/README.txt' defaults: message: "Running test $i from $twd" @@ -13,11 +12,6 @@ pipeline: action: workflow:print style: 1 - print_test_case: - description: "Test README" - action: workflow:print - message: $Cat('${test_readme}') - deploy_conf_files: action: storage:copy source: @@ -53,7 +47,7 @@ pipeline: watch: true immuneToHangups: true command: gobler -c /tmp/gobler.conf - timeoutMs: 10000 + timeoutMs: 5000 run_sleep: action: exec:run diff --git a/test_e2e/cases/test_02/README.md b/test_e2e/cases/test_02/README.md new file mode 100644 index 0000000..9c5cb69 --- /dev/null +++ b/test_e2e/cases/test_02/README.md @@ -0,0 +1,6 @@ +## test_02 +--- + +goslmailer runs with broken sacct line (-j jobid missing) + +--- diff --git a/test_e2e/cases/test_02/conf/adaptive_card_template.json b/test_e2e/cases/test_02/conf/adaptive_card_template.json new file mode 100644 index 0000000..e4628e5 --- /dev/null +++ b/test_e2e/cases/test_02/conf/adaptive_card_template.json @@ -0,0 +1,237 @@ +{ + "type":"message", + "attachments":[ + { + "contentType":"application/vnd.microsoft.card.adaptive", + "content":{ + "type":"AdaptiveCard", + "body":[ + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"CBE Slurm job info", + "style":"heading" + }, + { + "type":"ColumnSet", + "columns":[ + { + "type":"Column", + "items":[ + { + "type":"Image", + "style":"person", + "url":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/590px-Slurm_logo.svg.png", + "size":"small" + } + ], + "width":"auto" + }, + { + "type":"Column", + "items":[ + { + "type":"TextBlock", + "weight":"bolder", + "text":"{{ .Job.MailSubject }} {{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}", + "wrap":true, + "size":"Large", + {{ if or (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "FAILED") (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "TIMEOUT") ((eq .Job.SlurmEnvironment.SLURM_JOB_STATE "OUT_OF_MEMORY")) }}"color":"Attention"{{ else }}"color":"Good"{{ end }} + }, + { + "type":"TextBlock", + "spacing":"none", + "text":"Created {{ .Created }}", + + "isSubtle":true, + "wrap":true + } + ], + "width":"stretch" + } + ] + }, + {{ if ne .Job.PrunedMessageCount 0 }} + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"WARNING: Rate limiting triggered. {{ .Job.PrunedMessageCount }} additonal notificiations have been suppressed", + "style":"heading", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"true", + "id":"ExternalData", + "facts":[ + { + "type":"Fact", + "title":"Job name", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_NAME }}" + }, + { + "type":"Fact", + "title":"Job ID", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_ID }}" + }, + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + }, + { + "type":"Fact", + "title":"Partition", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_PARTITION }}" + }, + { + "type":"Fact", + "title":"Compute Nodes Used", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_NODELIST }}" + }, + { + "type":"Fact", + "title":"Cores", + "value":"{{ .Job.JobStats.Ncpus }}" + }, + { + "type":"Fact", + "title":"Job state", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_STATE }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"Exit Code", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_EXIT_CODE_MAX }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Submit", + "value":"{{ .Job.JobStats.Submittime }}" + }, + { + "type":"Fact", + "title":"Start", + "value":"{{ .Job.JobStats.Starttime }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"End", + "value":"{{ .Job.JobStats.Endtime }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Reserved Walltime", + "value":"{{ .Job.JobStats.WalltimeStr }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Used Walltime", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_RUN_TIME }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING" }} + { + "type":"Fact", + "title":"Used CPU time", + "value":"{{ .Job.JobStats.TotalCPUStr }}" + }, + { + "type":"Fact", + "title":"% User (Computation)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcUserComputePercentage }}' + }, + { + "type":"Fact", + "title":"% System (I/O)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcSystemComputePercentage }}' + }, + {{ end }} + {{ end }} + { + "type":"Fact", + "title":"Memory Requested", + "value":"{{ .Job.JobStats.ReqMem | humanBytes }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Max Memory Used", + "value":"{{ .Job.JobStats.MaxRSS | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Write", + "value":"{{ .Job.JobStats.MaxDiskWrite | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Read", + "value":"{{ .Job.JobStats.MaxDiskRead | humanBytes }}" + } + {{ end }} + ] + }, + {{ range .Job.Hints }} + { + "type":"TextBlock", + "text":"{{ . }}", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"false", + "id":"InternalData", + "facts":[ + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + }, + { + "type":"Fact", + "title":"JobStatus", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}" + } + ] + } + ], + "actions":[ + { + "type":"Action.OpenUrl", + "title":"View Google", + "url":"https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + ], + "$schema":"http://adaptivecards.io/schemas/adaptive-card.json", + "version":"1.2", + "msteams":{ + "entities":[ + { + "type":"mention", + "text":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}", + "mentioned":{ + "id":"{{ .UserID }}", + "name":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + } + } + ] + } + } + } + ] +} diff --git a/test_e2e/cases/test_02/conf/gobler.conf b/test_e2e/cases/test_02/conf/gobler.conf new file mode 100755 index 0000000..876fe78 --- /dev/null +++ b/test_e2e/cases/test_02/conf/gobler.conf @@ -0,0 +1,19 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "connectors": { + "msteams": { + "name": "dev channel", + "renderToFile": "yes", + "spoolDir": "/tmp", + "adaptiveCardTemplate": "/tmp/adaptive_card_template.json", + "url": "http://localhost:9999/", + "useLookup": "no", + "monitorT": "10000ms", + "pickerT": "1000ms", + "psBufLen": "3", + "numSenders": "3", + "maxMsgPU": "6" + } + } +} diff --git a/test_e2e/cases/test_02/conf/goslmailer.conf b/test_e2e/cases/test_02/conf/goslmailer.conf new file mode 100755 index 0000000..fc53e4c --- /dev/null +++ b/test_e2e/cases/test_02/conf/goslmailer.conf @@ -0,0 +1,14 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "binpaths": { + "sacct": "/tmp/sacct" + }, + "connectors": { + "msteams": { + "renderToFile": "spool", + "spoolDir": "/tmp", + "useLookup": "no" + } + } +} diff --git a/test_e2e/cases/test_02/sacct/sacct b/test_e2e/cases/test_02/sacct/sacct new file mode 100755 index 0000000..a6384eb --- /dev/null +++ b/test_e2e/cases/test_02/sacct/sacct @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +cwd=`dirname $0` +cat ${cwd}/sacct.txt +exit 1 diff --git a/test_e2e/cases/test_02/sacct/sacct.txt b/test_e2e/cases/test_02/sacct/sacct.txt new file mode 100644 index 0000000..a914ccf --- /dev/null +++ b/test_e2e/cases/test_02/sacct/sacct.txt @@ -0,0 +1 @@ +sacct: fatal: Bad job/step specified: -n diff --git a/test_e2e/cases/test_02/slurm_env/slurmenv.sh b/test_e2e/cases/test_02/slurm_env/slurmenv.sh new file mode 100755 index 0000000..9135535 --- /dev/null +++ b/test_e2e/cases/test_02/slurm_env/slurmenv.sh @@ -0,0 +1,28 @@ +#!/usr/bin/bash + +unset SLURM_JOB_NAME +unset SLURM_JOB_GROUP +unset SLURM_JOB_STATE +unset SLURM_ARRAY_JOB_ID +unset SLURM_JOB_WORK_DIR +unset SLURM_JOB_MAIL_TYPE +unset SLURM_JOBID +unset SLURM_ARRAY_TASK_ID +unset SLURM_JOB_RUN_TIME +unset SLURM_ARRAY_TASK_COUNT +unset SLURM_JOB_EXIT_CODE2 +unset SLURM_JOB_DERIVED_EC +unset SLURM_JOB_ID +unset SLURM_JOB_USER +unset SLURM_ARRAY_TASK_MAX +unset SLURM_JOB_EXIT_CODE +unset SLURM_JOB_UID +unset SLURM_JOB_NODELIST +unset SLURM_ARRAY_TASK_MIN +unset SLURM_JOB_STDIN +unset SLURM_ARRAY_TASK_STEP +unset SLURM_JOB_EXIT_CODE_MAX +unset SLURM_JOB_GID +unset SLURM_CLUSTER_NAME +unset SLURM_JOB_PARTITION +unset SLURM_JOB_ACCOUNT diff --git a/test_e2e/cases/test_02/test.yaml b/test_e2e/cases/test_02/test.yaml new file mode 100644 index 0000000..31748c8 --- /dev/null +++ b/test_e2e/cases/test_02/test.yaml @@ -0,0 +1,71 @@ +init: + test_readme: '${twd}/README.md' + +defaults: + message: "Running test $i from $twd" + systempaths: + - $bwd + +pipeline: + + print_welcome: + description: "Current test" + action: workflow:print + style: 1 + + deploy_conf_files: + action: storage:copy + source: + URL: $twd/conf + dest: + URL: /tmp + + deploy_sacct_files: + action: storage:copy + source: + URL: $twd/sacct + dest: + URL: /tmp + + run_goslmailer: + action: exec:run + checkError: true + env: + GOSLMAILER_CONF: /tmp/goslmailer.conf + commands: + - source $twd/slurm_env/slurmenv.sh + - goslmailer -s "slurm job x" pja + + test_assert_goslmailer: + action: validator:assert + expect: + - '/Deposit gob OK!/' + actual: + - $run_goslmailer.Output + + run_gobler: + action: process:start + watch: true + immuneToHangups: true + command: gobler -c /tmp/gobler.conf + timeoutMs: 5000 + + run_sleep: + action: exec:run + checkError: true + commands: + - sleep 5 + + stop_gobler: + action: process:stop + pid: $run_gobler.Pid + + # https://github.com/viant/assertly#validation + test_assert_gobler: + action: validator:assert + expect: + - '~/Send successful to file: rendered-\d*-pja-/' + - '~/SENDER msteams#\d: Gob deleted/' + actual: + - $run_gobler.Stdout + - $run_gobler.Stdout diff --git a/test_e2e/cases/test_03/README.md b/test_e2e/cases/test_03/README.md new file mode 100644 index 0000000..ffcb7c2 --- /dev/null +++ b/test_e2e/cases/test_03/README.md @@ -0,0 +1,7 @@ +## test_03 +--- + +goslmailer render msteams json to file (actual data) +Job start + +--- diff --git a/test_e2e/cases/test_03/conf/adaptive_card_template.json b/test_e2e/cases/test_03/conf/adaptive_card_template.json new file mode 100644 index 0000000..e4628e5 --- /dev/null +++ b/test_e2e/cases/test_03/conf/adaptive_card_template.json @@ -0,0 +1,237 @@ +{ + "type":"message", + "attachments":[ + { + "contentType":"application/vnd.microsoft.card.adaptive", + "content":{ + "type":"AdaptiveCard", + "body":[ + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"CBE Slurm job info", + "style":"heading" + }, + { + "type":"ColumnSet", + "columns":[ + { + "type":"Column", + "items":[ + { + "type":"Image", + "style":"person", + "url":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/590px-Slurm_logo.svg.png", + "size":"small" + } + ], + "width":"auto" + }, + { + "type":"Column", + "items":[ + { + "type":"TextBlock", + "weight":"bolder", + "text":"{{ .Job.MailSubject }} {{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}", + "wrap":true, + "size":"Large", + {{ if or (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "FAILED") (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "TIMEOUT") ((eq .Job.SlurmEnvironment.SLURM_JOB_STATE "OUT_OF_MEMORY")) }}"color":"Attention"{{ else }}"color":"Good"{{ end }} + }, + { + "type":"TextBlock", + "spacing":"none", + "text":"Created {{ .Created }}", + + "isSubtle":true, + "wrap":true + } + ], + "width":"stretch" + } + ] + }, + {{ if ne .Job.PrunedMessageCount 0 }} + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"WARNING: Rate limiting triggered. {{ .Job.PrunedMessageCount }} additonal notificiations have been suppressed", + "style":"heading", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"true", + "id":"ExternalData", + "facts":[ + { + "type":"Fact", + "title":"Job name", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_NAME }}" + }, + { + "type":"Fact", + "title":"Job ID", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_ID }}" + }, + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + }, + { + "type":"Fact", + "title":"Partition", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_PARTITION }}" + }, + { + "type":"Fact", + "title":"Compute Nodes Used", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_NODELIST }}" + }, + { + "type":"Fact", + "title":"Cores", + "value":"{{ .Job.JobStats.Ncpus }}" + }, + { + "type":"Fact", + "title":"Job state", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_STATE }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"Exit Code", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_EXIT_CODE_MAX }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Submit", + "value":"{{ .Job.JobStats.Submittime }}" + }, + { + "type":"Fact", + "title":"Start", + "value":"{{ .Job.JobStats.Starttime }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"End", + "value":"{{ .Job.JobStats.Endtime }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Reserved Walltime", + "value":"{{ .Job.JobStats.WalltimeStr }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Used Walltime", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_RUN_TIME }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING" }} + { + "type":"Fact", + "title":"Used CPU time", + "value":"{{ .Job.JobStats.TotalCPUStr }}" + }, + { + "type":"Fact", + "title":"% User (Computation)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcUserComputePercentage }}' + }, + { + "type":"Fact", + "title":"% System (I/O)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcSystemComputePercentage }}' + }, + {{ end }} + {{ end }} + { + "type":"Fact", + "title":"Memory Requested", + "value":"{{ .Job.JobStats.ReqMem | humanBytes }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Max Memory Used", + "value":"{{ .Job.JobStats.MaxRSS | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Write", + "value":"{{ .Job.JobStats.MaxDiskWrite | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Read", + "value":"{{ .Job.JobStats.MaxDiskRead | humanBytes }}" + } + {{ end }} + ] + }, + {{ range .Job.Hints }} + { + "type":"TextBlock", + "text":"{{ . }}", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"false", + "id":"InternalData", + "facts":[ + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + }, + { + "type":"Fact", + "title":"JobStatus", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}" + } + ] + } + ], + "actions":[ + { + "type":"Action.OpenUrl", + "title":"View Google", + "url":"https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + ], + "$schema":"http://adaptivecards.io/schemas/adaptive-card.json", + "version":"1.2", + "msteams":{ + "entities":[ + { + "type":"mention", + "text":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}", + "mentioned":{ + "id":"{{ .UserID }}", + "name":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + } + } + ] + } + } + } + ] +} diff --git a/test_e2e/cases/test_03/conf/gobler.conf b/test_e2e/cases/test_03/conf/gobler.conf new file mode 100755 index 0000000..876fe78 --- /dev/null +++ b/test_e2e/cases/test_03/conf/gobler.conf @@ -0,0 +1,19 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "connectors": { + "msteams": { + "name": "dev channel", + "renderToFile": "yes", + "spoolDir": "/tmp", + "adaptiveCardTemplate": "/tmp/adaptive_card_template.json", + "url": "http://localhost:9999/", + "useLookup": "no", + "monitorT": "10000ms", + "pickerT": "1000ms", + "psBufLen": "3", + "numSenders": "3", + "maxMsgPU": "6" + } + } +} diff --git a/test_e2e/cases/test_03/conf/goslmailer.conf b/test_e2e/cases/test_03/conf/goslmailer.conf new file mode 100755 index 0000000..7315c4e --- /dev/null +++ b/test_e2e/cases/test_03/conf/goslmailer.conf @@ -0,0 +1,22 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "binpaths": { + "sacct": "/tmp/sacct", + "sstat": "/tmp/sstat" + }, + "connectors": { + "msteams": { + "renderToFile": "yes", + "adaptiveCardTemplate": "/tmp/adaptive_card_template.json", + "spoolDir": "/tmp", + "useLookup": "no" + } + }, + "qosmap": { + "3600": "RAPID", + "28800": "SHORT", + "172800": "MEDIUM", + "1209600": "LONG" + } +} diff --git a/test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json b/test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json new file mode 100644 index 0000000..8f35305 --- /dev/null +++ b/test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json @@ -0,0 +1,168 @@ +{ + "type":"message", + "attachments":[ + { + "contentType":"application/vnd.microsoft.card.adaptive", + "content":{ + "type":"AdaptiveCard", + "body":[ + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"CBE Slurm job info", + "style":"heading" + }, + { + "type":"ColumnSet", + "columns":[ + { + "type":"Column", + "items":[ + { + "type":"Image", + "style":"person", + "url":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/590px-Slurm_logo.svg.png", + "size":"small" + } + ], + "width":"auto" + }, + { + "type":"Column", + "items":[ + { + "type":"TextBlock", + "weight":"bolder", + "text":"Job 1052477 Began", + "wrap":true, + "size":"Large", + "color":"Good" + }, + { + "type":"TextBlock", + "spacing":"none", + + "isSubtle":true, + "wrap":true + } + ], + "width":"stretch" + } + ] + }, + + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"true", + "id":"ExternalData", + "facts":[ + { + "type":"Fact", + "title":"Job name", + "value":"endlyJobFail" + }, + { + "type":"Fact", + "title":"Job ID", + "value":"1052477" + }, + { + "type":"Fact", + "title":"User", + "value":"petar.jager" + }, + { + "type":"Fact", + "title":"Partition", + "value":"c" + }, + { + "type":"Fact", + "title":"Compute Nodes Used", + "value":"stg-c2-0" + }, + { + "type":"Fact", + "title":"Cores", + "value":"1" + }, + { + "type":"Fact", + "title":"Job state", + "value":"RUNNING" + }, + + { + "type":"Fact", + "title":"Submit", + "value":"2022-05-24T07:43:07" + }, + { + "type":"Fact", + "title":"Start", + "value":"2022-05-24T07:43:07" + }, + + { + "type":"Fact", + "title":"Reserved Walltime", + "value":"08:00:00" + }, + + { + "type":"Fact", + "title":"Memory Requested", + "value":"4.3 GB" + }, + + ] + }, + + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"false", + "id":"InternalData", + "facts":[ + { + "type":"Fact", + "title":"User", + "value":"petar.jager" + }, + { + "type":"Fact", + "title":"JobStatus", + "value":"Began" + } + ] + } + ], + "actions":[ + { + "type":"Action.OpenUrl", + "title":"View Google", + "url":"https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + ], + "$schema":"http://adaptivecards.io/schemas/adaptive-card.json", + "version":"1.2", + "msteams":{ + "entities":[ + { + "type":"mention", + "text":"petar.jager", + "mentioned":{ + "id":"petar.jager@imba.oeaw.ac.at", + "name":"petar.jager" + } + } + ] + } + } + } + ] +} diff --git a/test_e2e/cases/test_03/sacct/sacct b/test_e2e/cases/test_03/sacct/sacct new file mode 100755 index 0000000..6300143 --- /dev/null +++ b/test_e2e/cases/test_03/sacct/sacct @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +cwd=`dirname $0` +cat ${cwd}/sacct.txt diff --git a/test_e2e/cases/test_03/sacct/sacct.txt b/test_e2e/cases/test_03/sacct/sacct.txt new file mode 100644 index 0000000..1139010 --- /dev/null +++ b/test_e2e/cases/test_03/sacct/sacct.txt @@ -0,0 +1,3 @@ +endlyJobFail|petar.jager|c|stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown|08:00:00|00:00:14|00:00:14|00:00:00|00:00:00|00:00:00|4G|||||||| +batch|||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| +extern|||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| diff --git a/test_e2e/cases/test_03/sacct/sstat b/test_e2e/cases/test_03/sacct/sstat new file mode 100755 index 0000000..440d73f --- /dev/null +++ b/test_e2e/cases/test_03/sacct/sstat @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +cwd=`dirname $0` +cat ${cwd}/sstat.txt diff --git a/test_e2e/cases/test_03/sacct/sstat.txt b/test_e2e/cases/test_03/sacct/sstat.txt new file mode 100644 index 0000000..4146297 --- /dev/null +++ b/test_e2e/cases/test_03/sacct/sstat.txt @@ -0,0 +1,2 @@ +1052477.extern|0|0|2012|stg-c2-0|stg-c2-0|stg-c2-0| +1052477.batch|344K|36|8267|stg-c2-0|stg-c2-0|stg-c2-0| diff --git a/test_e2e/cases/test_03/slurm_env/slurmenv.sh b/test_e2e/cases/test_03/slurm_env/slurmenv.sh new file mode 100755 index 0000000..155853d --- /dev/null +++ b/test_e2e/cases/test_03/slurm_env/slurmenv.sh @@ -0,0 +1,30 @@ +#!/usr/bin/bash + +export SLURM_ARRAY_JOB_ID="" +export SLURM_ARRAY_TASK_COUNT="" +export SLURM_ARRAY_TASK_ID="" +export SLURM_ARRAY_TASK_MAX="" +export SLURM_ARRAY_TASK_MIN="" +export SLURM_ARRAY_TASK_STEP="" +export SLURM_CLUSTER_NAME="clip" +export SLURM_JOB_ACCOUNT="hpc" +export SLURM_JOB_DERIVED_EC="" +export SLURM_JOB_EXIT_CODE="" +export SLURM_JOB_EXIT_CODE2="" +export SLURM_JOB_EXIT_CODE_MAX="" +export SLURM_JOB_EXIT_CODE_MIN="" +export SLURM_JOB_GID="1999" +export SLURM_JOB_GROUP="is.grp" +export SLURM_JOBID="1052477" +export SLURM_JOB_ID="1052477" +export SLURM_JOB_MAIL_TYPE="Began" +export SLURM_JOB_NAME="endlyJobFail" +export SLURM_JOB_NODELIST="stg-c2-0" +export SLURM_JOB_PARTITION="c" +export SLURM_JOB_QUEUED_TIME="00:00:00" +export SLURM_JOB_RUN_TIME="" +export SLURM_JOB_STATE="RUNNING" +export SLURM_JOB_STDIN="/dev/null" +export SLURM_JOB_UID="58546" +export SLURM_JOB_USER="petar.jager" +export SLURM_JOB_WORK_DIR="/users/petar.jager" diff --git a/test_e2e/cases/test_03/test.yaml b/test_e2e/cases/test_03/test.yaml new file mode 100644 index 0000000..8b2dee0 --- /dev/null +++ b/test_e2e/cases/test_03/test.yaml @@ -0,0 +1,65 @@ +init: + test_readme: '${twd}/README.md' + +defaults: + message: "Running test $i from $twd" + systempaths: + - $bwd + +pipeline: + + print_welcome: + description: "Current test" + action: workflow:print + style: 1 + + deploy_conf_files: + action: storage:copy + source: + URL: $twd/conf + dest: + URL: /tmp + + deploy_sacct_files: + action: storage:copy + source: + URL: $twd/sacct + dest: + URL: /tmp + + run_goslmailer: + action: exec:extract + checkError: true + env: + GOSLMAILER_CONF: /tmp/goslmailer.conf + commands: + - command: source $twd/slurm_env/slurmenv.sh + - command: goslmailer -s "Slurm Job_id=1052477 Name=endlyJobFail Failed, Run time 00:00:30, FAILED, ExitCode 1" petar.jager@imba.oeaw.ac.at + extract: + - key: rfile + regExpr: 'Send successful to file: (rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json)' + required: true + + debug_extract: + action: workflow:print + message: "GOT: $rfile" + + test_diff: + action: exec:run + checkError: true + commands: + - sed -i -e '/"text":"Created /d' $WorkingDirectory()/$rfile + - diff $WorkingDirectory()/$rfile $twd/results/*.json && echo RESULTS MATCH + + test_assert_goslmailer: + action: validator:assert + expect: + - '~/Send successful to file: rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json/' + actual: + - $run_goslmailer.Output + + + # todo: + # add test: + # jq . rendered.json >/dev/null || echo FAILED + diff --git a/test_e2e/cases/test_04/README.md b/test_e2e/cases/test_04/README.md new file mode 100644 index 0000000..5c3f981 --- /dev/null +++ b/test_e2e/cases/test_04/README.md @@ -0,0 +1,7 @@ +## test_04 +--- + +goslmailer render msteams json to file (actual data) +Job end - fail + +--- diff --git a/test_e2e/cases/test_04/conf/adaptive_card_template.json b/test_e2e/cases/test_04/conf/adaptive_card_template.json new file mode 100644 index 0000000..e4628e5 --- /dev/null +++ b/test_e2e/cases/test_04/conf/adaptive_card_template.json @@ -0,0 +1,237 @@ +{ + "type":"message", + "attachments":[ + { + "contentType":"application/vnd.microsoft.card.adaptive", + "content":{ + "type":"AdaptiveCard", + "body":[ + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"CBE Slurm job info", + "style":"heading" + }, + { + "type":"ColumnSet", + "columns":[ + { + "type":"Column", + "items":[ + { + "type":"Image", + "style":"person", + "url":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/590px-Slurm_logo.svg.png", + "size":"small" + } + ], + "width":"auto" + }, + { + "type":"Column", + "items":[ + { + "type":"TextBlock", + "weight":"bolder", + "text":"{{ .Job.MailSubject }} {{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}", + "wrap":true, + "size":"Large", + {{ if or (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "FAILED") (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "TIMEOUT") ((eq .Job.SlurmEnvironment.SLURM_JOB_STATE "OUT_OF_MEMORY")) }}"color":"Attention"{{ else }}"color":"Good"{{ end }} + }, + { + "type":"TextBlock", + "spacing":"none", + "text":"Created {{ .Created }}", + + "isSubtle":true, + "wrap":true + } + ], + "width":"stretch" + } + ] + }, + {{ if ne .Job.PrunedMessageCount 0 }} + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"WARNING: Rate limiting triggered. {{ .Job.PrunedMessageCount }} additonal notificiations have been suppressed", + "style":"heading", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"true", + "id":"ExternalData", + "facts":[ + { + "type":"Fact", + "title":"Job name", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_NAME }}" + }, + { + "type":"Fact", + "title":"Job ID", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_ID }}" + }, + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + }, + { + "type":"Fact", + "title":"Partition", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_PARTITION }}" + }, + { + "type":"Fact", + "title":"Compute Nodes Used", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_NODELIST }}" + }, + { + "type":"Fact", + "title":"Cores", + "value":"{{ .Job.JobStats.Ncpus }}" + }, + { + "type":"Fact", + "title":"Job state", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_STATE }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"Exit Code", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_EXIT_CODE_MAX }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Submit", + "value":"{{ .Job.JobStats.Submittime }}" + }, + { + "type":"Fact", + "title":"Start", + "value":"{{ .Job.JobStats.Starttime }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"End", + "value":"{{ .Job.JobStats.Endtime }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Reserved Walltime", + "value":"{{ .Job.JobStats.WalltimeStr }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Used Walltime", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_RUN_TIME }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING" }} + { + "type":"Fact", + "title":"Used CPU time", + "value":"{{ .Job.JobStats.TotalCPUStr }}" + }, + { + "type":"Fact", + "title":"% User (Computation)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcUserComputePercentage }}' + }, + { + "type":"Fact", + "title":"% System (I/O)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcSystemComputePercentage }}' + }, + {{ end }} + {{ end }} + { + "type":"Fact", + "title":"Memory Requested", + "value":"{{ .Job.JobStats.ReqMem | humanBytes }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Max Memory Used", + "value":"{{ .Job.JobStats.MaxRSS | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Write", + "value":"{{ .Job.JobStats.MaxDiskWrite | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Read", + "value":"{{ .Job.JobStats.MaxDiskRead | humanBytes }}" + } + {{ end }} + ] + }, + {{ range .Job.Hints }} + { + "type":"TextBlock", + "text":"{{ . }}", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"false", + "id":"InternalData", + "facts":[ + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + }, + { + "type":"Fact", + "title":"JobStatus", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}" + } + ] + } + ], + "actions":[ + { + "type":"Action.OpenUrl", + "title":"View Google", + "url":"https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + ], + "$schema":"http://adaptivecards.io/schemas/adaptive-card.json", + "version":"1.2", + "msteams":{ + "entities":[ + { + "type":"mention", + "text":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}", + "mentioned":{ + "id":"{{ .UserID }}", + "name":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}" + } + } + ] + } + } + } + ] +} diff --git a/test_e2e/cases/test_04/conf/gobler.conf b/test_e2e/cases/test_04/conf/gobler.conf new file mode 100755 index 0000000..876fe78 --- /dev/null +++ b/test_e2e/cases/test_04/conf/gobler.conf @@ -0,0 +1,19 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "connectors": { + "msteams": { + "name": "dev channel", + "renderToFile": "yes", + "spoolDir": "/tmp", + "adaptiveCardTemplate": "/tmp/adaptive_card_template.json", + "url": "http://localhost:9999/", + "useLookup": "no", + "monitorT": "10000ms", + "pickerT": "1000ms", + "psBufLen": "3", + "numSenders": "3", + "maxMsgPU": "6" + } + } +} diff --git a/test_e2e/cases/test_04/conf/goslmailer.conf b/test_e2e/cases/test_04/conf/goslmailer.conf new file mode 100755 index 0000000..7315c4e --- /dev/null +++ b/test_e2e/cases/test_04/conf/goslmailer.conf @@ -0,0 +1,22 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "binpaths": { + "sacct": "/tmp/sacct", + "sstat": "/tmp/sstat" + }, + "connectors": { + "msteams": { + "renderToFile": "yes", + "adaptiveCardTemplate": "/tmp/adaptive_card_template.json", + "spoolDir": "/tmp", + "useLookup": "no" + } + }, + "qosmap": { + "3600": "RAPID", + "28800": "SHORT", + "172800": "MEDIUM", + "1209600": "LONG" + } +} diff --git a/test_e2e/cases/test_04/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653372112324147944.json b/test_e2e/cases/test_04/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653372112324147944.json new file mode 100644 index 0000000..bbabff8 --- /dev/null +++ b/test_e2e/cases/test_04/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653372112324147944.json @@ -0,0 +1,247 @@ +{ + "type":"message", + "attachments":[ + { + "contentType":"application/vnd.microsoft.card.adaptive", + "content":{ + "type":"AdaptiveCard", + "body":[ + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"CBE Slurm job info", + "style":"heading" + }, + { + "type":"ColumnSet", + "columns":[ + { + "type":"Column", + "items":[ + { + "type":"Image", + "style":"person", + "url":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/590px-Slurm_logo.svg.png", + "size":"small" + } + ], + "width":"auto" + }, + { + "type":"Column", + "items":[ + { + "type":"TextBlock", + "weight":"bolder", + "text":"Job 1052477 Failed", + "wrap":true, + "size":"Large", + "color":"Attention" + }, + { + "type":"TextBlock", + "spacing":"none", + + "isSubtle":true, + "wrap":true + } + ], + "width":"stretch" + } + ] + }, + + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"true", + "id":"ExternalData", + "facts":[ + { + "type":"Fact", + "title":"Job name", + "value":"endlyJobFail" + }, + { + "type":"Fact", + "title":"Job ID", + "value":"1052477" + }, + { + "type":"Fact", + "title":"User", + "value":"petar.jager" + }, + { + "type":"Fact", + "title":"Partition", + "value":"c" + }, + { + "type":"Fact", + "title":"Compute Nodes Used", + "value":"stg-c2-0" + }, + { + "type":"Fact", + "title":"Cores", + "value":"1" + }, + { + "type":"Fact", + "title":"Job state", + "value":"FAILED" + }, + + { + "type":"Fact", + "title":"Exit Code", + "value":"1" + }, + + { + "type":"Fact", + "title":"Submit", + "value":"2022-05-24T07:43:07" + }, + { + "type":"Fact", + "title":"Start", + "value":"2022-05-24T07:43:07" + }, + + { + "type":"Fact", + "title":"End", + "value":"2022-05-24T07:43:37" + }, + + { + "type":"Fact", + "title":"Reserved Walltime", + "value":"08:00:00" + }, + + { + "type":"Fact", + "title":"Used Walltime", + "value":"00:00:30" + }, + + { + "type":"Fact", + "title":"Used CPU time", + "value":"00:00.007" + }, + { + "type":"Fact", + "title":"% User (Computation)", + "value":'28.57%' + }, + { + "type":"Fact", + "title":"% System (I/O)", + "value":'71.43%' + }, + + + { + "type":"Fact", + "title":"Memory Requested", + "value":"4.3 GB" + }, + + { + "type":"Fact", + "title":"Max Memory Used", + "value":"352 kB" + }, + { + "type":"Fact", + "title":"Max Disk Write", + "value":"0 B" + }, + { + "type":"Fact", + "title":"Max Disk Read", + "value":"10 kB" + } + + ] + }, + + { + "type":"TextBlock", + "text":"TIP: Please consider lowering the ammount of requested memory in the future, your job has consumed less then half of the requested memory.", + "wrap":true, + "color":"Attention" + }, + + { + "type":"TextBlock", + "text":"TIP: Please consider lowering the amount of requested CPU cores in the future, your job has consumed less than half of requested CPU cores", + "wrap":true, + "color":"Attention" + }, + + { + "type":"TextBlock", + "text":"TIP: Your job was submitted to SHORT QOS and finished within half of the requested walltime. Consider submitting it to the RAPID QOS instead", + "wrap":true, + "color":"Attention" + }, + + { + "type":"TextBlock", + "text":"TIP: No --time specified: Using default SHORT QOS limit. Specify --time to increase the chances that the scheduler will use this job for backfilling purposes", + "wrap":true, + "color":"Attention" + }, + + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"false", + "id":"InternalData", + "facts":[ + { + "type":"Fact", + "title":"User", + "value":"petar.jager" + }, + { + "type":"Fact", + "title":"JobStatus", + "value":"Failed" + } + ] + } + ], + "actions":[ + { + "type":"Action.OpenUrl", + "title":"View Google", + "url":"https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + ], + "$schema":"http://adaptivecards.io/schemas/adaptive-card.json", + "version":"1.2", + "msteams":{ + "entities":[ + { + "type":"mention", + "text":"petar.jager", + "mentioned":{ + "id":"petar.jager@imba.oeaw.ac.at", + "name":"petar.jager" + } + } + ] + } + } + } + ] +} diff --git a/test_e2e/cases/test_04/sacct/sacct b/test_e2e/cases/test_04/sacct/sacct new file mode 100755 index 0000000..6300143 --- /dev/null +++ b/test_e2e/cases/test_04/sacct/sacct @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +cwd=`dirname $0` +cat ${cwd}/sacct.txt diff --git a/test_e2e/cases/test_04/sacct/sacct.txt b/test_e2e/cases/test_04/sacct/sacct.txt new file mode 100644 index 0000000..8605721 --- /dev/null +++ b/test_e2e/cases/test_04/sacct/sacct.txt @@ -0,0 +1,3 @@ +endlyJobFail|petar.jager|c|stg-c2-0|1|FAILED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37|08:00:00|00:00:30|00:00:30|00:00.007|00:00.002|00:00.005|4G|||||||| +batch|||stg-c2-0|1|FAILED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37||00:00:30|00:00:30|00:00.006|00:00.002|00:00.003||344K|0.00M|0.01M|stg-c2-0|stg-c2-0|stg-c2-0|| +extern|||stg-c2-0|1|COMPLETED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37||00:00:30|00:00:30|00:00.001|00:00:00|00:00.001||0|0|0.00M|stg-c2-0|stg-c2-0|stg-c2-0|| diff --git a/test_e2e/cases/test_04/slurm_env/slurmenv.sh b/test_e2e/cases/test_04/slurm_env/slurmenv.sh new file mode 100755 index 0000000..edbbd76 --- /dev/null +++ b/test_e2e/cases/test_04/slurm_env/slurmenv.sh @@ -0,0 +1,30 @@ +#!/usr/bin/bash + +export SLURM_ARRAY_JOB_ID="" +export SLURM_ARRAY_TASK_COUNT="" +export SLURM_ARRAY_TASK_ID="" +export SLURM_ARRAY_TASK_MAX="" +export SLURM_ARRAY_TASK_MIN="" +export SLURM_ARRAY_TASK_STEP="" +export SLURM_CLUSTER_NAME="clip" +export SLURM_JOB_ACCOUNT="hpc" +export SLURM_JOB_DERIVED_EC="0" +export SLURM_JOB_EXIT_CODE="256" +export SLURM_JOB_EXIT_CODE2="1:0" +export SLURM_JOB_EXIT_CODE_MAX="1" +export SLURM_JOB_EXIT_CODE_MIN="" +export SLURM_JOB_GID="1999" +export SLURM_JOB_GROUP="is.grp" +export SLURM_JOBID="1052477" +export SLURM_JOB_ID="1052477" +export SLURM_JOB_MAIL_TYPE="Failed" +export SLURM_JOB_NAME="endlyJobFail" +export SLURM_JOB_NODELIST="stg-c2-0" +export SLURM_JOB_PARTITION="c" +export SLURM_JOB_QUEUED_TIME="" +export SLURM_JOB_RUN_TIME="00:00:30" +export SLURM_JOB_STATE="FAILED" +export SLURM_JOB_STDIN="/dev/null" +export SLURM_JOB_UID="58546" +export SLURM_JOB_USER="petar.jager" +export SLURM_JOB_WORK_DIR="/users/petar.jager" diff --git a/test_e2e/cases/test_04/test.yaml b/test_e2e/cases/test_04/test.yaml new file mode 100644 index 0000000..e89f9a2 --- /dev/null +++ b/test_e2e/cases/test_04/test.yaml @@ -0,0 +1,63 @@ +init: + test_readme: '${twd}/README.md' + +defaults: + message: "Running test $i from $twd" + systempaths: + - $bwd + +pipeline: + + print_welcome: + description: "Current test" + action: workflow:print + style: 1 + + deploy_conf_files: + action: storage:copy + source: + URL: $twd/conf + dest: + URL: /tmp + + deploy_sacct_files: + action: storage:copy + source: + URL: $twd/sacct + dest: + URL: /tmp + + run_goslmailer: + action: exec:extract + checkError: true + env: + GOSLMAILER_CONF: /tmp/goslmailer.conf + commands: + - command: source $twd/slurm_env/slurmenv.sh + - command: goslmailer -s "Slurm Job_id=1052477 Name=endlyJobFail Failed, Run time 00:00:30, FAILED, ExitCode 1" petar.jager@imba.oeaw.ac.at + extract: + - key: rfile + regExpr: 'Send successful to file: (rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json)' + required: true + + debug_extract: + action: workflow:print + message: "GOT: $rfile" + + test_diff: + action: exec:run + checkError: true + commands: + - sed -i -e '/"text":"Created /d' $WorkingDirectory()/$rfile + - diff $WorkingDirectory()/$rfile $twd/results/*.json && echo RESULTS MATCH + + test_assert_goslmailer: + action: validator:assert + expect: + - '~/Send successful to file: rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json/' + actual: + - $run_goslmailer.Output + + # todo: + # add test: + # jq . rendered.json > /dev/null || echo FAILED \ No newline at end of file diff --git a/test_e2e/run.yaml b/test_e2e/run.yaml index 405da51..90500f7 100644 --- a/test_e2e/run.yaml +++ b/test_e2e/run.yaml @@ -5,13 +5,19 @@ init: pipeline: loop_over_tests: - range: 0..01 + range: 0..04 subPath: cases/test_${index} template: + setup_print: action: workflow:print message: "Running case ${index} on path $path" + print_test_case: + description: "Test $index README" + action: workflow:print + message: $Cat('cases/test_${index}/README.md') + run_test: action: workflow:run request: '@cases/test_${index}/test' From f5de4801d6d51fbb8bd9ab004bbb16de6d738ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=9Cmit=20Seren?= Date: Mon, 30 May 2022 16:38:07 +0200 Subject: [PATCH 2/4] Add support for SLURM < 21.08.x and improve error handling In oder SLURM version (< 21.08.x) the mail program is executed without setting any SLURM job environment variables (#4) We fallback to parsing the subject line that is passed to the mail program to retrieve jobid and other information such as job state and mail type. Additionally the function for retrieving job related information via sacct and sstat \ now properly return error messages, if the call fails. This fixes #7 We added some end2end tests to test the above fixes. --- cmd/goslmailer/goslmailer.go | 5 +- internal/slurmjob/getjobcontext.go | 102 ++++- internal/slurmjob/getjobcontext_test.go | 181 +++++--- internal/slurmjob/sacct.go | 426 +++++++++--------- internal/slurmjob/sacct_test.go | 212 +++++---- test_data/sacct.txt | 6 +- test_e2e/cases/test_00/sacct/sacct.txt | 6 +- test_e2e/cases/test_01/conf/gobler.conf | 2 +- test_e2e/cases/test_01/sacct/sacct.txt | 6 +- test_e2e/cases/test_01/test.yaml | 46 +- test_e2e/cases/test_02/test.yaml | 39 +- ...r@imba.oeaw.ac.at-1653378962712164702.json | 2 +- test_e2e/cases/test_03/sacct/sacct.txt | 6 +- test_e2e/cases/test_03/slurm_env/slurmenv.sh | 2 +- test_e2e/cases/test_03/test.yaml | 10 +- test_e2e/cases/test_04/sacct/sacct.txt | 6 +- test_e2e/cases/test_05/README.md | 6 + .../test_05/conf/adaptive_card_template.json | 237 ++++++++++ test_e2e/cases/test_05/conf/gobler.conf | 19 + test_e2e/cases/test_05/conf/goslmailer.conf | 22 + ...r@imba.oeaw.ac.at-1653378962712164702.json | 168 +++++++ test_e2e/cases/test_05/sacct/sacct | 3 + .../cases/test_05/sacct/sacct_1052477.txt | 3 + test_e2e/cases/test_05/sacct/sstat | 4 + .../cases/test_05/sacct/sstat_1052477.txt | 2 + test_e2e/cases/test_05/slurm_env/slurmenv.sh | 28 ++ test_e2e/cases/test_05/test.yaml | 65 +++ test_e2e/run.yaml | 4 +- 28 files changed, 1180 insertions(+), 438 deletions(-) create mode 100644 test_e2e/cases/test_05/README.md create mode 100644 test_e2e/cases/test_05/conf/adaptive_card_template.json create mode 100644 test_e2e/cases/test_05/conf/gobler.conf create mode 100644 test_e2e/cases/test_05/conf/goslmailer.conf create mode 100644 test_e2e/cases/test_05/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json create mode 100644 test_e2e/cases/test_05/sacct/sacct create mode 100644 test_e2e/cases/test_05/sacct/sacct_1052477.txt create mode 100644 test_e2e/cases/test_05/sacct/sstat create mode 100644 test_e2e/cases/test_05/sacct/sstat_1052477.txt create mode 100644 test_e2e/cases/test_05/slurm_env/slurmenv.sh create mode 100644 test_e2e/cases/test_05/test.yaml diff --git a/cmd/goslmailer/goslmailer.go b/cmd/goslmailer/goslmailer.go index 3e6a2fa..822c24c 100644 --- a/cmd/goslmailer/goslmailer.go +++ b/cmd/goslmailer/goslmailer.go @@ -67,7 +67,10 @@ func main() { // get job statistics based on the SLURM_JOB_ID from slurmEnv struct // only if job is END or FAIL(?) - job.GetJobStats(log, ic.CmdParams.Subject, cfg.Binpaths) + err = job.GetJobStats(ic.CmdParams.Subject, cfg.Binpaths, log) + if err != nil { + log.Fatalf("Unable to retrieve job stats. Error: %v", err) + } // generate hints based on SlurmEnv and JobStats (e.g. "too much memory requested" or "walltime << requested queue") // only if job is END or fail(?) diff --git a/internal/slurmjob/getjobcontext.go b/internal/slurmjob/getjobcontext.go index ab7ba82..5e7bc19 100644 --- a/internal/slurmjob/getjobcontext.go +++ b/internal/slurmjob/getjobcontext.go @@ -1,9 +1,11 @@ package slurmjob import ( + "errors" "fmt" "log" "os" + "regexp" "sort" "strings" "time" @@ -111,7 +113,8 @@ func IsJobFinished(jobState string) bool { "FAILED", "COMPLETED", "OUT_OF_MEMORY", - "TIMEOUT": + "TIMEOUT", + "Mixed": return true } return false @@ -121,31 +124,113 @@ func (j *JobContext) IsJobFinished() bool { return IsJobFinished(j.SlurmEnvironment.SLURM_JOB_STATE) } +// Parse a subject line string and return a partial filled SlurmEnvirinment struct +// throw error if parsing failes +func parseSubjectLine(subject string) (*SlurmEnvironment, error) { + rJob, _ := regexp.Compile(`^Slurm Job_id=(?P\d+) Name=(?P.*) (?P\w+), \w+ time .+?(?:, (?P\w+), ExitCode (?P\d))?$`) + asJob, _ := regexp.Compile(`^Slurm Array Summary Job_id=.+ \((\d+)\) Name=(?P.*) (?P\w+)(?:, (?P\w+), \w+ \[(?P.+)\])?$`) + aJob, _ := regexp.Compile(`^Slurm Array Task Job_id=(?P\d+)_(?P\d+) \((?P\d+)\) Name=(?P.*) (?P\w+), \w+ time .+?(?:, (?P\w+), ExitCode (?P\d))?$`) + + env := new(SlurmEnvironment) + var jobId string + var jobState string + var mailType string + var jobName string + if strings.Contains(subject, "Slurm Array Summary Job_id=") { + matches := asJob.FindStringSubmatch(subject) + if matches == nil { + return nil, errors.New(("Invalid subject line: " + subject)) + } + jobId = matches[1] + jobName = matches [2] + mailType = matches[3] + jobState = matches[4] + if jobState == "" { + jobState = "PENDING" + } + } else if strings.Contains(subject, "Slurm Array Task Job_id") { + matches := aJob.FindStringSubmatch(subject) + if matches == nil { + return nil, errors.New(("Invalid subject line: " + subject)) + } + env.SLURM_ARRAY_JOB_ID = matches[1] + env.SLURM_ARRAY_TASK_ID = matches[2] + jobId = matches[3] + jobName = matches [4] + mailType = matches[5] + jobState = matches[6] + if (jobState == "") { + jobState = "RUNNING" + } + } else { + matches := rJob.FindStringSubmatch(subject) + if matches == nil { + return nil, errors.New(("Invalid subject line: " + subject)) + } + jobId = matches[1] + jobName = matches [2] + mailType = matches[3] + jobState = matches[4] + if jobState == "" { + jobState = "RUNNING" + } + + } + env.SLURM_JOBID = jobId + env.SLURM_JOB_ID = jobId + env.SLURM_JOB_MAIL_TYPE = mailType + env.SLURM_JOB_STATE = jobState + env.SLURM_JOB_NAME = jobName + return env, nil +} + +func (j *JobContext) UpdateEnvVarsFromSacct(subject string) error { + env, err := parseSubjectLine(subject) + if err != nil { + return err + } + j.SlurmEnvironment = *env + return nil +} + // Get additional job statistics from external source (e.g. jobinfo or sacct) -func (j *JobContext) GetJobStats(log *log.Logger, subject string, paths map[string]string) { +func (j *JobContext) GetJobStats(subject string, paths map[string]string, log *log.Logger) error { log.Print("Start retrieving job stats") log.Printf("%#v", j.SlurmEnvironment) + + // SLURM < 21.08.x don't have any SLURM envs set, we need to parse the mail subject line, retrieve the jobid and all other information from sacct + if j.SlurmEnvironment.SLURM_JOBID == "" { + err := j.UpdateEnvVarsFromSacct(subject) + if err != nil { + return err + } + } jobId := j.SlurmEnvironment.SLURM_JOBID if strings.Contains(subject, "Slurm Array Summary Job_id=") { - j.MailSubject = fmt.Sprintf("Job Array Summary %s (%s-%s)", j.SlurmEnvironment.SLURM_ARRAY_JOB_ID, j.SlurmEnvironment.SLURM_ARRAY_TASK_MIN, j.SlurmEnvironment.SLURM_ARRAY_TASK_MAX) - //jobId = fmt.Sprintf("%s_%s", j.SlurmEnvironment.SLURM_ARRAY_JOB_ID, j.SlurmEnvironment.SLURM_ARRAY_TASK_ID) + j.MailSubject = fmt.Sprintf("Job Array Summary %s_*", j.SlurmEnvironment.SLURM_ARRAY_JOB_ID) } else if strings.Contains(subject, "Slurm Array Task Job_id") { - jobId = j.SlurmEnvironment.SLURM_ARRAY_JOB_ID j.MailSubject = fmt.Sprintf("Job Array Task %s", jobId) } else { j.MailSubject = fmt.Sprintf("Job %s", jobId) - } if j.SlurmEnvironment.SLURM_ARRAY_JOB_ID != "" { jobId = j.SlurmEnvironment.SLURM_ARRAY_JOB_ID } log.Printf("Fetch job info %s", jobId) - j.JobStats = *GetSacctMetrics(jobId, log, paths) + jobStats, err := GetSacctMetrics(jobId, paths, log) + if err != nil { + return err + } + j.JobStats = *jobStats counter := 0 for !IsJobFinished(j.JobStats.State) && j.JobStats.State != j.SlurmEnvironment.SLURM_JOB_STATE && counter < 5 { time.Sleep(2 * time.Second) - j.JobStats = *GetSacctMetrics(jobId, log, paths) + jobStats, err = GetSacctMetrics(jobId, paths, log) + if err != nil { + return fmt.Errorf("Failed to Get job stats: %w", err) + } + j.JobStats = *jobStats counter += 1 } if j.JobStats.State == "RUNNING" { @@ -153,4 +238,5 @@ func (j *JobContext) GetJobStats(log *log.Logger, subject string, paths map[stri updateJobStatsWithLiveData(&j.JobStats, jobId, log, paths) } log.Printf("Finished retrieving job stats") + return nil } diff --git a/internal/slurmjob/getjobcontext_test.go b/internal/slurmjob/getjobcontext_test.go index dcff22e..6265593 100644 --- a/internal/slurmjob/getjobcontext_test.go +++ b/internal/slurmjob/getjobcontext_test.go @@ -1,81 +1,150 @@ package slurmjob import ( - "testing" + "testing" ) var qosMaxRuntimeMap = map[uint64]string{ - 3600: "RAPID", - 28800: "SHORT", - 172800: "MEDIUM", - 1209600: "LONG", + 3600: "RAPID", + 28800: "SHORT", + 172800: "MEDIUM", + 1209600: "LONG", } func TestCalculateOptimalQos(t *testing.T) { - qos := calculateOptimalQOS(qosMaxRuntimeMap, 600) - if qos != "RAPID" { - t.Errorf("Wrong QOS got: %s, want: RAPID", qos) - } - - qos = calculateOptimalQOS(qosMaxRuntimeMap, 3800) - if qos != "SHORT" { - t.Errorf("Wrong QOS got: %s, want: SHORT", qos) - } - - qos = calculateOptimalQOS(qosMaxRuntimeMap, 29000) - if qos != "MEDIUM" { - t.Errorf("Wrong QOS got: %s, want: MEDIUM", qos) - } - - qos = calculateOptimalQOS(qosMaxRuntimeMap, 175000) - if qos != "LONG" { - t.Errorf("Wrong QOS got: %s, want: LONG", qos) - } + qos := calculateOptimalQOS(qosMaxRuntimeMap, 600) + if qos != "RAPID" { + t.Errorf("Wrong QOS got: %s, want: RAPID", qos) + } + + qos = calculateOptimalQOS(qosMaxRuntimeMap, 3800) + if qos != "SHORT" { + t.Errorf("Wrong QOS got: %s, want: SHORT", qos) + } + + qos = calculateOptimalQOS(qosMaxRuntimeMap, 29000) + if qos != "MEDIUM" { + t.Errorf("Wrong QOS got: %s, want: MEDIUM", qos) + } + + qos = calculateOptimalQOS(qosMaxRuntimeMap, 175000) + if qos != "LONG" { + t.Errorf("Wrong QOS got: %s, want: LONG", qos) + } } func TestNoHintsWhenJobIsNotFinished(t *testing.T) { - jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "RUNNING"} - jobContext := JobContext{SlurmEnvironment: jobEnviron} - jobContext.GenerateHints(qosMaxRuntimeMap) - if len(jobContext.Hints) != 0 { - t.Error("Running jobs should have now hints") - } + jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "RUNNING"} + jobContext := JobContext{SlurmEnvironment: jobEnviron} + jobContext.GenerateHints(qosMaxRuntimeMap) + if len(jobContext.Hints) != 0 { + t.Error("Running jobs should have now hints") + } } func TestOOMHints(t *testing.T) { - jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "OUT_OF_MEMORY"} - jobContext := JobContext{SlurmEnvironment: jobEnviron} - jobContext.GenerateHints(qosMaxRuntimeMap) - if len(jobContext.Hints) != 1 { - t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 1) - } + jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "OUT_OF_MEMORY"} + jobContext := JobContext{SlurmEnvironment: jobEnviron} + jobContext.GenerateHints(qosMaxRuntimeMap) + if len(jobContext.Hints) != 1 { + t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 1) + } } func TestTimeOutHints(t *testing.T) { - jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "TIMEOUT"} - jobContext := JobContext{SlurmEnvironment: jobEnviron} - jobContext.GenerateHints(qosMaxRuntimeMap) - if len(jobContext.Hints) != 1 { - t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 1) - } + jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "TIMEOUT"} + jobContext := JobContext{SlurmEnvironment: jobEnviron} + jobContext.GenerateHints(qosMaxRuntimeMap) + if len(jobContext.Hints) != 1 { + t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 1) + } } func TestRegularHints(t *testing.T) { - jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "COMPLETED"} - metrics := SacctMetrics{MaxRSS: 1610612736, ReqMem: 4294967296, Runtime: 11000, Walltime: 29000, CPUTime: 79488, TotalCPU: 30000} - jobContext := JobContext{SlurmEnvironment: jobEnviron, JobStats: metrics} - jobContext.GenerateHints(qosMaxRuntimeMap) - if len(jobContext.Hints) != 3 { - t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 3) - } + jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "COMPLETED"} + metrics := SacctMetrics{MaxRSS: 1610612736, ReqMem: 4294967296, Runtime: 11000, Walltime: 29000, CPUTime: 79488, TotalCPU: 30000} + jobContext := JobContext{SlurmEnvironment: jobEnviron, JobStats: metrics} + jobContext.GenerateHints(qosMaxRuntimeMap) + if len(jobContext.Hints) != 3 { + t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 3) + } } func TestNoHints(t *testing.T) { - jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "COMPLETED"} - metrics := SacctMetrics{MaxRSS: 4294967296, ReqMem: 4294967296, Runtime: 22000, Walltime: 29000} - jobContext := JobContext{SlurmEnvironment: jobEnviron, JobStats: metrics} - jobContext.GenerateHints(qosMaxRuntimeMap) - if len(jobContext.Hints) != 0 { - t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 0) - } + jobEnviron := SlurmEnvironment{SLURM_JOB_STATE: "COMPLETED"} + metrics := SacctMetrics{MaxRSS: 4294967296, ReqMem: 4294967296, Runtime: 22000, Walltime: 29000} + jobContext := JobContext{SlurmEnvironment: jobEnviron, JobStats: metrics} + jobContext.GenerateHints(qosMaxRuntimeMap) + if len(jobContext.Hints) != 0 { + t.Errorf("Wrong number of hints. got: %d, want %d", len(jobContext.Hints), 0) + } +} + +func TestSubjectParsing(t *testing.T) { + // parse regular job BEGIN subject line + subject := "Slurm Job_id=39766384 Name=rMPCD-PS:3.5_0 Began, Queued time 2-00:04:18" + env, err := parseSubjectLine(subject) + if err != nil { + t.Fatalf("Error parsing subject: %s", err) + } + if env.SLURM_JOBID != "39766384" || env.SLURM_JOB_MAIL_TYPE != "Began" || env.SLURM_JOB_STATE != "RUNNING" || env.SLURM_JOB_NAME != "rMPCD-PS:3.5_0" { + t.Errorf("jobid/MAIL_TYPE/JOB_STATE/JOB_NAME wrong. Got: %s/%s/%s/%s, want: 39766384/Began/RUNNING/rMPCD-PS:3.5_0", env.SLURM_JOBID, env.SLURM_JOB_MAIL_TYPE, env.SLURM_JOB_STATE, env.SLURM_JOB_NAME) + } + + // parse task summary END job subject line + subject = "Slurm Job_id=39891831 Name=L_R38_3 Ended, Run time 1-11:30:27, COMPLETED, ExitCode 0" + env, err = parseSubjectLine(subject) + if err != nil { + t.Fatalf("Error parsing subject: %s", err) + } + if env.SLURM_JOBID != "39891831" || env.SLURM_JOB_MAIL_TYPE != "Ended" || env.SLURM_JOB_STATE != "COMPLETED" || env.SLURM_JOB_NAME != "L_R38_3" { + t.Errorf("jobid/MAIL_TYPE/JOB_STATE/JOB_NAME wrong. Got: %s/%s/%s/%s, want: 39891831/Ended/COMPLETED/L_R38_3", env.SLURM_JOBID, env.SLURM_JOB_MAIL_TYPE, env.SLURM_JOB_STATE, env.SLURM_JOB_NAME) + } + + // parse task summary BEGIN job subject line + subject = "Slurm Array Summary Job_id=39860384_* (39860384) Name=count Began" + env, err = parseSubjectLine(subject) + if err != nil { + t.Fatalf("Error parsing subject: %s", err) + } + if env.SLURM_JOBID != "39860384" || env.SLURM_JOB_MAIL_TYPE != "Began" || env.SLURM_JOB_STATE != "PENDING" || env.SLURM_JOB_NAME != "count" { + t.Errorf("jobid/MAIL_TYPE wrong. Got: %s/%s/%s/%s, want: 39860384/Began/PENDING/count", env.SLURM_JOBID, env.SLURM_JOB_MAIL_TYPE, env.SLURM_JOB_STATE, env.SLURM_JOB_NAME) + } + + // parse task summary END job subject line + subject = "Slurm Array Summary Job_id=39973135_* (39973135) Name=2022_PLANTEEN_SCHIMPER_01_FC1_analysis.sbatch Ended, Mixed, MaxSignal [9]" + env, err = parseSubjectLine(subject) + if err != nil { + t.Fatalf("Error parsing subject: %s", err) + } + if env.SLURM_JOBID != "39973135" || env.SLURM_JOB_MAIL_TYPE != "Ended" || env.SLURM_JOB_STATE != "Mixed" || env.SLURM_JOB_NAME != "2022_PLANTEEN_SCHIMPER_01_FC1_analysis.sbatch" { + t.Errorf("jobid/MAIL_TYPE/JOB_STATE wrong. Got: %s/%s/%s/%s, want: 39973135/Ended/Mixed/2022_PLANTEEN_SCHIMPER_01_FC1_analysis.sbatch", env.SLURM_JOBID, env.SLURM_JOB_MAIL_TYPE, env.SLURM_JOB_STATE, env.SLURM_JOB_NAME) + } + + // parse task array BEGIN job subject line + subject = "Slurm Array Task Job_id=1052478_1 (1052479) Name=wrap Began, Queued time 00:00:01" + env, err = parseSubjectLine(subject) + if err != nil { + t.Fatalf("Error parsing subject: %s", err) + } + if env.SLURM_JOBID != "1052479" || env.SLURM_ARRAY_JOB_ID != "1052478" || env.SLURM_JOB_MAIL_TYPE != "Began" || env.SLURM_JOB_STATE != "RUNNING" || env.SLURM_JOB_NAME != "wrap" { + t.Errorf("jobid/jobarrayid/MAIL_TYPE wrong. Got: %s/%s/%s/%s/%s, want: 1052479/1052478/Began/RUNNING/wrap", env.SLURM_JOBID, env.SLURM_ARRAY_JOB_ID, env.SLURM_JOB_MAIL_TYPE, env.SLURM_JOB_STATE, env.SLURM_JOB_NAME) + } + + // parse task array END job subject line + subject = "Slurm Array Task Job_id=1052478_1 (1052479) Name=wrap Ended, Run time 00:00:08, COMPLETED, ExitCode 0" + env, err = parseSubjectLine(subject) + if err != nil { + t.Fatalf("Error parsing subject: %s", err) + } + if env.SLURM_JOBID != "1052479" || env.SLURM_ARRAY_JOB_ID != "1052478" || env.SLURM_JOB_MAIL_TYPE != "Ended" || env.SLURM_JOB_STATE != "COMPLETED" || env.SLURM_JOB_NAME != "wrap" { + t.Errorf("jobid/jobarrayid/MAIL_TYPE/JOB_STATE wrong. Got: %s/%s/%s/%s/%s, want: 1052479/1052478/Ended/COMPLETED/wrap", env.SLURM_JOBID, env.SLURM_ARRAY_JOB_ID, env.SLURM_JOB_MAIL_TYPE, env.SLURM_JOB_STATE, env.SLURM_JOB_NAME) + } + + // parse error message when wrong subject line + subject = "slurm job x" + env, err = parseSubjectLine(subject) + if err == nil { + t.Fatalf("No error thrown for wrong subject") + } } diff --git a/internal/slurmjob/sacct.go b/internal/slurmjob/sacct.go index 57ee2e2..45d0e81 100644 --- a/internal/slurmjob/sacct.go +++ b/internal/slurmjob/sacct.go @@ -1,243 +1,265 @@ package slurmjob import ( - "log" - "math" - "os/exec" - "regexp" - "strconv" - "strings" + "fmt" + "log" + "math" + "os/exec" + "regexp" + "strconv" + "strings" ) type SacctMetrics struct { - State string - Ncpus int64 - Nodes int - Submittime string - Starttime string - Endtime string - CPUTimeStr string - CPUTime float64 - TotalCPU float64 - TotalCPUStr string - UserCPU float64 - SystemCPU float64 - ReqMem uint64 - MaxRSS uint64 - Walltime uint64 - WalltimeStr string - Runtime uint64 - RuntimeStr string - MaxDiskWrite uint64 - MaxDiskRead uint64 + JobName string + User string + Account string + Partition string + State string + Ncpus int64 + Nodes int + NodeList string + Submittime string + Starttime string + Endtime string + CPUTimeStr string + CPUTime float64 + TotalCPU float64 + TotalCPUStr string + UserCPU float64 + SystemCPU float64 + ReqMem uint64 + MaxRSS uint64 + Walltime uint64 + WalltimeStr string + Runtime uint64 + RuntimeStr string + MaxDiskWrite uint64 + MaxDiskRead uint64 } type SstatMetrics struct { - MaxRSS uint64 - MaxDiskWrite uint64 - MaxDiskRead uint64 + MaxRSS uint64 + MaxDiskWrite uint64 + MaxDiskRead uint64 } func parseTime(input string) (float64, uint64, uint64, uint64) { - reg := `^(((?P\d+)-)?(?P\d\d):)?(?P\d\d):(?P\d\d(\.\d+)?)$` - r := regexp.MustCompile(reg) - matches := r.FindStringSubmatch(input) - var ss float64 - var mm, hh, dd uint64 - if matches != nil { - ss, _ = strconv.ParseFloat(matches[r.SubexpIndex("seconds")], 64) - mm, _ = strconv.ParseUint(matches[r.SubexpIndex("minutes")], 10, 32) - hh, _ = strconv.ParseUint(matches[r.SubexpIndex("hours")], 10, 32) - dd, _ = strconv.ParseUint(matches[r.SubexpIndex("days")], 10, 32) - } - return ss, mm, hh, dd + reg := `^(((?P\d+)-)?(?P\d\d):)?(?P\d\d):(?P\d\d(\.\d+)?)$` + r := regexp.MustCompile(reg) + matches := r.FindStringSubmatch(input) + var ss float64 + var mm, hh, dd uint64 + if matches != nil { + ss, _ = strconv.ParseFloat(matches[r.SubexpIndex("seconds")], 64) + mm, _ = strconv.ParseUint(matches[r.SubexpIndex("minutes")], 10, 32) + hh, _ = strconv.ParseUint(matches[r.SubexpIndex("hours")], 10, 32) + dd, _ = strconv.ParseUint(matches[r.SubexpIndex("days")], 10, 32) + } + return ss, mm, hh, dd } func parseByteSize(input string) uint64 { - if input == "" || input == "16?" { - return 0.0 - } - m := map[string]float64{"K": 10, "M": 20, "G": 30, "T": 40, "P": 50, "E": 60} - var value = 0.0 - var scale = 1.0 - if exp, found := m[input[len(input)-1:]]; found { - scale = math.Pow(2, exp) - value, _ = strconv.ParseFloat(input[:len(input)-1], 64) - } else { - value, _ = strconv.ParseFloat(input, 64) - } - return uint64(value * scale) + if input == "" || input == "16?" { + return 0.0 + } + m := map[string]float64{"K": 10, "M": 20, "G": 30, "T": 40, "P": 50, "E": 60} + var value = 0.0 + var scale = 1.0 + if exp, found := m[input[len(input)-1:]]; found { + scale = math.Pow(2, exp) + value, _ = strconv.ParseFloat(input[:len(input)-1], 64) + } else { + value, _ = strconv.ParseFloat(input, 64) + } + return uint64(value * scale) } func parseCpuTime(input string) float64 { - ss, mm, hh, dd := parseTime(input) - return float64(dd*24*60*60+hh*60*60+mm*60) + ss + ss, mm, hh, dd := parseTime(input) + return float64(dd*24*60*60+hh*60*60+mm*60) + ss } -func ParseSstatMetrics(input []byte) *SstatMetrics { - var metrics SstatMetrics - lines := strings.Split(string(input), "\n") - for _, line := range lines { - if line == "" { - continue - } - split := strings.Split(line, "|") - if split[0] != "" { - maxRSS := parseByteSize(split[1]) - if metrics.MaxRSS < maxRSS { - metrics.MaxRSS = maxRSS - } - } - if split[1] != "" { - maxDiskWrite := parseByteSize(split[2]) - if metrics.MaxDiskWrite < maxDiskWrite { - metrics.MaxDiskWrite = maxDiskWrite - } - } - if split[2] != "" { - maxDiskRead := parseByteSize(split[3]) - if metrics.MaxDiskRead < maxDiskRead { - metrics.MaxDiskRead = maxDiskRead - } - } - - } - return &metrics +func ParseSstatMetrics(input []byte) (*SstatMetrics, error) { + var metrics SstatMetrics + lines := strings.Split(string(input), "\n") + for _, line := range lines { + if line == "" { + continue + } + split := strings.Split(line, "|") + if split[0] != "" { + maxRSS := parseByteSize(split[1]) + if metrics.MaxRSS < maxRSS { + metrics.MaxRSS = maxRSS + } + } + if split[1] != "" { + maxDiskWrite := parseByteSize(split[2]) + if metrics.MaxDiskWrite < maxDiskWrite { + metrics.MaxDiskWrite = maxDiskWrite + } + } + if split[2] != "" { + maxDiskRead := parseByteSize(split[3]) + if metrics.MaxDiskRead < maxDiskRead { + metrics.MaxDiskRead = maxDiskRead + } + } + + } + return &metrics, nil } -func ParseSacctMetrics(input []byte) *SacctMetrics { - var metrics SacctMetrics - if len(input) == 0 { - return &metrics - } - lines := strings.Split(string(input), "\n") - for _, line := range lines { - if line == "" { - continue - } - split := strings.Split(line, "|") - ncpus, _ := strconv.ParseInt(strings.TrimSpace(split[4]), 10, 16) - if metrics.Ncpus < ncpus { - metrics.Ncpus = ncpus - } - if split[16] != "" { - maxRSS := parseByteSize(split[16]) - if metrics.MaxRSS < maxRSS { - metrics.MaxRSS = maxRSS - } - } - if split[17] != "" { - maxDiskWrite := parseByteSize(split[17]) - if metrics.MaxDiskWrite < maxDiskWrite { - metrics.MaxDiskWrite = maxDiskWrite - } - } - if split[18] != "" { - maxDiskRead := parseByteSize(split[18]) - if metrics.MaxDiskRead < maxDiskRead { - metrics.MaxDiskRead = maxDiskRead - } - } - - } - // retrieve information for entire job allocation (NodeList, ReqMem) - allocation := lines[0] - split := strings.Split(allocation, "|") - metrics.State = split[5] - cpuTimeStr := split[11] - cpuTime := parseCpuTime(cpuTimeStr) - if metrics.CPUTime < cpuTime { - metrics.CPUTime = cpuTime - metrics.CPUTimeStr = cpuTimeStr - } - totalCpuTimeStr := split[12] - totalCpuTime := parseCpuTime(totalCpuTimeStr) - if metrics.TotalCPU < totalCpuTime { - metrics.TotalCPU = totalCpuTime - metrics.TotalCPUStr = totalCpuTimeStr - } - userCpuTime := parseCpuTime(split[13]) - if metrics.UserCPU < userCpuTime { - metrics.UserCPU = userCpuTime - } - systemCpuTime := parseCpuTime(split[14]) - if metrics.SystemCPU < systemCpuTime { - metrics.SystemCPU = systemCpuTime - } - metrics.Nodes = len(strings.Split(split[3], ",")) - reqMem := strings.TrimSpace(split[15]) - if strings.HasSuffix(reqMem, "n") { - metrics.ReqMem = uint64(metrics.Nodes) * parseByteSize(reqMem[:len(reqMem)-1]) - - } else if strings.HasSuffix(reqMem, "c") { - metrics.ReqMem = uint64(metrics.Ncpus) * parseByteSize(reqMem[:len(reqMem)-1]) - } else { - metrics.ReqMem = parseByteSize(reqMem) - } - metrics.Submittime = split[6] - metrics.Starttime = split[7] - metrics.Endtime = split[8] - metrics.WalltimeStr = split[9] - metrics.Walltime = uint64(parseCpuTime(split[9])) - metrics.RuntimeStr = split[10] - metrics.Runtime = uint64(parseCpuTime(split[10])) - - log.Printf("Metrics: %#v", metrics) - return &metrics +func ParseSacctMetrics(input []byte) (*SacctMetrics, error) { + var metrics SacctMetrics + if len(input) == 0 { + return &metrics, nil + } + lines := strings.Split(string(input), "\n") + for _, line := range lines { + if line == "" { + continue + } + split := strings.Split(line, "|") + ncpus, _ := strconv.ParseInt(strings.TrimSpace(split[5]), 10, 16) + if metrics.Ncpus < ncpus { + metrics.Ncpus = ncpus + } + if split[17] != "" { + maxRSS := parseByteSize(split[17]) + if metrics.MaxRSS < maxRSS { + metrics.MaxRSS = maxRSS + } + } + if split[18] != "" { + maxDiskWrite := parseByteSize(split[18]) + if metrics.MaxDiskWrite < maxDiskWrite { + metrics.MaxDiskWrite = maxDiskWrite + } + } + if split[19] != "" { + maxDiskRead := parseByteSize(split[19]) + if metrics.MaxDiskRead < maxDiskRead { + metrics.MaxDiskRead = maxDiskRead + } + } + + } + // retrieve information for entire job allocation (NodeList, ReqMem) + allocation := lines[0] + split := strings.Split(allocation, "|") + metrics.JobName = split[0] + metrics.User = split[1] + metrics.Account = split[2] + metrics.Partition = split[3] + metrics.NodeList = split[4] + metrics.State = split[6] + cpuTimeStr := split[12] + cpuTime := parseCpuTime(cpuTimeStr) + if metrics.CPUTime < cpuTime { + metrics.CPUTime = cpuTime + metrics.CPUTimeStr = cpuTimeStr + } + totalCpuTimeStr := split[13] + totalCpuTime := parseCpuTime(totalCpuTimeStr) + if metrics.TotalCPU < totalCpuTime { + metrics.TotalCPU = totalCpuTime + metrics.TotalCPUStr = totalCpuTimeStr + } + userCpuTime := parseCpuTime(split[14]) + if metrics.UserCPU < userCpuTime { + metrics.UserCPU = userCpuTime + } + systemCpuTime := parseCpuTime(split[15]) + if metrics.SystemCPU < systemCpuTime { + metrics.SystemCPU = systemCpuTime + } + metrics.Nodes = len(strings.Split(split[4], ",")) + reqMem := strings.TrimSpace(split[16]) + if strings.HasSuffix(reqMem, "n") { + metrics.ReqMem = uint64(metrics.Nodes) * parseByteSize(reqMem[:len(reqMem)-1]) + + } else if strings.HasSuffix(reqMem, "c") { + metrics.ReqMem = uint64(metrics.Ncpus) * parseByteSize(reqMem[:len(reqMem)-1]) + } else { + metrics.ReqMem = parseByteSize(reqMem) + } + metrics.Submittime = split[7] + metrics.Starttime = split[8] + metrics.Endtime = split[9] + metrics.WalltimeStr = split[10] + metrics.Walltime = uint64(parseCpuTime(split[10])) + metrics.RuntimeStr = split[11] + metrics.Runtime = uint64(parseCpuTime(split[11])) + + log.Printf("Metrics: %#v", metrics) + return &metrics, nil } func (m SacctMetrics) CalcUserComputePercentage() float64 { - if m.TotalCPU != 0 { - return (float64(m.UserCPU) / float64(m.TotalCPU)) * 100 - } - return 0.0 + if m.TotalCPU != 0 { + return (float64(m.UserCPU) / float64(m.TotalCPU)) * 100 + } + return 0.0 } func (m SacctMetrics) CalcSystemComputePercentage() float64 { - if m.TotalCPU != 0 { - return (float64(m.SystemCPU) / float64(m.TotalCPU)) * 100 - } - return 0.0 + if m.TotalCPU != 0 { + return (float64(m.SystemCPU) / float64(m.TotalCPU)) * 100 + } + return 0.0 } -func GetSacctMetrics(jobId string, log *log.Logger, paths map[string]string) *SacctMetrics { - return ParseSacctMetrics(GetSacctData(jobId, log, paths)) +func GetSacctMetrics(jobId string, paths map[string]string, log *log.Logger) (*SacctMetrics, error) { + sacctMetrics, err := GetSacctData(jobId, paths, log) + if err != nil { + return nil, err + } + return ParseSacctMetrics(sacctMetrics) } -func GetSstatMetrics(jobId string, log *log.Logger, paths map[string]string) *SstatMetrics { - return ParseSstatMetrics(GetSstatData(jobId, log, paths)) +func GetSstatMetrics(jobId string, paths map[string]string, log *log.Logger) (*SstatMetrics, error) { + sstatMetrics, err := GetSacctData(jobId, paths, log) + if err != nil { + return nil, err + } + return ParseSstatMetrics(sstatMetrics) } func updateJobStatsWithLiveData(metrics *SacctMetrics, jobId string, log *log.Logger, paths map[string]string) { - liveMetrics := GetSstatMetrics(jobId, log, paths) - if liveMetrics.MaxRSS > 0 { - metrics.MaxRSS = liveMetrics.MaxRSS - } - if liveMetrics.MaxDiskWrite > 0 { - metrics.MaxDiskWrite = liveMetrics.MaxDiskWrite - } - if liveMetrics.MaxDiskRead > 0 { - metrics.MaxDiskRead = liveMetrics.MaxDiskRead - } + liveMetrics, err := GetSstatMetrics(jobId, paths, log) + if err == nil { + + if liveMetrics.MaxRSS > 0 { + metrics.MaxRSS = liveMetrics.MaxRSS + } + if liveMetrics.MaxDiskWrite > 0 { + metrics.MaxDiskWrite = liveMetrics.MaxDiskWrite + } + if liveMetrics.MaxDiskRead > 0 { + metrics.MaxDiskRead = liveMetrics.MaxDiskRead + } + } } // Execute the saccct command and return its output -func GetSacctData(jobId string, log *log.Logger, paths map[string]string) []byte { - formatLine := "JobName,User,Partition,NodeList,ncpus,State,Submit,start,end,timelimit,elapsed,CPUTime,TotalCPU,UserCPU,SystemCPU,ReqMem,MaxRSS,MaxDiskWrite,MaxDiskRead,MaxRSSNode,MaxDiskWriteNode,MaxDiskReadNode,Comment" - cmd := exec.Command(paths["sacct"], "-j", jobId, "-n", "-p", "--format", formatLine) - output, err := cmd.CombinedOutput() - if err != nil { - log.Fatal(output) - } - return output +func GetSacctData(jobId string, paths map[string]string, log *log.Logger) ([]byte, error) { + formatLine := "JobName,User,Account,Partition,NodeList,ncpus,State,Submit,start,end,timelimit,elapsed,CPUTime,TotalCPU,UserCPU,SystemCPU,ReqMem,MaxRSS,MaxDiskWrite,MaxDiskRead,MaxRSSNode,MaxDiskWriteNode,MaxDiskReadNode,Comment" + cmd := exec.Command(paths["sacct"], "-j", jobId, "-n", "-p", "--format", formatLine) + output, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("Failed to execute sacct command: %w", err) + } + return output, nil } -func GetSstatData(jobId string, log *log.Logger, paths map[string]string) []byte { - formatLine := "JobID,MaxRSS,MaxDiskWrite,MaxDiskRead,MaxRSSNode,MaxDiskWriteNode,MaxDiskReadNode" - cmd := exec.Command(paths["sstat"], "-a", "-j", jobId, "-n", "-p", "--format", formatLine) - output, err := cmd.CombinedOutput() - if err != nil { - log.Fatal(output) - } - return output +func GetSstatData(jobId string, paths map[string]string, log *log.Logger) ([]byte, error) { + formatLine := "JobID,MaxRSS,MaxDiskWrite,MaxDiskRead,MaxRSSNode,MaxDiskWriteNode,MaxDiskReadNode" + cmd := exec.Command(paths["sstat"], "-a", "-j", jobId, "-n", "-p", "--format", formatLine) + output, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("Failed to execute sstat command: %w", err) + } + return output, nil } diff --git a/internal/slurmjob/sacct_test.go b/internal/slurmjob/sacct_test.go index ffaebdf..5cdd5ce 100644 --- a/internal/slurmjob/sacct_test.go +++ b/internal/slurmjob/sacct_test.go @@ -1,114 +1,130 @@ package slurmjob import ( - "io/ioutil" - "os" - "testing" + "io/ioutil" + "os" + "testing" ) func TestParseSacctMetrics(t *testing.T) { - // Read the input data from a file - file, err := os.Open("../../test_data/sacct.txt") - if err != nil { - t.Fatalf("Can not open test data: %v", err) - } - data, _ := ioutil.ReadAll(file) - metrics := ParseSacctMetrics(data) - t.Logf("%+v", metrics) + // Read the input data from a file + file, err := os.Open("../../test_data/sacct.txt") + if err != nil { + t.Fatalf("Can not open test data: %v", err) + } + data, _ := ioutil.ReadAll(file) + metrics, _ := ParseSacctMetrics(data) + t.Logf("%+v", metrics) - if metrics.State != "COMPLETED" { - t.Errorf("State is incorrect. got: %s, want: %s", metrics.State, "COMPLETED") - } - if metrics.Nodes != 1 { - t.Errorf("Nodes is incorrect. got: %d, want: %d", metrics.Nodes, 1) - } - if metrics.Ncpus != 4 { - t.Errorf("Ncpus is incorrect. got: %d, want: %d", metrics.Ncpus, 4) - } - if metrics.Submittime != "2022-02-16T20:40:15" { - t.Errorf("Submittime is incorrect. got: %s, want: %s", metrics.Submittime, "2022-02-16T20:40:15") - } - if metrics.Starttime != "2022-02-16T20:40:15" { - t.Errorf("Starttime is incorrect. got: %s, want: %s", metrics.Starttime, "2022-02-16T20:40:15") - } - if metrics.Endtime != "2022-02-17T01:11:04" { - t.Errorf("Endtime is incorrect. got: %s, want: %s", metrics.Endtime, "2022-02-17T01:11:04") - } - if metrics.CPUTimeStr != "18:03:16" { - t.Errorf("CPUTimeStr is incorrect. got: %s, want: %s", metrics.CPUTimeStr, "18:03:16") - } - if metrics.CPUTime != 64996 { - t.Errorf("CPUTime is incorrect. got: %f, want: %f", metrics.CPUTime, 64996.0) - } - if metrics.TotalCPUStr != "01:57.511" { - t.Errorf("TotalCPUStr is incorrect. got: %s, want: %s", metrics.TotalCPUStr, "01:57.511") - } - if metrics.TotalCPU != 117.511 { - t.Errorf("TotalCPU is incorrect. got: %f, want: %f", metrics.TotalCPU, 117.511) - } - if metrics.UserCPU != 102.011 { - t.Errorf("UserCPU is incorrect. got: %f, want: %f", metrics.UserCPU, 102.011) - } - if metrics.SystemCPU != 15.5 { - t.Errorf("SystemCPU is incorrect. got: %f, want: %f", metrics.SystemCPU, 15.5) - } - if metrics.ReqMem != 34359738368 { - t.Errorf("ReqMem is incorrect. got: %d, want: %d", metrics.ReqMem, 34359738368) - } - if metrics.MaxRSS != 1133199360 { - t.Errorf("MaxRSS is incorrect. got: %d, want: %d", metrics.MaxRSS, 1133199360) - } - if metrics.WalltimeStr != "08:00:00" { - t.Errorf("WalltimeStr is incorrect. got: %s, want: %s", metrics.WalltimeStr, "08:00:00") - } - if metrics.Walltime != 28800 { - t.Errorf("Walltime is incorrect. got: %d, want: %d", metrics.Walltime, 28800) - } - if metrics.RuntimeStr != "04:30:49" { - t.Errorf("RuntimeStr is incorrect. got: %s, want: %s", metrics.RuntimeStr, "04:30:49") - } - if metrics.Runtime != 16249 { - t.Errorf("Runtime is incorrect. got: %d, want: %d", metrics.Runtime, 16249) - } - if metrics.MaxRSS != 1133199360 { - t.Errorf("MaxRSS is incorrect. got: %d, want: %d", metrics.MaxRSS, 1133199360) - } - if metrics.MaxDiskWrite != 10485 { - t.Errorf("MaxDiskWrite is incorrect. got: %d, want: %d", metrics.MaxDiskWrite, 10485) - } - if metrics.MaxDiskRead != 136314 { - t.Errorf("MaxDiskRead is incorrect. got: %d, want: %d", metrics.MaxDiskRead, 136314) - } + if metrics.JobName != "JobName" { + t.Errorf("JobName is incorrect. got: %s, want: %s", metrics.JobName, "JobName") + } + + if metrics.User != "username" { + t.Errorf("User is incorrect. got: %s, want: %s", metrics.User, "username") + } + if metrics.Account != "account" { + t.Errorf("Account is incorrect. got: %s, want: %s", metrics.Account, "account") + } + if metrics.Partition != "c" { + t.Errorf("Partition is incorrect. got: %s, want: %s", metrics.Partition, "c") + } + if metrics.NodeList != "clip-c2-10" { + t.Errorf("Partition is incorrect. got: %s, want: %s", metrics.NodeList, "clip-c2-10") + } + if metrics.State != "COMPLETED" { + t.Errorf("State is incorrect. got: %s, want: %s", metrics.State, "COMPLETED") + } + if metrics.Nodes != 1 { + t.Errorf("Nodes is incorrect. got: %d, want: %d", metrics.Nodes, 1) + } + if metrics.Ncpus != 4 { + t.Errorf("Ncpus is incorrect. got: %d, want: %d", metrics.Ncpus, 4) + } + if metrics.Submittime != "2022-02-16T20:40:15" { + t.Errorf("Submittime is incorrect. got: %s, want: %s", metrics.Submittime, "2022-02-16T20:40:15") + } + if metrics.Starttime != "2022-02-16T20:40:15" { + t.Errorf("Starttime is incorrect. got: %s, want: %s", metrics.Starttime, "2022-02-16T20:40:15") + } + if metrics.Endtime != "2022-02-17T01:11:04" { + t.Errorf("Endtime is incorrect. got: %s, want: %s", metrics.Endtime, "2022-02-17T01:11:04") + } + if metrics.CPUTimeStr != "18:03:16" { + t.Errorf("CPUTimeStr is incorrect. got: %s, want: %s", metrics.CPUTimeStr, "18:03:16") + } + if metrics.CPUTime != 64996 { + t.Errorf("CPUTime is incorrect. got: %f, want: %f", metrics.CPUTime, 64996.0) + } + if metrics.TotalCPUStr != "01:57.511" { + t.Errorf("TotalCPUStr is incorrect. got: %s, want: %s", metrics.TotalCPUStr, "01:57.511") + } + if metrics.TotalCPU != 117.511 { + t.Errorf("TotalCPU is incorrect. got: %f, want: %f", metrics.TotalCPU, 117.511) + } + if metrics.UserCPU != 102.011 { + t.Errorf("UserCPU is incorrect. got: %f, want: %f", metrics.UserCPU, 102.011) + } + if metrics.SystemCPU != 15.5 { + t.Errorf("SystemCPU is incorrect. got: %f, want: %f", metrics.SystemCPU, 15.5) + } + if metrics.ReqMem != 34359738368 { + t.Errorf("ReqMem is incorrect. got: %d, want: %d", metrics.ReqMem, 34359738368) + } + if metrics.MaxRSS != 1133199360 { + t.Errorf("MaxRSS is incorrect. got: %d, want: %d", metrics.MaxRSS, 1133199360) + } + if metrics.WalltimeStr != "08:00:00" { + t.Errorf("WalltimeStr is incorrect. got: %s, want: %s", metrics.WalltimeStr, "08:00:00") + } + if metrics.Walltime != 28800 { + t.Errorf("Walltime is incorrect. got: %d, want: %d", metrics.Walltime, 28800) + } + if metrics.RuntimeStr != "04:30:49" { + t.Errorf("RuntimeStr is incorrect. got: %s, want: %s", metrics.RuntimeStr, "04:30:49") + } + if metrics.Runtime != 16249 { + t.Errorf("Runtime is incorrect. got: %d, want: %d", metrics.Runtime, 16249) + } + if metrics.MaxRSS != 1133199360 { + t.Errorf("MaxRSS is incorrect. got: %d, want: %d", metrics.MaxRSS, 1133199360) + } + if metrics.MaxDiskWrite != 10485 { + t.Errorf("MaxDiskWrite is incorrect. got: %d, want: %d", metrics.MaxDiskWrite, 10485) + } + if metrics.MaxDiskRead != 136314 { + t.Errorf("MaxDiskRead is incorrect. got: %d, want: %d", metrics.MaxDiskRead, 136314) + } } func TestParseSstatMetrics(t *testing.T) { - // Read the input data from a file - file, err := os.Open("../../test_data/sstat.txt") - if err != nil { - t.Fatalf("Can not open test data: %v", err) - } - data, _ := ioutil.ReadAll(file) - metrics := ParseSstatMetrics(data) - t.Logf("%+v", metrics) - if metrics.MaxRSS != 1850245120 { - t.Errorf("MaxRSS is incorrect. got: %d, want: %d", metrics.MaxRSS, 1850245120) - } - if metrics.MaxDiskWrite != 70 { - t.Errorf("MaxDiskWrite is incorrect. got: %d, want: %d", metrics.MaxDiskWrite, 70) - } - if metrics.MaxDiskRead != 205384 { - t.Errorf("MaxDiskRead is incorrect. got: %d, want: %d", metrics.MaxDiskRead, 205384) - } + // Read the input data from a file + file, err := os.Open("../../test_data/sstat.txt") + if err != nil { + t.Fatalf("Can not open test data: %v", err) + } + data, _ := ioutil.ReadAll(file) + metrics, _ := ParseSstatMetrics(data) + t.Logf("%+v", metrics) + if metrics.MaxRSS != 1850245120 { + t.Errorf("MaxRSS is incorrect. got: %d, want: %d", metrics.MaxRSS, 1850245120) + } + if metrics.MaxDiskWrite != 70 { + t.Errorf("MaxDiskWrite is incorrect. got: %d, want: %d", metrics.MaxDiskWrite, 70) + } + if metrics.MaxDiskRead != 205384 { + t.Errorf("MaxDiskRead is incorrect. got: %d, want: %d", metrics.MaxDiskRead, 205384) + } } func TestParseSacctMetricsEmptyInput(t *testing.T) { - // Read the input data from a file - metrics := ParseSacctMetrics([]byte("")) - var emptyMetrics SacctMetrics - t.Logf("%+v", metrics) + // Read the input data from a file + metrics, _ := ParseSacctMetrics([]byte("")) + var emptyMetrics SacctMetrics + t.Logf("%+v", metrics) - if *metrics != emptyMetrics { - t.Error("Empty input should return empty metrics") - } + if *metrics != emptyMetrics { + t.Error("Empty input should return empty metrics") + } } diff --git a/test_data/sacct.txt b/test_data/sacct.txt index 4d66e61..ab21699 100644 --- a/test_data/sacct.txt +++ b/test_data/sacct.txt @@ -1,3 +1,3 @@ -JobName|username|c|clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04|08:00:00|04:30:49|18:03:16|01:57.511|01:42.011|00:15.500|32Gn|||||||| -batch|||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|01:57.510|01:42.011|00:15.498|32Gn|1106640K|0.01M|0.13M|clip-c2-10|clip-c2-10|clip-c2-10|| -extern|||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|00:00.001|00:00:00|00:00.001|32Gn|0|0|0.00M|clip-c2-10|clip-c2-10|clip-c2-10|| +JobName|username|account|c|clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04|08:00:00|04:30:49|18:03:16|01:57.511|01:42.011|00:15.500|32Gn|||||||| +batch||account||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|01:57.510|01:42.011|00:15.498|32Gn|1106640K|0.01M|0.13M|clip-c2-10|clip-c2-10|clip-c2-10|| +extern||account||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|00:00.001|00:00:00|00:00.001|32Gn|0|0|0.00M|clip-c2-10|clip-c2-10|clip-c2-10|| diff --git a/test_e2e/cases/test_00/sacct/sacct.txt b/test_e2e/cases/test_00/sacct/sacct.txt index 4d66e61..ab21699 100644 --- a/test_e2e/cases/test_00/sacct/sacct.txt +++ b/test_e2e/cases/test_00/sacct/sacct.txt @@ -1,3 +1,3 @@ -JobName|username|c|clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04|08:00:00|04:30:49|18:03:16|01:57.511|01:42.011|00:15.500|32Gn|||||||| -batch|||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|01:57.510|01:42.011|00:15.498|32Gn|1106640K|0.01M|0.13M|clip-c2-10|clip-c2-10|clip-c2-10|| -extern|||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|00:00.001|00:00:00|00:00.001|32Gn|0|0|0.00M|clip-c2-10|clip-c2-10|clip-c2-10|| +JobName|username|account|c|clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04|08:00:00|04:30:49|18:03:16|01:57.511|01:42.011|00:15.500|32Gn|||||||| +batch||account||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|01:57.510|01:42.011|00:15.498|32Gn|1106640K|0.01M|0.13M|clip-c2-10|clip-c2-10|clip-c2-10|| +extern||account||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|00:00.001|00:00:00|00:00.001|32Gn|0|0|0.00M|clip-c2-10|clip-c2-10|clip-c2-10|| diff --git a/test_e2e/cases/test_01/conf/gobler.conf b/test_e2e/cases/test_01/conf/gobler.conf index 876fe78..f54f8bf 100755 --- a/test_e2e/cases/test_01/conf/gobler.conf +++ b/test_e2e/cases/test_01/conf/gobler.conf @@ -1,5 +1,5 @@ { - "logfile": "", + "logfile": "/tmp/gobler_test05.log", "defaultconnector": "msteams", "connectors": { "msteams": { diff --git a/test_e2e/cases/test_01/sacct/sacct.txt b/test_e2e/cases/test_01/sacct/sacct.txt index 4d66e61..ab21699 100644 --- a/test_e2e/cases/test_01/sacct/sacct.txt +++ b/test_e2e/cases/test_01/sacct/sacct.txt @@ -1,3 +1,3 @@ -JobName|username|c|clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04|08:00:00|04:30:49|18:03:16|01:57.511|01:42.011|00:15.500|32Gn|||||||| -batch|||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|01:57.510|01:42.011|00:15.498|32Gn|1106640K|0.01M|0.13M|clip-c2-10|clip-c2-10|clip-c2-10|| -extern|||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|00:00.001|00:00:00|00:00.001|32Gn|0|0|0.00M|clip-c2-10|clip-c2-10|clip-c2-10|| +JobName|username|account|c|clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04|08:00:00|04:30:49|18:03:16|01:57.511|01:42.011|00:15.500|32Gn|||||||| +batch||account||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|01:57.510|01:42.011|00:15.498|32Gn|1106640K|0.01M|0.13M|clip-c2-10|clip-c2-10|clip-c2-10|| +extern||account||clip-c2-10|4|COMPLETED|2022-02-16T20:40:15|2022-02-16T20:40:15|2022-02-17T01:11:04||04:30:49|18:03:16|00:00.001|00:00:00|00:00.001|32Gn|0|0|0.00M|clip-c2-10|clip-c2-10|clip-c2-10|| diff --git a/test_e2e/cases/test_01/test.yaml b/test_e2e/cases/test_01/test.yaml index 3bddedb..27c3555 100644 --- a/test_e2e/cases/test_01/test.yaml +++ b/test_e2e/cases/test_01/test.yaml @@ -31,49 +31,65 @@ pipeline: checkError: true env: GOSLMAILER_CONF: /tmp/goslmailer.conf - commands: + commands: - source $twd/slurm_env/slurmenv.sh - - goslmailer -s "slurm job x" pja + - goslmailer -s "Slurm Job_id=39766384 Name=job Began, Queued time 2-00:04:18" pja test_assert_goslmailer: action: validator:assert - expect: + expect: - '/Deposit gob OK!/' - actual: + actual: - $run_goslmailer.Output + clear_gobler_log: + action: exec:run + checkError: false + commands: + - truncate -s0 /tmp/gobler_test05.log + run_gobler: action: process:start watch: true immuneToHangups: true - command: gobler -c /tmp/gobler.conf - timeoutMs: 5000 + command: gobler + arguments: + - -c + - /tmp/gobler.conf run_sleep: action: exec:run checkError: true - commands: + commands: - sleep 5 stop_gobler: action: process:stop pid: $run_gobler.Pid - # debug_gobler: - # action: workflow:print - # message: $AsJSON($run_gobler) + + # debug_gobler: + # action: workflow:print + # message: $AsJSON($run_gobler) + + read_gobler_log: + action: exec:run + checkError: true + commands: + - cat /tmp/gobler_test05.log + # https://github.com/viant/assertly#validation test_assert_gobler: action: validator:assert - expect: + expect: - '~/Send successful to file: rendered-1051492-pja-/' - '~/SENDER msteams#\d: Gob deleted/' - actual: - - $run_gobler.Stdout - - $run_gobler.Stdout + actual: + - $read_gobler_log.Cmd[0].Stdout + - $read_gobler_log.Cmd[0].Stdout - # todo: + # todo: # add test: # jq . rendered.json >/dev/null || echo FAILED diff --git a/test_e2e/cases/test_02/test.yaml b/test_e2e/cases/test_02/test.yaml index 31748c8..688502c 100644 --- a/test_e2e/cases/test_02/test.yaml +++ b/test_e2e/cases/test_02/test.yaml @@ -29,43 +29,16 @@ pipeline: run_goslmailer: action: exec:run - checkError: true + checkError: false env: GOSLMAILER_CONF: /tmp/goslmailer.conf - commands: + commands: - source $twd/slurm_env/slurmenv.sh - - goslmailer -s "slurm job x" pja + - goslmailer -s "Slurm Job_id=39766384 Name=job Began, Queued time 2-00:04:18" pja test_assert_goslmailer: action: validator:assert - expect: - - '/Deposit gob OK!/' - actual: + expect: + - '/Unable to retrieve job stats. Error: Failed to execute sacct command: exit status 1/' + actual: - $run_goslmailer.Output - - run_gobler: - action: process:start - watch: true - immuneToHangups: true - command: gobler -c /tmp/gobler.conf - timeoutMs: 5000 - - run_sleep: - action: exec:run - checkError: true - commands: - - sleep 5 - - stop_gobler: - action: process:stop - pid: $run_gobler.Pid - - # https://github.com/viant/assertly#validation - test_assert_gobler: - action: validator:assert - expect: - - '~/Send successful to file: rendered-\d*-pja-/' - - '~/SENDER msteams#\d: Gob deleted/' - actual: - - $run_gobler.Stdout - - $run_gobler.Stdout diff --git a/test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json b/test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json index 8f35305..2a8f784 100644 --- a/test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json +++ b/test_e2e/cases/test_03/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json @@ -62,7 +62,7 @@ { "type":"Fact", "title":"Job name", - "value":"endlyJobFail" + "value":"endlyJobStart" }, { "type":"Fact", diff --git a/test_e2e/cases/test_03/sacct/sacct.txt b/test_e2e/cases/test_03/sacct/sacct.txt index 1139010..d79ebad 100644 --- a/test_e2e/cases/test_03/sacct/sacct.txt +++ b/test_e2e/cases/test_03/sacct/sacct.txt @@ -1,3 +1,3 @@ -endlyJobFail|petar.jager|c|stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown|08:00:00|00:00:14|00:00:14|00:00:00|00:00:00|00:00:00|4G|||||||| -batch|||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| -extern|||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| +endlyJobStart|petar.jager|account|c|stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown|08:00:00|00:00:14|00:00:14|00:00:00|00:00:00|00:00:00|4G|||||||| +batch||account||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| +extern||account||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| diff --git a/test_e2e/cases/test_03/slurm_env/slurmenv.sh b/test_e2e/cases/test_03/slurm_env/slurmenv.sh index 155853d..a98bba8 100755 --- a/test_e2e/cases/test_03/slurm_env/slurmenv.sh +++ b/test_e2e/cases/test_03/slurm_env/slurmenv.sh @@ -18,7 +18,7 @@ export SLURM_JOB_GROUP="is.grp" export SLURM_JOBID="1052477" export SLURM_JOB_ID="1052477" export SLURM_JOB_MAIL_TYPE="Began" -export SLURM_JOB_NAME="endlyJobFail" +export SLURM_JOB_NAME="endlyJobStart" export SLURM_JOB_NODELIST="stg-c2-0" export SLURM_JOB_PARTITION="c" export SLURM_JOB_QUEUED_TIME="00:00:00" diff --git a/test_e2e/cases/test_03/test.yaml b/test_e2e/cases/test_03/test.yaml index 8b2dee0..5800247 100644 --- a/test_e2e/cases/test_03/test.yaml +++ b/test_e2e/cases/test_03/test.yaml @@ -32,9 +32,9 @@ pipeline: checkError: true env: GOSLMAILER_CONF: /tmp/goslmailer.conf - commands: + commands: - command: source $twd/slurm_env/slurmenv.sh - - command: goslmailer -s "Slurm Job_id=1052477 Name=endlyJobFail Failed, Run time 00:00:30, FAILED, ExitCode 1" petar.jager@imba.oeaw.ac.at + - command: goslmailer -s "Slurm Job_id=1052477 Name=endlyJobStart Began, Queued time 2-00:04:18" petar.jager@imba.oeaw.ac.at extract: - key: rfile regExpr: 'Send successful to file: (rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json)' @@ -53,13 +53,13 @@ pipeline: test_assert_goslmailer: action: validator:assert - expect: + expect: - '~/Send successful to file: rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json/' - actual: + actual: - $run_goslmailer.Output - # todo: + # todo: # add test: # jq . rendered.json >/dev/null || echo FAILED diff --git a/test_e2e/cases/test_04/sacct/sacct.txt b/test_e2e/cases/test_04/sacct/sacct.txt index 8605721..a1fa0cf 100644 --- a/test_e2e/cases/test_04/sacct/sacct.txt +++ b/test_e2e/cases/test_04/sacct/sacct.txt @@ -1,3 +1,3 @@ -endlyJobFail|petar.jager|c|stg-c2-0|1|FAILED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37|08:00:00|00:00:30|00:00:30|00:00.007|00:00.002|00:00.005|4G|||||||| -batch|||stg-c2-0|1|FAILED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37||00:00:30|00:00:30|00:00.006|00:00.002|00:00.003||344K|0.00M|0.01M|stg-c2-0|stg-c2-0|stg-c2-0|| -extern|||stg-c2-0|1|COMPLETED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37||00:00:30|00:00:30|00:00.001|00:00:00|00:00.001||0|0|0.00M|stg-c2-0|stg-c2-0|stg-c2-0|| +endlyJobFail|petar.jager|account|c|stg-c2-0|1|FAILED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37|08:00:00|00:00:30|00:00:30|00:00.007|00:00.002|00:00.005|4G|||||||| +batch||account||stg-c2-0|1|FAILED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37||00:00:30|00:00:30|00:00.006|00:00.002|00:00.003||344K|0.00M|0.01M|stg-c2-0|stg-c2-0|stg-c2-0|| +extern||account||stg-c2-0|1|COMPLETED|2022-05-24T07:43:07|2022-05-24T07:43:07|2022-05-24T07:43:37||00:00:30|00:00:30|00:00.001|00:00:00|00:00.001||0|0|0.00M|stg-c2-0|stg-c2-0|stg-c2-0|| diff --git a/test_e2e/cases/test_05/README.md b/test_e2e/cases/test_05/README.md new file mode 100644 index 0000000..b8b1297 --- /dev/null +++ b/test_e2e/cases/test_05/README.md @@ -0,0 +1,6 @@ +## test_05 +--- + +Test goslmailer on SLURM versions (<21.8.x) that don't set the job information in as env variables + +--- diff --git a/test_e2e/cases/test_05/conf/adaptive_card_template.json b/test_e2e/cases/test_05/conf/adaptive_card_template.json new file mode 100644 index 0000000..a7cf9e3 --- /dev/null +++ b/test_e2e/cases/test_05/conf/adaptive_card_template.json @@ -0,0 +1,237 @@ +{ + "type":"message", + "attachments":[ + { + "contentType":"application/vnd.microsoft.card.adaptive", + "content":{ + "type":"AdaptiveCard", + "body":[ + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"CBE Slurm job info", + "style":"heading" + }, + { + "type":"ColumnSet", + "columns":[ + { + "type":"Column", + "items":[ + { + "type":"Image", + "style":"person", + "url":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/590px-Slurm_logo.svg.png", + "size":"small" + } + ], + "width":"auto" + }, + { + "type":"Column", + "items":[ + { + "type":"TextBlock", + "weight":"bolder", + "text":"{{ .Job.MailSubject }} {{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}", + "wrap":true, + "size":"Large", + {{ if or (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "FAILED") (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "TIMEOUT") ((eq .Job.SlurmEnvironment.SLURM_JOB_STATE "OUT_OF_MEMORY")) }}"color":"Attention"{{ else }}"color":"Good"{{ end }} + }, + { + "type":"TextBlock", + "spacing":"none", + "text":"Created {{ .Created }}", + + "isSubtle":true, + "wrap":true + } + ], + "width":"stretch" + } + ] + }, + {{ if ne .Job.PrunedMessageCount 0 }} + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"WARNING: Rate limiting triggered. {{ .Job.PrunedMessageCount }} additonal notificiations have been suppressed", + "style":"heading", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"true", + "id":"ExternalData", + "facts":[ + { + "type":"Fact", + "title":"Job name", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_NAME }}" + }, + { + "type":"Fact", + "title":"Job ID", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_ID }}" + }, + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.JobStats.User }}" + }, + { + "type":"Fact", + "title":"Partition", + "value":"{{ .Job.JobStats.Partition }}" + }, + { + "type":"Fact", + "title":"Compute Nodes Used", + "value":"{{ .Job.JobStats.NodeList }}" + }, + { + "type":"Fact", + "title":"Cores", + "value":"{{ .Job.JobStats.Ncpus }}" + }, + { + "type":"Fact", + "title":"Job state", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_STATE }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"Exit Code", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_EXIT_CODE_MAX }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Submit", + "value":"{{ .Job.JobStats.Submittime }}" + }, + { + "type":"Fact", + "title":"Start", + "value":"{{ .Job.JobStats.Starttime }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}} + { + "type":"Fact", + "title":"End", + "value":"{{ .Job.JobStats.Endtime }}" + }, + {{ end }} + { + "type":"Fact", + "title":"Reserved Walltime", + "value":"{{ .Job.JobStats.WalltimeStr }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Used Walltime", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_RUN_TIME }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING" }} + { + "type":"Fact", + "title":"Used CPU time", + "value":"{{ .Job.JobStats.TotalCPUStr }}" + }, + { + "type":"Fact", + "title":"% User (Computation)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcUserComputePercentage }}' + }, + { + "type":"Fact", + "title":"% System (I/O)", + "value":'{{ printf "%5.2f%%" .Job.JobStats.CalcSystemComputePercentage }}' + }, + {{ end }} + {{ end }} + { + "type":"Fact", + "title":"Memory Requested", + "value":"{{ .Job.JobStats.ReqMem | humanBytes }}" + }, + {{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }} + { + "type":"Fact", + "title":"Max Memory Used", + "value":"{{ .Job.JobStats.MaxRSS | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Write", + "value":"{{ .Job.JobStats.MaxDiskWrite | humanBytes }}" + }, + { + "type":"Fact", + "title":"Max Disk Read", + "value":"{{ .Job.JobStats.MaxDiskRead | humanBytes }}" + } + {{ end }} + ] + }, + {{ range .Job.Hints }} + { + "type":"TextBlock", + "text":"{{ . }}", + "wrap":true, + "color":"Attention" + }, + {{ end }} + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"false", + "id":"InternalData", + "facts":[ + { + "type":"Fact", + "title":"User", + "value":"{{ .Job.JobStats.User }}" + }, + { + "type":"Fact", + "title":"JobStatus", + "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}" + } + ] + } + ], + "actions":[ + { + "type":"Action.OpenUrl", + "title":"View Google", + "url":"https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + ], + "$schema":"http://adaptivecards.io/schemas/adaptive-card.json", + "version":"1.2", + "msteams":{ + "entities":[ + { + "type":"mention", + "text":"{{ .Job.JobStats.User }}", + "mentioned":{ + "id":"{{ .UserID }}", + "name":"{{ .Job.JobStats.User }}" + } + } + ] + } + } + } + ] +} diff --git a/test_e2e/cases/test_05/conf/gobler.conf b/test_e2e/cases/test_05/conf/gobler.conf new file mode 100644 index 0000000..876fe78 --- /dev/null +++ b/test_e2e/cases/test_05/conf/gobler.conf @@ -0,0 +1,19 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "connectors": { + "msteams": { + "name": "dev channel", + "renderToFile": "yes", + "spoolDir": "/tmp", + "adaptiveCardTemplate": "/tmp/adaptive_card_template.json", + "url": "http://localhost:9999/", + "useLookup": "no", + "monitorT": "10000ms", + "pickerT": "1000ms", + "psBufLen": "3", + "numSenders": "3", + "maxMsgPU": "6" + } + } +} diff --git a/test_e2e/cases/test_05/conf/goslmailer.conf b/test_e2e/cases/test_05/conf/goslmailer.conf new file mode 100644 index 0000000..7315c4e --- /dev/null +++ b/test_e2e/cases/test_05/conf/goslmailer.conf @@ -0,0 +1,22 @@ +{ + "logfile": "", + "defaultconnector": "msteams", + "binpaths": { + "sacct": "/tmp/sacct", + "sstat": "/tmp/sstat" + }, + "connectors": { + "msteams": { + "renderToFile": "yes", + "adaptiveCardTemplate": "/tmp/adaptive_card_template.json", + "spoolDir": "/tmp", + "useLookup": "no" + } + }, + "qosmap": { + "3600": "RAPID", + "28800": "SHORT", + "172800": "MEDIUM", + "1209600": "LONG" + } +} diff --git a/test_e2e/cases/test_05/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json b/test_e2e/cases/test_05/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json new file mode 100644 index 0000000..2a8f784 --- /dev/null +++ b/test_e2e/cases/test_05/results/rendered-1052477-petar.jager@imba.oeaw.ac.at-1653378962712164702.json @@ -0,0 +1,168 @@ +{ + "type":"message", + "attachments":[ + { + "contentType":"application/vnd.microsoft.card.adaptive", + "content":{ + "type":"AdaptiveCard", + "body":[ + { + "type":"TextBlock", + "size":"medium", + "weight":"bolder", + "text":"CBE Slurm job info", + "style":"heading" + }, + { + "type":"ColumnSet", + "columns":[ + { + "type":"Column", + "items":[ + { + "type":"Image", + "style":"person", + "url":"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/590px-Slurm_logo.svg.png", + "size":"small" + } + ], + "width":"auto" + }, + { + "type":"Column", + "items":[ + { + "type":"TextBlock", + "weight":"bolder", + "text":"Job 1052477 Began", + "wrap":true, + "size":"Large", + "color":"Good" + }, + { + "type":"TextBlock", + "spacing":"none", + + "isSubtle":true, + "wrap":true + } + ], + "width":"stretch" + } + ] + }, + + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"true", + "id":"ExternalData", + "facts":[ + { + "type":"Fact", + "title":"Job name", + "value":"endlyJobStart" + }, + { + "type":"Fact", + "title":"Job ID", + "value":"1052477" + }, + { + "type":"Fact", + "title":"User", + "value":"petar.jager" + }, + { + "type":"Fact", + "title":"Partition", + "value":"c" + }, + { + "type":"Fact", + "title":"Compute Nodes Used", + "value":"stg-c2-0" + }, + { + "type":"Fact", + "title":"Cores", + "value":"1" + }, + { + "type":"Fact", + "title":"Job state", + "value":"RUNNING" + }, + + { + "type":"Fact", + "title":"Submit", + "value":"2022-05-24T07:43:07" + }, + { + "type":"Fact", + "title":"Start", + "value":"2022-05-24T07:43:07" + }, + + { + "type":"Fact", + "title":"Reserved Walltime", + "value":"08:00:00" + }, + + { + "type":"Fact", + "title":"Memory Requested", + "value":"4.3 GB" + }, + + ] + }, + + { + "type":"FactSet", + "separator":true, + "spacing":"large", + "isVisible":"false", + "id":"InternalData", + "facts":[ + { + "type":"Fact", + "title":"User", + "value":"petar.jager" + }, + { + "type":"Fact", + "title":"JobStatus", + "value":"Began" + } + ] + } + ], + "actions":[ + { + "type":"Action.OpenUrl", + "title":"View Google", + "url":"https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + ], + "$schema":"http://adaptivecards.io/schemas/adaptive-card.json", + "version":"1.2", + "msteams":{ + "entities":[ + { + "type":"mention", + "text":"petar.jager", + "mentioned":{ + "id":"petar.jager@imba.oeaw.ac.at", + "name":"petar.jager" + } + } + ] + } + } + } + ] +} diff --git a/test_e2e/cases/test_05/sacct/sacct b/test_e2e/cases/test_05/sacct/sacct new file mode 100644 index 0000000..1e7e19a --- /dev/null +++ b/test_e2e/cases/test_05/sacct/sacct @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +cwd=`dirname $0` +cat ${cwd}/sacct_$2.txt diff --git a/test_e2e/cases/test_05/sacct/sacct_1052477.txt b/test_e2e/cases/test_05/sacct/sacct_1052477.txt new file mode 100644 index 0000000..d79ebad --- /dev/null +++ b/test_e2e/cases/test_05/sacct/sacct_1052477.txt @@ -0,0 +1,3 @@ +endlyJobStart|petar.jager|account|c|stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown|08:00:00|00:00:14|00:00:14|00:00:00|00:00:00|00:00:00|4G|||||||| +batch||account||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| +extern||account||stg-c2-0|1|RUNNING|2022-05-24T07:43:07|2022-05-24T07:43:07|Unknown||00:00:14|00:00:14|00:00:00|00:00:00|00:00:00||||||||| diff --git a/test_e2e/cases/test_05/sacct/sstat b/test_e2e/cases/test_05/sacct/sstat new file mode 100644 index 0000000..55250bd --- /dev/null +++ b/test_e2e/cases/test_05/sacct/sstat @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +cwd=`dirname $0` +cat ${cwd}/sstat_$2.txt diff --git a/test_e2e/cases/test_05/sacct/sstat_1052477.txt b/test_e2e/cases/test_05/sacct/sstat_1052477.txt new file mode 100644 index 0000000..4146297 --- /dev/null +++ b/test_e2e/cases/test_05/sacct/sstat_1052477.txt @@ -0,0 +1,2 @@ +1052477.extern|0|0|2012|stg-c2-0|stg-c2-0|stg-c2-0| +1052477.batch|344K|36|8267|stg-c2-0|stg-c2-0|stg-c2-0| diff --git a/test_e2e/cases/test_05/slurm_env/slurmenv.sh b/test_e2e/cases/test_05/slurm_env/slurmenv.sh new file mode 100644 index 0000000..9135535 --- /dev/null +++ b/test_e2e/cases/test_05/slurm_env/slurmenv.sh @@ -0,0 +1,28 @@ +#!/usr/bin/bash + +unset SLURM_JOB_NAME +unset SLURM_JOB_GROUP +unset SLURM_JOB_STATE +unset SLURM_ARRAY_JOB_ID +unset SLURM_JOB_WORK_DIR +unset SLURM_JOB_MAIL_TYPE +unset SLURM_JOBID +unset SLURM_ARRAY_TASK_ID +unset SLURM_JOB_RUN_TIME +unset SLURM_ARRAY_TASK_COUNT +unset SLURM_JOB_EXIT_CODE2 +unset SLURM_JOB_DERIVED_EC +unset SLURM_JOB_ID +unset SLURM_JOB_USER +unset SLURM_ARRAY_TASK_MAX +unset SLURM_JOB_EXIT_CODE +unset SLURM_JOB_UID +unset SLURM_JOB_NODELIST +unset SLURM_ARRAY_TASK_MIN +unset SLURM_JOB_STDIN +unset SLURM_ARRAY_TASK_STEP +unset SLURM_JOB_EXIT_CODE_MAX +unset SLURM_JOB_GID +unset SLURM_CLUSTER_NAME +unset SLURM_JOB_PARTITION +unset SLURM_JOB_ACCOUNT diff --git a/test_e2e/cases/test_05/test.yaml b/test_e2e/cases/test_05/test.yaml new file mode 100644 index 0000000..5800247 --- /dev/null +++ b/test_e2e/cases/test_05/test.yaml @@ -0,0 +1,65 @@ +init: + test_readme: '${twd}/README.md' + +defaults: + message: "Running test $i from $twd" + systempaths: + - $bwd + +pipeline: + + print_welcome: + description: "Current test" + action: workflow:print + style: 1 + + deploy_conf_files: + action: storage:copy + source: + URL: $twd/conf + dest: + URL: /tmp + + deploy_sacct_files: + action: storage:copy + source: + URL: $twd/sacct + dest: + URL: /tmp + + run_goslmailer: + action: exec:extract + checkError: true + env: + GOSLMAILER_CONF: /tmp/goslmailer.conf + commands: + - command: source $twd/slurm_env/slurmenv.sh + - command: goslmailer -s "Slurm Job_id=1052477 Name=endlyJobStart Began, Queued time 2-00:04:18" petar.jager@imba.oeaw.ac.at + extract: + - key: rfile + regExpr: 'Send successful to file: (rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json)' + required: true + + debug_extract: + action: workflow:print + message: "GOT: $rfile" + + test_diff: + action: exec:run + checkError: true + commands: + - sed -i -e '/"text":"Created /d' $WorkingDirectory()/$rfile + - diff $WorkingDirectory()/$rfile $twd/results/*.json && echo RESULTS MATCH + + test_assert_goslmailer: + action: validator:assert + expect: + - '~/Send successful to file: rendered-1052477-petar.jager@imba.oeaw.ac.at-\d+.json/' + actual: + - $run_goslmailer.Output + + + # todo: + # add test: + # jq . rendered.json >/dev/null || echo FAILED + diff --git a/test_e2e/run.yaml b/test_e2e/run.yaml index 90500f7..01ec07c 100644 --- a/test_e2e/run.yaml +++ b/test_e2e/run.yaml @@ -5,14 +5,14 @@ init: pipeline: loop_over_tests: - range: 0..04 + range: 0..05 subPath: cases/test_${index} template: setup_print: action: workflow:print message: "Running case ${index} on path $path" - + print_test_case: description: "Test $index README" action: workflow:print From 9dc76f297bb8acbd70922f89eeca9bf7a0465455 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=9Cmit=20Seren?= Date: Wed, 1 Jun 2022 10:42:43 +0200 Subject: [PATCH 3/4] Add more information to the README and move structs to job_data.go. --- README.md | 28 ++++--- internal/slurmjob/getjobcontext.go | 14 ++-- internal/slurmjob/job_data.go | 34 ++++++++ internal/slurmjob/sacct.go | 34 -------- templates/README.md | 78 ++++++++++--------- test_e2e/README.md | 9 +++ .../test_05/conf/adaptive_card_template.json | 2 +- 7 files changed, 107 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index d82289a..388d8bd 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,7 @@ # goslmailer -> **Warning** -> Currently goslmailer will only work on SLURM >= 21.08.x -> -> Work in progress to support older versions of SLURM is tracked here: -https://github.com/CLIP-HPC/goslmailer/issues/4 +> **Info** +> In SLURM < 21.08.x, only a subset of job related information are available as SLURM environment variables in the `adaptive_card_template.json` and `telegramTemplate.html` templates. Instead of the SLURM environment variables (i.e `Job.SlurmEnvironment.SLURM_JOB_USER`) the variables from `SacctMetrics` can be used (i.e. `.Job.JobStats.User`) instead. See the [adaptive_card_template.json](test_e2e/cases/test_05/conf/adaptive_card_template.json) in the `test_e2e/cases/test_05` test case as an example. ## Drop-in notification delivery solution for slurm that can do... @@ -21,7 +18,7 @@ https://github.com/CLIP-HPC/goslmailer/issues/4 **Goslmailer** (GoSlurmMailer) is a drop-in replacement [MailProg](https://slurm.schedmd.com/slurm.conf.html#OPT_MailProg) for [slurm](https://slurm.schedmd.com/). -With goslmailer configured as as the slurm mailer, +With goslmailer configured as as the slurm mailer, ``` MailProg = /usr/bin/goslmailer @@ -49,7 +46,7 @@ To support future additional receiver schemes, a [connector package](connectors/ * place [goslmailer.conf](cmd/goslmailer/goslmailer.conf.annotated_example) here: `/etc/slurm/goslmailer.conf` * point slurm `MailProg` to the binary -### gobler +### gobler * place binary to the path of your liking * place [gobler.conf](cmd/gobler/gobler.conf) to the path of your liking @@ -74,7 +71,7 @@ See each connector details below... ## Spooling and throttling of messages - gobler service -In high-throughput clusters or in situations where job/message spikes are common, it might not be advisable to try to send all of the incoming messages as they arrive. +In high-throughput clusters or in situations where job/message spikes are common, it might not be advisable to try to send all of the incoming messages as they arrive. For these environments goslmailer can be configured to spool messages from certain connectors on disk, to be later processed by the **gobler** service. @@ -86,7 +83,7 @@ On startup, gobler reads its config file and spins-up a `connector monitor` for `connector monitor` in turn spins up 3 goroutines: `monitor`, `picker` and `numSenders` x `sender`. -* **monitor** : +* **monitor** : * every `monitorT` seconds (or milliseconds) scans the `spoolDir` for new messages and sends them to the **picker** * **picker** : @@ -105,7 +102,7 @@ On startup, gobler reads its config file and spins-up a `connector monitor` for ## Connectors -### default connector +### default connector Specifies which receiver scheme is the default one, in case when user didn't specify `--mail-user` and slurm sent a bare username. @@ -125,6 +122,13 @@ With connector parameters, you can: * template message body * allowList the recipients +To make sure that mutt properly renders the HTML email, add following lines to `/etc/Muttrc.local` + +``` +# Local configuration for Mutt. +set content_type="text/html" +``` + See [annotated configuration example](cmd/goslmailer/goslmailer.conf.annotated_example) --- @@ -135,9 +139,9 @@ Sends **1on1** or **group chat** messages about jobs via [telegram messenger app ![Telegram card](./images/telegram.png) -Prerequisites for the telegram connector: +Prerequisites for the telegram connector: -1. a telegram bot must be created and +1. a telegram bot must be created and 2. the bot daemon service **tgslumbot** must be running. Site admins can [create a telegram bot](https://core.telegram.org/bots#6-botfather) by messaging [botfather](https://t.me/botfather). diff --git a/internal/slurmjob/getjobcontext.go b/internal/slurmjob/getjobcontext.go index 5e7bc19..9ef552c 100644 --- a/internal/slurmjob/getjobcontext.go +++ b/internal/slurmjob/getjobcontext.go @@ -142,13 +142,13 @@ func parseSubjectLine(subject string) (*SlurmEnvironment, error) { return nil, errors.New(("Invalid subject line: " + subject)) } jobId = matches[1] - jobName = matches [2] + jobName = matches[2] mailType = matches[3] jobState = matches[4] if jobState == "" { jobState = "PENDING" } - } else if strings.Contains(subject, "Slurm Array Task Job_id") { + } else if strings.Contains(subject, "Slurm Array Task Job_id") { matches := aJob.FindStringSubmatch(subject) if matches == nil { return nil, errors.New(("Invalid subject line: " + subject)) @@ -156,10 +156,10 @@ func parseSubjectLine(subject string) (*SlurmEnvironment, error) { env.SLURM_ARRAY_JOB_ID = matches[1] env.SLURM_ARRAY_TASK_ID = matches[2] jobId = matches[3] - jobName = matches [4] + jobName = matches[4] mailType = matches[5] jobState = matches[6] - if (jobState == "") { + if jobState == "" { jobState = "RUNNING" } } else { @@ -168,7 +168,7 @@ func parseSubjectLine(subject string) (*SlurmEnvironment, error) { return nil, errors.New(("Invalid subject line: " + subject)) } jobId = matches[1] - jobName = matches [2] + jobName = matches[2] mailType = matches[3] jobState = matches[4] if jobState == "" { @@ -184,7 +184,7 @@ func parseSubjectLine(subject string) (*SlurmEnvironment, error) { return env, nil } -func (j *JobContext) UpdateEnvVarsFromSacct(subject string) error { +func (j *JobContext) UpdateEnvVarsFromMailSubject(subject string) error { env, err := parseSubjectLine(subject) if err != nil { return err @@ -200,7 +200,7 @@ func (j *JobContext) GetJobStats(subject string, paths map[string]string, log *l // SLURM < 21.08.x don't have any SLURM envs set, we need to parse the mail subject line, retrieve the jobid and all other information from sacct if j.SlurmEnvironment.SLURM_JOBID == "" { - err := j.UpdateEnvVarsFromSacct(subject) + err := j.UpdateEnvVarsFromMailSubject(subject) if err != nil { return err } diff --git a/internal/slurmjob/job_data.go b/internal/slurmjob/job_data.go index b383f2b..5bd5747 100644 --- a/internal/slurmjob/job_data.go +++ b/internal/slurmjob/job_data.go @@ -38,3 +38,37 @@ type JobContext struct { MailSubject string PrunedMessageCount uint32 } + +type SacctMetrics struct { + JobName string + User string + Account string + Partition string + State string + Ncpus int64 + Nodes int + NodeList string + Submittime string + Starttime string + Endtime string + CPUTimeStr string + CPUTime float64 + TotalCPU float64 + TotalCPUStr string + UserCPU float64 + SystemCPU float64 + ReqMem uint64 + MaxRSS uint64 + Walltime uint64 + WalltimeStr string + Runtime uint64 + RuntimeStr string + MaxDiskWrite uint64 + MaxDiskRead uint64 +} + +type SstatMetrics struct { + MaxRSS uint64 + MaxDiskWrite uint64 + MaxDiskRead uint64 +} diff --git a/internal/slurmjob/sacct.go b/internal/slurmjob/sacct.go index 45d0e81..a8c0ed9 100644 --- a/internal/slurmjob/sacct.go +++ b/internal/slurmjob/sacct.go @@ -10,40 +10,6 @@ import ( "strings" ) -type SacctMetrics struct { - JobName string - User string - Account string - Partition string - State string - Ncpus int64 - Nodes int - NodeList string - Submittime string - Starttime string - Endtime string - CPUTimeStr string - CPUTime float64 - TotalCPU float64 - TotalCPUStr string - UserCPU float64 - SystemCPU float64 - ReqMem uint64 - MaxRSS uint64 - Walltime uint64 - WalltimeStr string - Runtime uint64 - RuntimeStr string - MaxDiskWrite uint64 - MaxDiskRead uint64 -} - -type SstatMetrics struct { - MaxRSS uint64 - MaxDiskWrite uint64 - MaxDiskRead uint64 -} - func parseTime(input string) (float64, uint64, uint64, uint64) { reg := `^(((?P\d+)-)?(?P\d\d):)?(?P\d\d):(?P\d\d(\.\d+)?)$` r := regexp.MustCompile(reg) diff --git a/templates/README.md b/templates/README.md index 81a547a..ba0c091 100644 --- a/templates/README.md +++ b/templates/README.md @@ -1,14 +1,16 @@ # Templating guide +> **Info** +> In SLURM < 21.08.x, only a subset of job related information are available as SLURM environment variables in the `adaptive_card_template.json` and `telegramTemplate.html` templates. Instead of the SLURM environment variables (i.e `Job.SlurmEnvironment.SLURM_JOB_USER`) the variables from `SacctMetrics` can be used (i.e. `.Job.JobStats.User`) instead. See the [adaptive_card_template.json](test_e2e/cases/test_05/conf/adaptive_card_template.json) in the `test_e2e/cases/test_05` test case as an example. + Goslmailer uses golang [text/template](https://pkg.go.dev/text/template) and [html/template](https://pkg.go.dev/html/template) libraries. -The connectors call `renderer.RenderTemplate` function. +The connectors call `renderer.RenderTemplate` function. Data structure you can reference in the template can be found in: * [rendererer.go](../internal/renderer/renderer.go) * [job_data.go](../internal/slurmjob/job_data.go) -* [sacct.go](../internal/slurmjob/sacct.go) Example: @@ -21,48 +23,48 @@ Structures: ``` struct { - Job slurmjob.JobContext - UserID string - Created string + Job slurmjob.JobContext + UserID string + Created string } type JobContext struct { - SlurmEnvironment - JobStats SacctMetrics - Hints []string - MailSubject string - PrunedMessageCount uint32 + SlurmEnvironment + JobStats SacctMetrics + Hints []string + MailSubject string + PrunedMessageCount uint32 } type SlurmEnvironment struct { - SLURM_ARRAY_JOB_ID string - SLURM_ARRAY_TASK_COUNT string - SLURM_ARRAY_TASK_ID string - SLURM_ARRAY_TASK_MAX string - SLURM_ARRAY_TASK_MIN string - SLURM_ARRAY_TASK_STEP string - SLURM_CLUSTER_NAME string - SLURM_JOB_ACCOUNT string - SLURM_JOB_DERIVED_EC string - SLURM_JOB_EXIT_CODE string - SLURM_JOB_EXIT_CODE2 string - SLURM_JOB_EXIT_CODE_MAX string - SLURM_JOB_EXIT_CODE_MIN string - SLURM_JOB_GID string - SLURM_JOB_GROUP string - SLURM_JOBID string - SLURM_JOB_ID string - SLURM_JOB_MAIL_TYPE string - SLURM_JOB_NAME string - SLURM_JOB_NODELIST string - SLURM_JOB_PARTITION string - SLURM_JOB_QUEUED_TIME string - SLURM_JOB_RUN_TIME string - SLURM_JOB_STATE string - SLURM_JOB_STDIN string - SLURM_JOB_UID string - SLURM_JOB_USER string - SLURM_JOB_WORK_DIR string + SLURM_ARRAY_JOB_ID string + SLURM_ARRAY_TASK_COUNT string + SLURM_ARRAY_TASK_ID string + SLURM_ARRAY_TASK_MAX string + SLURM_ARRAY_TASK_MIN string + SLURM_ARRAY_TASK_STEP string + SLURM_CLUSTER_NAME string + SLURM_JOB_ACCOUNT string + SLURM_JOB_DERIVED_EC string + SLURM_JOB_EXIT_CODE string + SLURM_JOB_EXIT_CODE2 string + SLURM_JOB_EXIT_CODE_MAX string + SLURM_JOB_EXIT_CODE_MIN string + SLURM_JOB_GID string + SLURM_JOB_GROUP string + SLURM_JOBID string + SLURM_JOB_ID string + SLURM_JOB_MAIL_TYPE string + SLURM_JOB_NAME string + SLURM_JOB_NODELIST string + SLURM_JOB_PARTITION string + SLURM_JOB_QUEUED_TIME string + SLURM_JOB_RUN_TIME string + SLURM_JOB_STATE string + SLURM_JOB_STDIN string + SLURM_JOB_UID string + SLURM_JOB_USER string + SLURM_JOB_WORK_DIR string } type SacctMetrics struct { diff --git a/test_e2e/README.md b/test_e2e/README.md index 0c1c3e4..efe6c05 100644 --- a/test_e2e/README.md +++ b/test_e2e/README.md @@ -3,6 +3,9 @@ 1. [test_00](./cases/test_00/README.md) 2. [test_01](./cases/test_01/README.md) 3. [test_02](./cases/test_02/README.md) +4. [test_03](./cases/test_03/README.md) +5. [test_04](./cases/test_04/README.md) +6. [test_05](./cases/test_05/README.md) --- @@ -38,4 +41,10 @@ Job start goslmailer render msteams json to file (actual data) Job end - fail + +## test_05 +--- + +Test goslmailer on SLURM versions (<21.8.x) that don't set the job information in as env variables + --- diff --git a/test_e2e/cases/test_05/conf/adaptive_card_template.json b/test_e2e/cases/test_05/conf/adaptive_card_template.json index a7cf9e3..42f6c42 100644 --- a/test_e2e/cases/test_05/conf/adaptive_card_template.json +++ b/test_e2e/cases/test_05/conf/adaptive_card_template.json @@ -138,7 +138,7 @@ { "type":"Fact", "title":"Used Walltime", - "value":"{{ .Job.SlurmEnvironment.SLURM_JOB_RUN_TIME }}" + "value":"{{ .Job.JobStats.RuntimeStr }}" }, {{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING" }} { From b738a6a2d403fa084148cf43461902a5694a00d5 Mon Sep 17 00:00:00 2001 From: "Jager,Petar" Date: Wed, 1 Jun 2022 14:35:41 +0200 Subject: [PATCH 4/4] minor fixes, readme, logger, test case 05 --- README.md | 37 ++++++++++++++++++++++++++++-- VERSION | 2 +- internal/slurmjob/getjobcontext.go | 20 ++++++++-------- internal/slurmjob/job_data.go | 16 ++++++------- internal/slurmjob/sacct.go | 26 ++++++++++----------- internal/slurmjob/sacct_test.go | 8 +++++-- templates/README.md | 2 +- test_e2e/cases/test_02/test.yaml | 2 +- test_e2e/cases/test_05/sacct/sacct | 0 test_e2e/cases/test_05/sacct/sstat | 0 10 files changed, 75 insertions(+), 38 deletions(-) mode change 100644 => 100755 test_e2e/cases/test_05/sacct/sacct mode change 100644 => 100755 test_e2e/cases/test_05/sacct/sstat diff --git a/README.md b/README.md index 388d8bd..83117e8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # goslmailer > **Info** -> In SLURM < 21.08.x, only a subset of job related information are available as SLURM environment variables in the `adaptive_card_template.json` and `telegramTemplate.html` templates. Instead of the SLURM environment variables (i.e `Job.SlurmEnvironment.SLURM_JOB_USER`) the variables from `SacctMetrics` can be used (i.e. `.Job.JobStats.User`) instead. See the [adaptive_card_template.json](test_e2e/cases/test_05/conf/adaptive_card_template.json) in the `test_e2e/cases/test_05` test case as an example. +> Now also works with SLURM < 21.08 +> +> For templating differences between slurm>21.08 and slurm<21.08 see [templating guide](./templates/README.md) ## Drop-in notification delivery solution for slurm that can do... @@ -40,10 +42,41 @@ To support future additional receiver schemes, a [connector package](connectors/ ## Installation +### Build + +#### Quick version, without end to end testing + +``` +git clone https://github.com/CLIP-HPC/goslmailer.git +make test +make build +make install +``` + +#### Slightly more involved, with end to end testing: + +Prerequisites: + +1. generated RSA keypair (passwordless) (`ssh-keygen -t rsa`) +2. `ssh $USER@localhost` must work without password + +Known caveats: + +* redhat/centos: must have lsb_release binary installed, package: `redhat-lsb-core` +* ubuntu 22: `set enable-bracketed-paste off` present in `~/.inputrc` +* maybe/maybe not, depends if you see failed tests: `export TERM=dumb` in `~/.bashrc` :) + +``` +# downloads endly binary and runs endly tests +make +``` + + ### goslmailer * place binary to the path of your liking -* place [goslmailer.conf](cmd/goslmailer/goslmailer.conf.annotated_example) here: `/etc/slurm/goslmailer.conf` +* place [goslmailer.conf](cmd/goslmailer/goslmailer.conf.annotated_example) here: `/etc/slurm/goslmailer.conf` (default path) + * OR: anywhere else, but then run the binary with `GOSLMAILER_CONF=/path/to/gosl.conf` in environment * point slurm `MailProg` to the binary ### gobler diff --git a/VERSION b/VERSION index ef4b6a2..a4b6ac3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v2.1.5 +v2.2.0 diff --git a/internal/slurmjob/getjobcontext.go b/internal/slurmjob/getjobcontext.go index 9ef552c..54dbaad 100644 --- a/internal/slurmjob/getjobcontext.go +++ b/internal/slurmjob/getjobcontext.go @@ -194,9 +194,9 @@ func (j *JobContext) UpdateEnvVarsFromMailSubject(subject string) error { } // Get additional job statistics from external source (e.g. jobinfo or sacct) -func (j *JobContext) GetJobStats(subject string, paths map[string]string, log *log.Logger) error { - log.Print("Start retrieving job stats") - log.Printf("%#v", j.SlurmEnvironment) +func (j *JobContext) GetJobStats(subject string, paths map[string]string, l *log.Logger) error { + l.Print("Start retrieving job stats") + l.Printf("%#v", j.SlurmEnvironment) // SLURM < 21.08.x don't have any SLURM envs set, we need to parse the mail subject line, retrieve the jobid and all other information from sacct if j.SlurmEnvironment.SLURM_JOBID == "" { @@ -217,8 +217,8 @@ func (j *JobContext) GetJobStats(subject string, paths map[string]string, log *l if j.SlurmEnvironment.SLURM_ARRAY_JOB_ID != "" { jobId = j.SlurmEnvironment.SLURM_ARRAY_JOB_ID } - log.Printf("Fetch job info %s", jobId) - jobStats, err := GetSacctMetrics(jobId, paths, log) + l.Printf("Fetch job info %s", jobId) + jobStats, err := GetSacctMetrics(jobId, paths, l) if err != nil { return err } @@ -226,17 +226,17 @@ func (j *JobContext) GetJobStats(subject string, paths map[string]string, log *l counter := 0 for !IsJobFinished(j.JobStats.State) && j.JobStats.State != j.SlurmEnvironment.SLURM_JOB_STATE && counter < 5 { time.Sleep(2 * time.Second) - jobStats, err = GetSacctMetrics(jobId, paths, log) + jobStats, err = GetSacctMetrics(jobId, paths, l) if err != nil { - return fmt.Errorf("Failed to Get job stats: %w", err) + return fmt.Errorf("failed to get job stats: %w", err) } j.JobStats = *jobStats counter += 1 } if j.JobStats.State == "RUNNING" { - log.Print("Update job with live stats") - updateJobStatsWithLiveData(&j.JobStats, jobId, log, paths) + l.Print("Update job with live stats") + updateJobStatsWithLiveData(&j.JobStats, jobId, paths, l) } - log.Printf("Finished retrieving job stats") + l.Printf("Finished retrieving job stats") return nil } diff --git a/internal/slurmjob/job_data.go b/internal/slurmjob/job_data.go index 5bd5747..e3f3c80 100644 --- a/internal/slurmjob/job_data.go +++ b/internal/slurmjob/job_data.go @@ -1,5 +1,13 @@ package slurmjob +type JobContext struct { + SlurmEnvironment + JobStats SacctMetrics + Hints []string + MailSubject string + PrunedMessageCount uint32 +} + type SlurmEnvironment struct { SLURM_ARRAY_JOB_ID string SLURM_ARRAY_TASK_COUNT string @@ -31,14 +39,6 @@ type SlurmEnvironment struct { SLURM_JOB_WORK_DIR string } -type JobContext struct { - SlurmEnvironment - JobStats SacctMetrics - Hints []string - MailSubject string - PrunedMessageCount uint32 -} - type SacctMetrics struct { JobName string User string diff --git a/internal/slurmjob/sacct.go b/internal/slurmjob/sacct.go index a8c0ed9..f3f89eb 100644 --- a/internal/slurmjob/sacct.go +++ b/internal/slurmjob/sacct.go @@ -77,7 +77,7 @@ func ParseSstatMetrics(input []byte) (*SstatMetrics, error) { return &metrics, nil } -func ParseSacctMetrics(input []byte) (*SacctMetrics, error) { +func ParseSacctMetrics(input []byte, l *log.Logger) (*SacctMetrics, error) { var metrics SacctMetrics if len(input) == 0 { return &metrics, nil @@ -159,7 +159,7 @@ func ParseSacctMetrics(input []byte) (*SacctMetrics, error) { metrics.RuntimeStr = split[11] metrics.Runtime = uint64(parseCpuTime(split[11])) - log.Printf("Metrics: %#v", metrics) + l.Printf("Metrics: %#v", metrics) return &metrics, nil } @@ -177,24 +177,24 @@ func (m SacctMetrics) CalcSystemComputePercentage() float64 { return 0.0 } -func GetSacctMetrics(jobId string, paths map[string]string, log *log.Logger) (*SacctMetrics, error) { - sacctMetrics, err := GetSacctData(jobId, paths, log) +func GetSacctMetrics(jobId string, paths map[string]string, l *log.Logger) (*SacctMetrics, error) { + sacctMetrics, err := GetSacctData(jobId, paths, l) if err != nil { return nil, err } - return ParseSacctMetrics(sacctMetrics) + return ParseSacctMetrics(sacctMetrics, l) } -func GetSstatMetrics(jobId string, paths map[string]string, log *log.Logger) (*SstatMetrics, error) { - sstatMetrics, err := GetSacctData(jobId, paths, log) +func GetSstatMetrics(jobId string, paths map[string]string, l *log.Logger) (*SstatMetrics, error) { + sstatMetrics, err := GetSacctData(jobId, paths, l) if err != nil { return nil, err } return ParseSstatMetrics(sstatMetrics) } -func updateJobStatsWithLiveData(metrics *SacctMetrics, jobId string, log *log.Logger, paths map[string]string) { - liveMetrics, err := GetSstatMetrics(jobId, paths, log) +func updateJobStatsWithLiveData(metrics *SacctMetrics, jobId string, paths map[string]string, l *log.Logger) { + liveMetrics, err := GetSstatMetrics(jobId, paths, l) if err == nil { if liveMetrics.MaxRSS > 0 { @@ -210,22 +210,22 @@ func updateJobStatsWithLiveData(metrics *SacctMetrics, jobId string, log *log.Lo } // Execute the saccct command and return its output -func GetSacctData(jobId string, paths map[string]string, log *log.Logger) ([]byte, error) { +func GetSacctData(jobId string, paths map[string]string, l *log.Logger) ([]byte, error) { formatLine := "JobName,User,Account,Partition,NodeList,ncpus,State,Submit,start,end,timelimit,elapsed,CPUTime,TotalCPU,UserCPU,SystemCPU,ReqMem,MaxRSS,MaxDiskWrite,MaxDiskRead,MaxRSSNode,MaxDiskWriteNode,MaxDiskReadNode,Comment" cmd := exec.Command(paths["sacct"], "-j", jobId, "-n", "-p", "--format", formatLine) output, err := cmd.CombinedOutput() if err != nil { - return nil, fmt.Errorf("Failed to execute sacct command: %w", err) + return nil, fmt.Errorf("failed to execute sacct command: %w", err) } return output, nil } -func GetSstatData(jobId string, paths map[string]string, log *log.Logger) ([]byte, error) { +func GetSstatData(jobId string, paths map[string]string, l *log.Logger) ([]byte, error) { formatLine := "JobID,MaxRSS,MaxDiskWrite,MaxDiskRead,MaxRSSNode,MaxDiskWriteNode,MaxDiskReadNode" cmd := exec.Command(paths["sstat"], "-a", "-j", jobId, "-n", "-p", "--format", formatLine) output, err := cmd.CombinedOutput() if err != nil { - return nil, fmt.Errorf("Failed to execute sstat command: %w", err) + return nil, fmt.Errorf("failed to execute sstat command: %w", err) } return output, nil } diff --git a/internal/slurmjob/sacct_test.go b/internal/slurmjob/sacct_test.go index 5cdd5ce..d0296df 100644 --- a/internal/slurmjob/sacct_test.go +++ b/internal/slurmjob/sacct_test.go @@ -1,7 +1,9 @@ package slurmjob import ( + "bytes" "io/ioutil" + "log" "os" "testing" ) @@ -13,7 +15,8 @@ func TestParseSacctMetrics(t *testing.T) { t.Fatalf("Can not open test data: %v", err) } data, _ := ioutil.ReadAll(file) - metrics, _ := ParseSacctMetrics(data) + l := log.New(&bytes.Buffer{}, "Testing: ", log.Llongfile) + metrics, _ := ParseSacctMetrics(data, l) t.Logf("%+v", metrics) if metrics.JobName != "JobName" { @@ -119,7 +122,8 @@ func TestParseSstatMetrics(t *testing.T) { func TestParseSacctMetricsEmptyInput(t *testing.T) { // Read the input data from a file - metrics, _ := ParseSacctMetrics([]byte("")) + l := log.New(&bytes.Buffer{}, "Testing: ", log.Llongfile) + metrics, _ := ParseSacctMetrics([]byte(""), l) var emptyMetrics SacctMetrics t.Logf("%+v", metrics) diff --git a/templates/README.md b/templates/README.md index ba0c091..f952ef1 100644 --- a/templates/README.md +++ b/templates/README.md @@ -9,8 +9,8 @@ The connectors call `renderer.RenderTemplate` function. Data structure you can reference in the template can be found in: -* [rendererer.go](../internal/renderer/renderer.go) * [job_data.go](../internal/slurmjob/job_data.go) +* [rendererer.go](../internal/renderer/renderer.go) Example: diff --git a/test_e2e/cases/test_02/test.yaml b/test_e2e/cases/test_02/test.yaml index 688502c..9f1556e 100644 --- a/test_e2e/cases/test_02/test.yaml +++ b/test_e2e/cases/test_02/test.yaml @@ -39,6 +39,6 @@ pipeline: test_assert_goslmailer: action: validator:assert expect: - - '/Unable to retrieve job stats. Error: Failed to execute sacct command: exit status 1/' + - '/Unable to retrieve job stats. Error: failed to execute sacct command: exit status 1/' actual: - $run_goslmailer.Output diff --git a/test_e2e/cases/test_05/sacct/sacct b/test_e2e/cases/test_05/sacct/sacct old mode 100644 new mode 100755 diff --git a/test_e2e/cases/test_05/sacct/sstat b/test_e2e/cases/test_05/sacct/sstat old mode 100644 new mode 100755