From c916e7dfca8337549c2648cacd32b61be1e05353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Tue, 10 Oct 2023 17:42:26 +0100 Subject: [PATCH] Workaround: avoid sleep(1) call by crypt_r MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using an off-cpu flamegraph I identified that concurrent PAM calls are slow due to a call to `sleep(1)`. `pam_authenticate` calls `crypt_r` which calls `NSSLOW_Init` which on first use will try to initialize the just `dlopen`-ed library. If it encounters a race condition it does a `sleep(1)`. This race condition can be quite reliably reproduced when performing a lot of PAM authentications from multiple threads in parallel. GDB can also be used to confirm this by putting a breakpoint on `sleep`: ``` #0 __sleep (seconds=seconds@entry=1) at ../sysdeps/unix/sysv/linux/sleep.c:42 #1 0x00007ffff1548e22 in freebl_RunLoaderOnce () at lowhash_vector.c:122 #2 0x00007ffff1548f31 in freebl_InitVector () at lowhash_vector.c:131 #3 NSSLOW_Init () at lowhash_vector.c:148 #4 0x00007ffff1b8f09a in __sha512_crypt_r (key=key@entry=0x7fffd8005a60 "pamtest-edvint", salt=0x7ffff31e17b8 "dIJbsXKc0", #5 0x00007ffff1b8d070 in __crypt_r (key=key@entry=0x7fffd8005a60 "pamtest-edvint", salt=, #6 0x00007ffff1dc9abc in verify_pwd_hash (p=p@entry=0x7fffd8005a60 "pamtest-edvint", hash=, nullok=nullok@entry=0) at passverify.c:111 #7 0x00007ffff1dc9139 in _unix_verify_password (pamh=pamh@entry=0x7fffd8002910, name=0x7fffd8002ab0 "pamtest-edvint", p=0x7fffd8005a60 "pamtest-edvint", ctrl=ctrl@entry=8389156) at support.c:777 #8 0x00007ffff1dc6556 in pam_sm_authenticate (pamh=0x7fffd8002910, flags=, argc=, argv=) at pam_unix_auth.c:178 #9 0x00007ffff7bcef1a in _pam_dispatch_aux (use_cached_chain=, resumed=, h=, flags=1, pamh=0x7fffd8002910) at pam_dispatch.c:110 #10 _pam_dispatch (pamh=pamh@entry=0x7fffd8002910, flags=1, choice=choice@entry=1) at pam_dispatch.c:426 #11 0x00007ffff7bce7e0 in pam_authenticate (pamh=0x7fffd8002910, flags=flags@entry=1) at pam_auth.c:34 #12 0x00000000005ae567 in XA_mh_authorize (username=username@entry=0x7fffd80028d0 "pamtest-edvint", password=password@entry=0x7fffd80028f0 "pamtest-edvint", error=error@entry=0x7ffff31e1be8) at xa_auth.c:83 #13 0x00000000005adf20 in stub_XA_mh_authorize (username=, password=) at xa_auth_stubs.c:42 #14 0x00000000004a0a6a in camlDune__exe__Bench_pam__pam_authenticate$27_320 () at ocaml/tests/bench/pam/bench_pam.ml:63 #15 0x00000000004a1113 in camlEzbechamel_concurrent__worker_loop_359 () at ocaml/tests/bench/lib/concurrent/ezbechamel_concurrent.ml:36 #16 0x00000000005935b9 in camlStdlib__Fun__protect_317 () #17 0x00000000004a1955 in camlThread__fun_850 () #18 0x00000000005d6401 in caml_start_program () #19 0x00000000005cd0fd in caml_callback_exn () #20 0x00000000005af810 in caml_thread_start () #21 0x00007ffff79b7e25 in start_thread (arg=0x7ffff31e2700) at pthread_create.c:308 #22 0x00007ffff71dbbad in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:113 ``` `pam_start` and `pam_end` doesn't help here, because on `pam_end` the library is `dlclose`-ed, so on next `pam_authenticate` it will have to go through the initialization code again. (This initialization code would've belonged into `pam_start`, not `pam_authenticate`, but there are several layers here including a call to `crypt_r`). To avoid this link with `libcrypt` and call `crypt_r` once ourselves (and ensure it loads `libfreeblpriv3` by using the sha512 prefix). That way the library will stay loaded (we'll hold a reference count on it), and the `dlclose` done by PAM won't unload it. Confirmed that there are no `sleep` calls now, and the results are also visible when running the benchmark targeted to the with and without fix code: ``` ╭─────────────────────────────────────────────────┬───────────────────────────┬───────────────────────────┬───────────────────────────╮ │name │ major-allocated │ minor-allocated │ monotonic-clock │ ├─────────────────────────────────────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┤ │ concurrent authenticate (sleep fix, actual):8 │ 0.0000 mjw/run│ 50.0000 mnw/run│ 27043467.0000 ns/run│ ╰─────────────────────────────────────────────────┴───────────────────────────┴───────────────────────────┴───────────────────────────╯ ╭────────────────────────────────────────┬───────────────────────────┬───────────────────────────┬───────────────────────────╮ │name │ major-allocated │ minor-allocated │ monotonic-clock │ ├────────────────────────────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┤ │ concurrent authenticate (no reuse):8 │ 0.0000 mjw/run│ 50.0000 mnw/run│ 1029831372.0000 ns/run│ ╰────────────────────────────────────────┴───────────────────────────┴───────────────────────────┴───────────────────────────╯ ``` Without this fix using 2 threads to perform PAM authentication would result in a 38x slowdown compared to using no threads at all (which is what XAPI currently does). Signed-off-by: Edwin Török --- ocaml/auth/dune | 2 +- ocaml/auth/pam.ml | 2 ++ ocaml/auth/xa_auth_stubs.c | 22 ++++++++++++++++++++-- ocaml/tests/bench/pam/bench_pam.ml | 7 ++----- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/ocaml/auth/dune b/ocaml/auth/dune index b2bdfd78b1d..0cbeb9156db 100644 --- a/ocaml/auth/dune +++ b/ocaml/auth/dune @@ -4,7 +4,7 @@ (names xa_auth xa_auth_stubs) ) (name pam) - (c_library_flags -lpam) + (c_library_flags -lpam -lcrypt) (libraries threads.posix) (wrapped false) ) \ No newline at end of file diff --git a/ocaml/auth/pam.ml b/ocaml/auth/pam.ml index 963f155e580..6650d9453c6 100644 --- a/ocaml/auth/pam.ml +++ b/ocaml/auth/pam.ml @@ -50,3 +50,5 @@ let authorize_stop t = let authorize_run t username password = let handle = check_handle t in authorize_run handle username password + +external workaround : unit -> unit = "stub_XA_workaround" diff --git a/ocaml/auth/xa_auth_stubs.c b/ocaml/auth/xa_auth_stubs.c index 86850e181b8..ba8673646c9 100644 --- a/ocaml/auth/xa_auth_stubs.c +++ b/ocaml/auth/xa_auth_stubs.c @@ -11,8 +11,9 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. */ -/* - */ + +/* must be at the beginning, it affects defines in other headers that cannot be reenabled later */ +#define _GNU_SOURCE #include #include @@ -142,6 +143,23 @@ CAMLprim value stub_XA_mh_authorize_run(value ml_handle, value username, value p caml_failwith(error ? error : "Unknown error"); CAMLreturn(ret); } + +#include +CAMLprim value stub_XA_workaround(value u) +{ + CAMLparam1(u); + struct crypt_data data; + memset(&data, 0, sizeof(data)); + + /* When called with '$6$' it will call sha512_crypt_r which will call NSSLOW_Init, which initializes the library, + and avoids the sleep() call that would otherwise happen when the library is initialized in parallel. + We don't want to link with libfreebl3 directly, because in the future we might switch to using libxcrypt. + */ + crypt_r("", "$6$", &data); + + CAMLreturn(Val_unit); +} + /* * Local variables: * mode: C diff --git a/ocaml/tests/bench/pam/bench_pam.ml b/ocaml/tests/bench/pam/bench_pam.ml index 0a440b50b4f..db5775b7b21 100644 --- a/ocaml/tests/bench/pam/bench_pam.ml +++ b/ocaml/tests/bench/pam/bench_pam.ml @@ -38,15 +38,12 @@ let pam_run h = Pam.authorize_run h username password let sleepfix_start () = + (* FIXME: this adds a 5s pause on startup, any way to initialize the code but not incurr the fail delay? *) (* To avoid the sleep(1) in the libgcrypt/NSS initialization code that gets called from Pam.authorize_run create and run a fake auth command, and keep the handle open *) + Pam.workaround (); let h = Pam.authorize_start () in - let () = - try Pam.authorize_run h "" "" - with Failure _ -> () - in - (* h is not valid to use anymore! *) h let sleepfix_stop = Pam.authorize_stop