From 1b1554cd72c0476f78ad4e718b8181df191fabe3 Mon Sep 17 00:00:00 2001 From: Alexander Barth Date: Fri, 5 Jan 2024 16:13:46 +0100 Subject: [PATCH] update benchmarks --- test/perf/test_perf_cdm.jl | 30 ++++++++----- test/perf/test_perf_xarray.py | 80 ++++++++++++++++++----------------- 2 files changed, 61 insertions(+), 49 deletions(-) diff --git a/test/perf/test_perf_cdm.jl b/test/perf/test_perf_cdm.jl index 249907d..f4b2e4c 100644 --- a/test/perf/test_perf_cdm.jl +++ b/test/perf/test_perf_cdm.jl @@ -1,27 +1,37 @@ +# Benchmark to be run on Linux as root + using BenchmarkTools using NCDatasets using Dates using CommonDataModel: @groupby +using CommonDataModel fname = expanduser("~/sample_perf2.nc") ds = NCDataset(fname) -v = ds[:data] +data_f64 = Float64.(ds[:data][:,:,:]) + +println("runtime") +gm = @btime begin + write("/proc/sys/vm/drop_caches","3") + mean(@groupby(ds[:data],Dates.Month(time)))[:,:,:]; +end + +# Welford +gs = @btime begin + write("/proc/sys/vm/drop_caches","3") + std(@groupby(ds[:data],Dates.Month(time)))[:,:,:]; +end + +println("accuracy") mean_ref = cat( - [mean(v[:,:,findall(Dates.month.(ds[:time][:]) .== m)],dims=3) + [mean(data_f64[:,:,findall(Dates.month.(ds[:time][:]) .== m)],dims=3) for m in 1:12]...,dims=3); std_ref = cat( - [std(v[:,:,findall(Dates.month.(ds[:time][:]) .== m)],dims=3) + [std(data_f64[:,:,findall(Dates.month.(ds[:time][:]) .== m)],dims=3) for m in 1:12]...,dims=3); - -gm = @btime mean(@groupby(ds[:data],Dates.Month(time)))[:,:,:]; -# 1.005 s (523137 allocations: 2.67 GiB) - @show sqrt(mean((gm - mean_ref).^2)) - -# Welford -gs = @btime std(@groupby(ds[:data],Dates.Month(time)))[:,:,:]; @show sqrt(mean((gs - std_ref).^2)) diff --git a/test/perf/test_perf_xarray.py b/test/perf/test_perf_xarray.py index 315f42b..5ee1cec 100644 --- a/test/perf/test_perf_xarray.py +++ b/test/perf/test_perf_xarray.py @@ -1,63 +1,65 @@ +# Benchmark to be run on Linux as root +# dropping file cache is OS specific and requires root priviledges + import timeit import xarray as xr import numpy +import os +import sys -# xarray-2023.12.0 -# Python 3.10.12 - -# mean -# minimum runtime of 30 trials -# 0.7370511470362544 seconds -# std -# 3.9330708980560303 seconds -tests = [ - """vm = ds["data"].groupby("time.month").mean().to_numpy();""", - """vm = ds["data"].groupby("time.month").std().to_numpy();""", - ] +def mean_no_cache(ds): + with open("/proc/sys/vm/drop_caches","w") as f: + f.write("3") + vm = ds["data"].groupby("time.month").mean().to_numpy(); +def std_no_cache(ds): + with open("/proc/sys/vm/drop_caches","w") as f: + f.write("3") -print("runtime") + vm = ds["data"].groupby("time.month").std().to_numpy(); -for tt in tests: - t = timeit.repeat(tt, - setup=""" -import xarray as xr -fname = "/home/abarth/sample_perf2.nc" +fname = os.path.expanduser("~/sample_perf2.nc") ds = xr.open_dataset(fname) -""", - number=1, - repeat=30, - ) - print("timeit ",min(t),tt) +print("python: ",sys.version) +print("xarray: ",xr.__version__) +print("numpy: ",numpy.__version__) -fname = "/home/abarth/sample_perf2.nc" -ds = xr.open_dataset(fname) +if __name__ == "__main__": + print("runtime") + + for test_fun in [mean_no_cache, std_no_cache]: + t = timeit.repeat(lambda: test_fun(ds), + setup="""from __main__ import ds""", + number=1, + repeat=30, + ) + + print(" minimum time of ",test_fun,": ",min(t)) -month = ds["time.month"].to_numpy() -print("accuracy") + month = ds["time.month"].to_numpy() + print("accuracy") -mean_ref = numpy.stack( - [ds["data"].data[(month == mm).nonzero()[0],:,:].mean(axis=0) for mm in range(1,13)],axis=0) + data_f64 = ds["data"].data.astype(dtype="f8") -std_ref = numpy.stack( - [ds["data"].data[(month == mm).nonzero()[0],:,:].std(axis=0,ddof=1) for mm in range(1,13)],axis=0) + mean_ref = numpy.stack( + [data_f64[(month == mm).nonzero()[0],:,:].mean(axis=0) for mm in range(1,13)],axis=0) + std_ref = numpy.stack( + [data_f64[(month == mm).nonzero()[0],:,:].std(axis=0,ddof=1) for mm in range(1,13)],axis=0) -vm = ds["data"].groupby("time.month").mean().to_numpy(); -print("accuracy of mean", - numpy.sqrt(numpy.mean((mean_ref - vm)**2))) -# output 0 + vm = ds["data"].groupby("time.month").mean().to_numpy(); -vs = ds["data"].groupby("time.month").std() + print(" accuracy of mean", + numpy.sqrt(numpy.mean((mean_ref - vm)**2))) + vs = ds["data"].groupby("time.month").std().to_numpy() -print("accuracy of std", - numpy.sqrt(numpy.mean((std_ref - vs)**2))) -# 0.00053720415 + print(" accuracy of std", + numpy.sqrt(numpy.mean((std_ref - vs)**2)))