From 6dd8183a8603c5eedd09007553058f1d36ee7ae7 Mon Sep 17 00:00:00 2001 From: Alexander Demin Date: Sat, 27 Jan 2024 00:00:06 +0300 Subject: [PATCH] upd fglm benchmarks --- Project.toml | 1 + benchmark/CI-scripts/run_benchmarks.jl | 7 + experimental/example-maybe-bug.jl | 14 +- experimental/fglmtest.jl | 17 + experimental/lexcmp.jl | 217 ++++++++++++ experimental/native_vec4-assume.txt | 93 +++++ experimental/native_vec4.txt | 93 +++++ experimental/test_fglm.jl | 22 ++ src/Groebner.jl | 20 +- src/arithmetic/Zp.jl | 47 +-- src/f4/f4.jl | 1 + src/f4/hashtable.jl | 2 +- src/f4/linalg/backend.jl | 34 +- src/fglm/fglm.jl | 54 +++ src/fglm/fglm_internal.jl | 111 +++++- src/fglm/linear.jl | 92 +++-- src/input-output/input-output.jl | 17 +- src/interface.jl | 27 ++ src/monomials/exponentvector.jl | 72 ++-- src/utils/invariants.jl | 2 +- src/utils/keywords.jl | 19 +- src/utils/simd.jl | 460 +++++++++++++++++++++++++ test/arithmetic/Zp.jl | 31 +- test/monoms/monom_orders.jl | 45 ++- test/reconstruction/ratrec.jl | 47 +-- 25 files changed, 1281 insertions(+), 264 deletions(-) create mode 100644 experimental/fglmtest.jl create mode 100644 experimental/lexcmp.jl create mode 100644 experimental/native_vec4-assume.txt create mode 100644 experimental/native_vec4.txt create mode 100644 experimental/test_fglm.jl create mode 100644 src/utils/simd.jl diff --git a/Project.toml b/Project.toml index c0335cec..10beedaa 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ AbstractAlgebra = "c3fe647b-3220-5bb0-a1ea-a7954cac585d" Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +HostCPUFeatures = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" MultivariatePolynomials = "102ac46a-7ee4-5c85-9060-abc95bfdeaa3" Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a" diff --git a/benchmark/CI-scripts/run_benchmarks.jl b/benchmark/CI-scripts/run_benchmarks.jl index dac5ed00..97c96187 100644 --- a/benchmark/CI-scripts/run_benchmarks.jl +++ b/benchmark/CI-scripts/run_benchmarks.jl @@ -67,6 +67,13 @@ push!( result=compute_gb(Groebner.katsuran(10, ordering=:degrevlex, k=GF(2^27 + 29)), 5) ) ) +push!( + suite, + ( + problem_name="groebner, AA, GF(2^30+3), katsura 11", + result=compute_gb(Groebner.katsuran(11, ordering=:degrevlex, k=GF(2^30 + 3)), 3) + ) +) push!( suite, ( diff --git a/experimental/example-maybe-bug.jl b/experimental/example-maybe-bug.jl index 73de135e..a7074665 100644 --- a/experimental/example-maybe-bug.jl +++ b/experimental/example-maybe-bug.jl @@ -11,11 +11,11 @@ end @info "" nthreads() @show ENV["JULIA_NUM_THREADS"] -Groebner.logging_enabled() = false +Groebner.logging_enabled() = true Groebner.invariants_enabled() = false Groebner.performance_counters_enabled() = false -s = Groebner.katsuran(9, ordering=:degrevlex, k=AbstractAlgebra.GF(2^30 + 3)); +s = Groebner.noonn(8, ordering=:degrevlex, k=AbstractAlgebra.GF(2^30 + 3)); trace, gb = Groebner.groebner_learn(s); @btime Groebner.groebner($s); @btime Groebner.groebner_apply!($trace, $s); @@ -24,15 +24,15 @@ trace, gb = Groebner.groebner_learn(s); @btime Groebner.groebner_apply!($trace, $((s, s, s, s, s, s, s, s))); #= - 113.912 ms (34041 allocations: 43.78 MiB) + 1.048 s (418167 allocations: 258.02 MiB) - 44.803 ms (18887 allocations: 24.46 MiB) + 142.140 ms (126985 allocations: 87.63 MiB) - 52.580 ms (20607 allocations: 35.38 MiB) + 185.635 ms (134610 allocations: 134.37 MiB) - 64.618 ms (23276 allocations: 59.42 MiB) + 283.075 ms (146988 allocations: 224.29 MiB) - 125.247 ms (28610 allocations: 107.47 MiB) + 433.356 ms (171889 allocations: 404.00 MiB) =# @profview Groebner.groebner_apply!(trace, ((s, s, s, s))); diff --git a/experimental/fglmtest.jl b/experimental/fglmtest.jl new file mode 100644 index 00000000..66a2e8b1 --- /dev/null +++ b/experimental/fglmtest.jl @@ -0,0 +1,17 @@ +using AbstractAlgebra, Groebner + +R, (x, y, z, t) = AbstractAlgebra.polynomial_ring(AbstractAlgebra.QQ, ["x", "y", "z", "t"]) +sys = [ + y^2 * z + 2 * x * y * t - 2 * x - z, + -x^3 * z + 4 * x * y^2 * z + 4 * x^2 * y * t + 2 * y^3 * t + 4 * x^2 - 10 * y^2 + + 4 * x * z - 10 * y * t + 2, + 2 * y * z * t + x * t^2 - x - 2 * z, + -x * z^3 + 4 * y * z^2 * t + 4 * x * z * t^2 + 2 * y * t^3 + 4 * x * z + 4 * z^2 - + 10 * y * t - 10 * t^2 + 2 +] + +gb_lex = Groebner.groebner(sys, ordering=Groebner.Lex()) +gb_drl = Groebner.groebner(sys, ordering=Groebner.DegRevLex()) +gb_fglm = Groebner.fglm(gb_drl, Groebner.DegRevLex(), Groebner.Lex()) + +@info "" gb_fglm diff --git a/experimental/lexcmp.jl b/experimental/lexcmp.jl new file mode 100644 index 00000000..7058b77b --- /dev/null +++ b/experimental/lexcmp.jl @@ -0,0 +1,217 @@ +using BenchmarkTools + +using HostCPUFeatures +using HostCPUFeatures: + register_size, + pick_vector_width, + pick_vector_width_shift, + simd_integer_register_size, + fma_fast, + has_feature, + register_count, + cpu_name, + register_size + +######### + +_setup1(n) = begin + x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n) + y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n) + x, y +end +_setup2(n) = begin + x = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n)) + y = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n)) + x, y +end +_setup3(T, n) = begin + s = rand(T.([0, 0, 0, 1, 2, 3]), 3) + x = Groebner.monom_construct_from_vector( + Groebner.ExponentVector{T}, + vcat(zeros(T, n), s) + ) + y = Groebner.monom_construct_from_vector( + Groebner.ExponentVector{T}, + vcat(zeros(T, n), reverse(s)) + ) + z = similar(x) + @assert Groebner.monom_totaldeg(x) == Groebner.monom_totaldeg(y) + z, x, y +end + +######### + +begin + n, step = 1, 5 + while n < 500 + @info "n = $n" + print("Groebner.monom_is_equal\t\t") + @btime Groebner.monom_is_equal(xx, yy) setup = begin + cc, xx, yy = _setup3(Int8, max(1, $n)) + end + print("Groebner.monom_copy\t\t") + @btime Groebner.monom_copy(xx) setup = begin + cc, xx, yy = _setup3(Int8, max(1, $n)) + end + print("Groebner.monom_is_divisible\t") + @btime Groebner.monom_is_divisible(xx, yy) setup = begin + cc, xx, yy = _setup3(Int8, $n) + end + print("Groebner.monom_product!\t\t") + @btime Groebner.monom_product!(cc, xx, yy) setup = begin + cc, xx, yy = _setup3(Int8, max(1, $n)) + end + print("Groebner.monom_lcm!\t\t") + @btime Groebner.monom_lcm!(cc, xx, yy) setup = begin + cc, xx, yy = _setup3(Int8, max(1, $n)) + end + print("Groebner.monom_is_gcd_const\t") + @btime Groebner.monom_is_gcd_const(xx, yy) setup = begin + cc, xx, yy = _setup3(Int8, $n) + end + print("Groebner.monom_isless:lex\t") + @btime Groebner.monom_isless(xx, yy, _ord) setup = begin + cc, xx, yy = _setup3(Int8, max(1, $n)) + _ord = Groebner._Lex{true}(ones(Int, length(xx))) + end + print("Groebner.monom_isless:drl\t") + @btime Groebner.monom_isless(xx, yy, _ord) setup = begin + cc, xx, yy = map(reverse, _setup3(Int8, max(1, $n))) + xx[1] = xx[end] + yy[1] = yy[end] + @assert Groebner.monom_totaldeg(xx) == Groebner.monom_totaldeg(yy) + _ord = Groebner._DegRevLex{true}(ones(Int, length(xx))) + end + n += step + step = ceil(Int, step * 1.2) + end +end + +_setup4(n) = begin + s = rand(UInt8.([0, 0, 0, 1, 2, 3]), 2) + a, b = vcat(zeros(UInt8, n), s), vcat(zeros(UInt8, n), reverse(s)) + a, b = reverse(a), reverse(b) + x = Groebner.monom_construct_from_vector(Groebner.ExponentVector{UInt8}, a) + y = Groebner.monom_construct_from_vector(Groebner.ExponentVector{UInt8}, b) + vT(n) = + if n + 2 < 8 + Groebner.PackedTuple1 + elseif n + 2 < 16 + Groebner.PackedTuple2 + elseif n + 2 < 24 + Groebner.PackedTuple3 + end + xpacked = Groebner.monom_construct_from_vector(vT(n){UInt64, UInt8}, a) + ypacked = Groebner.monom_construct_from_vector(vT(n){UInt64, UInt8}, b) + x, y, xpacked, ypacked +end +begin + n, step = 1, 3 + while n < 22 + @info "n = $n" + print("Groebner.monom_isless:drl:packed\t") + @btime Groebner.monom_isless(xpacked, ypacked, _ord) setup = begin + x, y, xpacked, ypacked = _setup4($n) + _ord = Groebner._DegRevLex{true}(ones(Int, length(x))) + @assert Groebner.monom_totaldeg(xpacked) == Groebner.monom_totaldeg(ypacked) + tmp1, tmp2 = + Vector{Int8}(undef, length(x) - 1), Vector{Int8}(undef, length(x) - 1) + @assert Groebner.monom_to_vector!(tmp1, x) == + Groebner.monom_to_vector!(tmp2, xpacked) + @assert Groebner.monom_to_vector!(tmp1, y) == + Groebner.monom_to_vector!(tmp2, ypacked) + end + print("Groebner.monom_isless:drl:expvect\t") + @btime Groebner.monom_isless(x, y, _ord) setup = begin + x, y, xpacked, ypacked = _setup4($n) + _ord = Groebner._DegRevLex{true}(ones(Int, length(x))) + @assert Groebner.monom_totaldeg(x) == Groebner.monom_totaldeg(y) + end + n += step + end +end + +begin + n, step = 1, 5 + while n < 500 + @info "n = $n" + for _ in 1:1_000 + x, y = _setup3(n) + res1 = vector_are_orth(x, y) + res2 = _vec_check_orth(x, y) + @assert res1 == res2 + end + @btime vector_are_orth(xx, yy) setup = begin + xx, yy = _setup3($n) + end + @btime _vec_check_orth(xx, yy) setup = begin + xx, yy = _setup3($n) + end + n += step + step = ceil(Int, step * 1.2) + end +end + +begin + _setup1(n) = begin + x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n) + y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n) + x, y + end + _setup2(n) = begin + x = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n)) + y = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n)) + x, y + end + _setup3(n) = begin + x = vcat(zeros(Int16, n), rand(Int16.([0, 0, 0, 1, 2, 3]), n)) + y = vcat(zeros(Int16, n), rand(Int16.([0, 0, 0, 1, 2, 3]), n)) + x, y + end + n, step = 1, 5 + while n < 500 + @info "n = $n" + n += step + step = ceil(Int, step * 1.2) + for _ in 1:100 + x, y = _setup3(n) + res1 = vector_any_lt(x, y) + res2 = vector_any_lt_simd(x, y) + @assert res1 == res2 + end + @btime vector_any_lt(xx, yy) setup = begin + xx, yy = _setup3($n) + end + @btime vector_any_lt_simd(xx, yy) setup = begin + xx, yy = _setup3($n) + end + end +end + +######### + +begin + _setup1(n) = begin + x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n) + y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n) + similar(x), x, y + end + n, step = 1, 5 + while n < 500 + @info "n = $n" + n += step + step = ceil(Int, step * 1.2) + for _ in 1:100 + c, x, y = _setup1(n) + res1 = vector_emax_1!(copy(c), x, y) + res2 = vector_emax_2!(copy(c), x, y) + @assert res1 == res2 + end + @btime vector_emax_1!(cc, xx, yy) setup = begin + cc, xx, yy = _setup1($n) + end + @btime vector_emax_2!(cc, xx, yy) setup = begin + cc, xx, yy = _setup1($n) + end + end +end diff --git a/experimental/native_vec4-assume.txt b/experimental/native_vec4-assume.txt new file mode 100644 index 00000000..47865572 --- /dev/null +++ b/experimental/native_vec4-assume.txt @@ -0,0 +1,93 @@ + .text + .file "mod_p" + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function julia_mod_p_61774 +.LCPI0_0: + .quad -9223372036854775808 # 0x8000000000000000 + .text + .globl julia_mod_p_61774 + .p2align 4, 0x90 + .type julia_mod_p_61774,@function +julia_mod_p_61774: # @julia_mod_p_61774 + .cfi_startproc +# %bb.0: # %top + push rbp + .cfi_def_cfa_offset 16 + .cfi_offset rbp, -16 + mov rbp, rsp + .cfi_def_cfa_register rbp + push rsi + push rdi + .cfi_offset rdi, -32 + .cfi_offset rsi, -24 + mov r9, rdx + mov r10, rcx + vpmovsxbq ymm2, dword ptr [r8 + 96] + vpmovzxbq ymm1, dword ptr [r8 + 100] # ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero + mov r11, qword ptr [r8 + 80] + mov rcx, qword ptr [r8 + 72] + mov rax, qword ptr [r8 + 64] + imul qword ptr [rdx] + mov rdi, rdx + mov rax, rcx + imul qword ptr [r9 + 8] + mov rcx, rdx + mov rax, r11 + imul qword ptr [r9 + 16] + mov rsi, rdx + vmovdqu ymm0, ymmword ptr [r9] + mov rax, qword ptr [r8 + 88] + imul qword ptr [r9 + 24] + vmovq xmm3, rdx + vmovq xmm4, rsi + vpunpcklqdq xmm3, xmm4, xmm3 # xmm3 = xmm4[0],xmm3[0] + vmovq xmm4, rcx + vmovq xmm5, rdi + vpunpcklqdq xmm4, xmm5, xmm4 # xmm4 = xmm5[0],xmm4[0] + vinserti128 ymm3, ymm4, xmm3, 1 + vpsrlq ymm4, ymm0, 32 + vpmuludq ymm4, ymm4, ymm2 + vpsrlq ymm5, ymm2, 32 + vpmuludq ymm5, ymm0, ymm5 + vpaddq ymm4, ymm5, ymm4 + vpsllq ymm4, ymm4, 32 + vpmuludq ymm2, ymm0, ymm2 + vpaddq ymm2, ymm2, ymm3 + vpaddq ymm2, ymm4, ymm2 + vpxor xmm3, xmm3, xmm3 + vpsrlvq ymm4, ymm2, ymm1 + movabs rax, offset .LCPI0_0 + vpbroadcastq ymm5, qword ptr [rax] + vpsrlvq ymm1, ymm5, ymm1 + vpxor ymm4, ymm4, ymm1 + vpsubq ymm1, ymm4, ymm1 + vpsrlq ymm2, ymm2, 63 + vpaddq ymm1, ymm1, ymm2 + vmovdqu ymm2, ymmword ptr [r8] + vpsrlq ymm4, ymm2, 32 + vpmuludq ymm4, ymm1, ymm4 + vpsrlq ymm5, ymm1, 32 + vpmuludq ymm5, ymm5, ymm2 + vpaddq ymm4, ymm4, ymm5 + vpsllq ymm4, ymm4, 32 + vpmuludq ymm1, ymm1, ymm2 + vpaddq ymm1, ymm1, ymm4 + vpsubq ymm0, ymm0, ymm1 + vpcmpgtq ymm1, ymm3, ymm0 + vpand ymm1, ymm1, ymm2 + vpaddq ymm0, ymm1, ymm0 + vpcmpgtq ymm1, ymm0, ymm2 + vpand ymm1, ymm1, ymm2 + vpsubq ymm0, ymm0, ymm1 + vmovdqu ymmword ptr [r10], ymm0 + mov rax, r10 + pop rdi + pop rsi + pop rbp + vzeroupper + ret +.Lfunc_end0: + .size julia_mod_p_61774, .Lfunc_end0-julia_mod_p_61774 + .cfi_endproc + # -- End function + .section ".note.GNU-stack","",@progbits diff --git a/experimental/native_vec4.txt b/experimental/native_vec4.txt new file mode 100644 index 00000000..9ca239cb --- /dev/null +++ b/experimental/native_vec4.txt @@ -0,0 +1,93 @@ + .text + .file "mod_p" + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function julia_mod_p_67625 +.LCPI0_0: + .quad -9223372036854775808 # 0x8000000000000000 + .text + .globl julia_mod_p_67625 + .p2align 4, 0x90 + .type julia_mod_p_67625,@function +julia_mod_p_67625: # @julia_mod_p_67625 + .cfi_startproc +# %bb.0: # %top + push rbp + .cfi_def_cfa_offset 16 + .cfi_offset rbp, -16 + mov rbp, rsp + .cfi_def_cfa_register rbp + push rsi + push rdi + .cfi_offset rdi, -32 + .cfi_offset rsi, -24 + mov r9, rdx + mov r10, rcx + vpmovsxbq ymm2, dword ptr [r8 + 96] + vpmovzxbq ymm1, dword ptr [r8 + 100] # ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero + mov r11, qword ptr [r8 + 80] + mov rcx, qword ptr [r8 + 72] + mov rax, qword ptr [r8 + 64] + imul qword ptr [rdx] + mov rdi, rdx + mov rax, rcx + imul qword ptr [r9 + 8] + mov rcx, rdx + mov rax, r11 + imul qword ptr [r9 + 16] + mov rsi, rdx + vmovdqu ymm0, ymmword ptr [r9] + mov rax, qword ptr [r8 + 88] + imul qword ptr [r9 + 24] + vmovq xmm3, rdx + vmovq xmm4, rsi + vpunpcklqdq xmm3, xmm4, xmm3 # xmm3 = xmm4[0],xmm3[0] + vmovq xmm4, rcx + vmovq xmm5, rdi + vpunpcklqdq xmm4, xmm5, xmm4 # xmm4 = xmm5[0],xmm4[0] + vinserti128 ymm3, ymm4, xmm3, 1 + vpsrlq ymm4, ymm0, 32 + vpmuludq ymm4, ymm4, ymm2 + vpsrlq ymm5, ymm2, 32 + vpmuludq ymm5, ymm0, ymm5 + vpaddq ymm4, ymm5, ymm4 + vpsllq ymm4, ymm4, 32 + vpmuludq ymm2, ymm0, ymm2 + vpaddq ymm2, ymm2, ymm3 + vpaddq ymm2, ymm4, ymm2 + vpxor xmm3, xmm3, xmm3 + vpsrlvq ymm4, ymm2, ymm1 + movabs rax, offset .LCPI0_0 + vpbroadcastq ymm5, qword ptr [rax] + vpsrlvq ymm1, ymm5, ymm1 + vpxor ymm4, ymm4, ymm1 + vpsubq ymm1, ymm4, ymm1 + vpsrlq ymm2, ymm2, 63 + vpaddq ymm1, ymm1, ymm2 + vmovdqu ymm2, ymmword ptr [r8] + vpsrlq ymm4, ymm2, 32 + vpmuludq ymm4, ymm1, ymm4 + vpsrlq ymm5, ymm1, 32 + vpmuludq ymm5, ymm5, ymm2 + vpaddq ymm4, ymm4, ymm5 + vpsllq ymm4, ymm4, 32 + vpmuludq ymm1, ymm1, ymm2 + vpaddq ymm1, ymm1, ymm4 + vpsubq ymm0, ymm0, ymm1 + vpcmpgtq ymm1, ymm3, ymm0 + vpand ymm1, ymm1, ymm2 + vpaddq ymm0, ymm1, ymm0 + vpcmpgtq ymm1, ymm0, ymm2 + vpand ymm1, ymm1, ymm2 + vpsubq ymm0, ymm0, ymm1 + vmovdqu ymmword ptr [r10], ymm0 + mov rax, r10 + pop rdi + pop rsi + pop rbp + vzeroupper + ret +.Lfunc_end0: + .size julia_mod_p_67625, .Lfunc_end0-julia_mod_p_67625 + .cfi_endproc + # -- End function + .section ".note.GNU-stack","",@progbits diff --git a/experimental/test_fglm.jl b/experimental/test_fglm.jl new file mode 100644 index 00000000..bd8b80e1 --- /dev/null +++ b/experimental/test_fglm.jl @@ -0,0 +1,22 @@ +if !isdefined(Main, :Groebner) + using Groebner +end +using AbstractAlgebra + +sys = Groebner.noonn(8, k=GF(2^26 + 15)) + +R, (x, y) = polynomial_ring(GF(2^26 + 15), ["x", "y"]) +sys = [x * y + 1, y^200 + 1] + +# gb_lex = Groebner.groebner(sys, ordering=Groebner.Lex()) +gb_drl = Groebner.groebner(sys, ordering=Groebner.DegRevLex()); + +minpoly_last = Groebner.fglm_residuals_in_batch( + gb_drl, + Groebner.DegRevLex(), + Groebner.Lex(), + statistics=:timings, + loglevel=0 +) + +gb_fglm = Groebner.fglm(gb_drl, Groebner.DegRevLex(), Groebner.Lex(), statistics=:timings); diff --git a/src/Groebner.jl b/src/Groebner.jl index 19e6ded1..6481c725 100644 --- a/src/Groebner.jl +++ b/src/Groebner.jl @@ -1,7 +1,7 @@ -module Groebner -# Groebner is a package for computing Gröbner bases. This is the main file. -# Groebner is distributed under GNU GPL v2 starting from Groebner v0.6.0. +# This file is a part of Groebner.jl. License is GNU GPL v2. +# Groebner.jl is a package for computing Gröbner bases. This is the main file. +module Groebner # Groebner works over integers modulo a prime and over the rationals. At its # heart, Groebner implements F4, multi-modular techniques, and tracing. @@ -33,11 +33,11 @@ invariants_enabled() = false logging_enabled() -> Bool Specifies if logging is enabled. If `false`, then all logging in Groebner is -disabled, and entails no runtime overhead. +disabled, and entails **(almost)** no runtime overhead. See also `@log` in `src/utils/logging.jl`. """ -logging_enabled() = false +logging_enabled() = true """ performance_counters_enabled() -> Bool @@ -45,9 +45,9 @@ logging_enabled() = false If performance-tracking macro `@timeit` should be enabled in Groebner. When this is `false`, all performance counters in Groebner are disabled and -entail no runtime overhead. +entail **(almost)** no runtime overhead. """ -performance_counters_enabled() = false +performance_counters_enabled() = true ### # Imports @@ -70,6 +70,9 @@ import Combinatorics using ExprTools +import HostCPUFeatures: + cpu_name, register_count, register_size, has_feature, pick_vector_width, fma_fast + using Logging import MultivariatePolynomials @@ -124,7 +127,8 @@ include("utils/invariants.jl") include("utils/timeit.jl") # Provides the macro `@stat` for collecting statistics include("utils/statistics.jl") - +# For fast and specific vector arithmetic +include("utils/simd.jl") # include("utils/versioninfo.jl") # Minimalistic plotting with Unicode diff --git a/src/arithmetic/Zp.jl b/src/arithmetic/Zp.jl index 8882cc50..cb3f90c6 100644 --- a/src/arithmetic/Zp.jl +++ b/src/arithmetic/Zp.jl @@ -7,8 +7,8 @@ # The arithmetic is parametrized by the type of the accumulator and the type of # the coefficient. Generally, AccumType should be picked so that the result of # a + b*c -# is representable exactly in AccumType for feasible a,b,c of type CoeffType. -# One example is AccumType = UInt64 and CoeffType = UInt32 with prime moduli +# is representable exactly in AccumType for feasible a,b,c of type CoeffType. +# One example is AccumType = UInt64 and CoeffType = UInt32 with prime moduli. abstract type AbstractArithmetic{AccumType, CoeffType} end # All implementations of arithmetic in Zp are a subtype of this. @@ -48,7 +48,7 @@ end divisor(arithm::ArithmeticZp) = arithm.magic.divisor -mod_p(a::T, arithm::ArithmeticZp{T}) where {T} = a % arithm.magic +@inline mod_p(a::T, arithm::ArithmeticZp{T}) where {T} = a % arithm.magic inv_mod_p(a::T, arithm::ArithmeticZp{T}) where {T} = invmod(a, divisor(arithm)) @@ -86,7 +86,7 @@ struct SpecializedArithmeticZp{AccumType, CoeffType, Add} <: uinv::UnsignedMultiplicativeInverse{AccumType} ) where {AccumType <: CoeffZp, CoeffType <: CoeffZp} # Further in the code we need the guarantee that the shift is < 64 - @invariant uinv.shift < 64 + @invariant uinv.shift < 8 * sizeof(AccumType) new{AccumType, CoeffType, uinv.add}(uinv.multiplier, uinv.shift, uinv.divisor) end end @@ -94,11 +94,11 @@ end divisor(arithm::SpecializedArithmeticZp) = arithm.divisor # Returns the higher half of the product a*b -function _mul_high(a::T, b::T) where {T <: Union{Signed, Unsigned}} +@inline function _mul_high(a::T, b::T) where {T <: Union{Signed, Unsigned}} ((widen(a) * b) >>> (sizeof(a) * 8)) % T end -function _mul_high(a::UInt128, b::UInt128) +@inline function _mul_high(a::UInt128, b::UInt128) shift = sizeof(a) * 4 mask = typemax(UInt128) >> shift a1, a2 = a >>> shift, a & mask @@ -111,18 +111,19 @@ end # NOTE: the compiler sometimes fails to simd this operation for T ≥ UInt32. # # a modulo p (addition specialization) -function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, true}) where {T, C} +@inline function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, true}) where {T, C} x = _mul_high(a, mod.multiplier) x = convert(T, convert(T, (convert(T, a - x) >>> UInt8(1))) + x) - # Bit shifts in Julia check that the shift is less than 64. - # The assumption allows us to bypass this check. - unsafe_assume(mod.shift < 64) + # Bit shifts in Julia check that the shift value is less than the bitsize of + # the argument. + # The assumption here allows us to bypass this check. + unsafe_assume(mod.shift < 8 * sizeof(T)) a - (x >>> mod.shift) * mod.divisor end # a modulo p (no addition specialization) -function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, false}) where {T, C} +@inline function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, false}) where {T, C} x = _mul_high(a, mod.multiplier) - unsafe_assume(mod.shift < 64) + unsafe_assume(mod.shift < 8 * sizeof(T)) a - (x >>> mod.shift) * mod.divisor end @@ -159,7 +160,7 @@ struct DelayedArithmeticZp{AccumType, CoeffType, Add} <: ::Type{CoeffType}, uinv::UnsignedMultiplicativeInverse{AccumType} ) where {AccumType <: CoeffZp, CoeffType <: CoeffZp} - @invariant uinv.shift < 64 + @invariant uinv.shift < 8 * sizeof(AccumType) new{AccumType, CoeffType, uinv.add}(uinv.multiplier, uinv.shift, uinv.divisor) end end @@ -177,16 +178,16 @@ function n_safe_consecutive_additions(arithm::DelayedArithmeticZp{T}) where {T < end # a modulo p (addition specialization) -function mod_p(a::A, mod::DelayedArithmeticZp{A, T, true}) where {A, T} +@inline function mod_p(a::A, mod::DelayedArithmeticZp{A, T, true}) where {A, T} x = _mul_high(a, mod.multiplier) x = convert(A, convert(A, (convert(A, a - x) >>> UInt8(1))) + x) - unsafe_assume(mod.shift < 64) + unsafe_assume(mod.shift < 8 * sizeof(T)) a - (x >>> mod.shift) * mod.divisor end # a modulo p (no addition specialization) -function mod_p(a::A, mod::DelayedArithmeticZp{A, T, false}) where {A, T} +@inline function mod_p(a::A, mod::DelayedArithmeticZp{A, T, false}) where {A, T} x = _mul_high(a, mod.multiplier) - unsafe_assume(mod.shift < 64) + unsafe_assume(mod.shift < 8 * sizeof(T)) a - (x >>> mod.shift) * mod.divisor end @@ -275,17 +276,17 @@ struct SignedArithmeticZp{AccumType, CoeffType} <: @invariant Primes.isprime(p) pa = convert(AccumType, p) magic = Base.MultiplicativeInverses.SignedMultiplicativeInverse(pa) - @invariant magic.shift < 64 + @invariant magic.shift < 8 * sizeof(AccumType) new{AccumType, CoeffType}(pa, pa * pa, magic.multiplier, magic.addmul, magic.shift) end end divisor(arithm::SignedArithmeticZp) = arithm.p -function mod_p(a::T, mod::SignedArithmeticZp{T}) where {T} +@inline function mod_p(a::T, mod::SignedArithmeticZp{T}) where {T} x = _mul_high(a, mod.multiplier) x += a * mod.addmul - unsafe_assume(mod.shift < 64) + unsafe_assume(mod.shift < 8 * sizeof(T)) d = (signbit(x) + (x >> mod.shift)) % T res = a - d * mod.p ifelse(res >= zero(T), res, res + mod.p) @@ -317,7 +318,7 @@ struct SignedCompositeArithmeticZp{AccumType, CoeffType, T, N} <: multipliers = map(a -> a.multiplier, arithms) addmuls = map(a -> a.addmul, arithms) shifts = map(a -> a.shift, arithms) - @invariant all(x -> x < 64, shifts) + @invariant all(x -> x < 8 * sizeof(AT), shifts) p2s = CompositeInt{N, AT}(ps) * CompositeInt{N, AT}(ps) new{CompositeInt{N, AT}, CompositeInt{N, CT}, AT, N}( CompositeInt{N, AT}(ps), @@ -331,13 +332,13 @@ end divisor(arithm::SignedCompositeArithmeticZp) = arithm.ps -function mod_p( +@inline function mod_p( a::CompositeInt{N, T}, arithm::SignedCompositeArithmeticZp{CompositeInt{N, T}, U, W, N} ) where {N, T, U, W} x = _mul_high.(a.data, arithm.multipliers.data) x = x .+ a.data .* arithm.addmuls.data - # unsafe_assume(all(x -> x < 64, arithm.shifts.data)) + unsafe_assume(all(arithm.shifts.data .< (8 * sizeof(T)))) d = (signbit.(x) .+ (x .>> arithm.shifts.data)) .% T res = a.data .- d .* arithm.ps.data res = ifelse.(res .>= T(0), res, res .+ arithm.ps.data) diff --git a/src/f4/f4.jl b/src/f4/f4.jl index 76ff3fe5..0ea94093 100644 --- a/src/f4/f4.jl +++ b/src/f4/f4.jl @@ -99,6 +99,7 @@ end # make sure the pairset has enough space pairset_resize_if_needed!(pairset, npairs) pairset_size = length(pairset.pairs) + # update pairset: # for each new element in basis.. diff --git a/src/f4/hashtable.jl b/src/f4/hashtable.jl index ef67c495..7a2a54d0 100644 --- a/src/f4/hashtable.jl +++ b/src/f4/hashtable.jl @@ -128,7 +128,7 @@ function hashtable_initialize( dmbits = 8 * sizeof(DivisionMask) compress_divmask = nvars > dmbits @assert dmbits == 32 - @log level = -5 "Using division masks: $use_divmask of $dmbits bits. Compressed: $compress_divmask" + @log level = -5 "Using division masks: $use_divmask. Using $dmbits bits. Compressed: $compress_divmask" ndivbits = div(dmbits, nvars) # division mask stores at least 1 bit # per each of first ndivvars variables diff --git a/src/f4/linalg/backend.jl b/src/f4/linalg/backend.jl index a96271cf..78cdc4c4 100644 --- a/src/f4/linalg/backend.jl +++ b/src/f4/linalg/backend.jl @@ -819,14 +819,14 @@ function linalg_normalize_row!( arithmetic::AbstractArithmeticZp{A, T}, first_nnz_index::Int=1 ) where {A <: Union{CoeffZp, CompositeCoeffZp}, T <: Union{CoeffZp, CompositeCoeffZp}} - @invariant @inbounds !iszero(row[first_nnz_index]) + @invariant !iszero(row[first_nnz_index]) lead = row[first_nnz_index] isone(lead) && return row @inbounds pinv = inv_mod_p(A(lead), arithmetic) % T - @inbounds row[1] = one(T) - @inbounds for i in 2:length(row) + @inbounds row[first_nnz_index] = one(T) + @inbounds for i in (first_nnz_index + 1):length(row) row[i] = mod_p(A(row[i]) * A(pinv), arithmetic) % T end @@ -839,7 +839,7 @@ function linalg_normalize_row!( arithmetic::AbstractArithmeticQQ{T}, first_nnz_index::Int=1 ) where {T <: CoeffQQ} - @invariant @inbounds !iszero(row[first_nnz_index]) + @invariant !iszero(row[first_nnz_index]) lead = row[first_nnz_index] isone(lead) && return row @@ -865,8 +865,6 @@ function linalg_vector_addmul_sparsedense_mod_p!( @invariant length(indices) == length(coeffs) @invariant !isempty(indices) - # unsafe_assume(!isempty(indices)) - @inbounds mul = divisor(arithmetic) - row[indices[1]] @inbounds for j in 1:length(indices) idx = indices[j] @@ -890,8 +888,6 @@ function linalg_vector_addmul_sparsedense!( @invariant length(indices) == length(coeffs) @invariant !isempty(indices) - # unsafe_assume(!isempty(indices)) - @inbounds mul = divisor(arithmetic) - row[indices[1]] @inbounds for j in 1:length(indices) idx = indices[j] @@ -915,8 +911,6 @@ function linalg_vector_addmul_sparsedense!( @invariant length(indices) == length(coeffs) @invariant !isempty(indices) - # unsafe_assume(!isempty(indices)) - # NOTE: mul is guaranteed to be < typemax(T) p2 = arithmetic.p2 @inbounds mul = row[indices[1]] % T @@ -943,8 +937,6 @@ function linalg_vector_addmul_sparsedense!( @invariant length(indices) == length(coeffs) @invariant !isempty(indices) - # unsafe_assume(!isempty(indices)) - # NOTE: mul is guaranteed to be < typemax(T) p2 = arithmetic.p2s @inbounds mul = row[indices[1]].data .% T @@ -1108,21 +1100,3 @@ function linalg_extract_sparse_row!( j - 1 end - -# TODO: not really important atm -# function extract_sparse_row_2!(vals::Vector{UInt}, dense::Vector{UInt}) -# j = 1 -# z = Vec{4, UInt}((0, 0, 0, 0)) -# @inbounds for i in 1:8:length(dense) -# v1 = SIMD.vload(Vec{4, UInt}, dense, i) -# v2 = SIMD.vload(Vec{4, UInt}, dense, i + 4) -# mask1 = !iszero(v1) -# mask2 = !iszero(v2) -# j1 = sum(mask1) -# j2 = sum(mask2) -# vstorec(v1, vals, j, mask1) -# vstorec(v2, vals, j + j1, mask2) -# j += j1 + j2 -# end -# nothing -# end diff --git a/src/fglm/fglm.jl b/src/fglm/fglm.jl index 100aff82..a8fb4c5b 100644 --- a/src/fglm/fglm.jl +++ b/src/fglm/fglm.jl @@ -48,3 +48,57 @@ function _fglm1( new_basis, _, new_ht = fglm_main!(ring, basis, ht, ordering_to, params) basis_export_data(new_basis, new_ht) end + +### +### + +function _fglm_residuals_in_batch(polynomials, ordering_from, ordering_to, kws) + ordering_from == ordering_to && return polynomials + + representation = + io_select_polynomial_representation(polynomials, kws, hint=:large_exponents) + ring, var_to_index, monoms, coeffs = + io_convert_to_internal(representation, polynomials, kws) + params = + AlgorithmParameters(ring, representation, kws, orderings=(ring.ord, ordering_from)) + if isempty(monoms) + @log level = -2 "Input consisting of zero polynomials." + throw(DomainError("Input consisting of zero polynomials to Groebner.fglm.")) + return io_convert_to_output(ring, polynomials, monoms, coeffs, params) + end + if kws.check + @log level = -2 "Checking if input is a Grobner basis" + # TODO this is, perhaps, broken! + if !isgroebner(polynomials, ordering=ordering_from, certify=false) + throw(DomainError("Input is not a Groebner basis.")) + end + end + + ring, _ = io_set_monomial_ordering!(ring, var_to_index, monoms, coeffs, params) + params = AlgorithmParameters( + ring, + representation, + kws, + orderings=(ordering_from, ordering_to) + ) + + _ordering_to = io_convert_to_internal_monomial_ordering(var_to_index, ordering_to) + new_monoms, new_coeffs = + _fglm_residuals_in_batch_1(ring, monoms, coeffs, _ordering_to, params) + + res = io_convert_to_output(ring, polynomials, new_monoms, new_coeffs, params) + + res +end + +@timeit function _fglm_residuals_in_batch_1( + ring::PolyRing, + monoms::Vector{Vector{M}}, + coeffs::Vector{Vector{C}}, + ordering_to, + params::AlgorithmParameters +) where {M <: Monom, C <: Coeff} + basis, _, ht = f4_initialize_structs(ring, monoms, coeffs, params) + new_basis, _, new_ht = _fglm_residuals_in_batch!(ring, basis, ht, ordering_to, params) + basis_export_data(new_basis, new_ht) +end diff --git a/src/fglm/fglm_internal.jl b/src/fglm/fglm_internal.jl index fcefc4d9..b7827eae 100644 --- a/src/fglm/fglm_internal.jl +++ b/src/fglm/fglm_internal.jl @@ -12,9 +12,10 @@ end Base.isempty(m::MonomialEnumerator) = m.load == 0 function enumerator_initialize(ht::MonomialHashtable{M}, ord) where {M} + hashtable_resize_if_needed!(ht, 1 << 6) zz = monom_construct_const_monom(M, ht.nvars) vidx = hashtable_insert!(ht, zz) - monoms = Vector{MonomId}(undef, 2^3) + monoms = Vector{MonomId}(undef, 1 << 3) monoms[1] = vidx load = 1 MonomialEnumerator{typeof(ord)}(monoms, load, Dict{MonomId, Int}(vidx => 1), ord) @@ -26,10 +27,9 @@ function enumerator_produce_next_monomials!( monom::MonomId ) where {M} while m.load + ht.nvars >= length(m.monoms) - resize!(m.monoms, length(m.monoms) * 2) + resize!(m.monoms, max(ht.nvars * 2, length(m.monoms) * 2)) end - # TODO: maybe an error, uncomment this line later - # hashtable_resize_if_needed!(ht, ht.nvars) + hashtable_resize_if_needed!(ht, ht.nvars) emonom = ht.monoms[monom] @inbounds for i in 1:(ht.nvars) @@ -51,6 +51,7 @@ function enumerator_produce_next_monomials!( end function enumerator_next_monomial!(m::MonomialEnumerator) + @assert m.load > 0 monom = m.monoms[m.load] m.load -= 1 monom @@ -82,9 +83,7 @@ function staircase_divides_monom(monom, staircase, ht) false end -const _seen = Dict() - -function fglm_main!( +@timeit function fglm_main!( ring::PolyRing, basis::Basis{C}, ht::MonomialHashtable{M}, @@ -96,7 +95,6 @@ function fglm_main!( from ordering: $(ring.ord) to ordering: $ord""" - empty!(_seen) monom_enumerator = enumerator_initialize(ht, ord) new_basis = basis_initialize(ring, basis.nfilled, C) matrix = wide_matrix_initialize(basis) @@ -115,20 +113,16 @@ function fglm_main!( continue end - if haskey(_seen, ht.monoms[monom]) - throw("This monomial has been processed before !") - end - _seen[ht.monoms[monom]] = 1 # Compute the normal form of the monomial w.r.t. by constructing and # echelonizing the F4 matrix. - to_be_reduced = basis_initialize(ring, [[monom]], [C[1]]) + to_be_reduced = basis_initialize(ring, [[monom]], [[C(1)]]) to_be_reduced.nfilled = 1 # ! This call takes most the time f4_normalform!(ring, basis, to_be_reduced, ht, params.arithmetic) # Search for a linear relation between all the computed normal forms relation_exists, relation = - find_linear_relation!(ring, matrix, monom, to_be_reduced, ht, params.arithmetic) + find_linear_relation!(matrix, monom, to_be_reduced, params.arithmetic) if relation_exists # Add the monomial to the staircase, and add the pre-image of the @@ -145,3 +139,92 @@ function fglm_main!( linbasis = extract_linear_basis(ring, matrix) new_basis, linbasis, ht end + +const _batchsize = Ref{Int}(100) + +function _fglm_residuals_in_batch!( + ring::PolyRing, + basis::Basis{C}, + ht::MonomialHashtable{M}, + ord::AbstractInternalOrdering, + params::AlgorithmParameters +) where {M <: Monom, C <: Coeff} + @log level = -2 """ + Applying FGLM to convert a basis + from ordering: $(ring.ord) + to ordering: $ord""" + + new_basis = basis_initialize(ring, basis.nfilled, C) + matrix = wide_matrix_initialize(basis) + monom_staircase = Vector{MonomId}() + + I = 0 + + # For "each" monomial, from the smallest to the largest + while true + # Get the next monomial + mons, coefs = Vector{Vector{MonomId}}(), Vector{Vector{C}}() + + filled = 0 + + for i in 1:_batchsize[] + ev = zeros(UInt32, ht.nvars) + ev[end] = I + I += 1 + # print("$I, ") + monom_ev = monom_construct_from_vector(M, ev) + monom = hashtable_insert!(ht, monom_ev) + + # If the monomial is divisible by any of the elements of the existing + # staircase, then throw the monomial away + if staircase_divides_monom(monom, monom_staircase, ht) + @log level = -4 "Monomial discarded, id = $monom" + continue + end + + push!(mons, [monom]) + push!(coefs, [C(1)]) + filled += 1 + end + # println() + iszero(filled) && continue + mons_copy = deepcopy(mons) + to_be_reduced = basis_initialize(ring, mons, coefs) + to_be_reduced.nfilled = filled + + # ! This call takes most the time + f4_normalform!(ring, basis, to_be_reduced, ht, params.arithmetic) + + # Search for a linear relation between all the computed normal forms + + for i in 1:filled + _to_be_reduced = + basis_initialize(ring, [to_be_reduced.monoms[i]], [to_be_reduced.coeffs[i]]) + _to_be_reduced.nfilled = 1 + relation_exists, relation = find_linear_relation!( + matrix, + mons_copy[i][1], + _to_be_reduced, + params.arithmetic + ) + + # println("i=$i -- monom=$(ht.monoms[mons_copy[i][1]]) -- $relation_exists") + # println([to_be_reduced.monoms[i]], [to_be_reduced.coeffs[i]]) + + if relation_exists + # Add the monomial to the staircase, and add the pre-image of the + # relation to the new basis + push!(monom_staircase, mons_copy[i][1]) + basis_add_generator!(new_basis, matrix, relation, ht, ord) + @goto End + end + end + end + + @label End + + basis_standardize!(ring, new_basis, ht, ord, params.arithmetic) + + linbasis = extract_linear_basis(ring, matrix) + new_basis, linbasis, ht +end diff --git a/src/fglm/linear.jl b/src/fglm/linear.jl index 32f7141b..eca2d398 100644 --- a/src/fglm/linear.jl +++ b/src/fglm/linear.jl @@ -1,18 +1,16 @@ # This file is a part of Groebner.jl. License is GNU GPL v2. +# Left part -- linear combinations of residuals (normal forms) +# Right part -- linear combinations of monomials (preimages) mutable struct WideMacaulayMatrix{C} - # pivot -> row leftpivs::Vector{Vector{Int}} leftcoeffs::Vector{Vector{C}} - # idx -> row rightrows::Vector{Vector{Int}} rightcoeffs::Vector{Vector{C}} - # pivot -> idx pivot2idx::Vector{Int} - # lefthash2col::Dict{Int, Int} leftcolumn_to_monom::Vector{Int} @@ -70,7 +68,7 @@ function wide_matrix_initialize(basis::Basis{C}) where {C <: Coeff} ) end -function convert_to_wide_dense_row(matrix, monom, vector::Basis{C}, ht) where {C <: Coeff} +function convert_to_wide_dense_row(matrix, monom, vector::Basis{C}) where {C <: Coeff} if matrix.nrcols >= length(matrix.rightcolumn_to_monom) resize!(matrix.rightcolumn_to_monom, 2 * length(matrix.rightcolumn_to_monom)) end @@ -113,9 +111,6 @@ function reduce_by_pivot_simultaneous!( rightcfs, arithmetic ) where {T <: CoeffZp} - - # mul = -densecoeffs[i] - # actually.. not bad! mul = mod_p(divisor(arithmetic) - leftrow[leftexps[1]], arithmetic) @inbounds for j in 1:length(leftexps) @@ -131,8 +126,6 @@ function reduce_by_pivot_simultaneous!( mul end -# -# Finite field magic specialization function normalize_wide_row_sparse!( leftcfs::Vector{T}, rightcfs, @@ -149,8 +142,6 @@ function normalize_wide_row_sparse!( end end -# -# Finite field magic specialization function normalize_wide_row_sparse!( leftcfs::Vector{T}, rightcfs, @@ -158,7 +149,6 @@ function normalize_wide_row_sparse!( ) where {T <: CoeffQQ} pinv = inv(leftcfs[1]) @inbounds for i in 2:length(leftcfs) - # row[i] *= pinv leftcfs[i] = leftcfs[i] * pinv end @inbounds leftcfs[1] = one(leftcfs[1]) @@ -180,10 +170,6 @@ function reduce_by_pivot_simultaneous!( rightcfs, arithmetic ) where {T <: CoeffQQ} - - # mul = -densecoeffs[i] - # actually.. not bad! - mul = -leftrow[leftexps[1]] @inbounds for j in 1:length(leftexps) @@ -217,11 +203,6 @@ function reduce_wide_dense_row_by_known_pivots_sparse!( # new pivot index np = -1 - # if debug() - # # @warn "in reduce" matrix.nlcols matrix.nrcols matrix.leftpivs - # # @warn "hmm" leftrow - # end - for i in 1:(matrix.nlcols) # if row element zero - no reduction @@ -246,7 +227,7 @@ function reduce_wide_dense_row_by_known_pivots_sparse!( rightexps = rightrows[pivot2idx[i]] rightcfs = matrix.rightcoeffs[pivot2idx[i]] - mul = reduce_by_pivot_simultaneous!( + reduce_by_pivot_simultaneous!( leftrow, leftexps, leftcfs, @@ -275,7 +256,7 @@ function extract_sparse_row(row::Vector{C}, np, k) where {C} # where k - number of structural nonzeros in new reduced row, k > 0 j = 1 @inbounds for i in np:length(row) # from new pivot - @inbounds if row[i] != 0 + if row[i] != 0 newrow[j] = i newcfs[j] = row[i] j += 1 @@ -285,35 +266,19 @@ function extract_sparse_row(row::Vector{C}, np, k) where {C} newrow, newcfs, j - 1 end -function find_linear_relation!( - ring, +const _sparisty_factors = [] + +@timeit function find_linear_relation!( matrix::WideMacaulayMatrix, monom::MonomId, vector::Basis{C}, - ht, arithmetic ) where {C <: Coeff} - leftrow, rightrow = convert_to_wide_dense_row(matrix, monom, vector, ht) - - # if debug() - # # @warn "start" - # println(monom) - # println(vector.monoms, " ", vector.coeffs) - # println(leftrow) - # println(rightrow) - # println(matrix) - # end + leftrow, rightrow = convert_to_wide_dense_row(matrix, monom, vector) reduced, np, k = reduce_wide_dense_row_by_known_pivots_sparse!(matrix, leftrow, rightrow, arithmetic) - # if debug() - # # @warn "reduced" - # println(reduced, " ", np, " ", k) - # println(leftrow) - # println(rightrow) - # end - if reduced # pass else @@ -322,12 +287,6 @@ function find_linear_relation!( normalize_wide_row_sparse!(lcoeffs, rcoeffs, arithmetic) - # if debug() - # # @warn "extracted" - # println(lexps, " ", lcoeffs) - # println(rexps, " ", rcoeffs) - # end - while np >= matrix.nlsize matrix.nlsize *= 2 resize!(matrix.leftpivs, matrix.nlsize) @@ -349,6 +308,39 @@ function find_linear_relation!( matrix.rightcoeffs[matrix.nrrows] = rcoeffs end + nnz_left = 0 + nnz_left_rows = 0 + for i in 1:(matrix.nlsize) + if isassigned(matrix.leftpivs, i) + nnz_left += length(matrix.leftpivs[i]) + nnz_left_rows += 1 + end + end + nnz_right = 0 + nnz_right_rows = 0 + for i in 1:(matrix.nrsize) + if isassigned(matrix.rightrows, i) + nnz_right += length(matrix.rightrows[i]) + nnz_right_rows += 1 + end + end + dimsleft = (nnz_left_rows, matrix.nlcols) + dimsright = (nnz_right_rows, matrix.nrcols) + spleft = nnz_left / prod(dimsleft) + spright = nnz_right / prod(dimsright) + + push!( + _sparisty_factors, + ( + nnz_left=nnz_left, + dimsleft=dimsleft, + spleft=spleft, + nnz_right=nnz_right, + dimsright=dimsright, + spright=spright + ) + ) + return reduced, rightrow end diff --git a/src/input-output/input-output.jl b/src/input-output/input-output.jl index 3216e7ff..cdd1c44b 100644 --- a/src/input-output/input-output.jl +++ b/src/input-output/input-output.jl @@ -222,7 +222,15 @@ function io_get_tight_unsigned_int_type(x::T) where {T <: Integer} end end -function io_select_coefftype(char, npolys, nvars, ordering, kws, hint) +function io_select_coefftype( + char, + npolys, + nvars, + ordering, + kws, + hint; + using_wide_type_for_coeffs=false +) @log level = -1 """ Selecting coefficient representation. Given hint hint=$hint. Keyword argument arithmetic=$(kws.arithmetic)""" @@ -239,7 +247,6 @@ function io_select_coefftype(char, npolys, nvars, ordering, kws, hint) ) end - using_wide_type_for_coeffs = true tight_signed_type = io_get_tight_signed_int_type(char) # If the requested arithmetic requires a signed representation @@ -256,11 +263,13 @@ function io_select_coefftype(char, npolys, nvars, ordering, kws, hint) end tight_unsigned_type = io_get_tight_unsigned_int_type(char) - if !using_wide_type_for_coeffs + tight_unsigned_type = if !using_wide_type_for_coeffs tight_unsigned_type else widen(tight_unsigned_type) - end, using_wide_type_for_coeffs + end + + tight_unsigned_type, using_wide_type_for_coeffs end ### diff --git a/src/interface.jl b/src/interface.jl index 41487a07..7ac1cd54 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -497,3 +497,30 @@ function fglm( result end + +""" + fglm(basis, ordering_from, ordering_to; options...) + +Converts a Groebner basis from one monomial ordering to another. +""" +function fglm_residuals_in_batch( + basis::AbstractVector, + ordering_from::AbstractMonomialOrdering, + ordering_to::AbstractMonomialOrdering; + options... +) + Base.require_one_based_indexing(basis) + + keywords = KeywordsHandler(:fglm, options) + + logging_setup(keywords) + statistics_setup(keywords) + + result = + _fglm_residuals_in_batch(basis, ordering_from, ordering_to, keywords)::typeof(basis) + + performance_counters_print(keywords) + statistics_print(keywords) + + result +end diff --git a/src/monomials/exponentvector.jl b/src/monomials/exponentvector.jl index ef226b91..2876710e 100644 --- a/src/monomials/exponentvector.jl +++ b/src/monomials/exponentvector.jl @@ -85,35 +85,27 @@ function monom_is_supported_ordering(::Type{ExponentVector{T}}, ::O) where {T, O true end -# DegRevLex monomial comparison +# DegRevLex monomial comparison function monom_isless(ea::ExponentVector, eb::ExponentVector, ::_DegRevLex{true}) @invariant length(ea) == length(eb) @invariant length(ea) > 1 - if @inbounds monom_totaldeg(ea) < monom_totaldeg(eb) + if monom_totaldeg(ea) < monom_totaldeg(eb) return true - elseif @inbounds monom_totaldeg(ea) != monom_totaldeg(eb) + elseif monom_totaldeg(ea) != monom_totaldeg(eb) return false end - i = length(ea) - @inbounds while i > 2 && ea[i] == eb[i] - i -= 1 - end - @inbounds if ea[i] <= eb[i] - return false - else - return true - end + _vec_cmp_revlex(ea, eb) end -# DegRevLex monomial comparison, but on a range of variables +# DegRevLex monomial comparison (shuffled variables) function monom_isless( ea::ExponentVector{T}, eb::ExponentVector{T}, ord::_DegRevLex{false} ) where {T} - indices = variable_indices(ord) @invariant length(ea) == length(eb) @invariant length(ea) > 1 + indices = variable_indices(ord) eatotaldeg = zero(T) ebtotaldeg = zero(T) @inbounds for i in indices @@ -136,30 +128,27 @@ function monom_isless( end end -# DegLex exponent vector comparison +# DegLex monomial comparison function monom_isless(ea::ExponentVector, eb::ExponentVector, ::_DegLex{true}) @invariant length(ea) == length(eb) @invariant length(ea) > 1 - if @inbounds monom_totaldeg(ea) < monom_totaldeg(eb) + if monom_totaldeg(ea) < monom_totaldeg(eb) return true - elseif @inbounds monom_totaldeg(ea) != monom_totaldeg(eb) + elseif monom_totaldeg(ea) != monom_totaldeg(eb) return false end - i = 2 - @inbounds while i < length(ea) && ea[i] == eb[i] - i += 1 - end - @inbounds return ea[i] < eb[i] ? true : false + _vec_cmp_lex(ea, eb) end +# DegLex monomial comparison (shuffled variables) function monom_isless( ea::ExponentVector{T}, eb::ExponentVector{T}, ord::_DegLex{false} ) where {T} - indices = variable_indices(ord) @invariant length(ea) == length(eb) @invariant length(ea) > 1 + indices = variable_indices(ord) eatotaldeg = zero(T) ebtotaldeg = zero(T) @inbounds for i in indices @@ -182,14 +171,10 @@ end function monom_isless(ea::ExponentVector, eb::ExponentVector, ::_Lex{true}) @invariant length(ea) == length(eb) @invariant length(ea) > 1 - i = 2 - @inbounds while i < length(ea) && ea[i] == eb[i] - i += 1 - end - @inbounds return ea[i] < eb[i] ? true : false + _vec_cmp_lex(ea, eb) end -# Lex monomial comparison +# Lex monomial comparison (shuffled variables) function monom_isless(ea::ExponentVector, eb::ExponentVector, ord::_Lex{false}) indices = variable_indices(ord) @invariant length(ea) == length(eb) @@ -264,11 +249,12 @@ function monom_lcm!( eb::ExponentVector{T} ) where {T} @invariant length(ec) == length(ea) == length(eb) - @inbounds ec[1] = zero(T) + s = zero(T) @inbounds for i in 2:length(ec) ec[i] = max(ea[i], eb[i]) - ec[1] += ec[i] + s += ec[i] end + ec[1] = s _monom_overflow_check(ec) ec end @@ -276,12 +262,7 @@ end # Checks if the gcd of monomials is constant. function monom_is_gcd_const(ea::ExponentVector{T}, eb::ExponentVector{T}) where {T} @invariant length(ea) == length(eb) - @inbounds for i in 2:length(ea) - if !iszero(ea[i]) && !iszero(eb[i]) - return false - end - end - true + _vec_check_orth(ea, eb) end # Returns the product of monomials. Also writes the result to ec. @@ -315,12 +296,7 @@ end # Checks monomial divisibility function monom_is_divisible(ea::ExponentVector{T}, eb::ExponentVector{T}) where {T} @invariant length(ea) == length(eb) - @inbounds for j in 1:length(ea) - if ea[j] < eb[j] - return false - end - end - true + _vec_not_any_lt(ea, eb) end # Checks monomial divisibility, AND performs division @@ -330,12 +306,8 @@ function monom_is_divisible!( eb::ExponentVector{T} ) where {T} @invariant length(ec) == length(ea) == length(eb) - @inbounds for j in 1:length(ec) - if ea[j] < eb[j] - return false, ec - end - ec[j] = ea[j] - eb[j] - end + !monom_is_divisible(ea, eb) && return false, ec + monom_division!(ec, ea, eb) true, ec end @@ -360,6 +332,7 @@ function monom_create_divmask( res = zero(Mask) o = one(Mask) if !compressed + # if the number of variables is <= 32 @inbounds for i in 1:ndivvars for _ in 1:ndivbits if e[i + 1] >= divmap[ctr] @@ -369,6 +342,7 @@ function monom_create_divmask( end end else + # if the number of variables is > 32 @invariant ndivbits == 1 @invariant length(divmap) == 8 * sizeof(Mask) @inbounds for i in 1:length(divmap) diff --git a/src/utils/invariants.jl b/src/utils/invariants.jl index a7f64c4f..0cae07c3 100644 --- a/src/utils/invariants.jl +++ b/src/utils/invariants.jl @@ -35,7 +35,7 @@ macro unreachable() end ### -# DANGER!!! DANGER!!! DANGER!!! +# DANGER """ unsafe_unreachable() --> Nothing diff --git a/src/utils/keywords.jl b/src/utils/keywords.jl index 44501807..d66aa49e 100644 --- a/src/utils/keywords.jl +++ b/src/utils/keywords.jl @@ -54,6 +54,7 @@ const _supported_kw_args = ( fglm = ( check = false, monoms = :dense, + statistics = :no, loglevel = _default_loglevel, ), groebner_learn = ( @@ -181,23 +182,6 @@ struct KeywordsHandler{Ord} Possible choices for keyword "statistics" are: `:no`, `:timings`, `:stats`, `:all`""" - # @log level = -2 """ - # In $function_key, using keywords: - # reduced = $reduced, - # ordering = $ordering, - # certify = $certify, - # linalg = $linalg, - # monoms = $monoms, - # seed = $seed, - # loglevel = $loglevel, - # maxpairs = $maxpairs, - # selection = $selection, - # modular = $modular, - # check = $check, - # sweep = $sweep - # homogenize = $homogenize - # statistics = $statistics""" - new{typeof(ordering)}( reduced, ordering, @@ -227,6 +211,7 @@ function logging_setup(keywords::KeywordsHandler) end function statistics_setup(keywords::KeywordsHandler) + log_simdinfo() if keywords.loglevel <= 0 performance_counters_refresh() statistics_refresh() diff --git a/src/utils/simd.jl b/src/utils/simd.jl new file mode 100644 index 00000000..c13dbc50 --- /dev/null +++ b/src/utils/simd.jl @@ -0,0 +1,460 @@ +# This file is a part of Groebner.jl. License is GNU GPL v2. + +### +# This file provides some very fast operations on vectors of integers. + +# NOTE. The use of these functions must be limited. +# +# Debugging LLVM IR is a headache. The LLVM IR textual format is not guaranteed +# to be stable. +# +# At this moment, we are grudgingly writing the LLVM IR, and we hope that the +# bright future where the compiler does this for us is well within our grasp. + +# Functions in this file take their inspiration from +# https://github.com/SciML/FindFirstFunctions.jl + +const BitInteger = Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8} + +function log_simdinfo() + @log level = -1 """ + CPU: $(Symbol(cpu_name())) + ------------------------ + fma_fast: \t$(Bool(fma_fast())) + registers:\t$(Int(register_count())) + simd width:\t<$(Int(pick_vector_width(Int32))) x i32> + """ + nothing +end + +j_to_llvm_t(_::Type{T}) where {T <: BitInteger} = "i$(8*sizeof(T))" +typemax_saturated(_::Type{T}, N) where {T <: BitInteger} = typemax(T) ⊻ (N - 1) + +# Same as `HostCPUFeatures.pick_vector_width`, +# but also makes sure that N \in {8, 16, 32, 64}. +# If this is not possible, returns N = 1. +function cutoff8_pick_vector_width(::Type{T}) where {T} + N = pick_vector_width(T) + if N in (8, 16, 32, 64) + return Int(N) + end + 1 +end + +# Vector functions in this file assume that +# - input vectors are non-negative. +# - input vectors have the same length. + +# Returns false if a[i] < b[i] for ANY index i, and true otherwise. +@inline @generated function _vec_not_any_lt( + a::Vector{T}, + b::Vector{T}, + offset::Int=1 +) where {T <: BitInteger} + N = cutoff8_pick_vector_width(T) + + # Unfortunate case. Default to scalar code. + if N == 1 + return quote + @inbounds for j in 1:length(a) + if a[j] < b[j] + return false + end + end + return true + end + end + + # The case when IntN exists. + @assert N in (8, 16, 32, 64) + B = sizeof(T) + llvm_t = j_to_llvm_t(T) + mask = typemax_saturated(Int, N) + textir = """ + define i8 @entry(i64 %0, i64 %1, i64 %2) #0 { + top: + %a = inttoptr i64 %0 to $llvm_t* + %b = inttoptr i64 %1 to $llvm_t* + %lenm$(N-1) = add nsw i64 %2, -$(N-1) + %dosimditer = icmp ugt i64 %2, $(N-1) + br i1 %dosimditer, label %L9.lr.ph, label %L32 + + L9.lr.ph: + %len$N = and i64 %2, $mask ; divisible by N + br label %L9 + + L9: + %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ] + %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i + %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i + %avi = bitcast $llvm_t* %api to <$N x $llvm_t>* + %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>* + %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B + %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B + %mask = icmp ult <$N x $llvm_t> %ai, %bi + %compressed = bitcast <$N x i1> %mask to i$N + %matchnotfound = icmp eq i$N %compressed, 0 + br i1 %matchnotfound, label %L30, label %common.ret + + common.ret: + %retval = phi i8 [ 0, %L9 ], [ 1, %L32 ], [ 0, %L51 ], [ 1, %L67 ] + ret i8 %retval + + L30: + %vinc = add nuw nsw i64 %i, $N + %continue = icmp slt i64 %vinc, %lenm$(N-1) + br i1 %continue, label %L9, label %L32 + + L32: + %cumi = phi i64 [ 0, %top ], [ %len$N, %L30 ] + %done = icmp eq i64 %cumi, %2 + br i1 %done, label %common.ret, label %L51 + + L51: + %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ] + %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si + %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si + %savi = load $llvm_t, $llvm_t* %sapi, align $B + %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B + %match = icmp ult $llvm_t %savi, %sbvi + br i1 %match, label %common.ret, label %L67 + + L67: + %inc = add i64 %si, 1 + %dobreak = icmp eq i64 %inc, %2 + br i1 %dobreak, label %common.ret, label %L51 + } + attributes #0 = { alwaysinline } + """ + quote + GC.@preserve a b begin + Base.llvmcall( + ($textir, "entry"), + Bool, + Tuple{Ptr{T}, Ptr{T}, Int64}, + Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset, + Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset, + length(a) - offset + ) + end + end +end + +# Returns true if a and b are orthogonal, and false otherwise. +@inline @generated function _vec_check_orth( + a::Vector{T}, + b::Vector{T}, + offset::Int=1 +) where {T <: BitInteger} + N = cutoff8_pick_vector_width(T) + + # Unfortunate case. Default to scalar code. + if N == 1 + return quote + @inbounds for j in 1:length(a) + if !iszero(a[j]) && !iszero(b[j]) + return false + end + end + return true + end + end + + # The case when IntN exists. + @assert N in (8, 16, 32, 64) + B = sizeof(T) + llvm_t = j_to_llvm_t(T) + mask = typemax_saturated(Int, N) + textir = """ + define i8 @entry(i64 %0, i64 %1, i64 %2) #0 { + top: + %a = inttoptr i64 %0 to $llvm_t* + %b = inttoptr i64 %1 to $llvm_t* + %lenm$(N-1) = add nsw i64 %2, -$(N-1) + %dosimditer = icmp ugt i64 %2, $(N-1) + br i1 %dosimditer, label %L9.lr.ph, label %L32 + + L9.lr.ph: + %len$N = and i64 %2, $mask ; divisible by N + br label %L9 + + L9: + %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ] + %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i + %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i + %avi = bitcast $llvm_t* %api to <$N x $llvm_t>* + %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>* + %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B + %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B + %mask1 = icmp ne <$N x $llvm_t> %ai, zeroinitializer + %mask2 = icmp ne <$N x $llvm_t> %bi, zeroinitializer + %mask3 = and <$N x i1> %mask1, %mask2 + %compressed = bitcast <$N x i1> %mask3 to i$N + %matchnotfound1 = icmp eq i$N %compressed, 0 + br i1 %matchnotfound1, label %L30, label %common.ret + + common.ret: + %retval = phi i8 [ 0, %L9 ], [ 1, %L32 ], [ 0, %L51 ], [ 1, %L67 ] + ret i8 %retval + + L30: + %vinc = add nuw nsw i64 %i, $N + %continue = icmp slt i64 %vinc, %lenm$(N-1) + br i1 %continue, label %L9, label %L32 + + L32: + %cumi = phi i64 [ 0, %top ], [ %len$N, %L30 ] + %done = icmp eq i64 %cumi, %2 + br i1 %done, label %common.ret, label %L51 + + L51: + %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ] + %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si + %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si + %savi = load $llvm_t, $llvm_t* %sapi, align $B + %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B + %mask11 = icmp ne $llvm_t %savi, 0 + %mask12 = icmp ne $llvm_t %sbvi, 0 + %mask13 = and i1 %mask11, %mask12 + %matchnotfound2 = icmp eq i1 %mask13, 0 + br i1 %matchnotfound2, label %L67, label %common.ret + + L67: + %inc = add i64 %si, 1 + %dobreak = icmp eq i64 %inc, %2 + br i1 %dobreak, label %common.ret, label %L51 + } + attributes #0 = { alwaysinline } + """ + quote + GC.@preserve a b begin + Base.llvmcall( + ($textir, "entry"), + Bool, + Tuple{Ptr{T}, Ptr{T}, Int64}, + Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset, + Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset, + length(a) - offset + ) + end + end +end + +# Returns true if a < b lexicographically, and false otherwise. +@inline @generated function _vec_cmp_lex( + a::Vector{T}, + b::Vector{T}, + offset::Int=1 +) where {T <: BitInteger} + N = cutoff8_pick_vector_width(T) + + # Unfortunate case. Default to scalar code. + if N == 1 + return quote + i = 1 + offset + @inbounds while i < length(a) && a[i] == b[i] + i += 1 + end + @inbounds return a[i] < b[i] + end + end + + # The case when IntN exists. + @assert N in (8, 16, 32, 64) + B = sizeof(T) + llvm_t = j_to_llvm_t(T) + mask = typemax_saturated(Int, N) + textir = """ + declare i$N @llvm.cttz.i$N(i$N, i1); + define i8 @entry(i64 %0, i64 %1, i64 %2) #0 { + top: + %a = inttoptr i64 %0 to $llvm_t* + %b = inttoptr i64 %1 to $llvm_t* + %lenm$(N-1) = add nsw i64 %2, -$(N-1) + %dosimditer = icmp ugt i64 %2, $(N-1) + br i1 %dosimditer, label %L9.lr.ph, label %L32 + + L9.lr.ph: + %len$N = and i64 %2, $mask ; divisible by N + br label %L9 + + L9: + %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ] + %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i + %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i + %avi = bitcast $llvm_t* %api to <$N x $llvm_t>* + %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>* + %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B + %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B + %mask = icmp ne <$N x $llvm_t> %ai, %bi + %compressed = bitcast <$N x i1> %mask to i$N + %matchfnotound = icmp eq i$N %compressed, 0 + br i1 %matchfnotound, label %L30, label %L17 + + L17: + %tz$N = call i$N @llvm.cttz.i$N(i$N %compressed, i1 true) + %tz64 = zext i$N %tz$N to i64 + %vis = add nuw i64 %i, %tz64 + %sapi2 = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %vis + %sbpi2 = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %vis + %savi2 = load $llvm_t, $llvm_t* %sapi2, align $B + %sbvi2 = load $llvm_t, $llvm_t* %sbpi2, align $B + %flag1 = icmp ult $llvm_t %savi2, %sbvi2 + br label %common.ret + + common.ret: + %retflag = phi i1 [ %flag1, %L17 ], [ 0, %L32 ], [ %flag2, %L51 ], [ 0, %L67 ] + %retval = zext i1 %retflag to i8 + ret i8 %retval + + L30: + %vinc = add nuw nsw i64 %i, $N + %continue = icmp slt i64 %vinc, %lenm$(N-1) + br i1 %continue, label %L9, label %L32 + + L32: + %cumi = phi i64 [ 0, %top ], [ %len$N, %L30 ] + %done = icmp eq i64 %cumi, %2 + br i1 %done, label %common.ret, label %L51 + + L51: + %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ] + %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si + %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si + %savi = load $llvm_t, $llvm_t* %sapi, align $B + %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B + %match = icmp eq $llvm_t %savi, %sbvi + %flag2 = icmp ult $llvm_t %savi, %sbvi + br i1 %match, label %L67, label %common.ret + + L67: + %inc = add i64 %si, 1 + %dobreak = icmp eq i64 %inc, %2 + br i1 %dobreak, label %common.ret, label %L51 + } + attributes #0 = { alwaysinline } + """ + quote + GC.@preserve a b begin + Base.llvmcall( + ($textir, "entry"), + Bool, + Tuple{Ptr{T}, Ptr{T}, Int64}, + Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset, + Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset, + length(a) - offset + ) + end + end +end + +# Returns true if a < b reversed lexicographically, and false otherwise. +@inline @generated function _vec_cmp_revlex( + a::Vector{T}, + b::Vector{T}, + offset::Int=1 +) where {T <: BitInteger} + N = cutoff8_pick_vector_width(T) + + # Unfortunate case. Default to scalar code. + if N == 1 + return quote + i = length(a) + @inbounds while i > 1 + offset && a[i] == b[i] + i -= 1 + end + @inbounds return a[i] > b[i] + end + end + + # The case when IntN exists. + @assert N in (8, 16, 32, 64) + B = sizeof(T) + llvm_t = j_to_llvm_t(T) + mask = typemax_saturated(Int, N) + textir = """ + declare i$N @llvm.ctlz.i$N(i$N, i1); + define i8 @entry(i64 %0, i64 %1, i64 %2) #0 { + top: + %a = inttoptr i64 %0 to $llvm_t* + %b = inttoptr i64 %1 to $llvm_t* + %lenm$N = add nsw i64 %2, -$N + %lenm1 = add nsw i64 %2, -1 + %dosimditer = icmp sge i64 %2, $N + br i1 %dosimditer, label %L9.lr.ph, label %L32 + + L9.lr.ph: + %len$N = and i64 %2, $mask ; divisible by N + %lenmlen$N = sub nsw i64 %lenm1, %len$N + br label %L9 + + L9: + %i = phi i64 [ %lenm$N, %L9.lr.ph ], [ %vdec, %L30 ] + %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i + %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i + %avi = bitcast $llvm_t* %api to <$N x $llvm_t>* + %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>* + %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B + %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B + %mask = icmp ne <$N x $llvm_t> %ai, %bi + %compressed = bitcast <$N x i1> %mask to i$N + %matchfnotound = icmp eq i$N %compressed, 0 + br i1 %matchfnotound, label %L30, label %L17 + + L17: + %tz$N = call i$N @llvm.ctlz.i$N(i$N %compressed, i1 true) + %tz$N.rev = sub i$N $(N-1), %tz$N + %tz64 = zext i$N %tz$N.rev to i64 + %vis = add nsw i64 %i, %tz64 + %sapi2 = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %vis + %sbpi2 = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %vis + %savi2 = load $llvm_t, $llvm_t* %sapi2, align $B + %sbvi2 = load $llvm_t, $llvm_t* %sbpi2, align $B + %flag1 = icmp ugt $llvm_t %savi2, %sbvi2 + br label %common.ret + + common.ret: + %retflag = phi i1 [ %flag1, %L17 ], [ 0, %L32 ], [ %flag2, %L51 ], [ 0, %L67 ] + %retval = zext i1 %retflag to i8 + ret i8 %retval + + L30: + %vdec = sub nsw i64 %i, $N + %continuesimd = icmp sge i64 %vdec, 0 + br i1 %continuesimd, label %L9, label %L32 + + L32: + %cumi = phi i64 [ %lenm1, %top ], [ %lenmlen$N, %L30 ] + %done = icmp eq i64 %cumi, -1 + br i1 %done, label %common.ret, label %L51 + + L51: + %si = phi i64 [ %dec, %L67 ], [ %cumi, %L32 ] + %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si + %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si + %savi = load $llvm_t, $llvm_t* %sapi, align $B + %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B + %match = icmp eq $llvm_t %savi, %sbvi + %flag2 = icmp ugt $llvm_t %savi, %sbvi + br i1 %match, label %L67, label %common.ret + + L67: + %dec = sub nsw i64 %si, 1 + %dobreak = icmp eq i64 %dec, -1 + br i1 %dobreak, label %common.ret, label %L51 + } + attributes #0 = { alwaysinline } + """ + quote + GC.@preserve a b begin + Base.llvmcall( + ($textir, "entry"), + Bool, + Tuple{Ptr{T}, Ptr{T}, Int64}, + Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset, + Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset, + length(a) - offset + ) + end + end +end diff --git a/test/arithmetic/Zp.jl b/test/arithmetic/Zp.jl index fa3122e1..359eba83 100644 --- a/test/arithmetic/Zp.jl +++ b/test/arithmetic/Zp.jl @@ -1,3 +1,4 @@ +import Primes @testset "arithmetic in Zp" begin m = UInt64(2^30 + 3) @@ -16,19 +17,23 @@ @test Groebner.n_spare_bits(a) == 2 @test Groebner.n_safe_consecutive_additions(a) == 8 - A, C = UInt64, UInt32 - modulo = C(2^31 - 1) - for arithmetic in [ - Groebner.ArithmeticZp(A, C, modulo), - Groebner.DelayedArithmeticZp(A, C, modulo), - Groebner.SignedArithmeticZp(signed(A), signed(C), signed(modulo)), - Groebner.SpecializedArithmeticZp(A, C, modulo) - ] - for _ in 1:1000 - T = typeof(Groebner.divisor(arithmetic)) - x = rand(T) - @test Groebner.mod_p(T(x), arithmetic) == Base.mod(x, T(modulo)) - @test Groebner.inv_mod_p(T(x), arithmetic) == Base.invmod(T(x), T(modulo)) + for (A, C) in [(UInt64, UInt32), (UInt32, UInt16), (UInt16, UInt8)] + modulo = Primes.prevprime(typemax(C) >> 2) + for arithmetic in [ + Groebner.ArithmeticZp(A, C, modulo), + Groebner.DelayedArithmeticZp(A, C, modulo), + Groebner.SignedArithmeticZp(signed(A), signed(C), signed(modulo)), + Groebner.SpecializedArithmeticZp(A, C, modulo) + ] + for _ in 1:1000 + T = typeof(Groebner.divisor(arithmetic)) + x = abs(rand(T)) + @test Groebner.mod_p(T(x), arithmetic) == Base.mod(x, T(modulo)) + if gcd(modulo, T(x)) == 1 + @test Groebner.inv_mod_p(T(x), arithmetic) == + Base.invmod(T(x), T(modulo)) + end + end end end end diff --git a/test/monoms/monom_orders.jl b/test/monoms/monom_orders.jl index f7de5f39..86ff1169 100644 --- a/test/monoms/monom_orders.jl +++ b/test/monoms/monom_orders.jl @@ -13,8 +13,35 @@ implementations_to_test = [ ] @testset "monom orders: Lex, DegLex, DegRevLex" begin - for T in (UInt64, UInt32, UInt16) + x = [1, 1, 1, 1, 1, 0, 1, 1, 0, 1] + y = [1, 1, 1, 1, 1, 0, 1, 1, 0, 1] + a = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, x) + b = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, y) + @test !lex(a, b) && !dl(a, b) && !drl(a, b) + + x = [1, 1, 1, 1, 1, 0, 1, 2, 0, 1] + y = [1, 1, 1, 1, 1, 0, 2, 1, 0, 1] + a = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, x) + b = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, y) + @test lex(a, b) && dl(a, b) && drl(a, b) + + n = 12 + for i in 2:n + xx = ones(Int32, n) + xx[i] = xx[i] + 1 + yy = ones(Int32, n) + yy[i - 1] = yy[i - 1] + 1 + a = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, xx) + b = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, yy) + @test lex(a, b) && dl(a, b) && drl(a, b) + end + + for T in (UInt64, UInt32, UInt16, UInt8) for EV in implementations_to_test + if EV{T} <: Groebner.AbstractPackedTuple{T, UInt8} && T == UInt8 + continue + end + a = monom_construct_from_vector(EV{T}, [0]) b = monom_construct_from_vector(EV{T}, [0]) @test !lex(a, b) @@ -74,11 +101,9 @@ implementations_to_test = [ @test dl(a, b) @test drl(a, b) - 5 > Groebner.monom_max_vars(EV{T}) && continue - a = monom_construct_from_vector(EV{T}, ones(UInt, 5)) - b = monom_construct_from_vector(EV{T}, ones(UInt, 5)) - @test !lex(a, b) - @test !dl(a, b) + 3 > Groebner.monom_max_vars(EV{T}) && continue + a = monom_construct_from_vector(EV{T}, [1, 5, 0]) + b = monom_construct_from_vector(EV{T}, [1, 0, 1]) @test !drl(a, b) 25 > Groebner.monom_max_vars(EV{T}) && continue @@ -98,7 +123,7 @@ implementations_to_test = [ # test that different implementations agree for n in 1:10 - k = rand(1:100) + k = rand(1:60) implementations_to_test_local = filter(xxx -> Groebner.monom_max_vars(xxx{T}) >= k, implementations_to_test) @@ -195,7 +220,7 @@ end end function test_orderings(n, v1, v2, ords_to_test) - R, x = QQ[["x$i" for i in 1:n]...] + R, x = AbstractAlgebra.QQ[["x$i" for i in 1:n]...] var_to_index = Dict(x .=> 1:n) for wo in ords_to_test ord = wo.ord @@ -257,7 +282,7 @@ end @testset "monom orders: ProductOrdering" begin pv = Groebner.ExponentVector{T} where {T} - R, x = QQ[["x$i" for i in 1:3]...] + R, x = AbstractAlgebra.QQ[["x$i" for i in 1:3]...] v1 = Groebner.monom_construct_from_vector(pv{UInt64}, [1, 2, 3]) v2 = Groebner.monom_construct_from_vector(pv{UInt64}, [3, 2, 1]) @@ -274,7 +299,7 @@ end test_orderings(3, v1, v2, ords_to_test) - R, x = QQ[["x$i" for i in 1:6]...] + R, x = AbstractAlgebra.QQ[["x$i" for i in 1:6]...] v1 = Groebner.monom_construct_from_vector(pv{UInt64}, [4, 1, 7, 0, 9, 8]) v2 = Groebner.monom_construct_from_vector(pv{UInt64}, [1, 6, 3, 5, 9, 100]) diff --git a/test/reconstruction/ratrec.jl b/test/reconstruction/ratrec.jl index a577dc59..7f038d61 100644 --- a/test/reconstruction/ratrec.jl +++ b/test/reconstruction/ratrec.jl @@ -1,32 +1,17 @@ -using Primes +import Primes @testset "rational reconstruction" begin - # these are just small sanity checks moduli = BigInt[107, 199, 509, 2^31 - 1, nextprime(BigInt(2)^100), nextprime(BigInt(2)^200)] numbers = [QQ(0), QQ(1), QQ(1, 2), QQ(3, 2), QQ(4, 5), QQ(1, 7), QQ(6)] - num, den, buf, buf1, buf2, buf3, u1, u2, u3, v1, v2, v3 = [BigInt(0) for _ in 1:13] for m in moduli bnd = Groebner.ratrec_reconstruction_bound(m) for a in numbers - ac = numerator(a) * invmod(denominator(a), m) - ar1 = Groebner.ratrec!( - num, - den, - bnd, - buf, - buf1, - buf2, - buf3, - u1, - u2, - u3, - v1, - v2, - v3, - ac, - m + ac = mod(numerator(a) * invmod(denominator(a), m), m) + success, (num, den) = Groebner.ratrec_nemo( + Groebner.Nemo.ZZRingElem(ac), + Groebner.Nemo.ZZRingElem(m) ) @test Base.unsafe_rational(num, den) == a end @@ -44,24 +29,12 @@ using Primes bnd = Groebner.ratrec_reconstruction_bound(m) for i in 1:samples a = BigInt(rand(0:(m - 1))) - success = Groebner.ratrec!( - num, - den, - bnd, - buf, - buf1, - buf2, - buf3, - u1, - u2, - u3, - v1, - v2, - v3, - a, - m + ac = mod(numerator(a) * invmod(denominator(a), m), m) + success, (num, den) = Groebner.ratrec_nemo( + Groebner.Nemo.ZZRingElem(ac), + Groebner.Nemo.ZZRingElem(m) ) - if a < sqrt(m) + if a < sqrt(div(m, 2)) @test success end if success