upd fglm benchmarks

sumiya11 · Jan 26, 2024 · 6dd8183 · 6dd8183
1 parent d42f0d8
commit 6dd8183
Show file tree

Hide file tree

Showing 25 changed files with 1,281 additions and 264 deletions.
diff --git a/Project.toml b/Project.toml
@@ -8,6 +8,7 @@ AbstractAlgebra = "c3fe647b-3220-5bb0-a1ea-a7954cac585d"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+HostCPUFeatures = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MultivariatePolynomials = "102ac46a-7ee4-5c85-9060-abc95bfdeaa3"
 Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a"

diff --git a/benchmark/CI-scripts/run_benchmarks.jl b/benchmark/CI-scripts/run_benchmarks.jl
@@ -67,6 +67,13 @@ push!(
         result=compute_gb(Groebner.katsuran(10, ordering=:degrevlex, k=GF(2^27 + 29)), 5)
     )
 )
+push!(
+    suite,
+    (
+        problem_name="groebner, AA, GF(2^30+3), katsura 11",
+        result=compute_gb(Groebner.katsuran(11, ordering=:degrevlex, k=GF(2^30 + 3)), 3)
+    )
+)
 push!(
     suite,
     (

diff --git a/experimental/example-maybe-bug.jl b/experimental/example-maybe-bug.jl
@@ -11,11 +11,11 @@ end
 
 @info "" nthreads()
 @show ENV["JULIA_NUM_THREADS"]
-Groebner.logging_enabled() = false
+Groebner.logging_enabled() = true
 Groebner.invariants_enabled() = false
 Groebner.performance_counters_enabled() = false
 
-s = Groebner.katsuran(9, ordering=:degrevlex, k=AbstractAlgebra.GF(2^30 + 3));
+s = Groebner.noonn(8, ordering=:degrevlex, k=AbstractAlgebra.GF(2^30 + 3));
 trace, gb = Groebner.groebner_learn(s);
 @btime Groebner.groebner($s);
 @btime Groebner.groebner_apply!($trace, $s);
@@ -24,15 +24,15 @@ trace, gb = Groebner.groebner_learn(s);
 @btime Groebner.groebner_apply!($trace, $((s, s, s, s, s, s, s, s)));
 
 #=
-  113.912 ms (34041 allocations: 43.78 MiB)
+  1.048 s (418167 allocations: 258.02 MiB)
 
-  44.803 ms (18887 allocations: 24.46 MiB)
+  142.140 ms (126985 allocations: 87.63 MiB)
 
-  52.580 ms (20607 allocations: 35.38 MiB)
+  185.635 ms (134610 allocations: 134.37 MiB)
 
-  64.618 ms (23276 allocations: 59.42 MiB)
+  283.075 ms (146988 allocations: 224.29 MiB)
 
-  125.247 ms (28610 allocations: 107.47 MiB)
+  433.356 ms (171889 allocations: 404.00 MiB)
 =#
 @profview Groebner.groebner_apply!(trace, ((s, s, s, s)));
 

diff --git a/experimental/fglmtest.jl b/experimental/fglmtest.jl
@@ -0,0 +1,17 @@
+using AbstractAlgebra, Groebner
+
+R, (x, y, z, t) = AbstractAlgebra.polynomial_ring(AbstractAlgebra.QQ, ["x", "y", "z", "t"])
+sys = [
+    y^2 * z + 2 * x * y * t - 2 * x - z,
+    -x^3 * z + 4 * x * y^2 * z + 4 * x^2 * y * t + 2 * y^3 * t + 4 * x^2 - 10 * y^2 +
+    4 * x * z - 10 * y * t + 2,
+    2 * y * z * t + x * t^2 - x - 2 * z,
+    -x * z^3 + 4 * y * z^2 * t + 4 * x * z * t^2 + 2 * y * t^3 + 4 * x * z + 4 * z^2 -
+    10 * y * t - 10 * t^2 + 2
+]
+
+gb_lex = Groebner.groebner(sys, ordering=Groebner.Lex())
+gb_drl = Groebner.groebner(sys, ordering=Groebner.DegRevLex())
+gb_fglm = Groebner.fglm(gb_drl, Groebner.DegRevLex(), Groebner.Lex())
+
+@info "" gb_fglm
diff --git a/experimental/lexcmp.jl b/experimental/lexcmp.jl
@@ -0,0 +1,217 @@
+using BenchmarkTools
+
+using HostCPUFeatures
+using HostCPUFeatures:
+    register_size,
+    pick_vector_width,
+    pick_vector_width_shift,
+    simd_integer_register_size,
+    fma_fast,
+    has_feature,
+    register_count,
+    cpu_name,
+    register_size
+
+#########
+
+_setup1(n) = begin
+    x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+    y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+    x, y
+end
+_setup2(n) = begin
+    x = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+    y = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+    x, y
+end
+_setup3(T, n) = begin
+    s = rand(T.([0, 0, 0, 1, 2, 3]), 3)
+    x = Groebner.monom_construct_from_vector(
+        Groebner.ExponentVector{T},
+        vcat(zeros(T, n), s)
+    )
+    y = Groebner.monom_construct_from_vector(
+        Groebner.ExponentVector{T},
+        vcat(zeros(T, n), reverse(s))
+    )
+    z = similar(x)
+    @assert Groebner.monom_totaldeg(x) == Groebner.monom_totaldeg(y)
+    z, x, y
+end
+
+#########
+
+begin
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        print("Groebner.monom_is_equal\t\t")
+        @btime Groebner.monom_is_equal(xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_copy\t\t")
+        @btime Groebner.monom_copy(xx) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_is_divisible\t")
+        @btime Groebner.monom_is_divisible(xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, $n)
+        end
+        print("Groebner.monom_product!\t\t")
+        @btime Groebner.monom_product!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_lcm!\t\t")
+        @btime Groebner.monom_lcm!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_is_gcd_const\t")
+        @btime Groebner.monom_is_gcd_const(xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, $n)
+        end
+        print("Groebner.monom_isless:lex\t")
+        @btime Groebner.monom_isless(xx, yy, _ord) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+            _ord = Groebner._Lex{true}(ones(Int, length(xx)))
+        end
+        print("Groebner.monom_isless:drl\t")
+        @btime Groebner.monom_isless(xx, yy, _ord) setup = begin
+            cc, xx, yy = map(reverse, _setup3(Int8, max(1, $n)))
+            xx[1] = xx[end]
+            yy[1] = yy[end]
+            @assert Groebner.monom_totaldeg(xx) == Groebner.monom_totaldeg(yy)
+            _ord = Groebner._DegRevLex{true}(ones(Int, length(xx)))
+        end
+        n += step
+        step = ceil(Int, step * 1.2)
+    end
+end
+
+_setup4(n) = begin
+    s = rand(UInt8.([0, 0, 0, 1, 2, 3]), 2)
+    a, b = vcat(zeros(UInt8, n), s), vcat(zeros(UInt8, n), reverse(s))
+    a, b = reverse(a), reverse(b)
+    x = Groebner.monom_construct_from_vector(Groebner.ExponentVector{UInt8}, a)
+    y = Groebner.monom_construct_from_vector(Groebner.ExponentVector{UInt8}, b)
+    vT(n) =
+        if n + 2 < 8
+            Groebner.PackedTuple1
+        elseif n + 2 < 16
+            Groebner.PackedTuple2
+        elseif n + 2 < 24
+            Groebner.PackedTuple3
+        end
+    xpacked = Groebner.monom_construct_from_vector(vT(n){UInt64, UInt8}, a)
+    ypacked = Groebner.monom_construct_from_vector(vT(n){UInt64, UInt8}, b)
+    x, y, xpacked, ypacked
+end
+begin
+    n, step = 1, 3
+    while n < 22
+        @info "n = $n"
+        print("Groebner.monom_isless:drl:packed\t")
+        @btime Groebner.monom_isless(xpacked, ypacked, _ord) setup = begin
+            x, y, xpacked, ypacked = _setup4($n)
+            _ord = Groebner._DegRevLex{true}(ones(Int, length(x)))
+            @assert Groebner.monom_totaldeg(xpacked) == Groebner.monom_totaldeg(ypacked)
+            tmp1, tmp2 =
+                Vector{Int8}(undef, length(x) - 1), Vector{Int8}(undef, length(x) - 1)
+            @assert Groebner.monom_to_vector!(tmp1, x) ==
+                    Groebner.monom_to_vector!(tmp2, xpacked)
+            @assert Groebner.monom_to_vector!(tmp1, y) ==
+                    Groebner.monom_to_vector!(tmp2, ypacked)
+        end
+        print("Groebner.monom_isless:drl:expvect\t")
+        @btime Groebner.monom_isless(x, y, _ord) setup = begin
+            x, y, xpacked, ypacked = _setup4($n)
+            _ord = Groebner._DegRevLex{true}(ones(Int, length(x)))
+            @assert Groebner.monom_totaldeg(x) == Groebner.monom_totaldeg(y)
+        end
+        n += step
+    end
+end
+
+begin
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        for _ in 1:1_000
+            x, y = _setup3(n)
+            res1 = vector_are_orth(x, y)
+            res2 = _vec_check_orth(x, y)
+            @assert res1 == res2
+        end
+        @btime vector_are_orth(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+        @btime _vec_check_orth(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+        n += step
+        step = ceil(Int, step * 1.2)
+    end
+end
+
+begin
+    _setup1(n) = begin
+        x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        x, y
+    end
+    _setup2(n) = begin
+        x = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+        y = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+        x, y
+    end
+    _setup3(n) = begin
+        x = vcat(zeros(Int16, n), rand(Int16.([0, 0, 0, 1, 2, 3]), n))
+        y = vcat(zeros(Int16, n), rand(Int16.([0, 0, 0, 1, 2, 3]), n))
+        x, y
+    end
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        n += step
+        step = ceil(Int, step * 1.2)
+        for _ in 1:100
+            x, y = _setup3(n)
+            res1 = vector_any_lt(x, y)
+            res2 = vector_any_lt_simd(x, y)
+            @assert res1 == res2
+        end
+        @btime vector_any_lt(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+        @btime vector_any_lt_simd(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+    end
+end
+
+#########
+
+begin
+    _setup1(n) = begin
+        x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        similar(x), x, y
+    end
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        n += step
+        step = ceil(Int, step * 1.2)
+        for _ in 1:100
+            c, x, y = _setup1(n)
+            res1 = vector_emax_1!(copy(c), x, y)
+            res2 = vector_emax_2!(copy(c), x, y)
+            @assert res1 == res2
+        end
+        @btime vector_emax_1!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup1($n)
+        end
+        @btime vector_emax_2!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup1($n)
+        end
+    end
+end
diff --git a/experimental/native_vec4-assume.txt b/experimental/native_vec4-assume.txt
@@ -0,0 +1,93 @@
+	.text
+	.file	"mod_p"
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3                               # -- Begin function julia_mod_p_61774
+.LCPI0_0:
+	.quad	-9223372036854775808            # 0x8000000000000000
+	.text
+	.globl	julia_mod_p_61774
+	.p2align	4, 0x90
+	.type	julia_mod_p_61774,@function
+julia_mod_p_61774:                      # @julia_mod_p_61774
+	.cfi_startproc
+# %bb.0:                                # %top
+	push	rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset rbp, -16
+	mov	rbp, rsp
+	.cfi_def_cfa_register rbp
+	push	rsi
+	push	rdi
+	.cfi_offset rdi, -32
+	.cfi_offset rsi, -24
+	mov	r9, rdx
+	mov	r10, rcx
+	vpmovsxbq	ymm2, dword ptr [r8 + 96]
+	vpmovzxbq	ymm1, dword ptr [r8 + 100] # ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+	mov	r11, qword ptr [r8 + 80]
+	mov	rcx, qword ptr [r8 + 72]
+	mov	rax, qword ptr [r8 + 64]
+	imul	qword ptr [rdx]
+	mov	rdi, rdx
+	mov	rax, rcx
+	imul	qword ptr [r9 + 8]
+	mov	rcx, rdx
+	mov	rax, r11
+	imul	qword ptr [r9 + 16]
+	mov	rsi, rdx
+	vmovdqu	ymm0, ymmword ptr [r9]
+	mov	rax, qword ptr [r8 + 88]
+	imul	qword ptr [r9 + 24]
+	vmovq	xmm3, rdx
+	vmovq	xmm4, rsi
+	vpunpcklqdq	xmm3, xmm4, xmm3        # xmm3 = xmm4[0],xmm3[0]
+	vmovq	xmm4, rcx
+	vmovq	xmm5, rdi
+	vpunpcklqdq	xmm4, xmm5, xmm4        # xmm4 = xmm5[0],xmm4[0]
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlq	ymm4, ymm0, 32
+	vpmuludq	ymm4, ymm4, ymm2
+	vpsrlq	ymm5, ymm2, 32
+	vpmuludq	ymm5, ymm0, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm2, ymm0, ymm2
+	vpaddq	ymm2, ymm2, ymm3
+	vpaddq	ymm2, ymm4, ymm2
+	vpxor	xmm3, xmm3, xmm3
+	vpsrlvq	ymm4, ymm2, ymm1
+	movabs	rax, offset .LCPI0_0
+	vpbroadcastq	ymm5, qword ptr [rax]
+	vpsrlvq	ymm1, ymm5, ymm1
+	vpxor	ymm4, ymm4, ymm1
+	vpsubq	ymm1, ymm4, ymm1
+	vpsrlq	ymm2, ymm2, 63
+	vpaddq	ymm1, ymm1, ymm2
+	vmovdqu	ymm2, ymmword ptr [r8]
+	vpsrlq	ymm4, ymm2, 32
+	vpmuludq	ymm4, ymm1, ymm4
+	vpsrlq	ymm5, ymm1, 32
+	vpmuludq	ymm5, ymm5, ymm2
+	vpaddq	ymm4, ymm4, ymm5
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm1, ymm1, ymm2
+	vpaddq	ymm1, ymm1, ymm4
+	vpsubq	ymm0, ymm0, ymm1
+	vpcmpgtq	ymm1, ymm3, ymm0
+	vpand	ymm1, ymm1, ymm2
+	vpaddq	ymm0, ymm1, ymm0
+	vpcmpgtq	ymm1, ymm0, ymm2
+	vpand	ymm1, ymm1, ymm2
+	vpsubq	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r10], ymm0
+	mov	rax, r10
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end0:
+	.size	julia_mod_p_61774, .Lfunc_end0-julia_mod_p_61774
+	.cfi_endproc
+                                        # -- End function
+	.section	".note.GNU-stack","",@progbits