diff --git a/experimental/add_mull.jl b/experimental/add_mull.jl new file mode 100644 index 00000000..a3b9a71b --- /dev/null +++ b/experimental/add_mull.jl @@ -0,0 +1,115 @@ +using InteractiveUtils, BenchmarkTools + +function add_mul!(vres::Vector{Int64}, a::Int32, v::Vector{Int32}) + @inbounds for i in eachindex(vres) + vres[i] += Int64(a) * Int64(v[i]) + end +end + +function add_mul_f!(vres, a, v) + @fastmath @inbounds @simd for i in eachindex(vres) + vres[i] += a * v[i] + end +end + +n = 2^10 +@benchmark add_mul!(v1, c, v2) setup = begin + v1 = rand(Int, n) + c = rand(Int32) + v2 = rand(Int32, n) +end + +@benchmark add_mul_f!(v1, c, v2) setup = begin + v1 = rand(Float64, n) + c = rand(Float64) + v2 = rand(Float64, n) +end + +@code_native debuginfo = :none add_mul_f!(Float64[], Float32(1), Float32[]) + +@code_native debuginfo = :none add_mul!(Int64[], Int32(1), Int32[]) +#= + .text + .file "add_mul" + .globl julia_add_mul_13693 # -- Begin function julia_add_mul_13693 + .p2align 4, 0x90 + .type julia_add_mul_13693,@function +julia_add_mul_13693: # @julia_add_mul_13693 + .cfi_startproc +# %bb.0: # %top + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + movq 8(%rcx), %r10 + testq %r10, %r10 + je .LBB0_9 +# %bb.1: # %L18.preheader + movq (%rcx), %rcx + movslq %edx, %r9 + movq (%r8), %r11 + movl $1, %eax + cmpq $16, %r10 + jb .LBB0_7 +# %bb.2: # %vector.memcheck + leaq (%r11,%r10,4), %rdx + cmpq %rdx, %rcx + jae .LBB0_4 +# %bb.3: # %vector.memcheck + leaq (%rcx,%r10,8), %rdx + cmpq %rdx, %r11 + jb .LBB0_7 +.LBB0_4: # %vector.ph + movq %r10, %r8 + andq $-16, %r8 + leaq 1(%r8), %rax + vmovq %r9, %xmm0 + vpbroadcastq %xmm0, %ymm0 + xorl %edx, %edx + .p2align 4, 0x90 +.LBB0_5: # %vector.body + # =>This Inner Loop Header: Depth=1 + vpmovzxdq (%r11,%rdx,4), %ymm1 # ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero + vpmovzxdq 16(%r11,%rdx,4), %ymm2 # ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero + vpmovzxdq 32(%r11,%rdx,4), %ymm3 # ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero + vpmovzxdq 48(%r11,%rdx,4), %ymm4 # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero + vpmuldq %ymm1, %ymm0, %ymm1 + vpmuldq %ymm2, %ymm0, %ymm2 + vpmuldq %ymm3, %ymm0, %ymm3 + vpmuldq %ymm4, %ymm0, %ymm4 + vpaddq (%rcx,%rdx,8), %ymm1, %ymm1 + vpaddq 32(%rcx,%rdx,8), %ymm2, %ymm2 + vpaddq 64(%rcx,%rdx,8), %ymm3, %ymm3 + vpaddq 96(%rcx,%rdx,8), %ymm4, %ymm4 + vmovdqu %ymm1, (%rcx,%rdx,8) + vmovdqu %ymm2, 32(%rcx,%rdx,8) + vmovdqu %ymm3, 64(%rcx,%rdx,8) + vmovdqu %ymm4, 96(%rcx,%rdx,8) + addq $16, %rdx + cmpq %rdx, %r8 + jne .LBB0_5 +# %bb.6: # %middle.block + cmpq %r8, %r10 + je .LBB0_9 +.LBB0_7: # %scalar.ph + decq %rax + .p2align 4, 0x90 +.LBB0_8: # %L18 + # =>This Inner Loop Header: Depth=1 + movslq (%r11,%rax,4), %rdx + imulq %r9, %rdx + addq %rdx, (%rcx,%rax,8) + incq %rax + cmpq %rax, %r10 + jne .LBB0_8 +.LBB0_9: # %L38 + popq %rbp + vzeroupper + retq +.Lfunc_end0: + .size julia_add_mul_13693, .Lfunc_end0-julia_add_mul_13693 + .cfi_endproc + # -- End function + .section ".note.GNU-stack","",@progbits +=# diff --git a/experimental/benchmarks/standalone_f4.jl b/experimental/benchmarks/standalone_f4.jl index ecd09639..fe210150 100644 --- a/experimental/benchmarks/standalone_f4.jl +++ b/experimental/benchmarks/standalone_f4.jl @@ -3,7 +3,7 @@ if !isdefined(Main, :Groebner) using Groebner end -import AbstractAlgebra +import AbstractAlgebra, Primes using BenchmarkTools using Logging @@ -42,9 +42,12 @@ function run_f4_ff_degrevlex_benchmarks(ground) end end -s = Groebner.katsuran(11, ordering=:degrevlex, ground=AbstractAlgebra.GF(2^31 - 1)) +p1 = 2^31 - 1 +p2 = 2^29 + 11 +p3 = 2^27 + 29 +s = Groebner.katsuran(11, ordering=:degrevlex, ground=AbstractAlgebra.GF(p3)) -@time Groebner.groebner(s); +@profview gb1 = Groebner.groebner(s); println() ground = AbstractAlgebra.GF(1048583) diff --git a/experimental/example-maybe-bug.jl b/experimental/example-maybe-bug.jl index 77a46db7..ab5de157 100644 --- a/experimental/example-maybe-bug.jl +++ b/experimental/example-maybe-bug.jl @@ -4,27 +4,42 @@ R, (x1, x2, x3) = PolynomialRing(GF(2^27 + 29), ["x1", "x2", "x3"], ordering=:de R, (x1, x2, x3) = PolynomialRing(GF(2^31 - 1), ["x1", "x2", "x3"], ordering=:degrevlex) s = [x1 * x2^2 + 2, x1 * x3 + 3, x2 * x3 + 4 * x1 - 5] -gb = Groebner.groebner(s, loglevel=-4) +gb = Groebner.groebner(s, loglevel=-5) gb -Groebner.logging_enabled() = false -k = Groebner.katsuran(9, ground=GF(2^27 + 29), ordering=:degrevlex) +p1 = 2^31 - 1 +p2 = 2^29 + 11 +p3 = 2^27 + 29 +p4 = 2^25 + 35 +s = Groebner.eco11(ordering=:degrevlex, ground=AbstractAlgebra.GF(p4)) -@profview for _ in 1:10 - gb = Groebner.groebner(k) -end +@time gb1 = Groebner.groebner(s); -Groebner.isgroebner(gb) +@time trace, gb2 = Groebner.groebner_learn(s); -@time gb2 = Groebner.groebner(k); -gb == gb2 +@btime suc, gb3 = Groebner.groebner_apply!($trace, $s); +println() +ground = AbstractAlgebra.GF(1048583) +run_f4_ff_degrevlex_benchmarks(ground) # for k = 7 # 188.876 ms (213205 allocations: 42.44 MiB) #! format: off -R,(t1,t2,t3,a,b,c) = PolynomialRing(QQ, ["t1","t2","t3","a", "b", "c"]) +R,(t1,t2,t3,a,b,c) = PolynomialRing(GF(2^27+29), ["t1","t2","t3","a", "b", "c"], ordering=:degrevlex) sys = [1000000*a^2*t1^2+1000000*a^2*t2^2+1000000*a^2*t3^2+1000000*b^2*t1^2+1000000*b^2*t2^2+1000000*b^2*t3^2+1000000*c^2*t1^2+1000000*c^2*t2^2+1000000*c^2*t3^2-1065102000*a^2*t1-1566200000*a^2*t2+359610000*a^2*t3-4000000*a*b*t2-1574352000*a*b*t3+4000000*a*c*t1+273640000*a*c*t3-1065102000*b^2*t1+8152000*b^2*t2+355610000*b^2*t3-1574352000*b*c*t1-273640000*b*c*t2-791462000*c^2*t1-1566200000*c^2*t2+355610000*c^2*t3+740236705137*a^2-279943961360*a*b+47071636200*a*c+1574352000*a*t1-273640000*a*t2+126292488913*b^2+837307375312*b*c+4000000*b*t1-273640000*b*t3+612513941897*c^2+4000000*c*t2-1574352000*c*t3+1000000*t1^2+1000000*t2^2+1000000*t3^2-624135247952*a-50784764200*b-283060057360*c-791462000*t1+8152000*t2+359610000*t3+165673, 1000000*a^2*t1^2+1000000*a^2*t2^2+1000000*a^2*t3^2+1000000*b^2*t1^2+1000000*b^2*t2^2+1000000*b^2*t3^2+1000000*c^2*t1^2+1000000*c^2*t2^2+1000000*c^2*t3^2-1889130000*a^2*t1-139016000*a^2*t2+357608000*a^2*t3+550492000*a*b*t3+1500376000*a*c*t3-1889130000*b^2*t1-689508000*b^2*t2+357608000*b^2*t3+550492000*b*c*t1-1500376000*b*c*t2-388754000*c^2*t1-139016000*c^2*t2+357608000*c^2*t3+740396599024*a^2+98430171568*a*b+268273230304*a*c-550492000*a*t1-1500376000*a*t2+854420557476*b^2-2714848476*b*c-1500376000*b*t3-114024022072*c^2+550492000*c*t3+1000000*t1^2+1000000*t2^2+1000000*t3^2+624263610988*a-268273230304*b+98430171568*c-388754000*t1-689508000*t2+357608000*t3-63620, 4000000*a^2*t1^2+4000000*a^2*t2^2+4000000*a^2*t3^2+4000000*b^2*t1^2+4000000*b^2*t2^2+4000000*b^2*t3^2+4000000*c^2*t1^2+4000000*c^2*t2^2+4000000*c^2*t3^2-3295636000*a^2*t1+6825304000*a^2*t2+1438448000*a^2*t3-16000000*a*b*t2+4096192000*a*b*t3+16000000*a*c*t1+4906624000*a*c*t3-3295636000*b^2*t1+2729112000*b^2*t2+1422448000*b^2*t3+4096192000*b*c*t1-4906624000*b*c*t2+1610988000*c^2*t1+6825304000*c^2*t2+1422448000*c^2*t3+2962666483625*a^2+722869290752*a*b+875649162944*a*c-4096192000*a*t1-4906624000*a*t2+513760438633*b^2-3361285532000*b*c+16000000*b*t1-4906624000*b*t3+2443184693353*c^2+16000000*c*t2+4096192000*c*t3+4000000*t1^2+4000000*t2^2+4000000*t3^2-2498705324448*a-879018458944*b+741978122752*c+1610988000*t1+2729112000*t2+1438448000*t3+440361,4000000*a^2*t1^2+4000000*a^2*t2^2+4000000*a^2*t3^2+4000000*b^2*t1^2+4000000*b^2*t2^2+4000000*b^2*t3^2+4000000*c^2*t1^2+4000000*c^2*t2^2+4000000*c^2*t3^2+3295636000*a^2*t1+6824896000*a^2*t2+1430432000*a^2*t3+4094592000*a*b*t3-4906624000*a*c*t3+3295636000*b^2*t1+2730304000*b^2*t2+1430432000*b^2*t3+4094592000*b*c*t1+4906624000*b*c*t2-1610988000*c^2*t1+6824896000*c^2*t2+1430432000*c^2*t3+2961910911797*a^2+732129427968*a*b-877323997696*a*c-4094592000*a*t1+4906624000*a*t2+516620569397*b^2+3361357491776*b*c+4906624000*b*t3+2445290017525*c^2+4094592000*c*t3+4000000*t1^2+4000000*t2^2+4000000*t3^2+2499114213824*a+877323997696*b+732129427968*c-1610988000*t1+2730304000*t2+1430432000*t3-324875, 1000000*a^2*t1^2+1000000*a^2*t2^2+1000000*a^2*t3^2+1000000*b^2*t1^2+1000000*b^2*t2^2+1000000*b^2*t3^2+1000000*c^2*t1^2+1000000*c^2*t2^2+1000000*c^2*t3^2+1889602000*a^2*t1-138926000*a^2*t2+359604000*a^2*t3-4000000*a*b*t2+550036000*a*b*t3+4000000*a*c*t1-1500228000*a*c*t3+1889602000*b^2*t1-688962000*b^2*t2+355604000*b^2*t3+550036000*b*c*t1+1500228000*b*c*t2+389374000*c^2*t1-138926000*c^2*t2+355604000*c^2*t3+740903906549*a^2+99175424872*a*b-265964790856*a*c-550036000*a*t1+1500228000*a*t2+854030749541*b^2+2874521168*b*c+4000000*b*t1+1500228000*b*t3-114557203083*c^2+4000000*c*t2+550036000*c*t3+1000000*t1^2+1000000*t2^2+1000000*t3^2-623884900400*a+270522742856*b+97519648872*c+389374000*t1-688962000*t2+359604000*t3+55909, 250000*a^2*t1^2+250000*a^2*t2^2+250000*a^2*t3^2+250000*b^2*t1^2+250000*b^2*t2^2+250000*b^2*t3^2+250000*c^2*t1^2+250000*c^2*t2^2+250000*c^2*t3^2+266341000*a^2*t1-391502000*a^2*t2+89402000*a^2*t3-393620000*a*b*t3-68228000*a*c*t3+266341000*b^2*t1+2118000*b^2*t2+89402000*b^2*t3-393620000*b*c*t1+68228000*b*c*t2+198113000*c^2*t1-391502000*c^2*t2+89402000*c^2*t3+184958257568*a^2-70380830480*a*b-12199439312*a*c+393620000*a*t1+68228000*a*t2+31688927488*b^2-209385275032*b*c+68228000*b*t3+153269490056*c^2-393620000*c*t3+250000*t1^2+250000*t2^2+250000*t3^2+156251491928*a+12199439312*b-70380830480*c+198113000*t1+2118000*t2+89402000*t3+159976] + +@time trace, gb = Groebner.groebner_learn( + sys, + ordering=Groebner.DegRevLex(), +); + +gb = Groebner.groebner_apply!( + trace, + sys, + loglevel=-3 +); + #! format: on Groebner.performance_counters_enabled() = true diff --git a/experimental/todo.jl b/experimental/todo.jl index 1bb18b38..69746e79 100644 --- a/experimental/todo.jl +++ b/experimental/todo.jl @@ -1,12 +1,16 @@ using SIMD, BenchmarkTools -function add_mul!(v1::Vector{T}, c, v2) where {T} - @inbounds for i in 1:length(v1) - v1[i] += T(c) * T(v2[i]) +function add_mul!(vres::Vector{Int64}, a::Int32, v::Vector{Int32}) + @inbounds for i in eachindex(vres) + vres[i] += Int64(a) * Int64(v[i]) end - nothing end +function add_mul_v2!(vres::Vector{Int64}, a::Int64, v::Vector{Int32}) + @inbounds for i in eachindex(vres) + vres[i] += a * Int64(v[i]) + end +end @code_native debuginfo = :none add_mul([1, 2], UInt32(1), UInt32[3, 4]) function reduce_dense_row_by_sparse_row_no_remainder!( @@ -105,7 +109,7 @@ end Val(2) ) -@code_native debuginfo = :none add_mul!(UInt64[1, 2, 3, 4], UInt32(8), UInt32[3, 1, 0, 10]) +@code_native debuginfo = :none add_mul_v2!(Int64[1, 2, 3, 4], Int64(8), Int32[3, 1, 0, 10]) @code_native debuginfo = :none reduce_dense_row_by_sparse_row_no_remainder_2!( UInt64[1, 2, 3, 4], diff --git a/src/arithmetic/Zp.jl b/src/arithmetic/Zp.jl index 4a7ed61d..7262cc9c 100644 --- a/src/arithmetic/Zp.jl +++ b/src/arithmetic/Zp.jl @@ -19,7 +19,7 @@ end divisor(arithm::BuiltinArithmeticZp) = arithm.magic.divisor function n_reserved_bits(arithm::BuiltinArithmeticZp{T}) where {T <: Unsigned} - res = leading_zeros(divisor(arithm)) - (8 >> 1) * sizeof(T) + res = 1 + leading_zeros(divisor(arithm)) - (8 >> 1) * sizeof(T) @invariant res >= 0 res end diff --git a/src/f4/linalg.jl b/src/f4/linalg.jl index aee13272..b51c9264 100644 --- a/src/f4/linalg.jl +++ b/src/f4/linalg.jl @@ -1653,7 +1653,6 @@ function reduce_dense_row_by_pivots_sparse!( j = 0 nskip = skip(arithmetic) - @log level = -4 "" n_reserved_bits(arithmetic) nskip divisor(arithmetic) @inbounds for i in start_column:end_column # if the element is zero - no reduction is needed @@ -1698,6 +1697,7 @@ function reduce_dense_row_by_pivots_sparse!( if iszero(j % nskip) dense_row_remainder!(row, arithmetic, i, end_column) end + row[i] = zero(row[i]) # @invariant iszero(row[i]) end @@ -1708,7 +1708,7 @@ function reduce_dense_row_by_pivots_sparse!( end # TODO: perhaps, not needed - dense_row_remainder!(row, arithmetic, Int(start_column), Int(end_column)) + # dense_row_remainder!(row, arithmetic, Int(start_column), Int(end_column)) # form the resulting row in sparse format resize!(new_column_indices, nonzeros)