diff --git a/experimental/add_mull.jl b/experimental/add_mull.jl
new file mode 100644
index 00000000..a3b9a71b
--- /dev/null
+++ b/experimental/add_mull.jl
@@ -0,0 +1,115 @@
+using InteractiveUtils, BenchmarkTools
+
+function add_mul!(vres::Vector{Int64}, a::Int32, v::Vector{Int32})
+    @inbounds for i in eachindex(vres)
+        vres[i] += Int64(a) * Int64(v[i])
+    end
+end
+
+function add_mul_f!(vres, a, v)
+    @fastmath @inbounds @simd for i in eachindex(vres)
+        vres[i] += a * v[i]
+    end
+end
+
+n = 2^10
+@benchmark add_mul!(v1, c, v2) setup = begin
+    v1 = rand(Int, n)
+    c = rand(Int32)
+    v2 = rand(Int32, n)
+end
+
+@benchmark add_mul_f!(v1, c, v2) setup = begin
+    v1 = rand(Float64, n)
+    c = rand(Float64)
+    v2 = rand(Float64, n)
+end
+
+@code_native debuginfo = :none add_mul_f!(Float64[], Float32(1), Float32[])
+
+@code_native debuginfo = :none add_mul!(Int64[], Int32(1), Int32[])
+#=
+        .text
+        .file   "add_mul"
+        .globl  julia_add_mul_13693             # -- Begin function julia_add_mul_13693
+        .p2align        4, 0x90
+        .type   julia_add_mul_13693,@function
+julia_add_mul_13693:                    # @julia_add_mul_13693
+        .cfi_startproc
+# %bb.0:                                # %top
+        pushq   %rbp
+        .cfi_def_cfa_offset 16
+        .cfi_offset %rbp, -16
+        movq    %rsp, %rbp
+        .cfi_def_cfa_register %rbp
+        movq    8(%rcx), %r10
+        testq   %r10, %r10
+        je      .LBB0_9
+# %bb.1:                                # %L18.preheader
+        movq    (%rcx), %rcx
+        movslq  %edx, %r9
+        movq    (%r8), %r11
+        movl    $1, %eax
+        cmpq    $16, %r10
+        jb      .LBB0_7
+# %bb.2:                                # %vector.memcheck
+        leaq    (%r11,%r10,4), %rdx
+        cmpq    %rdx, %rcx
+        jae     .LBB0_4
+# %bb.3:                                # %vector.memcheck
+        leaq    (%rcx,%r10,8), %rdx
+        cmpq    %rdx, %r11
+        jb      .LBB0_7
+.LBB0_4:                                # %vector.ph
+        movq    %r10, %r8
+        andq    $-16, %r8
+        leaq    1(%r8), %rax
+        vmovq   %r9, %xmm0
+        vpbroadcastq    %xmm0, %ymm0
+        xorl    %edx, %edx
+        .p2align        4, 0x90
+.LBB0_5:                                # %vector.body
+                                        # =>This Inner Loop Header: Depth=1
+        vpmovzxdq       (%r11,%rdx,4), %ymm1    # ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+        vpmovzxdq       16(%r11,%rdx,4), %ymm2  # ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+        vpmovzxdq       32(%r11,%rdx,4), %ymm3  # ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+        vpmovzxdq       48(%r11,%rdx,4), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+        vpmuldq %ymm1, %ymm0, %ymm1
+        vpmuldq %ymm2, %ymm0, %ymm2
+        vpmuldq %ymm3, %ymm0, %ymm3
+        vpmuldq %ymm4, %ymm0, %ymm4
+        vpaddq  (%rcx,%rdx,8), %ymm1, %ymm1
+        vpaddq  32(%rcx,%rdx,8), %ymm2, %ymm2
+        vpaddq  64(%rcx,%rdx,8), %ymm3, %ymm3
+        vpaddq  96(%rcx,%rdx,8), %ymm4, %ymm4
+        vmovdqu %ymm1, (%rcx,%rdx,8)
+        vmovdqu %ymm2, 32(%rcx,%rdx,8)
+        vmovdqu %ymm3, 64(%rcx,%rdx,8)
+        vmovdqu %ymm4, 96(%rcx,%rdx,8)
+        addq    $16, %rdx
+        cmpq    %rdx, %r8
+        jne     .LBB0_5
+# %bb.6:                                # %middle.block
+        cmpq    %r8, %r10
+        je      .LBB0_9
+.LBB0_7:                                # %scalar.ph
+        decq    %rax
+        .p2align        4, 0x90
+.LBB0_8:                                # %L18
+                                        # =>This Inner Loop Header: Depth=1
+        movslq  (%r11,%rax,4), %rdx
+        imulq   %r9, %rdx
+        addq    %rdx, (%rcx,%rax,8)
+        incq    %rax
+        cmpq    %rax, %r10
+        jne     .LBB0_8
+.LBB0_9:                                # %L38
+        popq    %rbp
+        vzeroupper
+        retq
+.Lfunc_end0:
+        .size   julia_add_mul_13693, .Lfunc_end0-julia_add_mul_13693     
+        .cfi_endproc
+                                        # -- End function
+        .section        ".note.GNU-stack","",@progbits
+=#
diff --git a/experimental/benchmarks/standalone_f4.jl b/experimental/benchmarks/standalone_f4.jl
index ecd09639..fe210150 100644
--- a/experimental/benchmarks/standalone_f4.jl
+++ b/experimental/benchmarks/standalone_f4.jl
@@ -3,7 +3,7 @@ if !isdefined(Main, :Groebner)
     using Groebner
 end
 
-import AbstractAlgebra
+import AbstractAlgebra, Primes
 using BenchmarkTools
 using Logging
 
@@ -42,9 +42,12 @@ function run_f4_ff_degrevlex_benchmarks(ground)
     end
 end
 
-s = Groebner.katsuran(11, ordering=:degrevlex, ground=AbstractAlgebra.GF(2^31 - 1))
+p1 = 2^31 - 1
+p2 = 2^29 + 11
+p3 = 2^27 + 29
+s = Groebner.katsuran(11, ordering=:degrevlex, ground=AbstractAlgebra.GF(p3))
 
-@time Groebner.groebner(s);
+@profview gb1 = Groebner.groebner(s);
 
 println()
 ground = AbstractAlgebra.GF(1048583)
diff --git a/experimental/example-maybe-bug.jl b/experimental/example-maybe-bug.jl
index 77a46db7..ab5de157 100644
--- a/experimental/example-maybe-bug.jl
+++ b/experimental/example-maybe-bug.jl
@@ -4,27 +4,42 @@ R, (x1, x2, x3) = PolynomialRing(GF(2^27 + 29), ["x1", "x2", "x3"], ordering=:de
 R, (x1, x2, x3) = PolynomialRing(GF(2^31 - 1), ["x1", "x2", "x3"], ordering=:degrevlex)
 
 s = [x1 * x2^2 + 2, x1 * x3 + 3, x2 * x3 + 4 * x1 - 5]
-gb = Groebner.groebner(s, loglevel=-4)
+gb = Groebner.groebner(s, loglevel=-5)
 gb
 
-Groebner.logging_enabled() = false
-k = Groebner.katsuran(9, ground=GF(2^27 + 29), ordering=:degrevlex)
+p1 = 2^31 - 1
+p2 = 2^29 + 11
+p3 = 2^27 + 29
+p4 = 2^25 + 35
+s = Groebner.eco11(ordering=:degrevlex, ground=AbstractAlgebra.GF(p4))
 
-@profview for _ in 1:10
-    gb = Groebner.groebner(k)
-end
+@time gb1 = Groebner.groebner(s);
 
-Groebner.isgroebner(gb)
+@time trace, gb2 = Groebner.groebner_learn(s);
 
-@time gb2 = Groebner.groebner(k);
-gb == gb2
+@btime suc, gb3 = Groebner.groebner_apply!($trace, $s);
 
+println()
+ground = AbstractAlgebra.GF(1048583)
+run_f4_ff_degrevlex_benchmarks(ground)
 # for k = 7
 # 188.876 ms (213205 allocations: 42.44 MiB)
 
 #! format: off
-R,(t1,t2,t3,a,b,c) = PolynomialRing(QQ, ["t1","t2","t3","a", "b", "c"])
+R,(t1,t2,t3,a,b,c) = PolynomialRing(GF(2^27+29), ["t1","t2","t3","a", "b", "c"], ordering=:degrevlex)
 sys = [1000000*a^2*t1^2+1000000*a^2*t2^2+1000000*a^2*t3^2+1000000*b^2*t1^2+1000000*b^2*t2^2+1000000*b^2*t3^2+1000000*c^2*t1^2+1000000*c^2*t2^2+1000000*c^2*t3^2-1065102000*a^2*t1-1566200000*a^2*t2+359610000*a^2*t3-4000000*a*b*t2-1574352000*a*b*t3+4000000*a*c*t1+273640000*a*c*t3-1065102000*b^2*t1+8152000*b^2*t2+355610000*b^2*t3-1574352000*b*c*t1-273640000*b*c*t2-791462000*c^2*t1-1566200000*c^2*t2+355610000*c^2*t3+740236705137*a^2-279943961360*a*b+47071636200*a*c+1574352000*a*t1-273640000*a*t2+126292488913*b^2+837307375312*b*c+4000000*b*t1-273640000*b*t3+612513941897*c^2+4000000*c*t2-1574352000*c*t3+1000000*t1^2+1000000*t2^2+1000000*t3^2-624135247952*a-50784764200*b-283060057360*c-791462000*t1+8152000*t2+359610000*t3+165673, 1000000*a^2*t1^2+1000000*a^2*t2^2+1000000*a^2*t3^2+1000000*b^2*t1^2+1000000*b^2*t2^2+1000000*b^2*t3^2+1000000*c^2*t1^2+1000000*c^2*t2^2+1000000*c^2*t3^2-1889130000*a^2*t1-139016000*a^2*t2+357608000*a^2*t3+550492000*a*b*t3+1500376000*a*c*t3-1889130000*b^2*t1-689508000*b^2*t2+357608000*b^2*t3+550492000*b*c*t1-1500376000*b*c*t2-388754000*c^2*t1-139016000*c^2*t2+357608000*c^2*t3+740396599024*a^2+98430171568*a*b+268273230304*a*c-550492000*a*t1-1500376000*a*t2+854420557476*b^2-2714848476*b*c-1500376000*b*t3-114024022072*c^2+550492000*c*t3+1000000*t1^2+1000000*t2^2+1000000*t3^2+624263610988*a-268273230304*b+98430171568*c-388754000*t1-689508000*t2+357608000*t3-63620, 4000000*a^2*t1^2+4000000*a^2*t2^2+4000000*a^2*t3^2+4000000*b^2*t1^2+4000000*b^2*t2^2+4000000*b^2*t3^2+4000000*c^2*t1^2+4000000*c^2*t2^2+4000000*c^2*t3^2-3295636000*a^2*t1+6825304000*a^2*t2+1438448000*a^2*t3-16000000*a*b*t2+4096192000*a*b*t3+16000000*a*c*t1+4906624000*a*c*t3-3295636000*b^2*t1+2729112000*b^2*t2+1422448000*b^2*t3+4096192000*b*c*t1-4906624000*b*c*t2+1610988000*c^2*t1+6825304000*c^2*t2+1422448000*c^2*t3+2962666483625*a^2+722869290752*a*b+875649162944*a*c-4096192000*a*t1-4906624000*a*t2+513760438633*b^2-3361285532000*b*c+16000000*b*t1-4906624000*b*t3+2443184693353*c^2+16000000*c*t2+4096192000*c*t3+4000000*t1^2+4000000*t2^2+4000000*t3^2-2498705324448*a-879018458944*b+741978122752*c+1610988000*t1+2729112000*t2+1438448000*t3+440361,4000000*a^2*t1^2+4000000*a^2*t2^2+4000000*a^2*t3^2+4000000*b^2*t1^2+4000000*b^2*t2^2+4000000*b^2*t3^2+4000000*c^2*t1^2+4000000*c^2*t2^2+4000000*c^2*t3^2+3295636000*a^2*t1+6824896000*a^2*t2+1430432000*a^2*t3+4094592000*a*b*t3-4906624000*a*c*t3+3295636000*b^2*t1+2730304000*b^2*t2+1430432000*b^2*t3+4094592000*b*c*t1+4906624000*b*c*t2-1610988000*c^2*t1+6824896000*c^2*t2+1430432000*c^2*t3+2961910911797*a^2+732129427968*a*b-877323997696*a*c-4094592000*a*t1+4906624000*a*t2+516620569397*b^2+3361357491776*b*c+4906624000*b*t3+2445290017525*c^2+4094592000*c*t3+4000000*t1^2+4000000*t2^2+4000000*t3^2+2499114213824*a+877323997696*b+732129427968*c-1610988000*t1+2730304000*t2+1430432000*t3-324875, 1000000*a^2*t1^2+1000000*a^2*t2^2+1000000*a^2*t3^2+1000000*b^2*t1^2+1000000*b^2*t2^2+1000000*b^2*t3^2+1000000*c^2*t1^2+1000000*c^2*t2^2+1000000*c^2*t3^2+1889602000*a^2*t1-138926000*a^2*t2+359604000*a^2*t3-4000000*a*b*t2+550036000*a*b*t3+4000000*a*c*t1-1500228000*a*c*t3+1889602000*b^2*t1-688962000*b^2*t2+355604000*b^2*t3+550036000*b*c*t1+1500228000*b*c*t2+389374000*c^2*t1-138926000*c^2*t2+355604000*c^2*t3+740903906549*a^2+99175424872*a*b-265964790856*a*c-550036000*a*t1+1500228000*a*t2+854030749541*b^2+2874521168*b*c+4000000*b*t1+1500228000*b*t3-114557203083*c^2+4000000*c*t2+550036000*c*t3+1000000*t1^2+1000000*t2^2+1000000*t3^2-623884900400*a+270522742856*b+97519648872*c+389374000*t1-688962000*t2+359604000*t3+55909, 250000*a^2*t1^2+250000*a^2*t2^2+250000*a^2*t3^2+250000*b^2*t1^2+250000*b^2*t2^2+250000*b^2*t3^2+250000*c^2*t1^2+250000*c^2*t2^2+250000*c^2*t3^2+266341000*a^2*t1-391502000*a^2*t2+89402000*a^2*t3-393620000*a*b*t3-68228000*a*c*t3+266341000*b^2*t1+2118000*b^2*t2+89402000*b^2*t3-393620000*b*c*t1+68228000*b*c*t2+198113000*c^2*t1-391502000*c^2*t2+89402000*c^2*t3+184958257568*a^2-70380830480*a*b-12199439312*a*c+393620000*a*t1+68228000*a*t2+31688927488*b^2-209385275032*b*c+68228000*b*t3+153269490056*c^2-393620000*c*t3+250000*t1^2+250000*t2^2+250000*t3^2+156251491928*a+12199439312*b-70380830480*c+198113000*t1+2118000*t2+89402000*t3+159976]
+
+@time trace, gb = Groebner.groebner_learn(
+    sys,
+    ordering=Groebner.DegRevLex(),
+);
+
+gb = Groebner.groebner_apply!(
+    trace,
+    sys,
+    loglevel=-3
+);
+
 #! format: on
 
 Groebner.performance_counters_enabled() = true
diff --git a/experimental/todo.jl b/experimental/todo.jl
index 1bb18b38..69746e79 100644
--- a/experimental/todo.jl
+++ b/experimental/todo.jl
@@ -1,12 +1,16 @@
 using SIMD, BenchmarkTools
 
-function add_mul!(v1::Vector{T}, c, v2) where {T}
-    @inbounds for i in 1:length(v1)
-        v1[i] += T(c) * T(v2[i])
+function add_mul!(vres::Vector{Int64}, a::Int32, v::Vector{Int32})
+    @inbounds for i in eachindex(vres)
+        vres[i] += Int64(a) * Int64(v[i])
     end
-    nothing
 end
 
+function add_mul_v2!(vres::Vector{Int64}, a::Int64, v::Vector{Int32})
+    @inbounds for i in eachindex(vres)
+        vres[i] += a * Int64(v[i])
+    end
+end
 @code_native debuginfo = :none add_mul([1, 2], UInt32(1), UInt32[3, 4])
 
 function reduce_dense_row_by_sparse_row_no_remainder!(
@@ -105,7 +109,7 @@ end
             Val(2)
         )
 
-@code_native debuginfo = :none add_mul!(UInt64[1, 2, 3, 4], UInt32(8), UInt32[3, 1, 0, 10])
+@code_native debuginfo = :none add_mul_v2!(Int64[1, 2, 3, 4], Int64(8), Int32[3, 1, 0, 10])
 
 @code_native debuginfo = :none reduce_dense_row_by_sparse_row_no_remainder_2!(
     UInt64[1, 2, 3, 4],
diff --git a/src/arithmetic/Zp.jl b/src/arithmetic/Zp.jl
index 4a7ed61d..7262cc9c 100644
--- a/src/arithmetic/Zp.jl
+++ b/src/arithmetic/Zp.jl
@@ -19,7 +19,7 @@ end
 divisor(arithm::BuiltinArithmeticZp) = arithm.magic.divisor
 
 function n_reserved_bits(arithm::BuiltinArithmeticZp{T}) where {T <: Unsigned}
-    res = leading_zeros(divisor(arithm)) - (8 >> 1) * sizeof(T)
+    res = 1 + leading_zeros(divisor(arithm)) - (8 >> 1) * sizeof(T)
     @invariant res >= 0
     res
 end
diff --git a/src/f4/linalg.jl b/src/f4/linalg.jl
index aee13272..b51c9264 100644
--- a/src/f4/linalg.jl
+++ b/src/f4/linalg.jl
@@ -1653,7 +1653,6 @@ function reduce_dense_row_by_pivots_sparse!(
 
     j = 0
     nskip = skip(arithmetic)
-    @log level = -4 "" n_reserved_bits(arithmetic) nskip divisor(arithmetic)
 
     @inbounds for i in start_column:end_column
         # if the element is zero - no reduction is needed
@@ -1698,6 +1697,7 @@ function reduce_dense_row_by_pivots_sparse!(
         if iszero(j % nskip)
             dense_row_remainder!(row, arithmetic, i, end_column)
         end
+        row[i] = zero(row[i])
 
         # @invariant iszero(row[i])
     end
@@ -1708,7 +1708,7 @@ function reduce_dense_row_by_pivots_sparse!(
     end
 
     # TODO: perhaps, not needed
-    dense_row_remainder!(row, arithmetic, Int(start_column), Int(end_column))
+    # dense_row_remainder!(row, arithmetic, Int(start_column), Int(end_column))
 
     # form the resulting row in sparse format
     resize!(new_column_indices, nonzeros)