From 6dd8183a8603c5eedd09007553058f1d36ee7ae7 Mon Sep 17 00:00:00 2001
From: Alexander Demin <asdemin_2@edu.hse.ru>
Date: Sat, 27 Jan 2024 00:00:06 +0300
Subject: [PATCH] upd fglm benchmarks

---
 Project.toml                           |   1 +
 benchmark/CI-scripts/run_benchmarks.jl |   7 +
 experimental/example-maybe-bug.jl      |  14 +-
 experimental/fglmtest.jl               |  17 +
 experimental/lexcmp.jl                 | 217 ++++++++++++
 experimental/native_vec4-assume.txt    |  93 +++++
 experimental/native_vec4.txt           |  93 +++++
 experimental/test_fglm.jl              |  22 ++
 src/Groebner.jl                        |  20 +-
 src/arithmetic/Zp.jl                   |  47 +--
 src/f4/f4.jl                           |   1 +
 src/f4/hashtable.jl                    |   2 +-
 src/f4/linalg/backend.jl               |  34 +-
 src/fglm/fglm.jl                       |  54 +++
 src/fglm/fglm_internal.jl              | 111 +++++-
 src/fglm/linear.jl                     |  92 +++--
 src/input-output/input-output.jl       |  17 +-
 src/interface.jl                       |  27 ++
 src/monomials/exponentvector.jl        |  72 ++--
 src/utils/invariants.jl                |   2 +-
 src/utils/keywords.jl                  |  19 +-
 src/utils/simd.jl                      | 460 +++++++++++++++++++++++++
 test/arithmetic/Zp.jl                  |  31 +-
 test/monoms/monom_orders.jl            |  45 ++-
 test/reconstruction/ratrec.jl          |  47 +--
 25 files changed, 1281 insertions(+), 264 deletions(-)
 create mode 100644 experimental/fglmtest.jl
 create mode 100644 experimental/lexcmp.jl
 create mode 100644 experimental/native_vec4-assume.txt
 create mode 100644 experimental/native_vec4.txt
 create mode 100644 experimental/test_fglm.jl
 create mode 100644 src/utils/simd.jl

diff --git a/Project.toml b/Project.toml
index c0335cec..10beedaa 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ AbstractAlgebra = "c3fe647b-3220-5bb0-a1ea-a7954cac585d"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+HostCPUFeatures = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MultivariatePolynomials = "102ac46a-7ee4-5c85-9060-abc95bfdeaa3"
 Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a"
diff --git a/benchmark/CI-scripts/run_benchmarks.jl b/benchmark/CI-scripts/run_benchmarks.jl
index dac5ed00..97c96187 100644
--- a/benchmark/CI-scripts/run_benchmarks.jl
+++ b/benchmark/CI-scripts/run_benchmarks.jl
@@ -67,6 +67,13 @@ push!(
         result=compute_gb(Groebner.katsuran(10, ordering=:degrevlex, k=GF(2^27 + 29)), 5)
     )
 )
+push!(
+    suite,
+    (
+        problem_name="groebner, AA, GF(2^30+3), katsura 11",
+        result=compute_gb(Groebner.katsuran(11, ordering=:degrevlex, k=GF(2^30 + 3)), 3)
+    )
+)
 push!(
     suite,
     (
diff --git a/experimental/example-maybe-bug.jl b/experimental/example-maybe-bug.jl
index 73de135e..a7074665 100644
--- a/experimental/example-maybe-bug.jl
+++ b/experimental/example-maybe-bug.jl
@@ -11,11 +11,11 @@ end
 
 @info "" nthreads()
 @show ENV["JULIA_NUM_THREADS"]
-Groebner.logging_enabled() = false
+Groebner.logging_enabled() = true
 Groebner.invariants_enabled() = false
 Groebner.performance_counters_enabled() = false
 
-s = Groebner.katsuran(9, ordering=:degrevlex, k=AbstractAlgebra.GF(2^30 + 3));
+s = Groebner.noonn(8, ordering=:degrevlex, k=AbstractAlgebra.GF(2^30 + 3));
 trace, gb = Groebner.groebner_learn(s);
 @btime Groebner.groebner($s);
 @btime Groebner.groebner_apply!($trace, $s);
@@ -24,15 +24,15 @@ trace, gb = Groebner.groebner_learn(s);
 @btime Groebner.groebner_apply!($trace, $((s, s, s, s, s, s, s, s)));
 
 #=
-  113.912 ms (34041 allocations: 43.78 MiB)
+  1.048 s (418167 allocations: 258.02 MiB)
 
-  44.803 ms (18887 allocations: 24.46 MiB)
+  142.140 ms (126985 allocations: 87.63 MiB)
 
-  52.580 ms (20607 allocations: 35.38 MiB)
+  185.635 ms (134610 allocations: 134.37 MiB)
 
-  64.618 ms (23276 allocations: 59.42 MiB)
+  283.075 ms (146988 allocations: 224.29 MiB)
 
-  125.247 ms (28610 allocations: 107.47 MiB)
+  433.356 ms (171889 allocations: 404.00 MiB)
 =#
 @profview Groebner.groebner_apply!(trace, ((s, s, s, s)));
 
diff --git a/experimental/fglmtest.jl b/experimental/fglmtest.jl
new file mode 100644
index 00000000..66a2e8b1
--- /dev/null
+++ b/experimental/fglmtest.jl
@@ -0,0 +1,17 @@
+using AbstractAlgebra, Groebner
+
+R, (x, y, z, t) = AbstractAlgebra.polynomial_ring(AbstractAlgebra.QQ, ["x", "y", "z", "t"])
+sys = [
+    y^2 * z + 2 * x * y * t - 2 * x - z,
+    -x^3 * z + 4 * x * y^2 * z + 4 * x^2 * y * t + 2 * y^3 * t + 4 * x^2 - 10 * y^2 +
+    4 * x * z - 10 * y * t + 2,
+    2 * y * z * t + x * t^2 - x - 2 * z,
+    -x * z^3 + 4 * y * z^2 * t + 4 * x * z * t^2 + 2 * y * t^3 + 4 * x * z + 4 * z^2 -
+    10 * y * t - 10 * t^2 + 2
+]
+
+gb_lex = Groebner.groebner(sys, ordering=Groebner.Lex())
+gb_drl = Groebner.groebner(sys, ordering=Groebner.DegRevLex())
+gb_fglm = Groebner.fglm(gb_drl, Groebner.DegRevLex(), Groebner.Lex())
+
+@info "" gb_fglm
diff --git a/experimental/lexcmp.jl b/experimental/lexcmp.jl
new file mode 100644
index 00000000..7058b77b
--- /dev/null
+++ b/experimental/lexcmp.jl
@@ -0,0 +1,217 @@
+using BenchmarkTools
+
+using HostCPUFeatures
+using HostCPUFeatures:
+    register_size,
+    pick_vector_width,
+    pick_vector_width_shift,
+    simd_integer_register_size,
+    fma_fast,
+    has_feature,
+    register_count,
+    cpu_name,
+    register_size
+
+#########
+
+_setup1(n) = begin
+    x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+    y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+    x, y
+end
+_setup2(n) = begin
+    x = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+    y = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+    x, y
+end
+_setup3(T, n) = begin
+    s = rand(T.([0, 0, 0, 1, 2, 3]), 3)
+    x = Groebner.monom_construct_from_vector(
+        Groebner.ExponentVector{T},
+        vcat(zeros(T, n), s)
+    )
+    y = Groebner.monom_construct_from_vector(
+        Groebner.ExponentVector{T},
+        vcat(zeros(T, n), reverse(s))
+    )
+    z = similar(x)
+    @assert Groebner.monom_totaldeg(x) == Groebner.monom_totaldeg(y)
+    z, x, y
+end
+
+#########
+
+begin
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        print("Groebner.monom_is_equal\t\t")
+        @btime Groebner.monom_is_equal(xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_copy\t\t")
+        @btime Groebner.monom_copy(xx) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_is_divisible\t")
+        @btime Groebner.monom_is_divisible(xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, $n)
+        end
+        print("Groebner.monom_product!\t\t")
+        @btime Groebner.monom_product!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_lcm!\t\t")
+        @btime Groebner.monom_lcm!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+        end
+        print("Groebner.monom_is_gcd_const\t")
+        @btime Groebner.monom_is_gcd_const(xx, yy) setup = begin
+            cc, xx, yy = _setup3(Int8, $n)
+        end
+        print("Groebner.monom_isless:lex\t")
+        @btime Groebner.monom_isless(xx, yy, _ord) setup = begin
+            cc, xx, yy = _setup3(Int8, max(1, $n))
+            _ord = Groebner._Lex{true}(ones(Int, length(xx)))
+        end
+        print("Groebner.monom_isless:drl\t")
+        @btime Groebner.monom_isless(xx, yy, _ord) setup = begin
+            cc, xx, yy = map(reverse, _setup3(Int8, max(1, $n)))
+            xx[1] = xx[end]
+            yy[1] = yy[end]
+            @assert Groebner.monom_totaldeg(xx) == Groebner.monom_totaldeg(yy)
+            _ord = Groebner._DegRevLex{true}(ones(Int, length(xx)))
+        end
+        n += step
+        step = ceil(Int, step * 1.2)
+    end
+end
+
+_setup4(n) = begin
+    s = rand(UInt8.([0, 0, 0, 1, 2, 3]), 2)
+    a, b = vcat(zeros(UInt8, n), s), vcat(zeros(UInt8, n), reverse(s))
+    a, b = reverse(a), reverse(b)
+    x = Groebner.monom_construct_from_vector(Groebner.ExponentVector{UInt8}, a)
+    y = Groebner.monom_construct_from_vector(Groebner.ExponentVector{UInt8}, b)
+    vT(n) =
+        if n + 2 < 8
+            Groebner.PackedTuple1
+        elseif n + 2 < 16
+            Groebner.PackedTuple2
+        elseif n + 2 < 24
+            Groebner.PackedTuple3
+        end
+    xpacked = Groebner.monom_construct_from_vector(vT(n){UInt64, UInt8}, a)
+    ypacked = Groebner.monom_construct_from_vector(vT(n){UInt64, UInt8}, b)
+    x, y, xpacked, ypacked
+end
+begin
+    n, step = 1, 3
+    while n < 22
+        @info "n = $n"
+        print("Groebner.monom_isless:drl:packed\t")
+        @btime Groebner.monom_isless(xpacked, ypacked, _ord) setup = begin
+            x, y, xpacked, ypacked = _setup4($n)
+            _ord = Groebner._DegRevLex{true}(ones(Int, length(x)))
+            @assert Groebner.monom_totaldeg(xpacked) == Groebner.monom_totaldeg(ypacked)
+            tmp1, tmp2 =
+                Vector{Int8}(undef, length(x) - 1), Vector{Int8}(undef, length(x) - 1)
+            @assert Groebner.monom_to_vector!(tmp1, x) ==
+                    Groebner.monom_to_vector!(tmp2, xpacked)
+            @assert Groebner.monom_to_vector!(tmp1, y) ==
+                    Groebner.monom_to_vector!(tmp2, ypacked)
+        end
+        print("Groebner.monom_isless:drl:expvect\t")
+        @btime Groebner.monom_isless(x, y, _ord) setup = begin
+            x, y, xpacked, ypacked = _setup4($n)
+            _ord = Groebner._DegRevLex{true}(ones(Int, length(x)))
+            @assert Groebner.monom_totaldeg(x) == Groebner.monom_totaldeg(y)
+        end
+        n += step
+    end
+end
+
+begin
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        for _ in 1:1_000
+            x, y = _setup3(n)
+            res1 = vector_are_orth(x, y)
+            res2 = _vec_check_orth(x, y)
+            @assert res1 == res2
+        end
+        @btime vector_are_orth(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+        @btime _vec_check_orth(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+        n += step
+        step = ceil(Int, step * 1.2)
+    end
+end
+
+begin
+    _setup1(n) = begin
+        x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        x, y
+    end
+    _setup2(n) = begin
+        x = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+        y = vcat(zeros(UInt8, n), rand(UInt8.([0, 0, 0, 1, 2, 3]), n))
+        x, y
+    end
+    _setup3(n) = begin
+        x = vcat(zeros(Int16, n), rand(Int16.([0, 0, 0, 1, 2, 3]), n))
+        y = vcat(zeros(Int16, n), rand(Int16.([0, 0, 0, 1, 2, 3]), n))
+        x, y
+    end
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        n += step
+        step = ceil(Int, step * 1.2)
+        for _ in 1:100
+            x, y = _setup3(n)
+            res1 = vector_any_lt(x, y)
+            res2 = vector_any_lt_simd(x, y)
+            @assert res1 == res2
+        end
+        @btime vector_any_lt(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+        @btime vector_any_lt_simd(xx, yy) setup = begin
+            xx, yy = _setup3($n)
+        end
+    end
+end
+
+#########
+
+begin
+    _setup1(n) = begin
+        x = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        y = rand(UInt8.([0, 0, 0, 1, 2, 3]), n)
+        similar(x), x, y
+    end
+    n, step = 1, 5
+    while n < 500
+        @info "n = $n"
+        n += step
+        step = ceil(Int, step * 1.2)
+        for _ in 1:100
+            c, x, y = _setup1(n)
+            res1 = vector_emax_1!(copy(c), x, y)
+            res2 = vector_emax_2!(copy(c), x, y)
+            @assert res1 == res2
+        end
+        @btime vector_emax_1!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup1($n)
+        end
+        @btime vector_emax_2!(cc, xx, yy) setup = begin
+            cc, xx, yy = _setup1($n)
+        end
+    end
+end
diff --git a/experimental/native_vec4-assume.txt b/experimental/native_vec4-assume.txt
new file mode 100644
index 00000000..47865572
--- /dev/null
+++ b/experimental/native_vec4-assume.txt
@@ -0,0 +1,93 @@
+	.text
+	.file	"mod_p"
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3                               # -- Begin function julia_mod_p_61774
+.LCPI0_0:
+	.quad	-9223372036854775808            # 0x8000000000000000
+	.text
+	.globl	julia_mod_p_61774
+	.p2align	4, 0x90
+	.type	julia_mod_p_61774,@function
+julia_mod_p_61774:                      # @julia_mod_p_61774
+	.cfi_startproc
+# %bb.0:                                # %top
+	push	rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset rbp, -16
+	mov	rbp, rsp
+	.cfi_def_cfa_register rbp
+	push	rsi
+	push	rdi
+	.cfi_offset rdi, -32
+	.cfi_offset rsi, -24
+	mov	r9, rdx
+	mov	r10, rcx
+	vpmovsxbq	ymm2, dword ptr [r8 + 96]
+	vpmovzxbq	ymm1, dword ptr [r8 + 100] # ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+	mov	r11, qword ptr [r8 + 80]
+	mov	rcx, qword ptr [r8 + 72]
+	mov	rax, qword ptr [r8 + 64]
+	imul	qword ptr [rdx]
+	mov	rdi, rdx
+	mov	rax, rcx
+	imul	qword ptr [r9 + 8]
+	mov	rcx, rdx
+	mov	rax, r11
+	imul	qword ptr [r9 + 16]
+	mov	rsi, rdx
+	vmovdqu	ymm0, ymmword ptr [r9]
+	mov	rax, qword ptr [r8 + 88]
+	imul	qword ptr [r9 + 24]
+	vmovq	xmm3, rdx
+	vmovq	xmm4, rsi
+	vpunpcklqdq	xmm3, xmm4, xmm3        # xmm3 = xmm4[0],xmm3[0]
+	vmovq	xmm4, rcx
+	vmovq	xmm5, rdi
+	vpunpcklqdq	xmm4, xmm5, xmm4        # xmm4 = xmm5[0],xmm4[0]
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlq	ymm4, ymm0, 32
+	vpmuludq	ymm4, ymm4, ymm2
+	vpsrlq	ymm5, ymm2, 32
+	vpmuludq	ymm5, ymm0, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm2, ymm0, ymm2
+	vpaddq	ymm2, ymm2, ymm3
+	vpaddq	ymm2, ymm4, ymm2
+	vpxor	xmm3, xmm3, xmm3
+	vpsrlvq	ymm4, ymm2, ymm1
+	movabs	rax, offset .LCPI0_0
+	vpbroadcastq	ymm5, qword ptr [rax]
+	vpsrlvq	ymm1, ymm5, ymm1
+	vpxor	ymm4, ymm4, ymm1
+	vpsubq	ymm1, ymm4, ymm1
+	vpsrlq	ymm2, ymm2, 63
+	vpaddq	ymm1, ymm1, ymm2
+	vmovdqu	ymm2, ymmword ptr [r8]
+	vpsrlq	ymm4, ymm2, 32
+	vpmuludq	ymm4, ymm1, ymm4
+	vpsrlq	ymm5, ymm1, 32
+	vpmuludq	ymm5, ymm5, ymm2
+	vpaddq	ymm4, ymm4, ymm5
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm1, ymm1, ymm2
+	vpaddq	ymm1, ymm1, ymm4
+	vpsubq	ymm0, ymm0, ymm1
+	vpcmpgtq	ymm1, ymm3, ymm0
+	vpand	ymm1, ymm1, ymm2
+	vpaddq	ymm0, ymm1, ymm0
+	vpcmpgtq	ymm1, ymm0, ymm2
+	vpand	ymm1, ymm1, ymm2
+	vpsubq	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r10], ymm0
+	mov	rax, r10
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end0:
+	.size	julia_mod_p_61774, .Lfunc_end0-julia_mod_p_61774
+	.cfi_endproc
+                                        # -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/experimental/native_vec4.txt b/experimental/native_vec4.txt
new file mode 100644
index 00000000..9ca239cb
--- /dev/null
+++ b/experimental/native_vec4.txt
@@ -0,0 +1,93 @@
+	.text
+	.file	"mod_p"
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3                               # -- Begin function julia_mod_p_67625
+.LCPI0_0:
+	.quad	-9223372036854775808            # 0x8000000000000000
+	.text
+	.globl	julia_mod_p_67625
+	.p2align	4, 0x90
+	.type	julia_mod_p_67625,@function
+julia_mod_p_67625:                      # @julia_mod_p_67625
+	.cfi_startproc
+# %bb.0:                                # %top
+	push	rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset rbp, -16
+	mov	rbp, rsp
+	.cfi_def_cfa_register rbp
+	push	rsi
+	push	rdi
+	.cfi_offset rdi, -32
+	.cfi_offset rsi, -24
+	mov	r9, rdx
+	mov	r10, rcx
+	vpmovsxbq	ymm2, dword ptr [r8 + 96]
+	vpmovzxbq	ymm1, dword ptr [r8 + 100] # ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+	mov	r11, qword ptr [r8 + 80]
+	mov	rcx, qword ptr [r8 + 72]
+	mov	rax, qword ptr [r8 + 64]
+	imul	qword ptr [rdx]
+	mov	rdi, rdx
+	mov	rax, rcx
+	imul	qword ptr [r9 + 8]
+	mov	rcx, rdx
+	mov	rax, r11
+	imul	qword ptr [r9 + 16]
+	mov	rsi, rdx
+	vmovdqu	ymm0, ymmword ptr [r9]
+	mov	rax, qword ptr [r8 + 88]
+	imul	qword ptr [r9 + 24]
+	vmovq	xmm3, rdx
+	vmovq	xmm4, rsi
+	vpunpcklqdq	xmm3, xmm4, xmm3        # xmm3 = xmm4[0],xmm3[0]
+	vmovq	xmm4, rcx
+	vmovq	xmm5, rdi
+	vpunpcklqdq	xmm4, xmm5, xmm4        # xmm4 = xmm5[0],xmm4[0]
+	vinserti128	ymm3, ymm4, xmm3, 1
+	vpsrlq	ymm4, ymm0, 32
+	vpmuludq	ymm4, ymm4, ymm2
+	vpsrlq	ymm5, ymm2, 32
+	vpmuludq	ymm5, ymm0, ymm5
+	vpaddq	ymm4, ymm5, ymm4
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm2, ymm0, ymm2
+	vpaddq	ymm2, ymm2, ymm3
+	vpaddq	ymm2, ymm4, ymm2
+	vpxor	xmm3, xmm3, xmm3
+	vpsrlvq	ymm4, ymm2, ymm1
+	movabs	rax, offset .LCPI0_0
+	vpbroadcastq	ymm5, qword ptr [rax]
+	vpsrlvq	ymm1, ymm5, ymm1
+	vpxor	ymm4, ymm4, ymm1
+	vpsubq	ymm1, ymm4, ymm1
+	vpsrlq	ymm2, ymm2, 63
+	vpaddq	ymm1, ymm1, ymm2
+	vmovdqu	ymm2, ymmword ptr [r8]
+	vpsrlq	ymm4, ymm2, 32
+	vpmuludq	ymm4, ymm1, ymm4
+	vpsrlq	ymm5, ymm1, 32
+	vpmuludq	ymm5, ymm5, ymm2
+	vpaddq	ymm4, ymm4, ymm5
+	vpsllq	ymm4, ymm4, 32
+	vpmuludq	ymm1, ymm1, ymm2
+	vpaddq	ymm1, ymm1, ymm4
+	vpsubq	ymm0, ymm0, ymm1
+	vpcmpgtq	ymm1, ymm3, ymm0
+	vpand	ymm1, ymm1, ymm2
+	vpaddq	ymm0, ymm1, ymm0
+	vpcmpgtq	ymm1, ymm0, ymm2
+	vpand	ymm1, ymm1, ymm2
+	vpsubq	ymm0, ymm0, ymm1
+	vmovdqu	ymmword ptr [r10], ymm0
+	mov	rax, r10
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	vzeroupper
+	ret
+.Lfunc_end0:
+	.size	julia_mod_p_67625, .Lfunc_end0-julia_mod_p_67625
+	.cfi_endproc
+                                        # -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/experimental/test_fglm.jl b/experimental/test_fglm.jl
new file mode 100644
index 00000000..bd8b80e1
--- /dev/null
+++ b/experimental/test_fglm.jl
@@ -0,0 +1,22 @@
+if !isdefined(Main, :Groebner)
+    using Groebner
+end
+using AbstractAlgebra
+
+sys = Groebner.noonn(8, k=GF(2^26 + 15))
+
+R, (x, y) = polynomial_ring(GF(2^26 + 15), ["x", "y"])
+sys = [x * y + 1, y^200 + 1]
+
+# gb_lex = Groebner.groebner(sys, ordering=Groebner.Lex())
+gb_drl = Groebner.groebner(sys, ordering=Groebner.DegRevLex());
+
+minpoly_last = Groebner.fglm_residuals_in_batch(
+    gb_drl,
+    Groebner.DegRevLex(),
+    Groebner.Lex(),
+    statistics=:timings,
+    loglevel=0
+)
+
+gb_fglm = Groebner.fglm(gb_drl, Groebner.DegRevLex(), Groebner.Lex(), statistics=:timings);
diff --git a/src/Groebner.jl b/src/Groebner.jl
index 19e6ded1..6481c725 100644
--- a/src/Groebner.jl
+++ b/src/Groebner.jl
@@ -1,7 +1,7 @@
-module Groebner
-# Groebner is a package for computing Gröbner bases. This is the main file.
-# Groebner is distributed under GNU GPL v2 starting from Groebner v0.6.0.
+# This file is a part of Groebner.jl. License is GNU GPL v2.
 
+# Groebner.jl is a package for computing Gröbner bases. This is the main file.
+module Groebner
 # Groebner works over integers modulo a prime and over the rationals. At its
 # heart, Groebner implements F4, multi-modular techniques, and tracing.
 
@@ -33,11 +33,11 @@ invariants_enabled() = false
     logging_enabled() -> Bool
 
 Specifies if logging is enabled. If `false`, then all logging in Groebner is
-disabled, and entails no runtime overhead.
+disabled, and entails **(almost)** no runtime overhead.
 
 See also `@log` in `src/utils/logging.jl`.
 """
-logging_enabled() = false
+logging_enabled() = true
 
 """
     performance_counters_enabled() -> Bool
@@ -45,9 +45,9 @@ logging_enabled() = false
 If performance-tracking macro `@timeit` should be enabled in Groebner. 
 
 When this is `false`, all performance counters in Groebner are disabled and
-entail no runtime overhead.
+entail **(almost)** no runtime overhead.
 """
-performance_counters_enabled() = false
+performance_counters_enabled() = true
 
 ###
 # Imports
@@ -70,6 +70,9 @@ import Combinatorics
 
 using ExprTools
 
+import HostCPUFeatures:
+    cpu_name, register_count, register_size, has_feature, pick_vector_width, fma_fast
+
 using Logging
 
 import MultivariatePolynomials
@@ -124,7 +127,8 @@ include("utils/invariants.jl")
 include("utils/timeit.jl")
 # Provides the macro `@stat` for collecting statistics
 include("utils/statistics.jl")
-
+# For fast and specific vector arithmetic
+include("utils/simd.jl")
 # include("utils/versioninfo.jl")
 
 # Minimalistic plotting with Unicode
diff --git a/src/arithmetic/Zp.jl b/src/arithmetic/Zp.jl
index 8882cc50..cb3f90c6 100644
--- a/src/arithmetic/Zp.jl
+++ b/src/arithmetic/Zp.jl
@@ -7,8 +7,8 @@
 # The arithmetic is parametrized by the type of the accumulator and the type of
 # the coefficient. Generally, AccumType should be picked so that the result of
 #   a + b*c
-# is representable exactly in AccumType for feasible  a,b,c of type CoeffType. 
-# One example is AccumType = UInt64 and CoeffType = UInt32 with prime moduli
+# is representable exactly in AccumType for feasible a,b,c of type CoeffType. 
+# One example is AccumType = UInt64 and CoeffType = UInt32 with prime moduli.
 abstract type AbstractArithmetic{AccumType, CoeffType} end
 
 # All implementations of arithmetic in Zp are a subtype of this.
@@ -48,7 +48,7 @@ end
 
 divisor(arithm::ArithmeticZp) = arithm.magic.divisor
 
-mod_p(a::T, arithm::ArithmeticZp{T}) where {T} = a % arithm.magic
+@inline mod_p(a::T, arithm::ArithmeticZp{T}) where {T} = a % arithm.magic
 
 inv_mod_p(a::T, arithm::ArithmeticZp{T}) where {T} = invmod(a, divisor(arithm))
 
@@ -86,7 +86,7 @@ struct SpecializedArithmeticZp{AccumType, CoeffType, Add} <:
         uinv::UnsignedMultiplicativeInverse{AccumType}
     ) where {AccumType <: CoeffZp, CoeffType <: CoeffZp}
         # Further in the code we need the guarantee that the shift is < 64
-        @invariant uinv.shift < 64
+        @invariant uinv.shift < 8 * sizeof(AccumType)
         new{AccumType, CoeffType, uinv.add}(uinv.multiplier, uinv.shift, uinv.divisor)
     end
 end
@@ -94,11 +94,11 @@ end
 divisor(arithm::SpecializedArithmeticZp) = arithm.divisor
 
 # Returns the higher half of the product a*b
-function _mul_high(a::T, b::T) where {T <: Union{Signed, Unsigned}}
+@inline function _mul_high(a::T, b::T) where {T <: Union{Signed, Unsigned}}
     ((widen(a) * b) >>> (sizeof(a) * 8)) % T
 end
 
-function _mul_high(a::UInt128, b::UInt128)
+@inline function _mul_high(a::UInt128, b::UInt128)
     shift = sizeof(a) * 4
     mask = typemax(UInt128) >> shift
     a1, a2 = a >>> shift, a & mask
@@ -111,18 +111,19 @@ end
 # NOTE: the compiler sometimes fails to simd this operation for T ≥ UInt32.
 #
 # a modulo p (addition specialization)
-function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, true}) where {T, C}
+@inline function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, true}) where {T, C}
     x = _mul_high(a, mod.multiplier)
     x = convert(T, convert(T, (convert(T, a - x) >>> UInt8(1))) + x)
-    # Bit shifts in Julia check that the shift is less than 64. 
-    # The assumption allows us to bypass this check.
-    unsafe_assume(mod.shift < 64)
+    # Bit shifts in Julia check that the shift value is less than the bitsize of
+    # the argument. 
+    # The assumption here allows us to bypass this check.
+    unsafe_assume(mod.shift < 8 * sizeof(T))
     a - (x >>> mod.shift) * mod.divisor
 end
 # a modulo p (no addition specialization)
-function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, false}) where {T, C}
+@inline function mod_p(a::T, mod::SpecializedArithmeticZp{T, C, false}) where {T, C}
     x = _mul_high(a, mod.multiplier)
-    unsafe_assume(mod.shift < 64)
+    unsafe_assume(mod.shift < 8 * sizeof(T))
     a - (x >>> mod.shift) * mod.divisor
 end
 
@@ -159,7 +160,7 @@ struct DelayedArithmeticZp{AccumType, CoeffType, Add} <:
         ::Type{CoeffType},
         uinv::UnsignedMultiplicativeInverse{AccumType}
     ) where {AccumType <: CoeffZp, CoeffType <: CoeffZp}
-        @invariant uinv.shift < 64
+        @invariant uinv.shift < 8 * sizeof(AccumType)
         new{AccumType, CoeffType, uinv.add}(uinv.multiplier, uinv.shift, uinv.divisor)
     end
 end
@@ -177,16 +178,16 @@ function n_safe_consecutive_additions(arithm::DelayedArithmeticZp{T}) where {T <
 end
 
 # a modulo p (addition specialization)
-function mod_p(a::A, mod::DelayedArithmeticZp{A, T, true}) where {A, T}
+@inline function mod_p(a::A, mod::DelayedArithmeticZp{A, T, true}) where {A, T}
     x = _mul_high(a, mod.multiplier)
     x = convert(A, convert(A, (convert(A, a - x) >>> UInt8(1))) + x)
-    unsafe_assume(mod.shift < 64)
+    unsafe_assume(mod.shift < 8 * sizeof(T))
     a - (x >>> mod.shift) * mod.divisor
 end
 # a modulo p (no addition specialization)
-function mod_p(a::A, mod::DelayedArithmeticZp{A, T, false}) where {A, T}
+@inline function mod_p(a::A, mod::DelayedArithmeticZp{A, T, false}) where {A, T}
     x = _mul_high(a, mod.multiplier)
-    unsafe_assume(mod.shift < 64)
+    unsafe_assume(mod.shift < 8 * sizeof(T))
     a - (x >>> mod.shift) * mod.divisor
 end
 
@@ -275,17 +276,17 @@ struct SignedArithmeticZp{AccumType, CoeffType} <:
         @invariant Primes.isprime(p)
         pa = convert(AccumType, p)
         magic = Base.MultiplicativeInverses.SignedMultiplicativeInverse(pa)
-        @invariant magic.shift < 64
+        @invariant magic.shift < 8 * sizeof(AccumType)
         new{AccumType, CoeffType}(pa, pa * pa, magic.multiplier, magic.addmul, magic.shift)
     end
 end
 
 divisor(arithm::SignedArithmeticZp) = arithm.p
 
-function mod_p(a::T, mod::SignedArithmeticZp{T}) where {T}
+@inline function mod_p(a::T, mod::SignedArithmeticZp{T}) where {T}
     x = _mul_high(a, mod.multiplier)
     x += a * mod.addmul
-    unsafe_assume(mod.shift < 64)
+    unsafe_assume(mod.shift < 8 * sizeof(T))
     d = (signbit(x) + (x >> mod.shift)) % T
     res = a - d * mod.p
     ifelse(res >= zero(T), res, res + mod.p)
@@ -317,7 +318,7 @@ struct SignedCompositeArithmeticZp{AccumType, CoeffType, T, N} <:
         multipliers = map(a -> a.multiplier, arithms)
         addmuls = map(a -> a.addmul, arithms)
         shifts = map(a -> a.shift, arithms)
-        @invariant all(x -> x < 64, shifts)
+        @invariant all(x -> x < 8 * sizeof(AT), shifts)
         p2s = CompositeInt{N, AT}(ps) * CompositeInt{N, AT}(ps)
         new{CompositeInt{N, AT}, CompositeInt{N, CT}, AT, N}(
             CompositeInt{N, AT}(ps),
@@ -331,13 +332,13 @@ end
 
 divisor(arithm::SignedCompositeArithmeticZp) = arithm.ps
 
-function mod_p(
+@inline function mod_p(
     a::CompositeInt{N, T},
     arithm::SignedCompositeArithmeticZp{CompositeInt{N, T}, U, W, N}
 ) where {N, T, U, W}
     x = _mul_high.(a.data, arithm.multipliers.data)
     x = x .+ a.data .* arithm.addmuls.data
-    # unsafe_assume(all(x -> x < 64, arithm.shifts.data))
+    unsafe_assume(all(arithm.shifts.data .< (8 * sizeof(T))))
     d = (signbit.(x) .+ (x .>> arithm.shifts.data)) .% T
     res = a.data .- d .* arithm.ps.data
     res = ifelse.(res .>= T(0), res, res .+ arithm.ps.data)
diff --git a/src/f4/f4.jl b/src/f4/f4.jl
index 76ff3fe5..0ea94093 100644
--- a/src/f4/f4.jl
+++ b/src/f4/f4.jl
@@ -99,6 +99,7 @@ end
     # make sure the pairset has enough space
     pairset_resize_if_needed!(pairset, npairs)
     pairset_size = length(pairset.pairs)
+    
 
     # update pairset:
     # for each new element in basis..
diff --git a/src/f4/hashtable.jl b/src/f4/hashtable.jl
index ef67c495..7a2a54d0 100644
--- a/src/f4/hashtable.jl
+++ b/src/f4/hashtable.jl
@@ -128,7 +128,7 @@ function hashtable_initialize(
     dmbits = 8 * sizeof(DivisionMask)
     compress_divmask = nvars > dmbits
     @assert dmbits == 32
-    @log level = -5 "Using division masks: $use_divmask of $dmbits bits. Compressed: $compress_divmask"
+    @log level = -5 "Using division masks: $use_divmask. Using $dmbits bits. Compressed: $compress_divmask"
     ndivbits = div(dmbits, nvars)
     # division mask stores at least 1 bit
     # per each of first ndivvars variables
diff --git a/src/f4/linalg/backend.jl b/src/f4/linalg/backend.jl
index a96271cf..78cdc4c4 100644
--- a/src/f4/linalg/backend.jl
+++ b/src/f4/linalg/backend.jl
@@ -819,14 +819,14 @@ function linalg_normalize_row!(
     arithmetic::AbstractArithmeticZp{A, T},
     first_nnz_index::Int=1
 ) where {A <: Union{CoeffZp, CompositeCoeffZp}, T <: Union{CoeffZp, CompositeCoeffZp}}
-    @invariant @inbounds !iszero(row[first_nnz_index])
+    @invariant !iszero(row[first_nnz_index])
 
     lead = row[first_nnz_index]
     isone(lead) && return row
 
     @inbounds pinv = inv_mod_p(A(lead), arithmetic) % T
-    @inbounds row[1] = one(T)
-    @inbounds for i in 2:length(row)
+    @inbounds row[first_nnz_index] = one(T)
+    @inbounds for i in (first_nnz_index + 1):length(row)
         row[i] = mod_p(A(row[i]) * A(pinv), arithmetic) % T
     end
 
@@ -839,7 +839,7 @@ function linalg_normalize_row!(
     arithmetic::AbstractArithmeticQQ{T},
     first_nnz_index::Int=1
 ) where {T <: CoeffQQ}
-    @invariant @inbounds !iszero(row[first_nnz_index])
+    @invariant !iszero(row[first_nnz_index])
 
     lead = row[first_nnz_index]
     isone(lead) && return row
@@ -865,8 +865,6 @@ function linalg_vector_addmul_sparsedense_mod_p!(
     @invariant length(indices) == length(coeffs)
     @invariant !isempty(indices)
 
-    # unsafe_assume(!isempty(indices))
-
     @inbounds mul = divisor(arithmetic) - row[indices[1]]
     @inbounds for j in 1:length(indices)
         idx = indices[j]
@@ -890,8 +888,6 @@ function linalg_vector_addmul_sparsedense!(
     @invariant length(indices) == length(coeffs)
     @invariant !isempty(indices)
 
-    # unsafe_assume(!isempty(indices))
-
     @inbounds mul = divisor(arithmetic) - row[indices[1]]
     @inbounds for j in 1:length(indices)
         idx = indices[j]
@@ -915,8 +911,6 @@ function linalg_vector_addmul_sparsedense!(
     @invariant length(indices) == length(coeffs)
     @invariant !isempty(indices)
 
-    # unsafe_assume(!isempty(indices))
-
     # NOTE: mul is guaranteed to be < typemax(T)
     p2 = arithmetic.p2
     @inbounds mul = row[indices[1]] % T
@@ -943,8 +937,6 @@ function linalg_vector_addmul_sparsedense!(
     @invariant length(indices) == length(coeffs)
     @invariant !isempty(indices)
 
-    # unsafe_assume(!isempty(indices))
-
     # NOTE: mul is guaranteed to be < typemax(T)
     p2 = arithmetic.p2s
     @inbounds mul = row[indices[1]].data .% T
@@ -1108,21 +1100,3 @@ function linalg_extract_sparse_row!(
 
     j - 1
 end
-
-# TODO: not really important atm
-# function extract_sparse_row_2!(vals::Vector{UInt}, dense::Vector{UInt})
-#     j = 1
-#     z = Vec{4, UInt}((0, 0, 0, 0))
-#     @inbounds for i in 1:8:length(dense)
-#         v1 = SIMD.vload(Vec{4, UInt}, dense, i)
-#         v2 = SIMD.vload(Vec{4, UInt}, dense, i + 4)
-#         mask1 = !iszero(v1)
-#         mask2 = !iszero(v2)
-#         j1 = sum(mask1)
-#         j2 = sum(mask2)
-#         vstorec(v1, vals, j, mask1)
-#         vstorec(v2, vals, j + j1, mask2)
-#         j += j1 + j2
-#     end
-#     nothing
-# end
diff --git a/src/fglm/fglm.jl b/src/fglm/fglm.jl
index 100aff82..a8fb4c5b 100644
--- a/src/fglm/fglm.jl
+++ b/src/fglm/fglm.jl
@@ -48,3 +48,57 @@ function _fglm1(
     new_basis, _, new_ht = fglm_main!(ring, basis, ht, ordering_to, params)
     basis_export_data(new_basis, new_ht)
 end
+
+###
+###
+
+function _fglm_residuals_in_batch(polynomials, ordering_from, ordering_to, kws)
+    ordering_from == ordering_to && return polynomials
+
+    representation =
+        io_select_polynomial_representation(polynomials, kws, hint=:large_exponents)
+    ring, var_to_index, monoms, coeffs =
+        io_convert_to_internal(representation, polynomials, kws)
+    params =
+        AlgorithmParameters(ring, representation, kws, orderings=(ring.ord, ordering_from))
+    if isempty(monoms)
+        @log level = -2 "Input consisting of zero polynomials."
+        throw(DomainError("Input consisting of zero polynomials to Groebner.fglm."))
+        return io_convert_to_output(ring, polynomials, monoms, coeffs, params)
+    end
+    if kws.check
+        @log level = -2 "Checking if input is a Grobner basis"
+        # TODO this is, perhaps, broken!
+        if !isgroebner(polynomials, ordering=ordering_from, certify=false)
+            throw(DomainError("Input is not a Groebner basis."))
+        end
+    end
+
+    ring, _ = io_set_monomial_ordering!(ring, var_to_index, monoms, coeffs, params)
+    params = AlgorithmParameters(
+        ring,
+        representation,
+        kws,
+        orderings=(ordering_from, ordering_to)
+    )
+
+    _ordering_to = io_convert_to_internal_monomial_ordering(var_to_index, ordering_to)
+    new_monoms, new_coeffs =
+        _fglm_residuals_in_batch_1(ring, monoms, coeffs, _ordering_to, params)
+
+    res = io_convert_to_output(ring, polynomials, new_monoms, new_coeffs, params)
+
+    res
+end
+
+@timeit function _fglm_residuals_in_batch_1(
+    ring::PolyRing,
+    monoms::Vector{Vector{M}},
+    coeffs::Vector{Vector{C}},
+    ordering_to,
+    params::AlgorithmParameters
+) where {M <: Monom, C <: Coeff}
+    basis, _, ht = f4_initialize_structs(ring, monoms, coeffs, params)
+    new_basis, _, new_ht = _fglm_residuals_in_batch!(ring, basis, ht, ordering_to, params)
+    basis_export_data(new_basis, new_ht)
+end
diff --git a/src/fglm/fglm_internal.jl b/src/fglm/fglm_internal.jl
index fcefc4d9..b7827eae 100644
--- a/src/fglm/fglm_internal.jl
+++ b/src/fglm/fglm_internal.jl
@@ -12,9 +12,10 @@ end
 Base.isempty(m::MonomialEnumerator) = m.load == 0
 
 function enumerator_initialize(ht::MonomialHashtable{M}, ord) where {M}
+    hashtable_resize_if_needed!(ht, 1 << 6)
     zz = monom_construct_const_monom(M, ht.nvars)
     vidx = hashtable_insert!(ht, zz)
-    monoms = Vector{MonomId}(undef, 2^3)
+    monoms = Vector{MonomId}(undef, 1 << 3)
     monoms[1] = vidx
     load = 1
     MonomialEnumerator{typeof(ord)}(monoms, load, Dict{MonomId, Int}(vidx => 1), ord)
@@ -26,10 +27,9 @@ function enumerator_produce_next_monomials!(
     monom::MonomId
 ) where {M}
     while m.load + ht.nvars >= length(m.monoms)
-        resize!(m.monoms, length(m.monoms) * 2)
+        resize!(m.monoms, max(ht.nvars * 2, length(m.monoms) * 2))
     end
-    # TODO: maybe an error, uncomment this line later
-    # hashtable_resize_if_needed!(ht, ht.nvars)
+    hashtable_resize_if_needed!(ht, ht.nvars)
 
     emonom = ht.monoms[monom]
     @inbounds for i in 1:(ht.nvars)
@@ -51,6 +51,7 @@ function enumerator_produce_next_monomials!(
 end
 
 function enumerator_next_monomial!(m::MonomialEnumerator)
+    @assert m.load > 0
     monom = m.monoms[m.load]
     m.load -= 1
     monom
@@ -82,9 +83,7 @@ function staircase_divides_monom(monom, staircase, ht)
     false
 end
 
-const _seen = Dict()
-
-function fglm_main!(
+@timeit function fglm_main!(
     ring::PolyRing,
     basis::Basis{C},
     ht::MonomialHashtable{M},
@@ -96,7 +95,6 @@ function fglm_main!(
         from ordering: $(ring.ord)
         to ordering:   $ord"""
 
-    empty!(_seen)
     monom_enumerator = enumerator_initialize(ht, ord)
     new_basis = basis_initialize(ring, basis.nfilled, C)
     matrix = wide_matrix_initialize(basis)
@@ -115,20 +113,16 @@ function fglm_main!(
             continue
         end
 
-        if haskey(_seen, ht.monoms[monom])
-            throw("This monomial has been processed before !")
-        end
-        _seen[ht.monoms[monom]] = 1
         # Compute the normal form of the monomial w.r.t. by constructing and
         # echelonizing the F4 matrix.
-        to_be_reduced = basis_initialize(ring, [[monom]], [C[1]])
+        to_be_reduced = basis_initialize(ring, [[monom]], [[C(1)]])
         to_be_reduced.nfilled = 1
         # ! This call takes most the time
         f4_normalform!(ring, basis, to_be_reduced, ht, params.arithmetic)
 
         # Search for a linear relation between all the computed normal forms
         relation_exists, relation =
-            find_linear_relation!(ring, matrix, monom, to_be_reduced, ht, params.arithmetic)
+            find_linear_relation!(matrix, monom, to_be_reduced, params.arithmetic)
 
         if relation_exists
             # Add the monomial to the staircase, and add the pre-image of the
@@ -145,3 +139,92 @@ function fglm_main!(
     linbasis = extract_linear_basis(ring, matrix)
     new_basis, linbasis, ht
 end
+
+const _batchsize = Ref{Int}(100)
+
+function _fglm_residuals_in_batch!(
+    ring::PolyRing,
+    basis::Basis{C},
+    ht::MonomialHashtable{M},
+    ord::AbstractInternalOrdering,
+    params::AlgorithmParameters
+) where {M <: Monom, C <: Coeff}
+    @log level = -2 """
+        Applying FGLM to convert a basis
+        from ordering: $(ring.ord)
+        to ordering:   $ord"""
+
+    new_basis = basis_initialize(ring, basis.nfilled, C)
+    matrix = wide_matrix_initialize(basis)
+    monom_staircase = Vector{MonomId}()
+
+    I = 0
+
+    # For "each" monomial, from the smallest to the largest
+    while true
+        # Get the next monomial
+        mons, coefs = Vector{Vector{MonomId}}(), Vector{Vector{C}}()
+
+        filled = 0
+
+        for i in 1:_batchsize[]
+            ev = zeros(UInt32, ht.nvars)
+            ev[end] = I
+            I += 1
+            # print("$I, ")
+            monom_ev = monom_construct_from_vector(M, ev)
+            monom = hashtable_insert!(ht, monom_ev)
+
+            # If the monomial is divisible by any of the elements of the existing
+            # staircase, then throw the monomial away
+            if staircase_divides_monom(monom, monom_staircase, ht)
+                @log level = -4 "Monomial discarded, id = $monom"
+                continue
+            end
+
+            push!(mons, [monom])
+            push!(coefs, [C(1)])
+            filled += 1
+        end
+        # println()
+        iszero(filled) && continue
+        mons_copy = deepcopy(mons)
+        to_be_reduced = basis_initialize(ring, mons, coefs)
+        to_be_reduced.nfilled = filled
+
+        # ! This call takes most the time
+        f4_normalform!(ring, basis, to_be_reduced, ht, params.arithmetic)
+
+        # Search for a linear relation between all the computed normal forms
+
+        for i in 1:filled
+            _to_be_reduced =
+                basis_initialize(ring, [to_be_reduced.monoms[i]], [to_be_reduced.coeffs[i]])
+            _to_be_reduced.nfilled = 1
+            relation_exists, relation = find_linear_relation!(
+                matrix,
+                mons_copy[i][1],
+                _to_be_reduced,
+                params.arithmetic
+            )
+
+            # println("i=$i -- monom=$(ht.monoms[mons_copy[i][1]]) -- $relation_exists")
+            # println([to_be_reduced.monoms[i]], [to_be_reduced.coeffs[i]])
+
+            if relation_exists
+                # Add the monomial to the staircase, and add the pre-image of the
+                # relation to the new basis
+                push!(monom_staircase, mons_copy[i][1])
+                basis_add_generator!(new_basis, matrix, relation, ht, ord)
+                @goto End
+            end
+        end
+    end
+
+    @label End
+
+    basis_standardize!(ring, new_basis, ht, ord, params.arithmetic)
+
+    linbasis = extract_linear_basis(ring, matrix)
+    new_basis, linbasis, ht
+end
diff --git a/src/fglm/linear.jl b/src/fglm/linear.jl
index 32f7141b..eca2d398 100644
--- a/src/fglm/linear.jl
+++ b/src/fglm/linear.jl
@@ -1,18 +1,16 @@
 # This file is a part of Groebner.jl. License is GNU GPL v2.
 
+# Left part -- linear combinations of residuals (normal forms)
+# Right part -- linear combinations of monomials (preimages)
 mutable struct WideMacaulayMatrix{C}
-    # pivot -> row
     leftpivs::Vector{Vector{Int}}
     leftcoeffs::Vector{Vector{C}}
 
-    # idx -> row
     rightrows::Vector{Vector{Int}}
     rightcoeffs::Vector{Vector{C}}
 
-    # pivot -> idx
     pivot2idx::Vector{Int}
 
-    #
     lefthash2col::Dict{Int, Int}
     leftcolumn_to_monom::Vector{Int}
 
@@ -70,7 +68,7 @@ function wide_matrix_initialize(basis::Basis{C}) where {C <: Coeff}
     )
 end
 
-function convert_to_wide_dense_row(matrix, monom, vector::Basis{C}, ht) where {C <: Coeff}
+function convert_to_wide_dense_row(matrix, monom, vector::Basis{C}) where {C <: Coeff}
     if matrix.nrcols >= length(matrix.rightcolumn_to_monom)
         resize!(matrix.rightcolumn_to_monom, 2 * length(matrix.rightcolumn_to_monom))
     end
@@ -113,9 +111,6 @@ function reduce_by_pivot_simultaneous!(
     rightcfs,
     arithmetic
 ) where {T <: CoeffZp}
-
-    # mul = -densecoeffs[i]
-    # actually.. not bad!
     mul = mod_p(divisor(arithmetic) - leftrow[leftexps[1]], arithmetic)
 
     @inbounds for j in 1:length(leftexps)
@@ -131,8 +126,6 @@ function reduce_by_pivot_simultaneous!(
     mul
 end
 
-#
-# Finite field magic specialization
 function normalize_wide_row_sparse!(
     leftcfs::Vector{T},
     rightcfs,
@@ -149,8 +142,6 @@ function normalize_wide_row_sparse!(
     end
 end
 
-#
-# Finite field magic specialization
 function normalize_wide_row_sparse!(
     leftcfs::Vector{T},
     rightcfs,
@@ -158,7 +149,6 @@ function normalize_wide_row_sparse!(
 ) where {T <: CoeffQQ}
     pinv = inv(leftcfs[1])
     @inbounds for i in 2:length(leftcfs)
-        # row[i] *= pinv
         leftcfs[i] = leftcfs[i] * pinv
     end
     @inbounds leftcfs[1] = one(leftcfs[1])
@@ -180,10 +170,6 @@ function reduce_by_pivot_simultaneous!(
     rightcfs,
     arithmetic
 ) where {T <: CoeffQQ}
-
-    # mul = -densecoeffs[i]
-    # actually.. not bad!
-
     mul = -leftrow[leftexps[1]]
 
     @inbounds for j in 1:length(leftexps)
@@ -217,11 +203,6 @@ function reduce_wide_dense_row_by_known_pivots_sparse!(
     # new pivot index
     np = -1
 
-    # if debug()
-    #     # @warn "in reduce" matrix.nlcols matrix.nrcols matrix.leftpivs
-    #     # @warn "hmm" leftrow
-    # end
-
     for i in 1:(matrix.nlcols)
 
         # if row element zero - no reduction
@@ -246,7 +227,7 @@ function reduce_wide_dense_row_by_known_pivots_sparse!(
         rightexps = rightrows[pivot2idx[i]]
         rightcfs  = matrix.rightcoeffs[pivot2idx[i]]
 
-        mul = reduce_by_pivot_simultaneous!(
+        reduce_by_pivot_simultaneous!(
             leftrow,
             leftexps,
             leftcfs,
@@ -275,7 +256,7 @@ function extract_sparse_row(row::Vector{C}, np, k) where {C}
     # where k - number of structural nonzeros in new reduced row, k > 0
     j = 1
     @inbounds for i in np:length(row) # from new pivot
-        @inbounds if row[i] != 0
+        if row[i] != 0
             newrow[j] = i
             newcfs[j] = row[i]
             j += 1
@@ -285,35 +266,19 @@ function extract_sparse_row(row::Vector{C}, np, k) where {C}
     newrow, newcfs, j - 1
 end
 
-function find_linear_relation!(
-    ring,
+const _sparisty_factors = []
+
+@timeit function find_linear_relation!(
     matrix::WideMacaulayMatrix,
     monom::MonomId,
     vector::Basis{C},
-    ht,
     arithmetic
 ) where {C <: Coeff}
-    leftrow, rightrow = convert_to_wide_dense_row(matrix, monom, vector, ht)
-
-    # if debug()
-    #     # @warn "start"
-    #     println(monom)
-    #     println(vector.monoms, " ", vector.coeffs)
-    #     println(leftrow)
-    #     println(rightrow)
-    #     println(matrix)
-    # end
+    leftrow, rightrow = convert_to_wide_dense_row(matrix, monom, vector)
 
     reduced, np, k =
         reduce_wide_dense_row_by_known_pivots_sparse!(matrix, leftrow, rightrow, arithmetic)
 
-    # if debug()
-    #     # @warn "reduced"
-    #     println(reduced, " ", np, " ", k)
-    #     println(leftrow)
-    #     println(rightrow)
-    # end
-
     if reduced
         # pass
     else
@@ -322,12 +287,6 @@ function find_linear_relation!(
 
         normalize_wide_row_sparse!(lcoeffs, rcoeffs, arithmetic)
 
-        # if debug()
-        #     # @warn "extracted"
-        #     println(lexps, " ", lcoeffs)
-        #     println(rexps, " ", rcoeffs)
-        # end
-
         while np >= matrix.nlsize
             matrix.nlsize *= 2
             resize!(matrix.leftpivs, matrix.nlsize)
@@ -349,6 +308,39 @@ function find_linear_relation!(
         matrix.rightcoeffs[matrix.nrrows] = rcoeffs
     end
 
+    nnz_left = 0
+    nnz_left_rows = 0
+    for i in 1:(matrix.nlsize)
+        if isassigned(matrix.leftpivs, i)
+            nnz_left += length(matrix.leftpivs[i])
+            nnz_left_rows += 1
+        end
+    end
+    nnz_right = 0
+    nnz_right_rows = 0
+    for i in 1:(matrix.nrsize)
+        if isassigned(matrix.rightrows, i)
+            nnz_right += length(matrix.rightrows[i])
+            nnz_right_rows += 1
+        end
+    end
+    dimsleft = (nnz_left_rows, matrix.nlcols)
+    dimsright = (nnz_right_rows, matrix.nrcols)
+    spleft = nnz_left / prod(dimsleft)
+    spright = nnz_right / prod(dimsright)
+
+    push!(
+        _sparisty_factors,
+        (
+            nnz_left=nnz_left,
+            dimsleft=dimsleft,
+            spleft=spleft,
+            nnz_right=nnz_right,
+            dimsright=dimsright,
+            spright=spright
+        )
+    )
+
     return reduced, rightrow
 end
 
diff --git a/src/input-output/input-output.jl b/src/input-output/input-output.jl
index 3216e7ff..cdd1c44b 100644
--- a/src/input-output/input-output.jl
+++ b/src/input-output/input-output.jl
@@ -222,7 +222,15 @@ function io_get_tight_unsigned_int_type(x::T) where {T <: Integer}
     end
 end
 
-function io_select_coefftype(char, npolys, nvars, ordering, kws, hint)
+function io_select_coefftype(
+    char,
+    npolys,
+    nvars,
+    ordering,
+    kws,
+    hint;
+    using_wide_type_for_coeffs=false
+)
     @log level = -1 """
     Selecting coefficient representation.
     Given hint hint=$hint. Keyword argument arithmetic=$(kws.arithmetic)"""
@@ -239,7 +247,6 @@ function io_select_coefftype(char, npolys, nvars, ordering, kws, hint)
         )
     end
 
-    using_wide_type_for_coeffs = true
     tight_signed_type = io_get_tight_signed_int_type(char)
 
     # If the requested arithmetic requires a signed representation
@@ -256,11 +263,13 @@ function io_select_coefftype(char, npolys, nvars, ordering, kws, hint)
     end
 
     tight_unsigned_type = io_get_tight_unsigned_int_type(char)
-    if !using_wide_type_for_coeffs
+    tight_unsigned_type = if !using_wide_type_for_coeffs
         tight_unsigned_type
     else
         widen(tight_unsigned_type)
-    end, using_wide_type_for_coeffs
+    end
+
+    tight_unsigned_type, using_wide_type_for_coeffs
 end
 
 ###
diff --git a/src/interface.jl b/src/interface.jl
index 41487a07..7ac1cd54 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -497,3 +497,30 @@ function fglm(
 
     result
 end
+
+"""
+    fglm(basis, ordering_from, ordering_to; options...)
+
+Converts a Groebner basis from one monomial ordering to another.
+"""
+function fglm_residuals_in_batch(
+    basis::AbstractVector,
+    ordering_from::AbstractMonomialOrdering,
+    ordering_to::AbstractMonomialOrdering;
+    options...
+)
+    Base.require_one_based_indexing(basis)
+
+    keywords = KeywordsHandler(:fglm, options)
+
+    logging_setup(keywords)
+    statistics_setup(keywords)
+
+    result =
+        _fglm_residuals_in_batch(basis, ordering_from, ordering_to, keywords)::typeof(basis)
+
+    performance_counters_print(keywords)
+    statistics_print(keywords)
+
+    result
+end
diff --git a/src/monomials/exponentvector.jl b/src/monomials/exponentvector.jl
index ef226b91..2876710e 100644
--- a/src/monomials/exponentvector.jl
+++ b/src/monomials/exponentvector.jl
@@ -85,35 +85,27 @@ function monom_is_supported_ordering(::Type{ExponentVector{T}}, ::O) where {T, O
     true
 end
 
-# DegRevLex monomial comparison
+# DegRevLex monomial comparison 
 function monom_isless(ea::ExponentVector, eb::ExponentVector, ::_DegRevLex{true})
     @invariant length(ea) == length(eb)
     @invariant length(ea) > 1
-    if @inbounds monom_totaldeg(ea) < monom_totaldeg(eb)
+    if monom_totaldeg(ea) < monom_totaldeg(eb)
         return true
-    elseif @inbounds monom_totaldeg(ea) != monom_totaldeg(eb)
+    elseif monom_totaldeg(ea) != monom_totaldeg(eb)
         return false
     end
-    i = length(ea)
-    @inbounds while i > 2 && ea[i] == eb[i]
-        i -= 1
-    end
-    @inbounds if ea[i] <= eb[i]
-        return false
-    else
-        return true
-    end
+    _vec_cmp_revlex(ea, eb)
 end
 
-# DegRevLex monomial comparison, but on a range of variables
+# DegRevLex monomial comparison (shuffled variables)
 function monom_isless(
     ea::ExponentVector{T},
     eb::ExponentVector{T},
     ord::_DegRevLex{false}
 ) where {T}
-    indices = variable_indices(ord)
     @invariant length(ea) == length(eb)
     @invariant length(ea) > 1
+    indices = variable_indices(ord)
     eatotaldeg = zero(T)
     ebtotaldeg = zero(T)
     @inbounds for i in indices
@@ -136,30 +128,27 @@ function monom_isless(
     end
 end
 
-# DegLex exponent vector comparison
+# DegLex monomial comparison
 function monom_isless(ea::ExponentVector, eb::ExponentVector, ::_DegLex{true})
     @invariant length(ea) == length(eb)
     @invariant length(ea) > 1
-    if @inbounds monom_totaldeg(ea) < monom_totaldeg(eb)
+    if monom_totaldeg(ea) < monom_totaldeg(eb)
         return true
-    elseif @inbounds monom_totaldeg(ea) != monom_totaldeg(eb)
+    elseif monom_totaldeg(ea) != monom_totaldeg(eb)
         return false
     end
-    i = 2
-    @inbounds while i < length(ea) && ea[i] == eb[i]
-        i += 1
-    end
-    @inbounds return ea[i] < eb[i] ? true : false
+    _vec_cmp_lex(ea, eb)
 end
 
+# DegLex monomial comparison (shuffled variables)
 function monom_isless(
     ea::ExponentVector{T},
     eb::ExponentVector{T},
     ord::_DegLex{false}
 ) where {T}
-    indices = variable_indices(ord)
     @invariant length(ea) == length(eb)
     @invariant length(ea) > 1
+    indices = variable_indices(ord)
     eatotaldeg = zero(T)
     ebtotaldeg = zero(T)
     @inbounds for i in indices
@@ -182,14 +171,10 @@ end
 function monom_isless(ea::ExponentVector, eb::ExponentVector, ::_Lex{true})
     @invariant length(ea) == length(eb)
     @invariant length(ea) > 1
-    i = 2
-    @inbounds while i < length(ea) && ea[i] == eb[i]
-        i += 1
-    end
-    @inbounds return ea[i] < eb[i] ? true : false
+    _vec_cmp_lex(ea, eb)
 end
 
-# Lex monomial comparison
+# Lex monomial comparison (shuffled variables)
 function monom_isless(ea::ExponentVector, eb::ExponentVector, ord::_Lex{false})
     indices = variable_indices(ord)
     @invariant length(ea) == length(eb)
@@ -264,11 +249,12 @@ function monom_lcm!(
     eb::ExponentVector{T}
 ) where {T}
     @invariant length(ec) == length(ea) == length(eb)
-    @inbounds ec[1] = zero(T)
+    s = zero(T)
     @inbounds for i in 2:length(ec)
         ec[i] = max(ea[i], eb[i])
-        ec[1] += ec[i]
+        s += ec[i]
     end
+    ec[1] = s
     _monom_overflow_check(ec)
     ec
 end
@@ -276,12 +262,7 @@ end
 # Checks if the gcd of monomials is constant.
 function monom_is_gcd_const(ea::ExponentVector{T}, eb::ExponentVector{T}) where {T}
     @invariant length(ea) == length(eb)
-    @inbounds for i in 2:length(ea)
-        if !iszero(ea[i]) && !iszero(eb[i])
-            return false
-        end
-    end
-    true
+    _vec_check_orth(ea, eb)
 end
 
 # Returns the product of monomials. Also writes the result to ec.
@@ -315,12 +296,7 @@ end
 # Checks monomial divisibility
 function monom_is_divisible(ea::ExponentVector{T}, eb::ExponentVector{T}) where {T}
     @invariant length(ea) == length(eb)
-    @inbounds for j in 1:length(ea)
-        if ea[j] < eb[j]
-            return false
-        end
-    end
-    true
+    _vec_not_any_lt(ea, eb)
 end
 
 # Checks monomial divisibility, AND performs division
@@ -330,12 +306,8 @@ function monom_is_divisible!(
     eb::ExponentVector{T}
 ) where {T}
     @invariant length(ec) == length(ea) == length(eb)
-    @inbounds for j in 1:length(ec)
-        if ea[j] < eb[j]
-            return false, ec
-        end
-        ec[j] = ea[j] - eb[j]
-    end
+    !monom_is_divisible(ea, eb) && return false, ec
+    monom_division!(ec, ea, eb)
     true, ec
 end
 
@@ -360,6 +332,7 @@ function monom_create_divmask(
     res = zero(Mask)
     o = one(Mask)
     if !compressed
+        # if the number of variables is <= 32
         @inbounds for i in 1:ndivvars
             for _ in 1:ndivbits
                 if e[i + 1] >= divmap[ctr]
@@ -369,6 +342,7 @@ function monom_create_divmask(
             end
         end
     else
+        # if the number of variables is > 32
         @invariant ndivbits == 1
         @invariant length(divmap) == 8 * sizeof(Mask)
         @inbounds for i in 1:length(divmap)
diff --git a/src/utils/invariants.jl b/src/utils/invariants.jl
index a7f64c4f..0cae07c3 100644
--- a/src/utils/invariants.jl
+++ b/src/utils/invariants.jl
@@ -35,7 +35,7 @@ macro unreachable()
 end
 
 ###
-# DANGER!!! DANGER!!! DANGER!!!
+# DANGER
 
 """
     unsafe_unreachable() --> Nothing
diff --git a/src/utils/keywords.jl b/src/utils/keywords.jl
index 44501807..d66aa49e 100644
--- a/src/utils/keywords.jl
+++ b/src/utils/keywords.jl
@@ -54,6 +54,7 @@ const _supported_kw_args = (
     fglm = (
         check       = false,
         monoms      = :dense,
+        statistics  = :no,
         loglevel    = _default_loglevel,
     ),
     groebner_learn = (
@@ -181,23 +182,6 @@ struct KeywordsHandler{Ord}
         Possible choices for keyword "statistics" are:
         `:no`, `:timings`, `:stats`, `:all`"""
 
-        # @log level = -2 """
-        #   In $function_key, using keywords: 
-        #   reduced    = $reduced, 
-        #   ordering   = $ordering, 
-        #   certify    = $certify, 
-        #   linalg     = $linalg, 
-        #   monoms     = $monoms, 
-        #   seed       = $seed, 
-        #   loglevel   = $loglevel, 
-        #   maxpairs   = $maxpairs,
-        #   selection  = $selection,
-        #   modular    = $modular,
-        #   check      = $check,
-        #   sweep      = $sweep
-        #   homogenize = $homogenize
-        #   statistics = $statistics"""
-
         new{typeof(ordering)}(
             reduced,
             ordering,
@@ -227,6 +211,7 @@ function logging_setup(keywords::KeywordsHandler)
 end
 
 function statistics_setup(keywords::KeywordsHandler)
+    log_simdinfo()
     if keywords.loglevel <= 0
         performance_counters_refresh()
         statistics_refresh()
diff --git a/src/utils/simd.jl b/src/utils/simd.jl
new file mode 100644
index 00000000..c13dbc50
--- /dev/null
+++ b/src/utils/simd.jl
@@ -0,0 +1,460 @@
+# This file is a part of Groebner.jl. License is GNU GPL v2.
+
+###
+# This file provides some very fast operations on vectors of integers.
+
+# NOTE. The use of these functions must be limited.
+#
+# Debugging LLVM IR is a headache. The LLVM IR textual format is not guaranteed
+# to be stable. 
+#
+# At this moment, we are grudgingly writing the LLVM IR, and we hope that the
+# bright future where the compiler does this for us is well within our grasp.
+
+# Functions in this file take their inspiration from
+# https://github.com/SciML/FindFirstFunctions.jl
+
+const BitInteger = Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8}
+
+function log_simdinfo()
+    @log level = -1 """
+      CPU: $(Symbol(cpu_name()))
+      ------------------------
+      fma_fast: \t$(Bool(fma_fast()))
+      registers:\t$(Int(register_count()))
+      simd width:\t<$(Int(pick_vector_width(Int32))) x i32>
+      """
+    nothing
+end
+
+j_to_llvm_t(_::Type{T}) where {T <: BitInteger} = "i$(8*sizeof(T))"
+typemax_saturated(_::Type{T}, N) where {T <: BitInteger} = typemax(T) ⊻ (N - 1)
+
+# Same as `HostCPUFeatures.pick_vector_width`, 
+# but also makes sure that N \in {8, 16, 32, 64}. 
+# If this is not possible, returns N = 1.
+function cutoff8_pick_vector_width(::Type{T}) where {T}
+    N = pick_vector_width(T)
+    if N in (8, 16, 32, 64)
+        return Int(N)
+    end
+    1
+end
+
+# Vector functions in this file assume that
+# - input vectors are non-negative.
+# - input vectors have the same length.
+
+# Returns false if a[i] < b[i] for ANY index i, and true otherwise.
+@inline @generated function _vec_not_any_lt(
+    a::Vector{T},
+    b::Vector{T},
+    offset::Int=1
+) where {T <: BitInteger}
+    N = cutoff8_pick_vector_width(T)
+
+    # Unfortunate case. Default to scalar code.
+    if N == 1
+        return quote
+            @inbounds for j in 1:length(a)
+                if a[j] < b[j]
+                    return false
+                end
+            end
+            return true
+        end
+    end
+
+    # The case when IntN exists.
+    @assert N in (8, 16, 32, 64)
+    B = sizeof(T)
+    llvm_t = j_to_llvm_t(T)
+    mask = typemax_saturated(Int, N)
+    textir = """
+    define i8 @entry(i64 %0, i64 %1, i64 %2) #0 {
+    top:
+        %a = inttoptr i64 %0 to $llvm_t*
+        %b = inttoptr i64 %1 to $llvm_t*
+        %lenm$(N-1) = add nsw i64 %2, -$(N-1)
+        %dosimditer = icmp ugt i64 %2, $(N-1)
+        br i1 %dosimditer, label %L9.lr.ph, label %L32
+    
+    L9.lr.ph:
+        %len$N = and i64 %2, $mask  ; divisible by N
+        br label %L9
+    
+    L9:
+        %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ]
+        %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i
+        %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i
+        %avi = bitcast $llvm_t* %api to <$N x $llvm_t>*
+        %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>*
+        %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B
+        %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B
+        %mask = icmp ult <$N x $llvm_t> %ai, %bi
+        %compressed = bitcast <$N x i1> %mask to i$N
+        %matchnotfound = icmp eq i$N %compressed, 0
+        br i1 %matchnotfound, label %L30, label %common.ret
+    
+    common.ret:
+        %retval = phi i8 [ 0, %L9 ], [ 1, %L32 ], [ 0, %L51 ], [ 1, %L67 ]
+        ret i8 %retval
+    
+    L30:
+        %vinc = add nuw nsw i64 %i, $N
+        %continue = icmp slt i64 %vinc, %lenm$(N-1)
+        br i1 %continue, label %L9, label %L32
+    
+    L32:
+        %cumi = phi i64 [ 0, %top ], [ %len$N, %L30 ]
+        %done = icmp eq i64 %cumi, %2
+        br i1 %done, label %common.ret, label %L51
+    
+    L51:
+        %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ]
+        %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si
+        %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si
+        %savi = load $llvm_t, $llvm_t* %sapi, align $B
+        %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B
+        %match = icmp ult $llvm_t %savi, %sbvi
+        br i1 %match, label %common.ret, label %L67
+    
+    L67:
+        %inc = add i64 %si, 1
+        %dobreak = icmp eq i64 %inc, %2
+        br i1 %dobreak, label %common.ret, label %L51
+    }
+    attributes #0 = { alwaysinline }
+    """
+    quote
+        GC.@preserve a b begin
+            Base.llvmcall(
+                ($textir, "entry"),
+                Bool,
+                Tuple{Ptr{T}, Ptr{T}, Int64},
+                Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset,
+                Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset,
+                length(a) - offset
+            )
+        end
+    end
+end
+
+# Returns true if a and b are orthogonal, and false otherwise.
+@inline @generated function _vec_check_orth(
+    a::Vector{T},
+    b::Vector{T},
+    offset::Int=1
+) where {T <: BitInteger}
+    N = cutoff8_pick_vector_width(T)
+
+    # Unfortunate case. Default to scalar code.
+    if N == 1
+        return quote
+            @inbounds for j in 1:length(a)
+                if !iszero(a[j]) && !iszero(b[j])
+                    return false
+                end
+            end
+            return true
+        end
+    end
+
+    # The case when IntN exists.
+    @assert N in (8, 16, 32, 64)
+    B = sizeof(T)
+    llvm_t = j_to_llvm_t(T)
+    mask = typemax_saturated(Int, N)
+    textir = """
+    define i8 @entry(i64 %0, i64 %1, i64 %2) #0 {
+    top:
+        %a = inttoptr i64 %0 to $llvm_t*
+        %b = inttoptr i64 %1 to $llvm_t*
+        %lenm$(N-1) = add nsw i64 %2, -$(N-1)
+        %dosimditer = icmp ugt i64 %2, $(N-1)
+        br i1 %dosimditer, label %L9.lr.ph, label %L32
+    
+    L9.lr.ph:
+        %len$N = and i64 %2, $mask  ; divisible by N
+        br label %L9
+    
+    L9:
+        %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ]
+        %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i
+        %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i
+        %avi = bitcast $llvm_t* %api to <$N x $llvm_t>*
+        %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>*
+        %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B
+        %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B
+        %mask1 = icmp ne <$N x $llvm_t> %ai, zeroinitializer
+        %mask2 = icmp ne <$N x $llvm_t> %bi, zeroinitializer
+        %mask3 = and <$N x i1> %mask1, %mask2
+        %compressed = bitcast <$N x i1> %mask3 to i$N
+        %matchnotfound1 = icmp eq i$N %compressed, 0
+        br i1 %matchnotfound1, label %L30, label %common.ret
+    
+    common.ret:
+        %retval = phi i8 [ 0, %L9 ], [ 1, %L32 ], [ 0, %L51 ], [ 1, %L67 ]
+        ret i8 %retval
+    
+    L30:
+        %vinc = add nuw nsw i64 %i, $N
+        %continue = icmp slt i64 %vinc, %lenm$(N-1)
+        br i1 %continue, label %L9, label %L32
+    
+    L32:
+        %cumi = phi i64 [ 0, %top ], [ %len$N, %L30 ]
+        %done = icmp eq i64 %cumi, %2
+        br i1 %done, label %common.ret, label %L51
+    
+    L51:
+        %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ]
+        %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si
+        %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si
+        %savi = load $llvm_t, $llvm_t* %sapi, align $B
+        %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B
+        %mask11 = icmp ne $llvm_t %savi, 0
+        %mask12 = icmp ne $llvm_t %sbvi, 0
+        %mask13 = and i1 %mask11, %mask12
+        %matchnotfound2 = icmp eq i1 %mask13, 0
+        br i1 %matchnotfound2, label %L67, label %common.ret
+    
+    L67:
+        %inc = add i64 %si, 1
+        %dobreak = icmp eq i64 %inc, %2
+        br i1 %dobreak, label %common.ret, label %L51
+    }
+    attributes #0 = { alwaysinline }
+    """
+    quote
+        GC.@preserve a b begin
+            Base.llvmcall(
+                ($textir, "entry"),
+                Bool,
+                Tuple{Ptr{T}, Ptr{T}, Int64},
+                Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset,
+                Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset,
+                length(a) - offset
+            )
+        end
+    end
+end
+
+# Returns true if a < b lexicographically, and false otherwise.
+@inline @generated function _vec_cmp_lex(
+    a::Vector{T},
+    b::Vector{T},
+    offset::Int=1
+) where {T <: BitInteger}
+    N = cutoff8_pick_vector_width(T)
+
+    # Unfortunate case. Default to scalar code.
+    if N == 1
+        return quote
+            i = 1 + offset
+            @inbounds while i < length(a) && a[i] == b[i]
+                i += 1
+            end
+            @inbounds return a[i] < b[i]
+        end
+    end
+
+    # The case when IntN exists.
+    @assert N in (8, 16, 32, 64)
+    B = sizeof(T)
+    llvm_t = j_to_llvm_t(T)
+    mask = typemax_saturated(Int, N)
+    textir = """
+    declare i$N @llvm.cttz.i$N(i$N, i1);
+    define i8 @entry(i64 %0, i64 %1, i64 %2) #0 {
+    top:
+        %a = inttoptr i64 %0 to $llvm_t*
+        %b = inttoptr i64 %1 to $llvm_t*
+        %lenm$(N-1) = add nsw i64 %2, -$(N-1)
+        %dosimditer = icmp ugt i64 %2, $(N-1)
+        br i1 %dosimditer, label %L9.lr.ph, label %L32
+
+    L9.lr.ph:
+        %len$N = and i64 %2, $mask  ; divisible by N
+        br label %L9
+    
+    L9:
+        %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ]
+        %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i
+        %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i
+        %avi = bitcast $llvm_t* %api to <$N x $llvm_t>*
+        %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>*
+        %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B
+        %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B
+        %mask = icmp ne <$N x $llvm_t> %ai, %bi
+        %compressed = bitcast <$N x i1> %mask to i$N
+        %matchfnotound = icmp eq i$N %compressed, 0
+        br i1 %matchfnotound, label %L30, label %L17
+
+    L17:
+        %tz$N = call i$N @llvm.cttz.i$N(i$N %compressed, i1 true)
+        %tz64 = zext i$N %tz$N to i64
+        %vis = add nuw i64 %i, %tz64
+        %sapi2 = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %vis
+        %sbpi2 = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %vis
+        %savi2 = load $llvm_t, $llvm_t* %sapi2, align $B
+        %sbvi2 = load $llvm_t, $llvm_t* %sbpi2, align $B
+        %flag1 = icmp ult $llvm_t %savi2, %sbvi2
+        br label %common.ret
+
+    common.ret:
+        %retflag = phi i1 [ %flag1, %L17 ], [ 0, %L32 ], [ %flag2, %L51 ], [ 0, %L67 ]
+        %retval = zext i1 %retflag to i8
+        ret i8 %retval
+
+    L30:
+        %vinc = add nuw nsw i64 %i, $N
+        %continue = icmp slt i64 %vinc, %lenm$(N-1)
+        br i1 %continue, label %L9, label %L32
+
+    L32:
+        %cumi = phi i64 [ 0, %top ], [ %len$N, %L30 ]
+        %done = icmp eq i64 %cumi, %2
+        br i1 %done, label %common.ret, label %L51
+
+    L51:
+        %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ]
+        %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si
+        %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si
+        %savi = load $llvm_t, $llvm_t* %sapi, align $B
+        %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B
+        %match = icmp eq $llvm_t %savi, %sbvi
+        %flag2 = icmp ult $llvm_t %savi, %sbvi
+        br i1 %match, label %L67, label %common.ret
+
+    L67:
+        %inc = add i64 %si, 1
+        %dobreak = icmp eq i64 %inc, %2
+        br i1 %dobreak, label %common.ret, label %L51
+    }
+    attributes #0 = { alwaysinline }
+    """
+    quote
+        GC.@preserve a b begin
+            Base.llvmcall(
+                ($textir, "entry"),
+                Bool,
+                Tuple{Ptr{T}, Ptr{T}, Int64},
+                Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset,
+                Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset,
+                length(a) - offset
+            )
+        end
+    end
+end
+
+# Returns true if a < b reversed lexicographically, and false otherwise.
+@inline @generated function _vec_cmp_revlex(
+    a::Vector{T},
+    b::Vector{T},
+    offset::Int=1
+) where {T <: BitInteger}
+    N = cutoff8_pick_vector_width(T)
+
+    # Unfortunate case. Default to scalar code.
+    if N == 1
+        return quote
+            i = length(a)
+            @inbounds while i > 1 + offset && a[i] == b[i]
+                i -= 1
+            end
+            @inbounds return a[i] > b[i]
+        end
+    end
+
+    # The case when IntN exists.
+    @assert N in (8, 16, 32, 64)
+    B = sizeof(T)
+    llvm_t = j_to_llvm_t(T)
+    mask = typemax_saturated(Int, N)
+    textir = """
+    declare i$N @llvm.ctlz.i$N(i$N, i1);
+    define i8 @entry(i64 %0, i64 %1, i64 %2) #0 {
+    top:
+        %a = inttoptr i64 %0 to $llvm_t*
+        %b = inttoptr i64 %1 to $llvm_t*
+        %lenm$N = add nsw i64 %2, -$N
+        %lenm1 = add nsw i64 %2, -1
+        %dosimditer = icmp sge i64 %2, $N
+        br i1 %dosimditer, label %L9.lr.ph, label %L32
+
+    L9.lr.ph:
+        %len$N = and i64 %2, $mask              ; divisible by N
+        %lenmlen$N = sub nsw i64 %lenm1, %len$N
+        br label %L9
+    
+    L9:
+        %i = phi i64 [ %lenm$N, %L9.lr.ph ], [ %vdec, %L30 ]
+        %api = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %i
+        %bpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %i
+        %avi = bitcast $llvm_t* %api to <$N x $llvm_t>*
+        %bvi = bitcast $llvm_t* %bpi to <$N x $llvm_t>*
+        %ai = load <$N x $llvm_t>, <$N x $llvm_t>* %avi, align $B
+        %bi = load <$N x $llvm_t>, <$N x $llvm_t>* %bvi, align $B
+        %mask = icmp ne <$N x $llvm_t> %ai, %bi
+        %compressed = bitcast <$N x i1> %mask to i$N
+        %matchfnotound = icmp eq i$N %compressed, 0
+        br i1 %matchfnotound, label %L30, label %L17
+
+    L17:
+        %tz$N = call i$N @llvm.ctlz.i$N(i$N %compressed, i1 true)
+        %tz$N.rev = sub i$N $(N-1), %tz$N
+        %tz64 = zext i$N %tz$N.rev to i64
+        %vis = add nsw i64 %i, %tz64
+        %sapi2 = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %vis
+        %sbpi2 = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %vis
+        %savi2 = load $llvm_t, $llvm_t* %sapi2, align $B
+        %sbvi2 = load $llvm_t, $llvm_t* %sbpi2, align $B
+        %flag1 = icmp ugt $llvm_t %savi2, %sbvi2
+        br label %common.ret
+
+    common.ret:
+        %retflag = phi i1 [ %flag1, %L17 ], [ 0, %L32 ], [ %flag2, %L51 ], [ 0, %L67 ]
+        %retval = zext i1 %retflag to i8
+        ret i8 %retval
+
+    L30:
+        %vdec = sub nsw i64 %i, $N
+        %continuesimd = icmp sge i64 %vdec, 0
+        br i1 %continuesimd, label %L9, label %L32
+
+    L32:
+        %cumi = phi i64 [ %lenm1, %top ], [ %lenmlen$N, %L30 ]
+        %done = icmp eq i64 %cumi, -1
+        br i1 %done, label %common.ret, label %L51
+
+    L51:
+        %si = phi i64 [ %dec, %L67 ], [ %cumi, %L32 ]
+        %sapi = getelementptr inbounds $llvm_t, $llvm_t* %a, i64 %si
+        %sbpi = getelementptr inbounds $llvm_t, $llvm_t* %b, i64 %si
+        %savi = load $llvm_t, $llvm_t* %sapi, align $B
+        %sbvi = load $llvm_t, $llvm_t* %sbpi, align $B
+        %match = icmp eq $llvm_t %savi, %sbvi
+        %flag2 = icmp ugt $llvm_t %savi, %sbvi
+        br i1 %match, label %L67, label %common.ret
+
+    L67:
+        %dec = sub nsw i64 %si, 1
+        %dobreak = icmp eq i64 %dec, -1
+        br i1 %dobreak, label %common.ret, label %L51
+    }
+    attributes #0 = { alwaysinline }
+    """
+    quote
+        GC.@preserve a b begin
+            Base.llvmcall(
+                ($textir, "entry"),
+                Bool,
+                Tuple{Ptr{T}, Ptr{T}, Int64},
+                Base.unsafe_convert(Ptr{T}, a) + sizeof(T) * offset,
+                Base.unsafe_convert(Ptr{T}, b) + sizeof(T) * offset,
+                length(a) - offset
+            )
+        end
+    end
+end
diff --git a/test/arithmetic/Zp.jl b/test/arithmetic/Zp.jl
index fa3122e1..359eba83 100644
--- a/test/arithmetic/Zp.jl
+++ b/test/arithmetic/Zp.jl
@@ -1,3 +1,4 @@
+import Primes
 
 @testset "arithmetic in Zp" begin
     m = UInt64(2^30 + 3)
@@ -16,19 +17,23 @@
     @test Groebner.n_spare_bits(a) == 2
     @test Groebner.n_safe_consecutive_additions(a) == 8
 
-    A, C = UInt64, UInt32
-    modulo = C(2^31 - 1)
-    for arithmetic in [
-        Groebner.ArithmeticZp(A, C, modulo),
-        Groebner.DelayedArithmeticZp(A, C, modulo),
-        Groebner.SignedArithmeticZp(signed(A), signed(C), signed(modulo)),
-        Groebner.SpecializedArithmeticZp(A, C, modulo)
-    ]
-        for _ in 1:1000
-            T = typeof(Groebner.divisor(arithmetic))
-            x = rand(T)
-            @test Groebner.mod_p(T(x), arithmetic) == Base.mod(x, T(modulo))
-            @test Groebner.inv_mod_p(T(x), arithmetic) == Base.invmod(T(x), T(modulo))
+    for (A, C) in [(UInt64, UInt32), (UInt32, UInt16), (UInt16, UInt8)]
+        modulo = Primes.prevprime(typemax(C) >> 2)
+        for arithmetic in [
+            Groebner.ArithmeticZp(A, C, modulo),
+            Groebner.DelayedArithmeticZp(A, C, modulo),
+            Groebner.SignedArithmeticZp(signed(A), signed(C), signed(modulo)),
+            Groebner.SpecializedArithmeticZp(A, C, modulo)
+        ]
+            for _ in 1:1000
+                T = typeof(Groebner.divisor(arithmetic))
+                x = abs(rand(T))
+                @test Groebner.mod_p(T(x), arithmetic) == Base.mod(x, T(modulo))
+                if gcd(modulo, T(x)) == 1
+                    @test Groebner.inv_mod_p(T(x), arithmetic) ==
+                          Base.invmod(T(x), T(modulo))
+                end
+            end
         end
     end
 end
diff --git a/test/monoms/monom_orders.jl b/test/monoms/monom_orders.jl
index f7de5f39..86ff1169 100644
--- a/test/monoms/monom_orders.jl
+++ b/test/monoms/monom_orders.jl
@@ -13,8 +13,35 @@ implementations_to_test = [
 ]
 
 @testset "monom orders: Lex, DegLex, DegRevLex" begin
-    for T in (UInt64, UInt32, UInt16)
+    x = [1, 1, 1, 1, 1, 0, 1, 1, 0, 1]
+    y = [1, 1, 1, 1, 1, 0, 1, 1, 0, 1]
+    a = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, x)
+    b = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, y)
+    @test !lex(a, b) && !dl(a, b) && !drl(a, b)
+
+    x = [1, 1, 1, 1, 1, 0, 1, 2, 0, 1]
+    y = [1, 1, 1, 1, 1, 0, 2, 1, 0, 1]
+    a = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, x)
+    b = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, y)
+    @test lex(a, b) && dl(a, b) && drl(a, b)
+
+    n = 12
+    for i in 2:n
+        xx = ones(Int32, n)
+        xx[i] = xx[i] + 1
+        yy = ones(Int32, n)
+        yy[i - 1] = yy[i - 1] + 1
+        a = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, xx)
+        b = monom_construct_from_vector(Groebner.ExponentVector{UInt32}, yy)
+        @test lex(a, b) && dl(a, b) && drl(a, b)
+    end
+
+    for T in (UInt64, UInt32, UInt16, UInt8)
         for EV in implementations_to_test
+            if EV{T} <: Groebner.AbstractPackedTuple{T, UInt8} && T == UInt8
+                continue
+            end
+
             a = monom_construct_from_vector(EV{T}, [0])
             b = monom_construct_from_vector(EV{T}, [0])
             @test !lex(a, b)
@@ -74,11 +101,9 @@ implementations_to_test = [
             @test dl(a, b)
             @test drl(a, b)
 
-            5 > Groebner.monom_max_vars(EV{T}) && continue
-            a = monom_construct_from_vector(EV{T}, ones(UInt, 5))
-            b = monom_construct_from_vector(EV{T}, ones(UInt, 5))
-            @test !lex(a, b)
-            @test !dl(a, b)
+            3 > Groebner.monom_max_vars(EV{T}) && continue
+            a = monom_construct_from_vector(EV{T}, [1, 5, 0])
+            b = monom_construct_from_vector(EV{T}, [1, 0, 1])
             @test !drl(a, b)
 
             25 > Groebner.monom_max_vars(EV{T}) && continue
@@ -98,7 +123,7 @@ implementations_to_test = [
 
         # test that different implementations agree
         for n in 1:10
-            k = rand(1:100)
+            k = rand(1:60)
 
             implementations_to_test_local =
                 filter(xxx -> Groebner.monom_max_vars(xxx{T}) >= k, implementations_to_test)
@@ -195,7 +220,7 @@ end
 end
 
 function test_orderings(n, v1, v2, ords_to_test)
-    R, x = QQ[["x$i" for i in 1:n]...]
+    R, x = AbstractAlgebra.QQ[["x$i" for i in 1:n]...]
     var_to_index = Dict(x .=> 1:n)
     for wo in ords_to_test
         ord = wo.ord
@@ -257,7 +282,7 @@ end
 @testset "monom orders: ProductOrdering" begin
     pv = Groebner.ExponentVector{T} where {T}
 
-    R, x = QQ[["x$i" for i in 1:3]...]
+    R, x = AbstractAlgebra.QQ[["x$i" for i in 1:3]...]
     v1 = Groebner.monom_construct_from_vector(pv{UInt64}, [1, 2, 3])
     v2 = Groebner.monom_construct_from_vector(pv{UInt64}, [3, 2, 1])
 
@@ -274,7 +299,7 @@ end
 
     test_orderings(3, v1, v2, ords_to_test)
 
-    R, x = QQ[["x$i" for i in 1:6]...]
+    R, x = AbstractAlgebra.QQ[["x$i" for i in 1:6]...]
     v1 = Groebner.monom_construct_from_vector(pv{UInt64}, [4, 1, 7, 0, 9, 8])
     v2 = Groebner.monom_construct_from_vector(pv{UInt64}, [1, 6, 3, 5, 9, 100])
 
diff --git a/test/reconstruction/ratrec.jl b/test/reconstruction/ratrec.jl
index a577dc59..7f038d61 100644
--- a/test/reconstruction/ratrec.jl
+++ b/test/reconstruction/ratrec.jl
@@ -1,32 +1,17 @@
-using Primes
+import Primes
 
 @testset "rational reconstruction" begin
-    # these are just small sanity checks 
     moduli =
         BigInt[107, 199, 509, 2^31 - 1, nextprime(BigInt(2)^100), nextprime(BigInt(2)^200)]
     numbers = [QQ(0), QQ(1), QQ(1, 2), QQ(3, 2), QQ(4, 5), QQ(1, 7), QQ(6)]
 
-    num, den, buf, buf1, buf2, buf3, u1, u2, u3, v1, v2, v3 = [BigInt(0) for _ in 1:13]
     for m in moduli
         bnd = Groebner.ratrec_reconstruction_bound(m)
         for a in numbers
-            ac = numerator(a) * invmod(denominator(a), m)
-            ar1 = Groebner.ratrec!(
-                num,
-                den,
-                bnd,
-                buf,
-                buf1,
-                buf2,
-                buf3,
-                u1,
-                u2,
-                u3,
-                v1,
-                v2,
-                v3,
-                ac,
-                m
+            ac = mod(numerator(a) * invmod(denominator(a), m), m)
+            success, (num, den) = Groebner.ratrec_nemo(
+                Groebner.Nemo.ZZRingElem(ac),
+                Groebner.Nemo.ZZRingElem(m)
             )
             @test Base.unsafe_rational(num, den) == a
         end
@@ -44,24 +29,12 @@ using Primes
         bnd = Groebner.ratrec_reconstruction_bound(m)
         for i in 1:samples
             a = BigInt(rand(0:(m - 1)))
-            success = Groebner.ratrec!(
-                num,
-                den,
-                bnd,
-                buf,
-                buf1,
-                buf2,
-                buf3,
-                u1,
-                u2,
-                u3,
-                v1,
-                v2,
-                v3,
-                a,
-                m
+            ac = mod(numerator(a) * invmod(denominator(a), m), m)
+            success, (num, den) = Groebner.ratrec_nemo(
+                Groebner.Nemo.ZZRingElem(ac),
+                Groebner.Nemo.ZZRingElem(m)
             )
-            if a < sqrt(m)
+            if a < sqrt(div(m, 2))
                 @test success
             end
             if success