Add support for lazy and low-storage operations

CliMA · Aug 16, 2024 · 252917a · 252917a
1 parent 91a30eb
commit 252917a
Show file tree

Hide file tree

Showing 11 changed files with 820 additions and 158 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,13 @@ version = "0.1.2"
 
 [compat]
 julia = "1.10"
+StaticArrays = "1"
+
+[weakdeps]
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[extensions]
+UnrolledUtilitiesStaticArraysExt = "StaticArrays"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -73,3 +73,9 @@ unroll when your use case is similar to a row in the first category.
 The table is also printed out by this package's unit tests, so these
 measurements can be compared across different operating systems by checking the
 [CI pipeline](https://github.com/CliMA/UnrolledUtilities.jl/actions/workflows/ci.yml).
+
+## Interface
+```@autodocs
+Modules = [UnrolledUtilities]
+Order   = [:function, :type]
+```
diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl
@@ -0,0 +1,10 @@
+module UnrolledUtilitiesStaticArraysExt
+
+import UnrolledUtilities
+import StaticArrays: SVector
+
+UnrolledUtilities.length_from_type(::Type{<:SVector{N}}) where {N} = N
+UnrolledUtilities.target_output_type(::SVector) = SVector
+UnrolledUtilities.output_constructor(::Type{SVector}) = SVector
+
+end
diff --git a/src/BitSequence.jl b/src/BitSequence.jl
@@ -0,0 +1,148 @@
+"""
+    BitSequence{N, [U]}(f)
+    BitSequence{N, [U]}([bit])
+
+A statically-sized analogue of `BitVector` with `Unsigned` chunks of type `U`,
+which can be constructed using either a function `f(n)` or a constant `bit`. By
+default, `U` is set to `UInt8` and `bit` is set to `false`.
+
+Efficient methods are provided for `unrolled_map`, `unrolled_accumulate`,
+`unrolled_take`, and `unrolled_drop`, though the methods for `unrolled_map` and
+`unrolled_accumulate` only apply when their outputs consist of `Bool`s. All
+other unrolled functions that need to construct non-empty iterators convert
+`BitSequence`s into `Tuple`s.
+"""
+struct BitSequence{N, U <: Unsigned, I <: NTuple{<:Any, U}} <: StaticSequence{N}
+    ints::I
+end
+BitSequence{N, U}(ints::I) where {N, U <: Unsigned, I <: NTuple{<:Any, U}} =
+    BitSequence{N, U, I}(ints)
+BitSequence{N}(args...) where {N} = BitSequence{N, UInt8}(args...)
+
+function BitSequence{N, U}(bit::Bool = false) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    int = bit ? ~zero(U) : zero(U)
+    ints = ntuple(_ -> int, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+function BitSequence{N, U}(f) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Val(n_ints)) do int_index
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{n_bits_per_int}(0);
+            init = zero(U),
+        ) do int, bit_offset
+            int | U(f(first_index + bit_offset)::Bool) << bit_offset
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+target_output_type(::BitSequence{<:Any, U}) where {U} = BitSequence{<:Any, U}
+
+output_promote_rule(::Type{B}, ::Type{O}) where {B <: BitSequence, O} = O
+output_promote_rule(::Type{B}, ::Type{Tuple}) where {B <: BitSequence} = Tuple
+output_promote_rule(::Type{B}, ::Type{LazySequence}) where {B <: BitSequence} =
+    B
+
+eltype_restriction(::Type{<:BitSequence}) = Bool
+
+empty_output(::Type{BitSequence{<:Any, U}}) where {U} = BitSequence{0, U}()
+
+@inline function unrolled_map_into_target(
+    ::Type{BitSequence{<:Any, U}},
+    f,
+    itrs...,
+) where {U}
+    lazy_itr = lazy_map(f, itrs...)
+    N = inferred_length(lazy_itr)
+    return BitSequence{N, U}(Base.Fix1(getindex, lazy_itr))
+end
+
+@inline function unrolled_accumulate_into_target(
+    ::Type{BitSequence{<:Any, U}},
+    op,
+    itr,
+    init,
+    transform,
+) where {U}
+    N = inferred_length(itr)
+    (N == 0 && init isa NoInit) &&
+        error("unrolled_accumulate requires an init value for empty iterators")
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_accumulate_into_tuple(
+        LazySequence{n_ints}();
+        init = (nothing, init),
+        transform = first,
+    ) do (_, init_value_for_new_int), int_index
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{n_bits_per_int}(0);
+            init = (zero(U), init_value_for_new_int),
+        ) do (int, prev_value), bit_offset
+            item = itr[first_index + bit_offset]
+            new_value =
+                first_index + bit_offset == 1 && prev_value isa NoInit ?
+                item : op(prev_value, item)
+            (int | U(transform(new_value)::Bool) << bit_offset, new_value)
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_take(
+    itr::BitSequence{<:Any, U},
+    ::Val{N},
+) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_take(itr.ints, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_drop(
+    itr::BitSequence{N_old, U},
+    ::Val{N},
+) where {N_old, N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N_old - N, n_bits_per_int)
+    n_dropped_ints = length(itr.ints) - n_ints
+    bit_offset = N - n_bits_per_int * n_dropped_ints
+    ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints))
+    ints = if bit_offset == 0
+        ints_without_offset
+    else
+        cur_ints = ints_without_offset
+        next_ints = unrolled_push(unrolled_drop(cur_ints, Val(1)), nothing)
+        unrolled_map_into_tuple(cur_ints, next_ints) do cur_int, next_int
+            isnothing(next_int) ? cur_int >> bit_offset :
+            cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset)
+        end
+    end
+    return BitSequence{N_old - N, U}(ints)
+end
+
+@inline function int_index_and_bit_offset(itr, n)
+    int_offset, bit_offset = divrem(n - 1, 8 * sizeof(eltype(itr.ints)))
+    return (int_offset + 1, bit_offset)
+end
+
+@inline function Base.getindex(itr::BitSequence, n::Integer)
+    int_index, bit_offset = int_index_and_bit_offset(itr, n)
+    int = itr.ints[int_index]
+    return Bool(int >> bit_offset & one(int))
+end
+
+@inline function Base.setindex(itr::BitSequence, bit::Bool, n::Integer)
+    int_index, bit_offset = int_index_and_bit_offset(itr, n)
+    int = itr.ints[int_index]
+    int′ = int & ~(one(int) << bit_offset) | typeof(int)(bit) << bit_offset
+    return typeof(itr)(Base.setindex(itr.ints, int′, int_index))
+end
+
+@inline Base.eltype(::BitSequence) = Bool
diff --git a/src/LazyMap.jl b/src/LazyMap.jl
@@ -0,0 +1,43 @@
+"""
+    LazyMap(f, itrs...)
+
+A lazy and statically-sized analogue of a `Base.AbstractBroadcasted` object
+whose values and `output_type` are consistent with `unrolled_map(f, itrs...)`.
+
+Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All
+other unrolled functions that need to construct non-empty iterators convert
+`LazyMap`s into their `output_type`s.
+"""
+struct LazyMap{N, F, I} <: StaticSequence{N}
+    f::F
+    itrs::I
+end
+LazyMap{N}(f, itrs...) where {N} = LazyMap{N, typeof(f), typeof(itrs)}(f, itrs)
+
+target_output_type(itr::LazyMap) = output_type_of_map(itr.f, itr.itrs...)
+
+@inline unrolled_fix2(f, arg, itrs) =
+    unrolled_map_into_tuple(Base.Fix2(f, arg), itrs)
+
+@inline unrolled_take(itr::LazyMap, ::Val{N}) where {N} =
+    LazyMap{N}(itr.f, unrolled_fix2(unrolled_take, Val(N), itr.itrs)...)
+
+@inline unrolled_drop(itr::LazyMap{N_old}, ::Val{N}) where {N_old, N} =
+    LazyMap{N_old - N}(itr.f, unrolled_fix2(unrolled_drop, Val(N), itr.itrs)...)
+
+@inline Base.getindex(itr::LazyMap, n::Integer) =
+    itr.f(unrolled_fix2(getindex, n, itr.itrs)...)
+
+@inline Base.eltype(itr::LazyMap) =
+    result_type(itr.f, unrolled_map_into_tuple(eltype, itr.itrs)...)
+
+################################################################################
+
+@inline lazy_map(f, itr) = LazyMap{inferred_length(itr)}(f, itr)
+@inline lazy_map(f, itrs...) = LazyMap{minimum_length(itrs...)}(f, itrs...)
+# The first method lets us avoid an infinite recursion through minimum_length.
+
+@inline lazy_zip(itrs...) = lazy_map(tuple, itrs...)
+
+@inline lazy_enumerate(itrs...) =
+    lazy_zip(LazySequence{minimum_length(itrs...)}(), itrs...)
diff --git a/src/LazySequence.jl b/src/LazySequence.jl
@@ -0,0 +1,34 @@
+"""
+    LazySequence{N}(f)
+    LazySequence{N}([start])
+
+A lazy analogue of `ntuple(f, Val(N))`, or a lazy and statically-sized analogue
+of `start:(start - 1 + N)`. By default, `start` is set to 1.
+
+Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All
+other unrolled functions that need to construct non-empty iterators convert
+`LazySequence`s into `Tuple`s.
+"""
+struct LazySequence{N, F} <: StaticSequence{N}
+    f::F
+end
+LazySequence{N}(f = identity) where {N} = LazySequence{N, typeof(f)}(f)
+LazySequence{N}(start::Number) where {N} =
+    LazySequence{N}(Base.Fix1(+, start - one(start)))
+
+target_output_type(::LazySequence) = LazySequence
+
+output_promote_rule(::Type{LazySequence}, ::Type{O}) where {O} = O
+output_promote_rule(::Type{LazySequence}, ::Type{Tuple}) = Tuple
+
+empty_output(::Type{LazySequence}) = LazySequence{0}()
+
+@inline unrolled_take(itr::LazySequence, ::Val{N}) where {N} =
+    LazySequence{N}(itr.f)
+
+@inline unrolled_drop(itr::LazySequence{N_old}, ::Val{N}) where {N_old, N} =
+    LazySequence{N_old - N}(n -> itr.f(n + N))
+
+@inline Base.getindex(itr::LazySequence, n::Integer) = itr.f(n)
+
+@inline Base.eltype(itr::LazySequence) = result_type(itr.f, Int)