Add support for lazy and low-storage operations

CliMA · Aug 20, 2024 · f8a4f2e · f8a4f2e
1 parent 91a30eb
commit f8a4f2e
Show file tree

Hide file tree

Showing 11 changed files with 917 additions and 191 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,13 @@ version = "0.1.2"
 
 [compat]
 julia = "1.10"
+StaticArrays = "1"
+
+[weakdeps]
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[extensions]
+UnrolledUtilitiesStaticArraysExt = "StaticArrays"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,16 +1,22 @@
 #  UnrolledUtilities.jl
 
-A collection of generated functions in which all loops are unrolled and inlined:
+## Unrolled Functions
+
+This package exports the following functions, in which all loops are unrolled
+and inlined:
 - `unrolled_any(f, itr)`: similar to `any`
 - `unrolled_all(f, itr)`: similar to `all`
 - `unrolled_foreach(f, itrs...)`: similar to `foreach`
 - `unrolled_map(f, itrs...)`: similar to `map`
 - `unrolled_reduce(op, itr; [init])`: similar to `reduce`
 - `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce`
-- `unrolled_zip(itrs...)`: similar to `zip`
-- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
-  handle multiple iterators
+- `unrolled_accumulate(op, itr; [init], [transform])`: similar to `accumulate`,
+  but with an optional `transform` function applied to every accumulated value
+- `unrolled_mapaccumulate(f, op, itrs...; [init], [transform])`: a combination
+  of `unrolled_map` and `unrolled_accumulate`, analogous to `unrolled_mapreduce`
 - `unrolled_in(item, itr)`: similar to `in`
+- `unrolled_push(itr, item)`: similar to `push!`, but non-mutating
+- `unrolled_append(itr, item)`: similar to `append!`, but non-mutating
 - `unrolled_unique(itr)`: similar to `unique`
 - `unrolled_filter(f, itr)`: similar to `filter`
 - `unrolled_split(f, itr)`: similar to `(filter(f, itr), filter(!f, itr))`, but
@@ -42,33 +48,80 @@ iterators have singleton element types (and when the result of calling `f`
 and/or `op` on these elements is inferrable). However, they can also be much
 more expensive to compile than their counterparts from `Base` and
 `Base.Iterators`, in which case they should not be used unless there is a clear
-performance benefit. Some notable exceptions to this are `unrolled_zip`,
-`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than
-`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation.
+performance benefit. Two notable exceptions to this are `unrolled_take` and `unrolled_drop`, which are faster to compile than their non-static versions.
+
+## Interface
+
+```@meta
+CurrentModule = UnrolledUtilities
+```
+
+These functions can be used to unroll loops over any iterators with statically
+inferrable lengths. To facilitate this, `UnrolledUtilities` provides the
+following interface:
+
+```@docs
+length_from_type
+target_output_type
+output_promote_rule
+eltype_restriction
+output_constructor
+empty_output
+```
+
+This interface is used to add built-in compatibility for `Tuple`s, `SVector`s,
+and some new types of lazy and low-storage iterators:
+
+```@docs
+StaticSequence
+LazyMap
+LazySequence
+BitSequence
+```
+
+## Lazy Functions
+
+In addition to the unrolled functions listed above, this package also exports
+several lazy functions, each of which stores its output in a `LazyMap`:
+- `lazy_map(f, itrs...)`: similar to `map`
+- `lazy_zip(itrs...)`: similar to `zip`
+- `lazy_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
+  handle multiple iterators
+
+When used in conjunction with a `LazySequence` or `BitSequence`, these functions
+can result in significantly lower register pressure than `unrolled_map` or
+similarly unrolled versions of `zip` and `enumerate`.
+
+## When to Unroll
 
 For a more precise indication of whether you should use `UnrolledUtilities`,
-please consult the autogenerated [Comparison Table](@ref). This table contains a
-comprehensive set of potential use cases, each with a measurement of performance
-optimization, the time required for compilation, and the memory usage during
-compilation. Most cases involve simple functions `f` and/or `op`, but the last
-few demonstrate the benefits of unrolling with non-trivial recursive functions.
+please consult the autogenerated [Comparison Table](@ref). For a comprehensive
+set of potential use cases, this table contains measurements that summarize
+performance, compilation, and allocations:
+- overall level of optimization (type stability, constant propagation, etc.)
+- run time (best of several trial measurements)
+- compilation time (as measured directly by the compiler)
+- memory allocations during compilation and first run (as measured by the
+  garbage collector and, when available, the operating system's resident set
+  size estimator)
 
 The rows of the table are highlighted as follows:
-- green indicates an improvement in performance and either no change in
-  compilation or easier compilation (i.e., either similar or smaller values of
-  compilation time and memory usage)
-- dark blue indicates an improvement in performance and harder compilation
-  (i.e., larger values of compilation time and/or memory usage)
-- light blue indicates no change in performance and easier compilation
-- yellow indicates no change in performance and no change in compilation
-- magenta indicates no change in performance, an increase in compilation time,
-  and a decrease in compilation memory usage
-- red indicates no change in performance and harder compilation
+- green indicates an improvement in performance and either an improvement or
+  no change in compilation and allocations
+- dark blue indicates an improvement in performance and either slower
+  compilation or more allocations
+- light blue indicates no change in performance and either faster compilation or
+  fewer allocations
+- magenta indicates no change in performance and either faster compilation with
+  more allocations or slower compilation with fewer allocations
+- yellow indicates no change in performance, compilation, or allocations
+- red indicates a deterioration in performance, or no change in
+  performance and either slower compilation or more allocations
 
 Rows highlighted in green and blue present a clear advantage for unrolling,
-whereas those highlighted in yellow, magenta, and red either have no clear
-advantage, or they have a clear disadvantage. It is recommended that you only
-unroll when your use case is similar to a row in the first category.
+whereas those highlighted in magenta, yellow, and red either have no clear
+advantage or have a clear disadvantage. It is recommended that you only unroll
+when your use case is similar to a row in the first category.
 
 The table is also printed out by this package's unit tests, so these
 measurements can be compared across different operating systems by checking the

diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl
@@ -0,0 +1,10 @@
+module UnrolledUtilitiesStaticArraysExt
+
+import UnrolledUtilities
+import StaticArrays: SVector
+
+UnrolledUtilities.length_from_type(::Type{<:SVector{N}}) where {N} = N
+UnrolledUtilities.target_output_type(::SVector) = SVector
+UnrolledUtilities.output_constructor(::Type{SVector}) = SVector
+
+end
diff --git a/src/BitSequence.jl b/src/BitSequence.jl
@@ -0,0 +1,152 @@
+"""
+    BitSequence{N, [U]}(f)
+    BitSequence{N, [U]}([bit])
+
+A statically-sized analogue of `BitVector` with `Unsigned` chunks of type `U`,
+which can be constructed using either a function `f(n)` or a constant `bit`. By
+default, `U` is set to `UInt8` and `bit` is set to `false`.
+
+Efficient methods are provided for `unrolled_map`, `unrolled_accumulate`,
+`unrolled_take`, and `unrolled_drop`, though the methods for `unrolled_map` and
+`unrolled_accumulate` only apply when their outputs consist of `Bool`s. All
+other unrolled functions that need to construct non-empty iterators convert
+`BitSequence`s into `Tuple`s.
+"""
+struct BitSequence{N, U <: Unsigned, I <: NTuple{<:Any, U}} <: StaticSequence{N}
+    ints::I
+end
+BitSequence{N, U}(ints::I) where {N, U <: Unsigned, I <: NTuple{<:Any, U}} =
+    BitSequence{N, U, I}(ints)
+BitSequence{N}(args...) where {N} = BitSequence{N, UInt8}(args...)
+
+function BitSequence{N, U}(bit::Bool = false) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    int = bit ? ~zero(U) : zero(U)
+    ints = ntuple(_ -> int, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+function BitSequence{N, U}(f) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Val(n_ints)) do int_index
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{n_bits_per_int}(0);
+            init = zero(U),
+        ) do int, bit_offset
+            int | U(f(first_index + bit_offset)::Bool) << bit_offset
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+target_output_type(::BitSequence{<:Any, U}) where {U} = BitSequence{<:Any, U}
+
+output_promote_rule(::Type{B}, ::Type{O}) where {B <: BitSequence, O} = O
+output_promote_rule(::Type{B}, ::Type{Tuple}) where {B <: BitSequence} = Tuple
+output_promote_rule(::Type{B}, ::Type{LazySequence}) where {B <: BitSequence} =
+    B
+
+eltype_restriction(::Type{<:BitSequence}) = Bool
+
+empty_output(::Type{BitSequence{<:Any, U}}) where {U} = BitSequence{0, U}()
+
+@inline function unrolled_map_into_target(
+    ::Type{BitSequence{<:Any, U}},
+    f,
+    itrs...,
+) where {U}
+    lazy_itr = lazy_map(f, itrs...)
+    N = inferred_length(lazy_itr)
+    return BitSequence{N, U}(Base.Fix1(getindex, lazy_itr))
+end
+
+@inline function unrolled_accumulate_into_target(
+    ::Type{BitSequence{<:Any, U}},
+    op,
+    itr,
+    init,
+    transform,
+) where {U}
+    N = inferred_length(itr)
+    (N == 0 && init isa NoInit) &&
+        error("unrolled_accumulate requires an init value for empty iterators")
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_accumulate_into_tuple(
+        LazySequence{n_ints}();
+        init = (nothing, init),
+        transform = first,
+    ) do (_, init_value_for_new_int), int_index
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{n_bits_per_int}(0);
+            init = (zero(U), init_value_for_new_int),
+        ) do (int, prev_value), bit_offset
+            item = itr[first_index + bit_offset]
+            new_value =
+                first_index + bit_offset == 1 && prev_value isa NoInit ?
+                item : op(prev_value, item)
+            (int | U(transform(new_value)::Bool) << bit_offset, new_value)
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_take(
+    itr::BitSequence{<:Any, U},
+    ::Val{N},
+) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_take(itr.ints, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_drop(
+    itr::BitSequence{N_old, U},
+    ::Val{N},
+) where {N_old, N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N_old - N, n_bits_per_int)
+    n_dropped_ints = length(itr.ints) - n_ints
+    bit_offset = N - n_bits_per_int * n_dropped_ints
+    ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints))
+    ints = if bit_offset == 0
+        ints_without_offset
+    else
+        cur_ints = ints_without_offset
+        next_ints = unrolled_push(unrolled_drop(cur_ints, Val(1)), nothing)
+        unrolled_map_into_tuple(cur_ints, next_ints) do cur_int, next_int
+            isnothing(next_int) ? cur_int >> bit_offset :
+            cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset)
+        end
+    end
+    return BitSequence{N_old - N, U}(ints)
+end
+
+@inline function int_index_and_bit_offset(itr, n)
+    int_offset, bit_offset = divrem(n - 1, 8 * sizeof(eltype(itr.ints)))
+    return (int_offset + 1, bit_offset)
+end
+
+Base.@propagate_inbounds function Base.getindex(itr::BitSequence, n::Integer)
+    int_index, bit_offset = int_index_and_bit_offset(itr, n)
+    int = itr.ints[int_index]
+    return Bool(int >> bit_offset & one(int))
+end # TODO: Is @propagate_inbounds helpful here, or would @inbounds be enough?
+
+Base.@propagate_inbounds function Base.setindex(
+    itr::BitSequence,
+    bit::Bool,
+    n::Integer,
+)
+    int_index, bit_offset = int_index_and_bit_offset(itr, n)
+    int = itr.ints[int_index]
+    int′ = int & ~(one(int) << bit_offset) | typeof(int)(bit) << bit_offset
+    return typeof(itr)(Base.setindex(itr.ints, int′, int_index))
+end
+
+@inline Base.eltype(::BitSequence) = Bool
diff --git a/src/LazyMap.jl b/src/LazyMap.jl
@@ -0,0 +1,55 @@
+"""
+    LazyMap{N}(f, itrs...)
+
+A lazy and statically-sized analogue of a `Base.AbstractBroadcasted` object
+whose values and `output_type` are consistent with `unrolled_map(f, itrs...)`.
+
+Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All
+other unrolled functions that need to construct non-empty iterators convert
+`LazyMap`s into their `output_type`s.
+"""
+struct LazyMap{N, F, I} <: StaticSequence{N}
+    f::F
+    itrs::I
+end
+LazyMap{N}(f, itrs...) where {N} = LazyMap{N, typeof(f), typeof(itrs)}(f, itrs)
+
+target_output_type(itr::LazyMap) = output_type_of_map(itr.f, itr.itrs...)
+
+# Ignore eltype restrictions during the promotion process, until the final step.
+target_output_type_for_promotion(itr::LazyMap) =
+    promoted_target_output_type(itr.itrs)
+
+@inline unrolled_fix2(f, arg, itrs) =
+    unrolled_map_into_tuple((@inline itr -> f(itr, arg)), itrs)
+
+@inline unrolled_take(itr::LazyMap, ::Val{N}) where {N} =
+    LazyMap{N}(itr.f, unrolled_fix2(unrolled_take, Val(N), itr.itrs)...)
+
+@inline unrolled_drop(itr::LazyMap{N_old}, ::Val{N}) where {N_old, N} =
+    LazyMap{N_old - N}(itr.f, unrolled_fix2(unrolled_drop, Val(N), itr.itrs)...)
+
+# Work around the recursion limit for getindex to handle chains of LazyMaps.
+@inline Base.getindex(itr::LazyMap, n::Integer) = lazy_map_getindex(itr, n)
+@inline lazy_map_getindex(itr, n) = getindex(itr, n)
+@inline lazy_map_getindex(itr::LazyMap, n) =
+    itr.f(unrolled_fix2(lazy_map_getindex, n, itr.itrs)...)
+@static if hasfield(Method, :recursion_relation)
+    for method in methods(lazy_map_getindex)
+        method.recursion_relation = (_...) -> true
+    end
+end
+
+@inline Base.eltype(itr::LazyMap) =
+    Base.promote_op(itr.f, unrolled_map_into_tuple(eltype, itr.itrs)...)
+
+################################################################################
+
+@inline lazy_map(f, itr) = LazyMap{inferred_length(itr)}(f, itr)
+@inline lazy_map(f, itrs...) = LazyMap{minimum_length(itrs)}(f, itrs...)
+# The first method avoids the recursion lazy_map → minimum_length → lazy_map.
+
+@inline lazy_zip(itrs...) = lazy_map(tuple, itrs...)
+
+@inline lazy_enumerate(itrs...) =
+    lazy_zip(LazySequence{minimum_length(itrs)}(), itrs...)
diff --git a/src/LazySequence.jl b/src/LazySequence.jl
@@ -0,0 +1,34 @@
+"""
+    LazySequence{N}(f)
+    LazySequence{N}([start])
+
+A lazy analogue of `ntuple(f, Val(N))`, or a lazy and statically-sized analogue
+of `start:(start - 1 + N)`. By default, `start` is set to 1.
+
+Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All
+other unrolled functions that need to construct non-empty iterators convert
+`LazySequence`s into `Tuple`s.
+"""
+struct LazySequence{N, F} <: StaticSequence{N}
+    f::F
+end
+LazySequence{N}(f = identity) where {N} = LazySequence{N, typeof(f)}(f)
+LazySequence{N}(start::Number) where {N} =
+    LazySequence{N}(Base.Fix1(+, start - one(start)))
+
+target_output_type(::LazySequence) = LazySequence
+
+output_promote_rule(::Type{LazySequence}, ::Type{O}) where {O} = O
+output_promote_rule(::Type{LazySequence}, ::Type{Tuple}) = Tuple
+
+empty_output(::Type{LazySequence}) = LazySequence{0}()
+
+@inline unrolled_take(itr::LazySequence, ::Val{N}) where {N} =
+    LazySequence{N}(itr.f)
+
+@inline unrolled_drop(itr::LazySequence{N_old}, ::Val{N}) where {N_old, N} =
+    LazySequence{N_old - N}(n -> itr.f(n + N))
+
+@inline Base.getindex(itr::LazySequence, n::Integer) = itr.f(n)
+
+@inline Base.eltype(itr::LazySequence) = Base.promote_op(itr.f, Int)